In [27]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [28]:
#function to extract product title
def is_advertisement(link):
    ad_keywords = ['ref=', 'pd_rd', 'content-id', 'aax-us-iad', 'redirect']
    for keyword in ad_keywords:
        if keyword in link:
            return True

    return False

def get_title(soup):

    try:
        title = soup.find("span", attrs={"id":'productTitle'}) #outer tag object

        title_value = title.text #title as a string value

        title_string = title_value.strip() #function to extract product title

    except AttributeError:
        title_string = ""
    
    return title_string

#function to extract product Price
def get_price(soup):

    try: 
        price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()
    
    except AttributeError:
        price =""
    
    return price

#Function to extract Product Rating
def get_rating(soup):
    
    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()

    except AttributeError:
        try:
            rating = soup.find("span", attrs ={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""
    return rating



#Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={"id":'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""
    
    return review_count



#Function to extract Availability Status
def get_availability(soup):

    try:
        available = soup.find("div", attrs={"id","availiblity"}).find("span").string.strip()
    
    except AttributeError:
        available = "Not Available"
    
    return available



In [29]:
if __name__ == '__main__':

    #add user agent
  HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept-Language' : 'en-US, en;q=0.5'})

    #The webpage URL
  URL = "https://www.amazon.com/s?k=ps5&crid=ORGXEVG0D6L2&sprefix=ps5%2Caps%2C152&ref=nb_sb_noss_1"

    # HTTP Request
  webpage = requests.get(URL, headers=HEADERS)
   
   #soup Object containing all data
  soup = BeautifulSoup(webpage.content, "html.parser")

   #Fetch links as List of Tag Objects
  links = soup.find_all("a", attrs={'class':"a-link-normal s-no-outline"})

   #store the links
  links_list = []

   #loop for extracting links from Tag Objects
  for link in links:
    href = link.get('href')
    if href and not is_advertisement(href):
        links_list.append(href)

    
  d = {"title":[], "price":[], "rating":[], "reviews":[], "availability":[]}
   
   #loop for extracting product details from each link
  # for link in links_list:
  #     # new_webpage = requests.get("https://amazon.com" + link, headers=HEADERS)
  #     new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)

  
  for link in links_list:
      
      full_url = "https://www.amazon.com" + link
      print(full_url)  # Debugging: print the URL being accessed
      new_webpage = requests.get(full_url, headers=HEADERS)


      new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
      d['title'].append(get_title(new_soup))
      d['price'].append(get_price(new_soup))
      d['rating'].append(get_rating(new_soup))
      d['reviews'].append(get_review_count(new_soup))
      d['availability'].append(get_availability(new_soup))

    
  amazon_df =pd.DataFrame.from_dict(d)
  amazon_df['title'].replace('', np.nan, inplace=True)
  amazon_df = amazon_df.dropna(subset=['title'])
  amazon_df.to_csv("amazon_data.csv", header=True, index=False)
   



https://www.amazon.com/sspa/click?ie=UTF8&spc=MTozNzk2OTQxMzA3MDIzMDI3OjE3MDY0MzM4Nzk6c3BfYXRmOjMwMDAwMTAyODc2NjgwMjo6MDo6&url=%2FAvatar-Frontiers-Pandora-Limited-PlayStation-5%2Fdp%2FB0C7SM1W4K%2Fref%3Dsr_1_1_sspa%3Fcrid%3DORGXEVG0D6L2%26keywords%3Dps5%26qid%3D1706433879%26sprefix%3Dps5%252Caps%252C152%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1
https://www.amazon.com/sspa/click?ie=UTF8&spc=MTozNzk2OTQxMzA3MDIzMDI3OjE3MDY0MzM4Nzk6c3BfYXRmOjMwMDEwNjExMTkzMTIwMjo6MDo6&url=%2Fwabracket-Accessories-Compatiable-Version-Charging%2Fdp%2FB0CPQ2H6HX%2Fref%3Dsr_1_2_sspa%3Fcrid%3DORGXEVG0D6L2%26keywords%3Dps5%26qid%3D1706433879%26sprefix%3Dps5%252Caps%252C152%26sr%3D8-2-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1
https://www.amazon.com/sspa/click?ie=UTF8&spc=MTozNzk2OTQxMzA3MDIzMDI3OjE3MDY0MzM4Nzk6c3BfbXRmOjMwMDAxOTI1MTI0NjQwMjo6MDo6&url=%2FPlaystation-Detachable-Controller-Headphone-Dissipation-5%2Fdp%2FB0C18YY48J%2Fref%3Dsr_1_12_sspa%3Fcrid%3DORGXEVG0D6L2%26keywords%3D

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


In [30]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability
0,Avatar: Frontiers of Pandora - Limited Edition...,,4.4 out of 5 stars,226 ratings,Not Available
1,wabracket All -in -One with 11 Accessories Wal...,,5.0 out of 5 stars,2 ratings,Not Available
2,"Wall Mount Holder for Playstation 5(PS5), 5 in...",,4.5 out of 5 stars,319 ratings,Not Available
3,"Travel Backpack for PS5, Protective Carrying C...",,1.0 out of 5 stars,2 ratings,Not Available
4,"KAPEYDESI Wireless Gaming Headset for PC, PS5,...",,4.8 out of 5 stars,267 ratings,Not Available
5,Faceplate for PS5 Disc Edition with Cooling Ve...,,4.9 out of 5 stars,16 ratings,Not Available
