In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [2]:
# Function to extract Product Name
def get_name(soup):

    try:
        title = soup.find("span", attrs={"id":'productTitle'})
        title_value = title.text
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'class':'a-price'}).string.strip()

    except AttributeError:

        try:
            price = soup.find("span", attrs={'class':'a-offscreen'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""

    return review_count


#Function to extract Description
def get_description(soup):
    try:
        description = soup.find("ul", attrs={"class": "a-unordered-list a-vertical a-spacing-mini"}).text.strip()
        
    except AttributeError:
        description = ""
        
    return description

#Function to extract Product Details
def get_details(soup):
    try:
        details = ""
        details_div = soup.find("div", attrs={"id": "detailBullets_feature_div"})
        details_li = details_div.find_all('li')
        for product_detail in details_li:
            pd = product_detail.find_all('span', attrs={'class':'a-list-item'})
            for pd_i in pd:
                pd_i_text = pd_i.find_all('span')
                pd_i_d, pd_i_val = pd_i_text[0].text.split('  ')[0].strip(), pd_i_text[1].text.strip()
                details += pd_i_d + ": " + pd_i_val + "\n"
          
    except AttributeError:
        details = ""
        
    return details


In [None]:
if __name__ == '__main__':

    # user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    baseURL = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%252+C283&ref=sr_pg_1"
    
    d = {"Name":[], "URL":[], "Price":[], "Rating":[], "Reviews":[],"Description":[], "ASIN":[], "Details":[],"Manufacturer":[]}
    
    
    
    count = 0
    i = 0
    
    # Loop for extracting 200 products
    while(count<200):
        
        # HTTP Request
        if(i==0):
            webpage = requests.get(baseURL, headers=HEADERS)
        else:
            try:
                new_baseURL = "https://www.amazon.in/s?k=bags&page={}&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%252+C283&ref=sr_pg_{}".format(i,i)
                webpage = requests.get(new_baseURL, headers=HEADERS)
            except:
                print("Connection Error! Sleeping for 5 sec ... ")
                time.sleep(5)
                continue
        i += 1
    
    
        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")

        # Fetching links as List of Tag Objects
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

        # Storing the links
        links_list = []

        # Loop for extracting links from Tag Objects
        for link in links:
            links_list.append(link.get('href'))

                
        count += len(links_list)
        

        # Loop for extracting product details from each link 
        for link in links_list:
            new_url = "https://www.amazon.in" + link
            try:
                new_webpage = requests.get(new_url, headers=HEADERS)
                new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            except:
                print("Connection Error! Sleeping for 5 seconds ...")
                time.sleep(5)
                continue

            details = get_details(new_soup)
            
            prod_asin = "Not found"
            prod_manufacturer = "Not found"
            for detail in details.split("\n"):
                if "ASIN" in detail:
                    try:
                        prod_asin = detail.split(":")[1].strip()
                    except:
                        prod_asin = ""
                if "Manufacturer" in detail:
                    try:
                        prod_manufacturer = detail.split(":")[1].strip()
                    except:
                        prod_manufacturer = ""

            d['Name'].append(get_name(new_soup))
            d['URL'].append(new_url)
            d['Price'].append(get_price(new_soup))
            d['Rating'].append(get_rating(new_soup))
            d['Reviews'].append(get_review_count(new_soup))
            d['Description'].append(get_description(new_soup))
            d['ASIN'].append(prod_asin)
            d['Details'].append(details)
            d['Manufacturer'].append(prod_manufacturer)

    
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['Name'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['Name'])
    amazon_df.to_csv("AmazonProductData.csv", header=True, index=False)

In [None]:
amazon_df

In [None]:
amazon_df.head()

In [None]:
amazon_df.iloc[0]

In [None]:
amazon_df.iloc[0].Description

In [None]:
amazon_df.iloc[0].Details

In [None]:
amazon_df.iloc[13].Details