**Basic Imports**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re
from dateutil.parser import parse

**Website and useragents**

In [None]:
headers = {

    "authority": "www.amazon.com",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-language": "en-US;q=0.9,bn;q=0.8",
    "sec-ch-ua": '" Not A;Brand";v="99","Chromium";v="102", "Google Chrome";v="102"'
}

Function to get HTML Data

In [28]:
def reviewsHtml(url, len_page):
    soups = []
    check=0
    for page_no in range(1, len_page+1):
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no
        }
        response = requests.get(url, headers=headers, params=params)
        soup = BeautifulSoup(response.text, 'lxml')

        # Check if the page is empty by examining the HTML content
        if not soup.select('div[data-hook="review"]'):
            # If no review elements are found, the page is likely empty
            print(f"Page {page_no} is empty. Stopping the loop.")
            check=1
            break

        soups.append(soup)

    return (soups,check)


Function to get product name

In [None]:
def get_prod_name(html_data):
  try:

    # Find the anchor tag with 'class="a-link-normal"'
    product_link_element = html_data.select_one('a[class="a-link-normal"]')

    # If found, extract the text content of the anchor tag (assuming product name is within)
    if product_link_element:
        prod_name = product_link_element.text.strip()
    else:
        prod_name = 'N/A'  # Handle cases where product link element is not found
    # print(prod_name)
  except Exception as e:
    print(f"Error extracting product name: {e}")

  return(prod_name)

Function to parse the reviews from HTML data

In [None]:
def getReviews(html_data, prod_name):

    data_dicts=[]
    boxes=html_data.select('div[data-hook="review"]')

    for box in boxes:

      try:
          name=box.select_one('[class="a-profile-name"]').text.strip()
      except Exception as e:
          name='N/A'

      try:
          stars=box.select_one('[data-hook="review-star-rating"]').text.strip().split('out')[0]
      except Exception as e:
          stars='N/A'

      try:
          title=box.select_one('[data-hook="review-title"]').text.strip()
      except Exception as e:
          title='N/A'

      try:
          datetime_str = box.select_one('[data-hook="review-date"]').text.strip()

          match = re.search(r"on ([\w ,]+)", datetime_str)
          # print('Unparsed ',datetime_str,'\n','Match ',match)
          try:
              date = parse(match[1]).strftime("%d/%m/%Y")
          except Exception as e:
              date = 'N/A'
          # print('Parsed ',date,'\n')
      except Exception as e:
          date = 'N/A'

      try:
          description=box.select_one('[data-hook="review-body"]').text.strip()
      except Exception as e:
          description='N/A'

      data_dict={'Name':name,
                  'Product Name':prod_name,
                'Stars':stars,
                'Title':title,
                'Date':date,
                'Description':description}
      data_dicts.append(data_dict)
    return data_dicts

List of URLs to be scrapped:- give the reviews section url

In [30]:
shoe = "https://www.amazon.in/Nike-Revolution-Mountain-Blue-T-Blue-908999-403/product-reviews/B078NKQVZZ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
iphn = "https://www.amazon.in/Apple-iPhone-15-128-GB/product-reviews/B0CHX1W1XY/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
head = "https://www.amazon.in/boAt-BassHeads-100-Headphones-Black/product-reviews/B071Z8M4KX/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
bag = "https://www.amazon.in/Bosca-Leather-Black-Utility-Kit/product-reviews/B003X4XVEC/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
url_lst = [shoe,iphn,head]

**Scrapping for specified number of pages**

In [None]:
start = datetime.datetime.now()

len_page = 40  # Define the number of pages you want to scrape
j=0
for i in url_lst:

  # prod_name =

  reviews = []
  html_datas = reviewsHtml(i, len_page)


  for html_data in html_datas:
      prod_name = get_prod_name(html_data)
      # print(len(html_data))
      review = getReviews( html_data, prod_name)  # Assuming you have a function getReviews() to extract reviews from HTML
      reviews += review
      if j==0:
        df_reviews = pd.DataFrame(reviews)
      else:
        df_reviews = pd.concat([df_reviews, pd.DataFrame(reviews)], ignore_index=True)
  j+=1

print("Time taken is ", datetime.datetime.now()-start)
print("Length of Dataframe is ", len(df_reviews))
# print(df_reviews.head(5))

**Scrapping for all the avilable pages**

In [29]:
start = datetime.datetime.now()
# creating element to check if the page is empty or not
flag = 0
for i, url in enumerate(url_lst):  # Use enumerate to keep track of the index
    reviews = []
    page_num = 1  # Start with page 1

    while True:
        html_datas, flag = reviewsHtml(url, page_num)  # Fetch HTML data for the current page
        if flag == 1:  # Stop if no more pages are available
            break

        for html_data in html_datas:
            prod_name = get_prod_name(html_data)
            review = getReviews(html_data, prod_name)  # Assuming you have a function getReviews() to extract reviews from HTML
            reviews += review

        page_num += 1  # Move to the next page

    if i == 0:
        df_reviews = pd.DataFrame(reviews)
    else:
        df_reviews = pd.concat([df_reviews, pd.DataFrame(reviews)], ignore_index=True)

print("Time taken is ", datetime.datetime.now()-start)
print("Length of Dataframe is ", len(df_reviews))


Page 5 is empty. Stopping the loop.
Time taken is  0:00:12.051148
Length of Dataframe is  93


In [None]:
# 20 sec for 1000 records from 3 different links

# Extrass

In [None]:
print(prod_name)

boAt BassHeads 100 in-Ear Wired Headphones with Mic (Black)


In [None]:
df_reviews.tail()

Unnamed: 0,Name,Product Name,Stars,Title,Date,Description
1077,Shanmugaraj,boAt BassHeads 100 in-Ear Wired Headphones wit...,2.0,2.0 out of 5 stars\nBuds are not fixed well,28/01/2024,"After using this headset, the buds are stuck i..."
1078,Yashik,boAt BassHeads 100 in-Ear Wired Headphones wit...,2.0,2.0 out of 5 stars\nWorst music quality. Too m...,17/11/2023,Here is an honest review. Too much hype in rat...
1079,Balaji M,boAt BassHeads 100 in-Ear Wired Headphones wit...,2.0,2.0 out of 5 stars\nMisfit,16/02/2024,Very poor quality. The jack seems to be a misf...
1080,Aravinda S.,boAt BassHeads 100 in-Ear Wired Headphones wit...,3.0,3.0 out of 5 stars\nBoat wired Earphones,16/11/2023,The media could not be loaded.\n ...
1081,Suhail,boAt BassHeads 100 in-Ear Wired Headphones wit...,3.0,3.0 out of 5 stars\nDurability,02/05/2023,"Bought in September 2021 , got a replacement i..."


In [None]:
df_reviews['Product Name'].value_counts()

Product Name
boAt BassHeads 100 in-Ear Wired Headphones with Mic (Black)             550
Apple iPhone 15 (128 GB) - Black                                        499
Nike Women's WMNS Revolution 4 Obsdn/Mountain Blue-T.Blue Running...     33
Name: count, dtype: int64

In [None]:
df_reviews.isna().sum()

Name            0
Product Name    0
Stars           0
Title           0
Date            0
Description     0
dtype: int64

In [None]:
df_reviews['Product Name'].isna().sum()
# df_reviews[df_reviews['Product Name']!='N/A']
# df_reviews['Product Name'].value_counts()

0

In [None]:
len(df_reviews)

1082

In [None]:
print(df_reviews['Description'][1],"\n\n",df_reviews["Description"][11],"\n\n",df_reviews["Description"][518],"\n\n",df_reviews["Description"][827])

Have to return it because of quality issues also they deliverd the wrong size than the size orderd.later i bought it from flipkart at 1000rs less than here. 

 I am usually a size 8. 8 was too small 8.5 too large. I am still giving 3 stars because they felt super bouncy and comfortable just wish they fit better. Returned 

 Eak no ka bekar phone hai. PTA nhi Maine Kyu buy kiya, heat krta hai, camera quality to Eakdm bekar hai 

 Mic quality is not that good for making phone calls. Sound quality is ok at that price. And durability is also good.


In [None]:
df_reviews[df_reviews['Date']!='N/A']

Unnamed: 0,Name,Stars,Title,Date,Description,Product Name
20,Ashish R.,1.0,1.0 out of 5 stars\nHeating,23/03/2024,Without use kiye hi phone heat hota rhta h Bat...,
21,sai chand,1.0,1.0 out of 5 stars\nHeating Issue,15/03/2024,Facing heating issues. Camera quality is not i...,
22,Balwant Rathore,1.0,1.0 out of 5 stars\nआइफ़ोन १५,19/01/2024,"camera quality very poor, performance इस लो",
23,Naveen,1.0,1.0 out of 5 stars\nBad,15/10/2023,Worst iphone I ever seen. I hate this phone . ...,
24,"Strongly recommend don’t buy this, intolerabl...",1.0,1.0 out of 5 stars\nTotally waste,23/11/2023,Heating issues even after 17.1.1 update,
...,...,...,...,...,...,...
232,Placeholder,1.0,1.0 out of 5 stars\nPro quality,04/12/2021,Quality not good bro quality,Verified Purchase
233,Rahul,1.0,1.0 out of 5 stars\nExpired item,06/05/2021,Delivered about to expire item.,Verified Purchase
234,JAGADISH BARMAN,1.0,1.0 out of 5 stars\nVery bad,28/08/2019,Faltu no 1..,Verified Purchase
235,Puneet Gothwal,1.0,1.0 out of 5 stars\nLow quality and item shown...,15/08/2023,Fake Amazon sale. Lack of customer support.,Verified Purchase


In [None]:
df_reviews.to_csv("Diff_Prod_Reviews.csv",index=False)

In [None]:
df_reviews.shape

(1082, 6)