# Amazon Product Review Scraping

This project aims to scrape reviews of any product link provided using Python.

In [2]:
%%capture requirements
import sys
import os
!{sys.executable} -m pip install bs4
!{sys.executable} -m pip install lxml
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install dateparser
import dateparser
import time
import requests
import lxml
from bs4 import BeautifulSoup
import pandas as pd

def get_page(link):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.64 Safari/534.3'}
    page = requests.get(link, headers=headers)
    return BeautifulSoup(page.text, 'lxml')

In [3]:
product_link = input('Enter the product link: ')
relative_url_prefix = "/".join(product_link.split("/", 3)[:3])
print(relative_url_prefix)

Enter the product link: https://www.amazon.in/Test-Exclusive-608/dp/B07HGBMJT6/ref=sr_1_1?keywords=oneplus+7&qid=1563085781&s=gateway&smid=A35FCS7U51TK3C&sr=8-1
https://www.amazon.in


In [4]:
product_page_soup = get_page(product_link)
#print(product_page_soup)

In [5]:
see_all_reviews = product_page_soup.find_all("a", {"data-hook" : "see-all-reviews-link-foot"})[0]
all_reviews_url =  relative_url_prefix + see_all_reviews['href']
print(all_reviews_url)

https://www.amazon.in/Test-Exclusive-608/product-reviews/B07HGBMJT6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews


In [6]:
review_page_limit = 5
timestr = time.strftime("%Y%m%d-%H%M%S")
csvoutput_file = f'.{os.path.sep}review_{timestr}.csv'

all_reviews_page_soup = get_page(all_reviews_url)

def parse_review_date(review_date_text):
    reduced_date_text = " ".join(review_date_text.split()[-3:])
    return dateparser.parse(reduced_date_text).isoformat()

def parse_reviews(soup):
    parsed = []
    for review in soup.find_all("div", {"data-hook" : "review"}):
        try:
            parsed.append({
                'id': review['id'],
                'name': review.find("span", {"class" : "a-profile-name"}).text,
                'rating': review.find(class_='a-section celwidget').find(class_='a-icon-alt').text[:3],
                'title': review.find("a", {"data-hook" : "review-title"}).find("span").text,
                'date': parse_review_date(review.find("span", {"data-hook" : "review-date"}).text),
                'text': review.find("span", {"data-hook": "review-body"}).find("span").text.strip()
            })
        except Exception as e:
            print(e)
    #print(parsed)
    #print(len(parsed))
    return parsed

def has_next_page(soup):
    return relative_url_prefix + soup.find("li", { "class" : "a-last"}).find("a")["href"]

def append_to_csv(list):
    pd.DataFrame(list).to_csv(csvoutput_file, mode='a', index=False, header=False)

try:
    page_counter = 0
    parsed_page = parse_reviews(all_reviews_page_soup)
    append_to_csv(parsed_page)
    
    # get next page
    while True:
        page_counter += 1
        if page_counter >= review_page_limit:
            break

        next_page_link = has_next_page(all_reviews_page_soup)
        #print(next_page_link)
        if not next_page_link:
            break

        all_reviews_page_soup = get_page(next_page_link)
        parsed_page = parse_reviews(all_reviews_page_soup)
        append_to_csv(parsed_page)
    
except Exception as e:
    print(e)


Saving the the Reviews dataframe to a CSV file.