<a href="https://colab.research.google.com/github/sanghakim/project_emba/blob/main/amazon_scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
# Header to set the requests as a browser requests
headers = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

In [None]:
# get product id from monfoot store
def getProduct(product_file):
  review_urls = []

  product = pd.read_csv(product_file, sep='\t', header=0, names=['name', 'id'])
  p_names = product['name'].to_list()
  p_ids = product['id'].to_list()

  for name, id in zip(p_names, p_ids):
    url = 'https://www.amazon.com/{0}/product-reviews/{1}'.format(name, id)
    review_urls.append(url)

  return p_names, p_ids, review_urls

In [None]:
# Extra Data as Html object from amazon Review page
def reviewsHtml(url, len_page):

    # Empty List define to store all pages html data
    soups = []

    # Loop for gather all 3000 reviews from 300 pages via range
    for page_no in range(1, len_page + 1):

        # parameter set as page no to the requests body
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }

        # Request make for each page
        response = requests.get(url, headers=headers)

        # Save Html object by using BeautifulSoup4 and lxml parser
        soup = BeautifulSoup(response.text, 'lxml')

        if soup == None:
          print('break in loop')
          break

        # Add single Html page data in master soups list
        soups.append(soup)

    return soups

In [None]:
# Grab Reviews name, description, date, stars, title from HTML
def getReviews(html_data):

    # Create Empty list to Hold all data
    data_dicts = []

    # Select all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')

    # Iterate all Reviews BOX
    for box in boxes:

        # Select Name using css selector and cleaning text using strip()
        # If Value is empty define value with 'N/A' for all.
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        # create Dictionary with al review data
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        # Add Dictionary in master empty List
        data_dicts.append(data_dict)

    return data_dicts

In [None]:
# URL of The amazon Review page
product_file = 'product.csv'
p_names, p_ids, review_urls = getProduct(product_file)
print('#product: {0}'.format(len(review_urls)))

#product: 138


In [None]:
import time, random

total_reviews = 0
idx = 0

# Grab all HTML
for reviews_url in review_urls:

  print('[review] {0}'.format(reviews_url))

  reviews = []
  len_page = 50
  html_datas = reviewsHtml(reviews_url, len_page)

  # Iterate all Html page
  for html_data in html_datas:

    # Grab review data
    review = getReviews(html_data)

    # add review data in reviews empty list
    reviews += review

  total_reviews += len(reviews)

  # Create a dataframe with reviews Data
  df_reviews = pd.DataFrame(reviews)
  review_file = '{0}_review_of_{1}.csv'.format(idx, p_names[idx])
  df_reviews.to_csv(review_file, encoding='utf8')
  print('please check the fiel, {0}'.format(review_file))
  print('#acc. of reviews: {0}'.format(len(reviews)))

  pause = random.uniform(1,6)
  time.sleep(pause)
  print('get reviews of {0} \n after {1} seconds, we try again'.format(p_names[idx], pause))
  idx += 1

print('Total number of reviews: {0}'.format(total_reviews))
print('Crawling reviews from amazon is completed !!!')

[review] https://www.amazon.com/MONFOOT-5-Pairs-Breathable-No-Show-Non-slip/product-reviews/B0BKRCZQHR
please check the fiel, 0_review_of_MONFOOT-5-Pairs-Breathable-No-Show-Non-slip.csv
#acc. of reviews: 0
get reviews of MONFOOT-5-Pairs-Breathable-No-Show-Non-slip 
 after 3.9107984521586245 seconds, we try again
[review] https://www.amazon.com/MONFOOT-Athletic-Cushioned-Mountain-Glitter/product-reviews/B0C9Q6FPF8
please check the fiel, 1_review_of_MONFOOT-Athletic-Cushioned-Mountain-Glitter.csv
#acc. of reviews: 8
get reviews of MONFOOT-Athletic-Cushioned-Mountain-Glitter 
 after 3.036039969518602 seconds, we try again
[review] https://www.amazon.com/MONFOOT-Athletic-Cushioned-Running-Performance/product-reviews/B08CK1QS7T
please check the fiel, 2_review_of_MONFOOT-Athletic-Cushioned-Running-Performance.csv
#acc. of reviews: 50
get reviews of MONFOOT-Athletic-Cushioned-Running-Performance 
 after 1.6484383050749427 seconds, we try again
[review] https://www.amazon.com/MONFOOT-Athletic-