## Importing Libraries

In [43]:
import time
from dotenv import load_dotenv
import os
import selectorlib
import requests
from dateutil import parser as dateparser

## Scraping

Since the user agent is something private that allows the user to access a website, I decided to set an environmental variable in order to hide it to the public

In [44]:
load_dotenv()
USER_AGENT = os.getenv("USER_AGENT")
YML_PATH = os.getenv("YML_PATH")

Here we have the Scraper class.
In the init function I declared 3 variables:
- **Session**: a variable that create a object from the library requests that allows the user to communicate with the website;
- **Asin**: this is the unique code that represent an Amazon product inside the marketplace. It is unique only in the specific country, so for amazon.com we will have a specific asin, while for the same product but in amazon UK we must use another asin;
- **Url**: this is the url that link directly to the reviews. I already formatted the string in order to access all the pages by only changing a parameter at the end of the string.

Then we have the **check_page** function. It has the role to assess if there are reviews in the page I want to scrape. I pass only **i** that represents the number ofthe page and using the css selector I check for the presence of reviews. So the function returns the reviews if it finds them, otherwise it returns False.

The last function, **scrape**, is the actual scraping. If there are reviews in the specific page I start to scroll the list and always through the css selectors I extract the various part of the reviews. In the end I create a dictionary as per key the part of the review and as per value a list with all the various parts. 

In [49]:
class Scraper:
    def __init__(self, asin) -> None:
        self.session = requests.session()
        self.asin = asin
        self.url = f"https://www.amazon.com/product-reviews/{self.asin}/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&sortBy=recent&pageNumber=1"
        
    def check_page(self, i):
        # headers = {"User-Agent": USER_AGENT}
        headers = {
        'authority': 'www.amazon.com',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
        }
        
        r = self.session.get(self.url+str(i), headers=headers)
        # Simple check to check if page was blocked (Usually 503)
        if r.status_code > 500:
            if "To discuss automated access to Amazon data please contact" in r.text:
                print("Page %s was blocked by Amazon. Please try using better proxies\n"%self.url)
            else:
                print("Page %s must have been blocked by Amazon as the status code was %d"%(self.url,r.status_code))
            return None
        elif r.status_code == 200:
            extractor = selectorlib.Extractor.from_yaml_file(str(YML_PATH))
        
            data = extractor.extract(r.text,base_url=self.url)
            return data
        else:
            return False
        

    def scrape(self, data):
        reviews = []
        for r in data['reviews']:
            r["product"] = data["product_title"]
            r['url'] = self.url
            if 'verified_purchase' in r:
                if 'Verified Purchase' in r['verified_purchase']:
                    r['verified_purchase'] = True
                else:
                    r['verified_purchase'] = False
            r['rating'] = r['rating'].split(' out of')[0]
            date_posted = r['date'].split('on ')[-1]
            if r['images']:
                r['images'] = "\n".join(r['images'])
            r['date'] = dateparser.parse(date_posted).strftime('%d %b %Y')
            reviews.append(r)
        histogram = {}
        for h in data['histogram']:
            histogram[h['key']] = h['value']
        data['histogram'] = histogram
        data['average_rating'] = float(data['average_rating'].split(' out')[0])
        data['reviews'] = reviews
        data['number_of_reviews'] = int(data['number_of_reviews'].split('  customer')[0])
        return data 
        

## Main

In [50]:
scraper = Scraper('B08D6VD9TR')

results = []

data = scraper.check_page(1)
print(data)


{'product_title': None, 'reviews': None, 'next_page': None, 'average_rating': None, 'number_of_reviews': None, 'histogram': None}
