# **Setup**

**Import Libraries**

In [1]:
import re
import time
import requests
import random
import pandas as pd
from collections import OrderedDict
from bs4 import BeautifulSoup

**User Configurable Options**

In [2]:
# Run scrape or retrieve scraped dataset
run = False
save = True
save_name = "RAM_2.csv"
start_url = "/workspace/Python-Scraper-main/RAM.csv"
dataset_url = ""

# How many times to run the scrape again on rows with missing info
run_again = True
run_again_count = 1

# Hardcoded for convenience
category = "RAM"

# Add Random delay (in ms) to improve scraping reliability
random_delay = True
r_min = 500
r_max = 2000

**Notifications via Ntfy**

_Download the ntfy app and subscribe to the topic name to get notifications_

In [3]:
# Push notification when scraping is complete?
notify = True
topic = "inf1101-python"

if notify:
    import ntfy

    def notify_me():
        requests.post(f"https://ntfy.sh/{topic}",
        data = "Completed Scrape!".encode(encoding='utf-8'))


**Multiple user agents to prevent HTTP 503**

**Get a list of all products**

# **Scraping Functions**

In [4]:
headers_list = [
    # Firefox 77 Mac
     {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    # Firefox 77 Windows
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    # Chrome 83 Mac
    {
        "Connection": "keep-alive",
        "DNT": "1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
    },
    # Chrome 83 Windows 
    {
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9"
    }
]

**Scapes a given URL**

In [5]:
def scrape(URL):

    # Get a random user agaent
    headers = random.choice(headers_list)

    resp = requests.get(URL, headers=headers)

    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text,'html.parser')

        # Wait random 0.5s to 2s
        if random_delay:
            time.sleep((random.randrange(r_min, r_max)/1000))
        
        return soup
    else:
        print("Unable to retrieve")
        print(resp)

**Retrieves all product URLs from one page**

In [6]:
def geturls(soup):

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    # Store the links
    all_links = []

    # Loop for extracting links from Tag Objects
    for link in links:
        variable = link.get('href') # Returns half a URL eg: /GeForce-GTX-1660-Graphics-ZT-T16620F-10L/dp/B07Z8PWC6R.............
        variable = 'https://www.amazon.sg' + variable # Complete the URL
        all_links.append(variable)

    return all_links

**Checks for next page, and returns URL if available**

In [7]:
def checknextpage(soup):

    # From soup, retrieve the section with the page buttons
    pages = soup.find('span', {'class': 's-pagination-strip'})
    
    # For single page listing, the page buttons are remvoved, so we need to catch the error
    try: 
        # Within this section, check if there is a grey 'Next' button
        lastpage = pages.find('span', {'class': 's-pagination-item s-pagination-next s-pagination-disabled'})
        
    except AttributeError:
        return False, ""

    # The button does not exsist, there is a next page
    if not lastpage: 

        # retrieve the half URL inside 'href' for the corresponding tag from the 'next' button
        half_link = pages.find('a', {'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        
        # Format the link nicely
        nextpage = 'https://www.amazon.sg' + half_link 

        return True, nextpage

    # The button exsists, this is the last page
    else:

        return False, ""

**Returns URL of reviews page**

In [8]:
def checkreviews(soup):

    try:

        # Check for button
        are_there_reviews = soup.find('span', {'data-hook':'top-customer-reviews-title'})

        # Retrieve the half URL inside 'href' for the corresponding tag from the 'see more reviews' button
        half_link = soup.find('a', {'data-hook': 'see-all-reviews-link-foot'})['href']
    
    except:
        return False, "No Reviews"

    # There is no such element, therefore there are reviews
    if not are_there_reviews:

        # Format the link nicely
        reviews_link = 'https://www.amazon.sg' + half_link 

        print(f"Review Link: {reviews_link}")
        return True, reviews_link
    
    # The element exsists so there are no reviews available
    else:

        print("Product does have any reviews")
        return False, "No Reviews"

**Returns all reviews from a given URL**

In [9]:
def getreviews(soup):

    reviews_list = []

    # Find all matching elements
    all_reviews = soup.find_all('span',{'data-hook':'review-body'})

    # Iterate over each review and append text to list
    for reviews in all_reviews:
        reviews_list.append(reviews.text.strip())

    print(f"Total Reviews: {len(reviews_list)}")

    return reviews_list


**Returns a list of all product URL from page 1 to end**

In [10]:
def getallurls(start_url):

    # Run the first cycle manually to get things going
    url = start_url
    soup = scrape(url)
    page_count = 1

    url_list = []

    while True:
        print(f"Starting URL scrape for page {page_count}")
        print(f"Current page is: {url}\n")

        # Add all URLs from one page to the master list
        for links in geturls(soup):
           url_list.append(links)

        # At the end of every page, check for exsistance of a next page button
        nextpage, next_url = checknextpage(soup)
        if nextpage:
            
            # Update the URL with the next page if true
            url = next_url

            # Wait
            if random_delay:
                time.sleep((random.randrange(r_min, r_max)/1000))

            # Scrape next page
            soup = scrape(url)

            page_count += 1

        else:
            return url_list

**Retrives product information**

In [11]:
def getproductinfo(soup):

    # Create output list
    ASIN = product = price = stars = ""
    mylist = [ASIN, product, price, stars]

    # Define each attribute its own function for use later

    # Checks a list of possible elements for text that macthes the ASIN format
    def _asin(soup):
        table = soup.find_all('td', {'class':'a-size-base prodDetAttrValue'})
        for i in table:
            # Search table for ASIN
            if re.search('([A-Z0-9]{10})', (i.text.strip())) and len(i.text.strip()) <= 11:
                return i.text.strip()
        return "No ASIN"
    
    # Retrives product name
    def _product(soup):
        return soup.find('span', {"id": 'productTitle'}).text.strip()

    # Retrives product price   
    def _price(soup):
        return soup.find('span', {'class' : 'a-offscreen'}).text.strip()

    # Retrives stars in numerical (0.0 to 5.0)     
    def _stars(soup):
        rating = soup.find('div', {'id':'averageCustomerReviews'})
        return rating.find('span',{'class':['a-size-base', 'a-color-base']}).text.strip()

    i = 0

    # Sometimes the element is missing, so we need to catch the errors here so the program keeps running
    # I think this method is more elegant than having 4x Try-Except statements
    for task in [_asin, _product, _price, _stars]:
        try:
            mylist[i] = task(soup)
            i += 1
        except AttributeError:
            if task == _asin:
                mylist[i] = "No ASIN"
            elif task == _stars :
                mylist[i] = "No Reviews"
            else:
                mylist[i] = "Error"

    return mylist

**Function to process and store scraped info inside a dataframe**

In [12]:
# Scape a given index from the dataframe
def scrape_product(i):

    # Retrive reviews URL and add to dataframe
    soup = scrape(data['URL'][i])

    attributes = getproductinfo(soup)

    # Store information
    data['ASIN'][i] = attributes[0]
    data['Product'][i] = attributes[1]
    data['Price'][i] = attributes[2]
    data['Stars'][i] = attributes[3]
    data['Category'][i] = category
    data['Reviews URL'][i] = checkreviews(soup)[1]

    # Retrieve ASIN from URL if unable to obtain from scraping
    if data['ASIN'][i] == "No ASIN":
        split = data['URL'][i].split("/")
        if re.search('([A-Z0-9]{10})', split[5]) and len(split[5]) <= 11:
            print(f"*ASIN retrived from URL*")
            data['ASIN'][i] = split[5]

    # Print informaton for debug:
    print(f"Product: {attributes[1]}")
    print(f"ASIN: {data['ASIN'][i]}")

    # Check if there are no reviews for the current product
    if data['Reviews URL'][i] == "No Reviews":
        data['Reviews'][i] = "No Reviews"
        print(f"No reviews for this product")

    else:
        # Retrieve review text and add to dataframe
        soup = scrape(data['Reviews URL'][i])
        data['Reviews'][i] = getreviews(soup)

**Function to retrieve ASIN from URL if unable to find in product table**

# **Main Code**

**Starts the URL scraping process with an empty dataframe, or load a CSV file**

In [13]:
if run:

    # Create an empty dataframe with named columns
    data = pd.DataFrame(columns=['ASIN', 'Product', 'Category', 'Price', 'URL', 'Reviews URL', 'Stars', 'Reviews'])

    # Retrives and stores all URLs for the next part
    data['URL'] = getallurls(start_url)
    print(f"Found {len(data)} Products")

else:

    # Load a dataset locally or from github
    data = pd.read_csv(dataset_url)

FileNotFoundError: [Errno 2] No such file or directory: ''

**Starts the product info and review scraping process**

In [None]:
if run:
    total_scrapes = 0
    
    for i in data.index:
        print(f"\n[URL {i+1}]")
        scrape_product(i)
        total_scrapes += 1

    print(f'\nScraped a total of {total_scrapes} products')
    
    if notify:
        notify_me()


[URL 1]
Review Link: https://www.amazon.sg/2022-Apple-iPhone-128-Generation/product-reviews/B09V4VDV1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
*ASIN retrived from URL*
Product: 2022 Apple iPhone SE (128 GB) - (PRODUCT) RED (3rd Generation)
ASIN: B09V4VDV1R
Total Reviews: 2

[URL 2]
Review Link: https://www.amazon.sg/Nothing-Phone-256-White-Smartphone/product-reviews/B0B76J71DJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Product: Nothing Phone (1) 8+256 White Smartphone
ASIN: B0B76J71DJ
Total Reviews: 10

[URL 3]
Review Link: https://www.amazon.sg/Samsung-Galaxy-S23-Ultra-256GB/product-reviews/B0BTMBLLBJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Product: Samsung Galaxy S23 5G Ultra 256GB - Light Pink
ASIN: B0BTMBLLBJ
Total Reviews: 10

[URL 4]
Review Link: https://www.amazon.sg/Google-Pixel-Unlocked-Smartphone-megapixel/product-reviews/B0BCQXXRJJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Product: Google Pixel 7

# **Verify Scraped Information**

**Save and sample dataset**

In [None]:
if save:
    data.to_csv(save_name, index=False)

# Preview scraped data
data.sample(20)

Unnamed: 0,ASIN,Product,Category,Price,URL,Reviews URL,Stars,Reviews
365,B07VXC97KS,Samsung Galaxy A10e 32GB A102U GSM/CDMA Unlock...,Smartphones,S$271.32,https://www.amazon.sg/Samsung-Galaxy-A102U-Unl...,https://www.amazon.sg/Samsung-Galaxy-A102U-Unl...,4.4,[This phone checks off all the boxes of a phon...
411,‎01076NARTL,"Moto Z Unlocked Smartphone, 5.5"" Quad HD scree...",Smartphones,S$267.77,https://www.amazon.sg/Unlocked-Smartphone-scre...,https://www.amazon.sg/Unlocked-Smartphone-scre...,3.1,[I wish I could give it a better review but th...
19,‎5011101810,OnePlus Nord 2 5G Dual-SIM 256GB ROM + 12GB RA...,Smartphones,S$679.15,https://www.amazon.sg/OnePlus-Dual-SIM-256GB-F...,https://www.amazon.sg/OnePlus-Dual-SIM-256GB-F...,4.5,[Téléphone reçu en parfait état littéralement ...
369,‎B07SCJNSRT,Samsung Galaxy A10 32GB A105G/DS LTE Unlocked ...,Smartphones,S$297.03,https://www.amazon.sg/Samsung-Galaxy-DS-Unlock...,https://www.amazon.sg/Samsung-Galaxy-DS-Unlock...,4.3,[This phone replaced my 5 year old ZTE Obsidia...
69,B07R9PTDTZ,Google - Pixel 3a with 64GB Memory Cell Phone ...,Smartphones,S$220.47,https://www.amazon.sg/Google-Pixel-Memory-Unlo...,https://www.amazon.sg/Google-Pixel-Memory-Unlo...,4.4,[Likes- it's light!- charges really fast; batt...
60,‎B09999QN1L,Samsung Galaxy S20 FE 5G UW 128GB for Verizon ...,Smartphones,S$238.84,https://www.amazon.sg/Samsung-Galaxy-128GB-Ver...,https://www.amazon.sg/Samsung-Galaxy-128GB-Ver...,3.9,[This phone was a refurbished unit. It was in ...
237,‎B09QXBSJ6S,"Xiaomi 11T PRO 5G + 4G Volte (128GB, 8GB) 6.67...",Smartphones,S$676.45,https://www.amazon.sg/Xiaomi-11T-Compatible-Un...,https://www.amazon.sg/Xiaomi-11T-Compatible-Un...,4.6,[I fell in love with Xiaomi phones a while bac...
283,‎B08ZDH9BD2,OSAT Inmarsat IsatPhone Prepaid SIM Card with ...,Smartphones,S$493.90,https://www.amazon.sg/OSAT-Inmarsat-IsatPhone-...,https://www.amazon.sg/OSAT-Inmarsat-IsatPhone-...,1.0,"[Lors de la demande d’activation, l’opérateur ..."
156,B0BBXX3RLK,"Simple Mobile Samsung Galaxy A23 5G, 64GB, Bla...",Smartphones,S$302.72,https://www.amazon.sg/Simple-Mobile-Samsung-Ga...,https://www.amazon.sg/Simple-Mobile-Samsung-Ga...,4.3,[Phone didn’t come with charger card or port. ...
329,‎B088KS8MNR,"HUAWEI P40 Lite 5G - Smartphone 128GB, 6GB RAM...",Smartphones,S$611.74,https://www.amazon.sg/HUAWEI-P40-Lite-5G-Smart...,https://www.amazon.sg/HUAWEI-P40-Lite-5G-Smart...,3.8,"[Bon tel., Para regalo y no dejarse mucha past..."


**Reattempt scrape for those links with missing info**
- *Notes: miscategorised books don't have ASIN, will always give error*
- *Can only obtain ASIN of apple products from URL*

In [None]:
if run_again:

    # Count number of retries needed
    try:
        retry = data['ASIN'].value_counts()['No ASIN']
        print(f"Retrying scrape on {retry} URLs")
    
    # Exit loop if no retry needed
    except KeyError:
        print(f"All products have ASIN, skipping")

    # Otherwise scrape again
    else:

        # Rerun product and review scraping for x cycles
        for count in range(run_again_count):
            
            # For debugging
            total_scrapes = 0

            for i in data.index:

                # Only run on rows with missing ASIN 
                if data['ASIN'][i] == "No ASIN":
                    print(f"\n[URL {i+1}] {total_scrapes} of {retry}")
                    scrape_product(i)
                    total_scrapes += 1

            print(f'\nScraped a total of {total_scrapes} products\n')
        
        if notify:
            notify_me()

All products have ASIN, skipping


**Save dataset again if scraping is reattempted**

In [None]:
if run_again and save :
    data.to_csv(save_name, index=False)

**Preview dataset**

In [None]:
data

Unnamed: 0,ASIN,Product,Category,Price,URL,Reviews URL,Stars,Reviews
0,B09V4VDV1R,2022 Apple iPhone SE (128 GB) - (PRODUCT) RED ...,Smartphones,S$779.00,https://www.amazon.sg/2022-Apple-iPhone-128-Ge...,https://www.amazon.sg/2022-Apple-iPhone-128-Ge...,4.9,"[Everything is as described, except that this ..."
1,B0B76J71DJ,Nothing Phone (1) 8+256 White Smartphone,Smartphones,S$650.00,https://www.amazon.sg/Nothing-Phone-256-White-...,https://www.amazon.sg/Nothing-Phone-256-White-...,4.6,[Estoy completamente enamorado del equipo func...
2,B0BTMBLLBJ,Samsung Galaxy S23 5G Ultra 256GB - Light Pink,Smartphones,"S$1,338.00",https://www.amazon.sg/Samsung-Galaxy-S23-Ultra...,https://www.amazon.sg/Samsung-Galaxy-S23-Ultra...,4.4,[So far no issue and working seamlessly. Batte...
3,‎B0BCQXXRJJ,Google Pixel 7 – Unlocked Android 5G Smartphon...,Smartphones,S$930.00,https://www.amazon.sg/Google-Pixel-Unlocked-Sm...,https://www.amazon.sg/Google-Pixel-Unlocked-Sm...,4.4,[My 1-month review: If you are like me and wat...
4,B0BDJ9J4C7,Apple iPhone 14 (128 GB) - Blue,Smartphones,"S$1,149.00",https://www.amazon.sg/Apple-iPhone-14-128-GB/d...,https://www.amazon.sg/Apple-iPhone-14-128-GB/p...,4.7,[Just another Iphone and hope it will last as ...
...,...,...,...,...,...,...,...,...
453,B00B5XEBT2,BLU Vivo 4.65 HD D930a Unlocked GSM Phone with...,Smartphones,S$468.60,https://www.amazon.sg/BLU-D930a-Touchscreen-Du...,https://www.amazon.sg/BLU-D930a-Touchscreen-Du...,3.7,[This is a very good product for the price. I ...
454,B009C7WH8K,BLU Vivo 4.3 D910a Unlocked GSM Dual-SIM Andro...,Smartphones,S$353.04,https://www.amazon.sg/BLU-D910a-Unlocked-Dual-...,https://www.amazon.sg/BLU-D910a-Unlocked-Dual-...,3.3,[After growing tired of waiting for Google to ...
455,B008OP859Q,"BLU Studio 5.3-Inch Display, Dual SIM, Android...",Smartphones,S$317.78,https://www.amazon.sg/BLU-5-3-Inch-Display-Gin...,https://www.amazon.sg/BLU-5-3-Inch-Display-Gin...,2.0,[the phone arrive early when i order it for my...
456,B0079NIPW6,Nokia Lumia 800 Unlocked Phone With - Purity H...,Smartphones,"S$1,370.20",https://www.amazon.sg/Nokia-Lumia-800-Unlocked...,https://www.amazon.sg/Nokia-Lumia-800-Unlocked...,3.0,[This was a nice gift for my self!I love it sp...


**Quickly check dataset for number of missing values**

In [None]:
rows = len(data)

try:
    asin_err = data['ASIN'].value_counts()['No ASIN']
except KeyError:
    asin_err = 0
asin_err += data['ASIN'].isna().sum()

try:
    price_err = data['Price'].value_counts()['Error']
except KeyError:
    price_err = 0
price_err += data['Price'].isna().sum()

try:
    reviews_err = data['Reviews URL'].value_counts()['No Reviews']
except KeyError:
    reviews_err = 0
reviews_err += data['Reviews URL'].isna().sum()

try:
    stars_err = data['Stars'].value_counts()['No Reviews']
except KeyError:
    stars_err = 0
stars_err += data['Stars'].isna().sum()

print(f"No ASIN: {asin_err}({data['ASIN'].isna().sum()}) of {rows}")
print(f"No Price: {price_err}({data['Price'].isna().sum()}) of {rows}")
print(f"No Reviews: {reviews_err}({data['Reviews'].isna().sum()}) of {rows}")
print(f"No Stars: {stars_err}({data['Stars'].isna().sum()}) of {rows}")

No ASIN: 0(0) of 458
No Price: 0(0) of 458
No Reviews: 21(0) of 458
No Stars: 0(0) of 458


In [None]:
data.to_pickle('laptops_pickle.pkl')