## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from fake_useragent import UserAgent  # To spoof user-agent when doing requests
import urllib.request # To download images
import os  # To download images to /data/data directory and to 
           # delete images once they are not being used
# To display images. Not currently used
# from PIL import Image
# from io import BytesIO
import pickle

# Function Definitions

## Functions to extract info from input URL

In [2]:
# Given URL, output the search keywords that were used
def get_search_keyword(url):
    return url.split('keywords=')[1].split('&')[0]

In [3]:
# Given URL, output the name of the product on the page
def get_product_name(url):
    # What Amazon sees when we do the request
    headers = {'User-Agent': UserAgent().random,  # Use random user agent
               "Accept-Encoding":"gzip, deflate", 
               "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
               "DNT":"1","Connection":"close", 
               "Upgrade-Insecure-Requests":"1"}
    
    # Create request for url
    r = requests.get(url, headers=headers)
    content = r.content
    
    # Create BeautifulSoup object from HTML code
    soup = BeautifulSoup(content)
    
    # Return the product name, removed of whitespace
    return soup.find('span', attrs={'id': 'productTitle'}).text.strip()

## Functions to format text

In [4]:
# Given an HTML snippet, return the formatted text 
def get_text(series): 
    try:
        return series.text
    except:
        return np.nan

In [5]:
# Remove newline characters from start and end of ratings
# Remove 'out of 5 stars'
# If a product doesn't have a rating, set it to np.nan
def clean_rating(series):
    try:
        return float(series.lstrip('\n\n').rstrip('\n\n out of 5 stars'))
    except:
        return np.nan

In [6]:
# Remove dollar sign ($) from price and convert value to float
# If a product doesn't have a price, set it to np.nan
def clean_price(series):
    try:
        return float(series.lstrip('$'))
    except:
        return np.nan

## Function to create a DF  (for a single search result page)

In [7]:
# Given page number and search keyword, return a DF containing:
# Product name, page, price, image url, rating, and url
def get_product_details(page_num, search_keyword):
    # What Amazon sees when we do the request
    headers = {'User-Agent': UserAgent().random,  # Use random user agent
               "Accept-Encoding":"gzip, deflate", 
               "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
               "DNT":"1","Connection":"close", 
               "Upgrade-Insecure-Requests":"1"}
    
    # Replace whitespace with + so that it can be used for searching
    search_keyword = search_keyword.replace(' ', '+')
    
    # Create URL
    url = "https://www.amazon.com/s?k={}&page={}".format(str(search_keyword), str(page_num))
    print(url)
    
    # Create request for url
    r = requests.get(url, headers=headers)
    content = r.content
    
    # Create BeautifulSoup object from HTML code
    soup = BeautifulSoup(content)
    
    # Check if the search result is list or grid view
    list_view_check = soup.find('div', attrs={'class': 'sg-col-20-of-24 s-result-item sg-col-0-of-12 sg-col-28-of-32 sg-col-16-of-20 sg-col sg-col-32-of-36 sg-col-12-of-16 sg-col-24-of-28'})
    
    if list_view_check:
        search_mode = 'list'
        products_on_page = soup.findAll('div', attrs={'class': 's-include-content-margin s-border-bottom'})  # Contains all products on page
    elif not list_view_check:
        search_mode = 'grid'
        products_on_page = soup.findAll('div', attrs={'class': 's-expand-height s-include-content-margin s-border-bottom'})  # Contains all products on page
    
    # Create list of dicts to store info
    data = []
    
    if search_mode == 'list':  # Search results load in list view format
        # Iterate thru the products on the page
        for product in products_on_page:
            # Create dict to store product info
            product_dict = {}

            # Get product information 
            product_details = product.find('div', attrs={'class': 'sg-col-4-of-12 sg-col-8-of-16 sg-col-16-of-24 sg-col-12-of-20 sg-col-24-of-32 sg-col sg-col-28-of-36 sg-col-20-of-28'})

            # Get product URL, name, page, price, and rating
            product_dict['url'] = 'https://amazon.com' + product_details.find('a', attrs={'class': 'a-link-normal a-text-normal'})['href']
            product_dict['name'] = product_details.find('span', attrs={'class': 'a-size-medium a-color-base a-text-normal'})
            product_dict['page'] = page_num
            product_dict['price'] = product_details.find('span', attrs={'class': 'a-offscreen'})
            
            # Some products do not have ratings
            try: 
                product_dict['rating'] = product_details.find('span', attrs={'class': 'a-icon-alt'})
            except:
                product_dict['rating'] = np.nan

            # Get product image
            product_dict['image_url'] = product.find('div', attrs = {'class': 'a-section aok-relative s-image-fixed-height'}).find('img')['src']

            # Add product_dict to our list of dicts
            data.append(product_dict)
            
    elif search_mode == 'grid':  # Search results load in grid view format
        # Iterate thru the products on the page
        for product in products_on_page:
            # Create dict to store product info
            product_dict = {}

            # Get product information 
            product_details = product.findAll('div', attrs={'class': 'sg-row'})[-1]

            # Get product URL, name, page, price, and rating
            product_dict['url'] = 'https://amazon.com' + product_details.find('a', attrs={'class': 'a-link-normal a-text-normal'})['href']
            product_dict['name'] = product_details.find('span', attrs={'class': 'a-size-base-plus a-color-base a-text-normal'})
            product_dict['page'] = page_num

            # Some products do not have prices
            try: 
                product_dict['price'] = product_details.find('span', attrs={'class': 'a-offscreen'})
            except:
                product_dict['price'] = np.nan 
            
            # Some products do not have ratings
            try: 
                product_dict['rating'] = product_details.find('div', attrs={'class': 'a-row a-size-small'}).find('span')
            except:
                product_dict['rating'] = np.nan

            # Get product image
            product_dict['image_url'] = product_details.find('div', attrs = {'class': 'a-section aok-relative s-image-square-aspect'}).find('img')['src']

            # Add product_dict to our list of dicts
            data.append(product_dict)
    
    # DF containing all our data
    df = pd.DataFrame(data)
    
    # Given HTML strings, update them to formatted text
    df.name = df.name.apply(get_text)
    
    df.price = df.price.apply(get_text)
    df.price = df.price.apply(clean_price)  # Remove $ signs, convert to float
    
    df.rating = df.rating.apply(get_text)
    df.rating = df.rating.apply(clean_rating)  # Remove 'out of 5 stars' and whitestpace, convert to float
    
    return df

## Function to create DataFrame (all pages)

In [8]:
# Given a search keyword, return a DF containing all products on the
# search's first 20 pages (Amazon's limit) 
def create_product_df(search_keyword):
    # Create DF to store info
    df = pd.DataFrame()

    # Iterate thru all 20 pages
    # Amazon only lets you go thru 20 pages
    for page in range(1, 21):
        page_df = get_product_details(page, search_keyword)

        # Append DF from page to our master df
        df = df.append(page_df, ignore_index=True)
        
    return df

## Functions to download/delete images

In [9]:
# Download image for all the products into the current directory
def get_product_images(df):
    
    # Get the digits place so that we can do left-zero padding
    # e.g. 123 will have digits_places=3
    # Do left-zero padding for Keras alhanumeric sorting
    # e.g. to prevent sorting as 1, 10, 100, etc.
    digits_places = len(str(len(df)))
    
    # Iterate thru the all the image URLs
    for index, url in df.image_url.iteritems():
        
        # Convert int to str
        index_str = str(index)

        # Construct filename of the downloaded image
        # Includes the directory where the image is to be stored
        # zfill pads the left of the string, such that the total number of digits
        # is equal to digits_places
        full_filename = os.path.join(os.getcwd() + '/data/data/', index_str.zfill(digits_places) + '.jpg')
    
        
        # Download image
        urllib.request.urlretrieve(url, full_filename)

In [10]:
# Delete images for all the products from the current directory
def delete_product_images(df):
    
    # Get the digits place so that we can do left-zero padding
    # e.g. 123 will have digits_places=3
    # Do left-zero padding for Keras alhanumeric sorting
    # e.g. to prevent sorting as 1, 10, 100, etc.
    digits_places = len(str(len(df)))
    
    # Iterate thru the all the images
    for index in range(len(df)):
        
        # Convert int to str
        index_str = str(index)

        # Construct filename of the downloaded image
        # Includes the directory where the image is to be stored
        # zfill pads the left of the string, such that the total number of digits
        # is equal to digits_places
        full_filename = os.path.join(os.getcwd() + '/data/data/', index_str.zfill(digits_places) + '.jpg')
    
            
        # Delete image
        os.remove(full_filename)

# Download Data

In [11]:
# Enter the URL of your preferred product
url = 'https://www.amazon.com/SainSmart-HC-SR04-Ranging-Detector-Distance/dp/B004U8TOE6/ref=sr_1_5?keywords=arduino+ultrasonic+sensor&qid=1577313071&sr=8-5'

# Extract search keywords from URL
search_keyword = get_search_keyword(url)

# Extract product name from URL
input_product = get_product_name(url)

In [12]:
search_keyword

'arduino+ultrasonic+sensor'

In [13]:
input_product

'SainSmart HC-SR04 Ranging Detector Mod Distance Sensor (Blue)'

In [14]:
# Create DF containing all the products from first 20 pages of
# Amazon search results
df = create_product_df(search_keyword)

https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=1
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=2
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=3
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=4
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=5
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=6
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=7
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=8
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=9
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=10
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=11
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=12
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=13
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=14
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=15
https://www.amazon.com/s?k=arduino+ultrasonic+sensor&page=16
https://www.amazon.com/s?k=arduin

In [15]:
len(df)

426

In [16]:
df.head()

Unnamed: 0,image_url,name,page,price,rating,url
0,https://m.media-amazon.com/images/I/81pEIMrGSi...,Smraza 5pcs Ultrasonic Module HC-SR04 Distance...,1,9.59,4.6,https://amazon.com/gp/slredirect/picassoRedire...
1,https://m.media-amazon.com/images/I/61AMuR0djY...,ELEGOO 5PCS HC-SR04 Ultrasonic Module Distance...,1,8.98,4.4,https://amazon.com/gp/slredirect/picassoRedire...
2,https://m.media-amazon.com/images/I/616pDRD0zJ...,Aceirmc HC-SR04 Ultrasonic Sensor Distance Mod...,1,6.99,4.6,https://amazon.com/Organizer-Ultrasonic-Distan...
3,https://m.media-amazon.com/images/I/81pEIMrGSi...,Smraza 5pcs Ultrasonic Module HC-SR04 Distance...,1,9.59,4.6,https://amazon.com/Smraza-Ultrasonic-Distance-...
4,https://m.media-amazon.com/images/I/61AMuR0djY...,ELEGOO 5PCS HC-SR04 Ultrasonic Module Distance...,1,8.98,4.4,https://amazon.com/ELEGOO-HC-SR04-Ultrasonic-D...


In [17]:
# Drop the listings with duplicate entries
df.drop_duplicates(subset=['name'], inplace=True)

len(df)

277

In [18]:
# Reset index
df.reset_index(drop=True, inplace=True)

In [19]:
# Index of input image
# This image will be compared to every other image
input_index = df[df.name == input_product].index[0]

In [20]:
# Download all the product images to /data/data
get_product_images(df)

In [21]:
# Pickle dataframe to use in main project file
with open('products_data', 'wb') as picklefile:
    pickle.dump([df, input_index], picklefile)

In [22]:
# Delete all the downloaded images
delete_product_images(df)