# Importing libraries

In [7]:
from bs4 import BeautifulSoup
import re # import Regular expression operations module
import requests
from time import gmtime, strftime
import time
import pandas as pd
import datetime
from bs4 import Tag

# Functions to import data attributes

In [3]:
def get_data_asin(bsElement: Tag) -> str:
    data_asin = bsElement.attrs['data-asin']
    return str(data_asin)

def get_data_uuid(bsElement: Tag) -> str:
    data_uuid = bsElement.attrs['data-uuid']
    return str(data_uuid)

def get_hyperlink(bsElement: Tag) -> str:
    hyperlink = 'https://www.amazon.com'+bsElement.find('a',attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'}).attrs['href']
    return str(hyperlink)

def get_product_name(bsElement: Tag) -> str:
    product_name = bsElement.find_all('span',attrs={'class':'a-size-medium a-color-base a-text-normal'})[0].text
    return str(product_name)

def get_product_image(bsElement: Tag) -> str:
    product_image = bsElement.find('img',attrs={'class':'s-image'}).attrs['src']
    return product_image

def get_product_ratings(bsElement: Tag) -> str:
    product_rating = bsElement.find('span',attrs={'class':'a-icon-alt'}).text if main_div[i].find('span',attrs={'class':'a-icon-alt'}) else ' '
    return str(product_rating.split(" ")[0])

def get_total_reviews(bsElement: Tag) -> str:
    totalReviews = bsElement.find('span',attrs={'class':'a-size-base s-underline-text'}).text if main_div[i].find('span',attrs={'class':'a-size-base s-underline-text'}) else ' '
    return str(totalReviews)

def get_product_price(bsElement: Tag) -> str:
    product_price = bsElement.find('span',attrs={'class':'a-price-whole'}).text + main_div[i].find('span',attrs={'class':'a-price-fraction'}).text if main_div[i].find('span',attrs={'class':'a-price-whole'}) else ' '
    return str(product_price)

# Funtion to get product details

In [4]:
def get_product_details(bsElement: Tag) -> dict:
    return {
        "data_asin": get_data_asin(bsElement),
        "data_uuid": get_data_uuid(bsElement),
        "hyperlink": get_hyperlink(bsElement),
        "product_name": get_product_name(bsElement),
        "product_image": get_product_image(bsElement),
        "product_rating": get_product_ratings(bsElement),
        "totalReviews": get_total_reviews(bsElement),
        "product_price": get_product_price(bsElement)
    }

# Amazon Data Scraping

In [5]:
HEADERS = {
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0',
    "accept-language": "en"
}
start = time.time()
all_results = []
for i in range(1,20):
    URL = 'https://www.amazon.com/s?k=iphone&page={}&ref=sr_pg_{}'.format(i,i)
    print(URL)
    page = requests.get(URL, headers=HEADERS)
    if page.status_code == 200:
        product_page = BeautifulSoup(page.text, "html.parser")
        main_div = product_page.find_all("div", attrs={"data-component-type": "s-search-result"})
        for i in range(len(main_div)):   
            data = get_product_details(main_div[i])
            all_results.append(data)
#     print('Sleeping between Requests')
    time.sleep(10)
end = time.time()
total_time = end-start
print("The time of execution of above program is :",
      (total_time) * 10**3, "ms")

https://www.amazon.com/s?k=iphone&page=1&ref=sr_pg_1
https://www.amazon.com/s?k=iphone&page=2&ref=sr_pg_2
https://www.amazon.com/s?k=iphone&page=3&ref=sr_pg_3
https://www.amazon.com/s?k=iphone&page=4&ref=sr_pg_4
https://www.amazon.com/s?k=iphone&page=5&ref=sr_pg_5
https://www.amazon.com/s?k=iphone&page=6&ref=sr_pg_6
https://www.amazon.com/s?k=iphone&page=7&ref=sr_pg_7
https://www.amazon.com/s?k=iphone&page=8&ref=sr_pg_8
https://www.amazon.com/s?k=iphone&page=9&ref=sr_pg_9
https://www.amazon.com/s?k=iphone&page=10&ref=sr_pg_10
https://www.amazon.com/s?k=iphone&page=11&ref=sr_pg_11
https://www.amazon.com/s?k=iphone&page=12&ref=sr_pg_12
https://www.amazon.com/s?k=iphone&page=13&ref=sr_pg_13
https://www.amazon.com/s?k=iphone&page=14&ref=sr_pg_14
https://www.amazon.com/s?k=iphone&page=15&ref=sr_pg_15
https://www.amazon.com/s?k=iphone&page=16&ref=sr_pg_16
https://www.amazon.com/s?k=iphone&page=17&ref=sr_pg_17
https://www.amazon.com/s?k=iphone&page=18&ref=sr_pg_18
https://www.amazon.com/s?k=i

# Saving results to a csv file

In [6]:
filename = "amazon_products.csv"
f = open(filename, "w", encoding='utf-8')
out = pd.DataFrame.from_records(all_results)
save_name = "products_amazon.csv"
# save_name = f"{datetime.now().strftime('%Y-%m-%d-%m')}.csv"
# logging.info(f"saving to {save_name}")
out.to_csv(save_name)