#### Automating webscraping amazon ecommerce

###### Packages:
1. AutoScraper (pip install autoscraper)
2. Pandas (pip install pandas)
3. openpyxl (pip install openpyxl)

In [1]:
from pathlib import Path

from autoscraper import AutoScraper
import pandas as pd

filepath = Path('download/earbudds_title_list.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)

In [2]:
amazon_url1 = 'https://www.amazon.co.uk/s?k=headphones'
amazon_url2 = 'https://www.amazon.co.uk/s?k=earbuds'
data_wanted_list = ['Sony MDR-ZX110 Overhead Headphones - Black , BASIC, Pack of 1', '£12.75', '39,914']
# data_wanted_list2 = ['JLab Go Air Pop True Wireless Earbuds, Headphones In Ear, Bluetooth Earphones with Microphone, Wireless Ear Buds, TWS Bluetooth Earbuds with Mic, USB Charging Case, Dual Connect, EQ3 Sound, Black', '£24.99', '34,239']

In [3]:
# Create an AutoScraper object
scraper = AutoScraper()

# if you have a proxy address
proxy = {
    "http": 'PROXY_ENDPOINT_HERE',
    "https": 'PROXY_ENDPOINT_HERE',
}
# Build the scraper by providing the Amazon URL (amazon_url1) and the list of data wanted (data_wanted_list)
scraper.build(url=amazon_url1, wanted_list=data_wanted_list) #request_args={"proxies": proxy}


['Amazon\'s Choicefor "headphones"',
 'KVIDIO Bluetooth Headphones Over Ear, 65 Hours Playtime Wireless Headphones with Microphone, Foldable Lightweight Headset with Deep Bass,HiFi Stereo Sound for Travel Work PC Cellphone (Black)',
 'Sony WH-CH520 Wireless Bluetooth Headphones - up to 50 Hours Battery Life with Quick Charge, On-ear style - Black',
 'Sony MDR-ZX110 Overhead Headphones - Black , BASIC, Pack of 1',
 '3 Pack Earphones, In-Ear Headphones Wired Earphones with Microphone and Volume Control, Noise Isolating and Deep Bass, Lightweight Earphones, 3.5 mm Earbuds Compatible with iPhone, iPad, Android',
 'Best Sellerin Noise Cancelling Headphones',
 'soundcore by Anker Q20i Hybrid Active Noise Cancelling Foldable Headphones, Wireless Over-Ear Bluetooth, 40H Long ANC Playtime, Hi-Res Audio, Big Bass, Customize via an App, Transparency Mode',
 'JBL Tune510BT - Wireless on-ear headphones featuring Bluetooth 5.0, up to 40 hours battery life and speed charge, in black',
 'Wireless Earb

In [4]:
# Retrieve similar results from the Amazon website using a scraper object and a given URL
result1 = scraper.get_result_similar(amazon_url1, grouped=True)

# Create a list containing all the keys from result1 using list comprehension
result1_key_list = [key for key in result1.keys()]

# Set an alias for the first key in result1 to be used as 'Title'
scraper.set_rule_aliases({result1_key_list[0]: 'Title'})

# Keep only the rules corresponding to the first key in result1
scraper.keep_rules([result1_key_list[0]])

# Save the scraper rules configuration with the alias and kept rule
scraper.save('amazon-search')

# Load the saved configuration for future use
scraper.load('amazon-search')


In [5]:
# Retrieve similar results from the Amazon website using a scraper object and a given URL
result2 = scraper.get_result_similar(amazon_url2,group_by_alias=True)
print(result2)

{'Title': ['Amazon\'s Choicefor "earbuds"', 'Wireless Earbuds, Bluetooth 5.3 Headphones In Ear with 4 ENC Noise Cancelling Mic, Btootos New Bluetooth Earbuds Mini Deep Bass Stereo Sound, 36H Playtime LED Display Wireless Earphones IP7 Waterproof', 'Wireless Earbuds, Bluetooth 5.3 Headphones in Ear with HiFi Stereo Deep Bass, 4 ENC Noise Cancelling Mic Wireless Earphones 40H Playtime, Bluetooth Earbuds Dual LED Display, IP7 Waterproof, USB-C', 'Wireless Earbuds - Bluetooth 5.3 Headphones with 4 ENC Noise Cancelling Mics - HiFi Stereo Deep Bass - 40 Hour Playtime In Ear Earphones - USB C Charging - Perfect for Sports, Work, and Leisure', 'soundcore by Anker P20i True Wireless Earbuds, 10mm Drivers with Big Bass, Bluetooth 5.3, 30H Long Playtime, IPX5 Water-Resistant, 2 Mics for AI Clear Calls, 22 Preset EQs, Customization via App', 'soundcore by Anker A20i True Wireless Earbuds, Bluetooth 5.3, With App for Custom Sound, 28H Long Playtime, Water-Resistant, 2 Mics for Clear Calls, Single E

In [6]:
# Initialize an empty list to store the earbuds titles
ear_budds_list = []

# Iterate through each title in result2
for result_values in result2.values():
    for result_item in result_values:
        # Get the exact result for the current title using the scraper
        #earbuds_title = scraper.get_result_exact(result_item)
        # Append the earbuds title to the list
        ear_budds_list.append(result_item)

# Create a DataFrame from the earbuds list with columns "earbuds Title" and "Price"
df = pd.DataFrame(ear_budds_list, columns=["earbuds Title"])

# Save the DataFrame to an Excel file named "ear_budds_list.xlsx"
df.to_excel("ear_budds_list.xlsx")
df.to_csv(filepath)