# Amazon Orders Web Scraping

## Create scraper class

In [None]:
import numpy as np
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import requests
import csv
import datetime
import time

import os  
from selenium import webdriver  
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.chrome.options import Options 

class AmazonOrderScraper:
    
    def __init__(self):
        self.date = np.array([])
        self.cost = np.array([])
        self.order_id = np.array([])
        
        
    
    def URL(self, year: int, start_index: int) -> str:
        
        return "https://www.amazon.co.uk/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_4?ie=UTF8&orderFilter=year-" + \
                str(year) + \
                "&search=&startIndex=" + \
                str(start_index)
    
    
    def scrape_order_data(self, start_year: int, end_year: int) -> pd.DataFrame:
        
        years = list(range(start_year, end_year + 1))
        
        driver = self.start_driver_and_manually_login_to_amazon()

        for year in years:
            driver.get(
                self.URL(year, 0)
            )
            
            number_of_pages = self.find_max_number_of_pages(driver)
            
            self.scrape_first_page_before_progressing(driver)

            for i in range(number_of_pages):
                self.scrape_page(driver, year, i)

            print(f"Order data extracted for { year }") 
            
        driver.close()
        
        print("Scraping done :)")
            
        order_data = pd.DataFrame({
            "Date": self.date,
            "Cost £": self.cost,
            "Order ID": self.order_id
        })
        
        order_data = self.prepare_dataset(order_data)
        
        order_data.to_csv(r"amazon-orders.csv")
            
        return order_data
    

    def start_driver_and_manually_login_to_amazon(self) -> webdriver:
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        
        driver = webdriver.Chrome("chromedriver.exe", options=options)
        driver.get("https://www.amazon.co.uk/ap/signin?_encoding=UTF8&accountStatusPolicy=P1&openid.assoc_handle=gbflex&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.co.uk%2Fgp%2Fcss%2Forder-history%3Fie%3DUTF8%26ref_%3Dnav_orders_first&pageId=webcs-yourorder&showRmrMe=1")
        time.sleep(30) # allows time for manual sign in - increase if you need more time
        
        
        return driver
    
    
    def find_max_number_of_pages(self, driver: webdriver) -> int:
        time.sleep(2)
        page_source = driver.page_source
        page_content = BeautifulSoup(page_source, "html.parser")

        a_normal = page_content.findAll("li", {"class": "a-normal"})
        a_selected = page_content.findAll("li", {"class": "a-selected"})
        max_pages = len(a_normal + a_selected) - 1
       
    
        return max_pages
    
    
    def scrape_first_page_before_progressing(self, driver: webdriver) -> None:
        time.sleep(2)
        page_source = driver.page_source
        page_content = BeautifulSoup(page_source, "html.parser")
        order_info = page_content.findAll("span", {"class": "a-color-secondary value"})

        orders = []
        for i in order_info:
            orders.append(i.text.strip())

        index = 0
        for i in orders:
            if index == 0:
                self.date = np.append(self.date, i)
                index += 1
            elif index == 1:
                self.cost = np.append(self.cost, i)
                index += 1
            elif index == 2:
                self.order_id = np.append(self.order_id, i)
                index = 0
    
    
    def scrape_page(self, driver: webdriver, year: int, i: int) -> None:
        start_index = list(range(10, 110, 10))
        
        driver.get(
            self.URL(year, start_index[i])
        )
        time.sleep(2)

        data = driver.page_source
        page_content = BeautifulSoup(data, "html.parser")

        order_info = page_content.findAll("span", {"class": "a-color-secondary value"})

        orders = []
        for i in order_info:
            orders.append(i.text.strip())

        index = 0
        for i in orders:
            if index == 0:
                self.date = np.append(self.date, i)
                index += 1
            elif index == 1:
                self.cost = np.append(self.cost, i)
                index += 1
            elif index == 2:
                self.order_id = np.append(self.order_id, i)
                index = 0
                
    
    def prepare_dataset(self, order_data: pd.DataFrame) -> pd.DataFrame:
        order_data.set_index("Order ID", inplace=True)

        order_data['Cost £'] = order_data['Cost £'].str.replace("£", "").astype(float)
        order_data['Order Date'] = pd.to_datetime(order_data['Date'])
        order_data['Year'] = pd.DatetimeIndex(order_data['Order Date']).year
        order_data['Month Number'] = pd.DatetimeIndex(order_data['Order Date']).month
        order_data['Day'] = pd.DatetimeIndex(order_data['Order Date']).dayofweek
        
        dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
        order_data['Day Of Week'] = order_data['Order Date'].dt.dayofweek.map(dayOfWeek)
        
        month = {1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}
        order_data['Month'] = order_data['Order Date'].dt.month.map(month)
        
        
        return order_data
    

## Scrape data

In [None]:
aos = AmazonOrderScraper()

order_data = aos.scrape_order_data(
    start_year = 2010,
    end_year = 2021
)

## Analyse data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(rc={'figure.facecolor':'white'})

In [None]:
print(f"Columns: { order_data.shape[1] }", end="\n")
print(f"Rows: { order_data.shape[0] }", end="\n")

In [None]:
order_data.head()

In [None]:
order_data.tail()

In [None]:
order_data.describe()

### Total amount spent

In [None]:
total_amount_spent = order_data["Cost £"].sum()
print(f"Total amount spent: £{ total_amount_spent }")

In [None]:
average_amount_spent_per_order = order_data["Cost £"].mean()
print(f"Average amount spent per order: £{ round(average_amount_spent_per_order, 2) }")

### Most and least expensive orders

In [None]:
order_data.loc[order_data['Cost £'] == order_data['Cost £'].max()]

In [None]:
order_data.loc[order_data['Cost £'] == order_data['Cost £'].min()]

### Top five most expensive orders

In [None]:
order_data.sort_values(ascending=False, by="Cost £").head(5)

### Total spending per year

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
yoy_cost = order_data.groupby(['Year'], as_index=False).sum()
sns.lineplot(x=yoy_cost['Year'], y=yoy_cost['Cost £'], color="grey");
plt.title("How much spending per year?");
plt.ylabel('Spending £')

### Count of orders per year

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
yoy_order_count = order_data.groupby(['Year'], as_index=False).count()
sns.lineplot(x=yoy_order_count['Year'], y=yoy_order_count['Cost £'], color="Grey");
plt.title("How many orders per year?");
plt.ylabel('Count of Orders')

### Total monthly spend

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

fig, ax = plt.subplots(figsize=(15,6))
monthly_cost = order_data.groupby(['Month'], as_index=False).sum()
sns.barplot(x=monthly_cost['Month'], y=monthly_cost['Cost £'], order=months, color="Grey");
plt.ylabel('Spending £')
plt.title("How much overall spending per month?");

### Average monthly spend

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

fig, ax = plt.subplots(figsize=(15,6))
monthly_cost = order_data.groupby(['Month'], as_index=False).mean()
sns.barplot(x=monthly_cost['Month'], y=monthly_cost['Cost £'], order=months, color="Grey");
plt.ylabel('Spending £')
plt.title("Average spending per month?");

### Day of the week with highest spending

In [None]:
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


fig, ax = plt.subplots(figsize=(15,6))
day_of_week_cost = order_data.groupby(['Day Of Week'], as_index=False).sum()
sns.barplot(x=day_of_week_cost['Day Of Week'], y=day_of_week_cost['Cost £'], order=days_of_week, color="Grey");
plt.ylabel('Spending £')
plt.title("Which day of the week has the highest spend?");

### All data time series

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(x=order_data['Order Date'], y=order_data['Cost £'], color="Grey");
plt.ylabel('Spending £')
plt.title("Spending Time Series");