# Gathering Data from Amazon Website by using Pandas, BeautifulSoup and urllib
[Click Here to Navigate](https://www.datacamp.com/tutorial/amazon-web-scraping-using-beautifulsoup)


# What is pandas?
<span style="color: green"> 
pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series. It is free software released under the three-clause BSD license. </span>

# What is in Beautiful python ?
<span style="color: green"> 
Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work.
These instructions illustrate all major features of Beautiful Soup 4, with examples. I show you what the library is good for, how it works, how to use it, how to make it do what you want, and what to do when it violates your expectations.
This document covers Beautiful Soup version 4.8.1. The examples in this documentation should work the same way in Python 2.7 and Python 3.2.</span>

# what is urllib in python ?
<span style="color: green"> 
Urllib package is the URL handling module for python. It is used to fetch URLs (Uniform Resource Locators). It uses the urlopen function and is able to fetch URLs using a variety of different protocols. Urllib is a package that collects several modules for working with URLs, such as: urllib.</span>

In [17]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


##### importing the libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [22]:
no_pages = 2

def get_data(pageNo):  
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get('https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_pg_'+str(pageNo)+'?ie=UTF8&pg='+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = BeautifulSoup(content)
    #print(soup)

    alls = []
    for d in soup.findAll('div', attrs={'class':'a-section a-spacing-none aok-relative'}):
        #print(d)
        name = d.find('span', attrs={'class':'zg-text-center-align'})
        n = name.find_all('img', alt=True)
        #print(n[0]['alt'])
        author = d.find('a', attrs={'class':'a-size-small a-link-child'})
        rating = d.find('span', attrs={'class':'a-icon-alt'})
        users_rated = d.find('a', attrs={'class':'a-size-small a-link-normal'})
        price = d.find('span', attrs={'class':'p13n-sc-price'})

        all1=[]

        if name is not None:
            #print(n[0]['alt'])
            all1.append(n[0]['alt'])
        else:
            all1.append("unknown-product")

        if author is not None:
            #print(author.text)
            all1.append(author.text)
        elif author is None:
            author = d.find('span', attrs={'class':'a-size-small a-color-base'})
            if author is not None:
                all1.append(author.text)
            else:    
                all1.append('0')

        if rating is not None:
            #print(rating.text)
            all1.append(rating.text)
        else:
            all1.append('-1')

        if users_rated is not None:
            #print(price.text)
            all1.append(users_rated.text)
        else:
            all1.append('0')     

        if price is not None:
            #print(price.text)
            all1.append(price.text)
        else:
            all1.append('0')
        alls.append(all1)    
    return alls


In [23]:
results = []
for i in range(1, no_pages+1):
    results.append(get_data(i))
flatten = lambda l: [item for sublist in l for item in sublist]
df = pd.DataFrame(flatten(results),columns=['Book Name','Author','Rating','Customers_Rated', 'Price'])
df.to_csv('amazon_products.csv', index=False, encoding='utf-8')


In [24]:
df = pd.read_csv("amazon_products.csv")

In [25]:
df.shape

(0, 5)

In [26]:
(100, 5)

(100, 5)

In [27]:
df.head(61)

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price


In [32]:
df['Rating'] = df['Rating'].apply(lambda x: x.split()[0])
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df['Rating'] = pd.to_numeric(df['Rating'])
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df["Price"] = df["Price"].str.replace('â‚¹', '')
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df["Price"] = df["Price"].str.replace(',', '')
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df['Price'] = df['Price'].apply(lambda x: x.split('.')[0])
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df['Price'] = df['Price'].astype(int)
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df["Customers_Rated"] = df["Customers_Rated"].str.replace(',', '')
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df['Customers_Rated'] = pd.to_numeric(df['Customers_Rated'], errors='ignore')
# POWERED BY DATACAMP WORKSPACE
# COPY CODE
df.head()

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price


In [33]:
df.dtypes

Book Name          object
Author             object
Rating              int64
Customers_Rated     int64
Price               int32
dtype: object

In [34]:
df.replace(str(0), np.nan, inplace=True)
df.replace(0, np.nan, inplace=True)


In [36]:
count_nan = len(df) - df.count()
count_nan


Book Name          0
Author             0
Rating             0
Customers_Rated    0
Price              0
dtype: int64

In [37]:
df = df.dropna()

In [38]:
data = df.sort_values(["Price"], axis=0, ascending=False)[:15]

In [39]:
data

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price


In [40]:
from bokeh.models import ColumnDataSource
from bokeh.transform import dodge
import math
from bokeh.io import curdoc
curdoc().clear()
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import Legend
output_notebook()


In [41]:
p = figure(x_range=data.iloc[:,1], plot_width=800, plot_height=550, title="Authors Highest Priced Book", toolbar_location=None, tools="")

p.vbar(x=data.iloc[:,1], top=data.iloc[:,4], width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = math.pi/2


In [42]:
show(p)

In [43]:
data = df[df['Customers_Rated'] > 1000]

In [44]:
data = data.sort_values(['Rating'],axis=0, ascending=False)[:15]

In [46]:
data

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price


In [47]:
p = figure(x_range=data.iloc[:,0], plot_width=800, plot_height=600, title="Top Rated Books with more than 1000 Customers Rating", toolbar_location=None, tools="")

p.vbar(x=data.iloc[:,0], top=data.iloc[:,2], width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = math.pi/2


In [48]:
show(p)

In [49]:
p = figure(x_range=data.iloc[:,1], plot_width=800, plot_height=600, title="Top Rated Books with more than 1000 Customers Rating", toolbar_location=None, tools="")

p.vbar(x=data.iloc[:,1], top=data.iloc[:,2], width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = math.pi/2


In [50]:
show(p)


In [51]:
data = df.sort_values(["Customers_Rated"], axis=0, ascending=False)[:20]


In [52]:
data

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price


# hdgfjgdsh


In [54]:
from bokeh.transform import factor_cmap
from bokeh.models import Legend
from bokeh.palettes import Dark2_5 as palette
import itertools
from bokeh.palettes import d3
#colors has a list of colors which can be used in plots
colors = itertools.cycle(palette)

palette = d3['Category20'][20]


In [57]:
index_cmap = factor_cmap('Author', palette=palette, factors=data["Author"])

In [56]:
p = figure(plot_width=700, plot_height=700, title = "Top Authors: Rating vs. Customers Rated")
p.scatter('Rating','Customers_Rated',source=data,fill_alpha=0.6, fill_color=index_cmap,size=20,legend='Author')
p.xaxis.axis_label = 'RATING'
p.yaxis.axis_label = 'CUSTOMERS RATED'
p.legend.location = 'top_left'




In [59]:
show(p)