# Building a WebScaper to gather Data of Books Selling on Amazon.in

#### This Jupyter Notebook was used to test and write code, Final program can also be ran from 'main.py'

In [2]:
#importing BeautifulSoup 4 and selenium
from bs4 import BeautifulSoup
import selenium

In [5]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


### Start the webdriver

In [25]:
# startup the webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\boraw\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [7]:
url = "https://www.amazon.in/"
driver.get(url)

In [274]:
def get_url(search_term):
    """Generates a URL from the search term."""
    generic_url = 'https://www.amazon.in/s?k={}'
    search_term = search_term.replace(' ','+') # conforming to the amazon's url format for search terms with scapes
   
    url = generic_url.format(search_term)
    
    # added page query
    url += '&page={}'
    
    return url

In [252]:
url = get_url('Books')
url

'https://www.amazon.in/s?k=Books&page{}'

In [253]:
driver.get(url)

### Extract the Collection

In [129]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [61]:
results = soup.find_all(class_= "a-section a-spacing-small s-padding-left-small s-padding-right-small")

In [159]:
results = soup.find_all('div',{'data-component-type':'s-search-result'})

In [161]:
len(results)

22

In [224]:
item = results[0]

In [225]:
atag = item.h2.a

In [226]:
title = atag.text.strip()
title

'How To Win Friends & Influence People, International Bestseller'

In [227]:
product_url = "https:amazon.in" + atag.get('href')
product_url

'https:amazon.in/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A03269371LC14WGRP6KNN&url=%2FFriends-Influence-People-International-Bestseller%2Fdp%2F8194899133%2Fref%3Dsr_1_1_sspa%3Fkeywords%3DBooks%26qid%3D1649696266%26sr%3D8-1-spons%26psc%3D1&qualifier=1649696266&id=6708276933000838&widgetName=sp_atf'

In [228]:
price_parent = item.find('span','a-price')

In [267]:
price_parent.find('span','a-offscreen').text[1:]

'109'

In [230]:
rating = item.i.text
rating

'4.5 out of 5 stars'

In [231]:
rating_count  = item.find('span',{"class":"a-size-base s-underline-text"}).text
rating_count

'63,852'

In [233]:
author = item.find('a',{'class':'a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style'}).text.strip()
author

'Dale Carnegie'

### General Function for getting details

In [237]:
def extract_details(item):
    """Extracts details of a single record"""
    atag = item.h2.a 
    title = atag.text.strip()
    
    product_url = "https://amazon.in" + atag.get('href')
    price_parent = item.find('span','a-price')
    price = price_parent.find('span','a-offscreen').text[1:]
    rating = item.i.text
    rating_count  = item.find('span',{"class":"a-size-base s-underline-text"}).text
    
    try: # used try block as some author names are missing
        author = item.find('a',{'class':'a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style'}).text.strip()
    except AttributeError:
        author = 'unknown'
        
    output = (title,author,product_url,price,rating, rating_count)
    
    return output


In [238]:
records = []
results = soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    records.append(extract_details(item))

In [1]:
records[0]

NameError: name 'records' is not defined


# Put it all together

In [281]:
def get_url(search_term):
    """Generates a URL from the search term."""
    generic_url = 'https://www.amazon.in/s?k={}'
    search_term = search_term.replace(' ','+') # conforming to the amazon's url format for search terms with scapes
   
    url = generic_url.format(search_term)
    
    # added page query
    url += '&page={}'
    
    return url
def extract_details(item):
    """Extracts details of a single record"""
    atag = item.h2.a 
    title = atag.text.strip()
    
    product_url = "https://amazon.in" + atag.get('href')
    price_parent = item.find('span','a-price')
    
    try: # used try block for errors due to missing values
        price = price_parent.find('span','a-offscreen').text[1:]
        author = item.find('a',{'class':'a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style'}).text.strip()
        rating = item.i.text
        rating_count  = item.find('span',{"class":"a-size-base s-underline-text"}).text
    except AttributeError:
        price = 'unkown'
        author = 'unknown'
        rating = 'unknown'
        rating_count = 'unknown'
        
    output = (title,author,product_url,price,rating, rating_count)
    return output


def main(search_term):
    """Run Main program"""
    # startup the webdriver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    records = []
    url = get_url(search_term)
    
    for page in range(1,21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div',{'data-component-type':'s-search-result'})
        
        for item in results:
            record = extract_details(item)
            if record:
                records.append(record)
    
    driver.close()
    df = pd.DataFrame(records)
    df.columns = ['Title','Author','Link','Price',"Rating",'Rating Count']
    df.to_csv('amazon_books_records.csv', sep=',')


In [291]:
url = get_url('Books')
url

'https://www.amazon.in/s?k=Books&page={}'

In [292]:
for page in range(1,21):
    print(url.format(page))

https://www.amazon.in/s?k=Books&page=1
https://www.amazon.in/s?k=Books&page=2
https://www.amazon.in/s?k=Books&page=3
https://www.amazon.in/s?k=Books&page=4
https://www.amazon.in/s?k=Books&page=5
https://www.amazon.in/s?k=Books&page=6
https://www.amazon.in/s?k=Books&page=7
https://www.amazon.in/s?k=Books&page=8
https://www.amazon.in/s?k=Books&page=9
https://www.amazon.in/s?k=Books&page=10
https://www.amazon.in/s?k=Books&page=11
https://www.amazon.in/s?k=Books&page=12
https://www.amazon.in/s?k=Books&page=13
https://www.amazon.in/s?k=Books&page=14
https://www.amazon.in/s?k=Books&page=15
https://www.amazon.in/s?k=Books&page=16
https://www.amazon.in/s?k=Books&page=17
https://www.amazon.in/s?k=Books&page=18
https://www.amazon.in/s?k=Books&page=19
https://www.amazon.in/s?k=Books&page=20


In [282]:
main('Books')



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\boraw\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


---END OF NOTEBOOK ---