## scrapping book info from homepage or 1 page

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
website_link = 'https://books.toscrape.com/'

In [3]:
res  = requests.get(website_link)
soup = BeautifulSoup(res.text,'html.parser')

In [4]:
data= []
for sp in soup.find_all('li',class_="col-xs-6 col-sm-4 col-md-3 col-lg-3"):
    
    
    img_link = 'https://books.toscrape.com/'+ sp.find('img').get('src')
    title    = sp.find_all('a')[-1].get('title')
    book_link= 'https://books.toscrape.com/'+ sp.find_all('a')[-1].get('href')
    rating   = sp.find('p').get('class')[-1]
    price    = sp.find('p',class_="price_color").text[1:]
    stock    = sp.find('p',class_="instock availability").text.strip()

    data.append([title,img_link,book_link,rating,price,stock])



In [5]:
data

[['A Light in the Attic',
  'https://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg',
  'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
  'Three',
  '£51.77',
  'In stock'],
 ['Tipping the Velvet',
  'https://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg',
  'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
  'One',
  '£53.74',
  'In stock'],
 ['Soumission',
  'https://books.toscrape.com/media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg',
  'https://books.toscrape.com/catalogue/soumission_998/index.html',
  'One',
  '£50.10',
  'In stock'],
 ['Sharp Objects',
  'https://books.toscrape.com/media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg',
  'https://books.toscrape.com/catalogue/sharp-objects_997/index.html',
  'Four',
  '£47.82',
  'In stock'],
 ['Sapiens: A Brief History of Humankind',
  'https://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jp

## scrapping books info from multiplepages

In [6]:
from tqdm import tqdm

data = []

for page in tqdm(range(1, 51)):
    multiple_page = 'https://books.toscrape.com/catalogue/page-' + str(page) + '.html'
    res = requests.get(multiple_page)
    soup = BeautifulSoup(res.text, 'html.parser')

    
    for sp in soup.find_all('li', class_="col-xs-6 col-sm-4 col-md-3 col-lg-3"):
        img_link = 'https://books.toscrape.com/' + sp.find('img').get('src')
        title = sp.find_all('a')[-1].get('title')
        book_link = 'https://books.toscrape.com/catalogue/' + sp.find_all('a')[-1].get('href')
        rating = sp.find('p').get('class')[-1]
        price = sp.find('p', class_="price_color").text[1:]
        stock = sp.find('p', class_="instock availability").text.strip()

        data.append([title, img_link, book_link, rating, price, stock])

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:54<00:00,  2.28s/it]


In [7]:
import pandas as pd

In [8]:
df= pd.DataFrame(data,columns=['title', 'img_link', 'book_link', 'rating', 'price','stock'])



In [9]:
df.head()

Unnamed: 0,title,img_link,book_link,rating,price,stock
0,A Light in the Attic,https://books.toscrape.com/../media/cache/2c/d...,https://books.toscrape.com/catalogue/a-light-i...,Three,£51.77,In stock
1,Tipping the Velvet,https://books.toscrape.com/../media/cache/26/0...,https://books.toscrape.com/catalogue/tipping-t...,One,£53.74,In stock
2,Soumission,https://books.toscrape.com/../media/cache/3e/e...,https://books.toscrape.com/catalogue/soumissio...,One,£50.10,In stock
3,Sharp Objects,https://books.toscrape.com/../media/cache/32/5...,https://books.toscrape.com/catalogue/sharp-obj...,Four,£47.82,In stock
4,Sapiens: A Brief History of Humankind,https://books.toscrape.com/../media/cache/be/a...,https://books.toscrape.com/catalogue/sapiens-a...,Five,£54.23,In stock


In [11]:
df.to_excel('book scrapping.xlsx',index=False)

## scrapping individual book info from each book

In [20]:
df = pd.read_excel('book scrapping.xlsx')

In [21]:
data=[]
for link in tqdm(df['book_link']):
    res = requests.get(link,timeout=10)
    soup = BeautifulSoup(res.text,'html.parser')

    typ                = soup.find('ul',class_='breadcrumb').find_all('a')[2].text
    stock_availability = soup.find('p',class_='instock availability').text.strip()
    
    UPC = soup.find_all('td')[0].text
    prod_typ = soup.find_all('td')[1].text
    price_ex=soup.find_all('td')[2].text[1:]
    price_in=soup.find_all('td')[3].text[1:]
    tax=soup.find_all('td')[4].text[1:]
    reviews=soup.find_all('td')[6].text

    
    

    data.append([typ,stock_availability,UPC,prod_typ,price_ex,price_in,tax,reviews])
    
    

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [23:50<00:00,  1.43s/it]


In [22]:
df1= pd.DataFrame(data,columns=['typ','stock_availability','UPC','prod_typ','price_ex','price_in','tax','reviews'])

In [23]:
df1.to_excel('book scrapping1.xlsx')

## combining the dataframes

In [26]:
df_1 = pd.read_excel('book scrapping.xlsx')
df_2 = pd.read_excel('book scrapping1.xlsx')


df3 = pd.DataFrame()

df3['title']         = df_1['title']
df3['upc']           = df_2['UPC']
df3['category']      = df_2['prod_typ']
df3['price_e_tax']   = df_2['price_ex']
df3['price_i_tax']   = df_2['price_in']
df3['tax']           = df_2['tax']
df3['rating']        = df_1['rating']
df3['reviews']       = df_2['reviews']
df3['stock']         = df_2['stock_availability']

df3['book_link']     = df_1['book_link']
df3['img_link']      = df_1['img_link']


In [29]:
df3.to_excel('combined book scrapping.xlsx',index=False)