## Scrape books' information from online book store (https://books.toscrape.com/)


In [1]:
import pandas as pd
import requests
import bs4

In [2]:
base_url = 'https://books.toscrape.com/catalogue/page-{}.html'

In [3]:
page_num = 12

In [4]:
base_url.format(page_num)

'https://books.toscrape.com/catalogue/page-12.html'

In [5]:
res = requests.get(base_url.format(1))

In [8]:
# res.text

In [6]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [26]:
#soup

In [9]:
products = soup.select(".product_pod")

In [10]:
example = products[0]
example

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [11]:
# extract book title
example.select('a')[1]['title']

'A Light in the Attic'

In [12]:
# extract book price
example.select('p')

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>,
 <p class="price_color">Â£51.77</p>,
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>]

In [13]:
example.select('p')[1].text[1:]

'£51.77'

In [14]:
# extract book availability text
example.select('p')[2]

<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>

In [15]:
example.select('p')[2].text.strip()

'In stock'

In [16]:
# extract book rating
example.select('p')[0]

<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>

In [17]:
example.select('p')[0]['class'][1]

'Three'

In [18]:
# get all books' title, price,in stock availability, and rating
book_titles = []
book_prices = []
book_availability = []
book_ratings = []

for n in range(1,50):
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    books = soup.select(".product_pod")
    
    for book in books:
        book_title = book.select('a')[1]['title']
        book_titles.append(book_title)
        book_price = book.select('p')[1].text[1:]
        book_prices.append(book_price)
        book_avail = book.select('p')[2].text.strip()
        book_availability.append(book_avail)
        book_rating = book.select('p')[0]['class'][1]
        book_ratings.append(book_rating)

In [27]:
book_titles[:10]

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria']

In [28]:
book_prices[:10]

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15']

In [29]:
book_availability[:10]

['In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock']

In [30]:
book_ratings[:10]

['Three', 'One', 'One', 'Four', 'Five', 'One', 'Four', 'Three', 'Four', 'One']

In [23]:
book_info = pd.DataFrame(list(zip(book_titles,book_prices, book_availability,book_ratings)),
              columns=['book_title','book_price', 'book_availability','book_rating_stars'])

In [24]:
book_info

Unnamed: 0,book_title,book_price,book_availability,book_rating_stars
0,A Light in the Attic,£51.77,In stock,Three
1,Tipping the Velvet,£53.74,In stock,One
2,Soumission,£50.10,In stock,One
3,Sharp Objects,£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,£54.23,In stock,Five
...,...,...,...,...
975,Icing (Aces Hockey #2),£40.44,In stock,Four
976,"Hawkeye, Vol. 1: My Life as a Weapon (Hawkeye #1)",£45.24,In stock,Three
977,Having the Barbarian's Baby (Ice Planet Barbar...,£34.96,In stock,Four
978,"Giant Days, Vol. 1 (Giant Days #1-4)",£56.76,In stock,Four


In [25]:
book_info.to_csv(r'C:\bootcamp_ml\Sample_data.csv', encoding='utf-8', header=True,index= False)