## Scrape books' information from online book store (https://books.toscrape.com/)


In [1]:
import pandas as pd
import requests
import bs4

In [2]:
base_url = 'https://books.toscrape.com/catalogue/page-{}.html'

In [3]:
page_num = 12

In [4]:
base_url.format(page_num)

'https://books.toscrape.com/catalogue/page-12.html'

In [5]:
res = requests.get(base_url.format(1))

In [8]:
# res.text

In [6]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [7]:
soup

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--><!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]--><!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:30" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="../static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="../static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="

In [9]:
products = soup.select(".product_pod")

In [10]:
example = products[0]
example

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [11]:
# extract book title
example.select('a')[1]['title']

'A Light in the Attic'

In [12]:
# extract book price
example.select('p')

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>,
 <p class="price_color">Â£51.77</p>,
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>]

In [13]:
example.select('p')[1].text[1:]

'£51.77'

In [14]:
# extract book availability text
example.select('p')[2]

<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>

In [15]:
example.select('p')[2].text.strip()

'In stock'

In [16]:
# extract book rating
example.select('p')[0]

<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>

In [17]:
example.select('p')[0]['class'][1]

'Three'

In [18]:
# get all books' title, price,in stock availability, and rating
book_titles = []
book_prices = []
book_availability = []
book_ratings = []

for n in range(1,50):
    scrape_url = base_url.format(n)
    res = requests.get(scrape_url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    books = soup.select(".product_pod")
    
    for book in books:
        book_title = book.select('a')[1]['title']
        book_titles.append(book_title)
        book_price = book.select('p')[1].text[1:]
        book_prices.append(book_price)
        book_avail = book.select('p')[2].text.strip()
        book_availability.append(book_avail)
        book_rating = book.select('p')[0]['class'][1]
        book_ratings.append(book_rating)

In [19]:
book_titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 'In Her Wake',
 'How Music Works',
 'Foolproof Preserving: A Guide to Small Batch Jams, Jellies, Pickles, Condiments, and More: A Foolproof Guide to Making Small Batch Jams, Jellies, Pickles, 

In [20]:
book_prices

['£51.77',
 '£53.74',
 '£50.10',
 '£47.82',
 '£54.23',
 '£22.65',
 '£33.34',
 '£17.93',
 '£22.60',
 '£52.15',
 '£13.99',
 '£20.66',
 '£17.46',
 '£52.29',
 '£35.02',
 '£57.25',
 '£23.88',
 '£37.59',
 '£51.33',
 '£45.17',
 '£12.84',
 '£37.32',
 '£30.52',
 '£25.27',
 '£34.53',
 '£54.64',
 '£22.50',
 '£53.13',
 '£40.30',
 '£44.18',
 '£17.66',
 '£31.05',
 '£23.82',
 '£36.89',
 '£15.94',
 '£33.29',
 '£18.02',
 '£19.63',
 '£52.22',
 '£33.63',
 '£57.31',
 '£26.41',
 '£47.61',
 '£23.11',
 '£45.07',
 '£31.77',
 '£50.27',
 '£14.27',
 '£44.18',
 '£18.78',
 '£25.52',
 '£16.28',
 '£31.12',
 '£19.49',
 '£17.27',
 '£19.09',
 '£56.13',
 '£56.41',
 '£56.50',
 '£45.22',
 '£38.16',
 '£54.11',
 '£42.96',
 '£23.89',
 '£16.77',
 '£20.59',
 '£37.13',
 '£56.06',
 '£58.11',
 '£49.05',
 '£40.76',
 '£19.73',
 '£32.24',
 '£41.83',
 '£39.58',
 '£39.25',
 '£25.02',
 '£51.04',
 '£19.83',
 '£50.40',
 '£13.61',
 '£13.34',
 '£18.97',
 '£36.28',
 '£10.16',
 '£15.44',
 '£48.41',
 '£46.35',
 '£14.07',
 '£14.86',
 '£33.37',

In [21]:
book_availability

['In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',

In [22]:
book_ratings

['Three',
 'One',
 'One',
 'Four',
 'Five',
 'One',
 'Four',
 'Three',
 'Four',
 'One',
 'Two',
 'Four',
 'Five',
 'Five',
 'Five',
 'Three',
 'One',
 'One',
 'Two',
 'Two',
 'One',
 'Two',
 'Three',
 'Five',
 'Five',
 'Three',
 'Three',
 'Three',
 'Five',
 'Four',
 'Five',
 'Three',
 'Five',
 'One',
 'Five',
 'Three',
 'Two',
 'One',
 'Four',
 'Two',
 'Three',
 'Two',
 'Five',
 'Five',
 'Two',
 'One',
 'Five',
 'Four',
 'Four',
 'Three',
 'One',
 'One',
 'Three',
 'Four',
 'Five',
 'One',
 'One',
 'One',
 'Four',
 'Three',
 'Four',
 'Three',
 'Four',
 'Four',
 'Three',
 'Five',
 'One',
 'One',
 'Four',
 'Three',
 'Three',
 'One',
 'Five',
 'Four',
 'Two',
 'Two',
 'Three',
 'Two',
 'Two',
 'Three',
 'Five',
 'Five',
 'One',
 'Two',
 'Three',
 'Four',
 'One',
 'One',
 'Three',
 'Two',
 'Two',
 'Two',
 'Four',
 'Two',
 'Three',
 'Two',
 'One',
 'Two',
 'Five',
 'Four',
 'Five',
 'Two',
 'Three',
 'One',
 'One',
 'Two',
 'Three',
 'Four',
 'One',
 'Two',
 'Two',
 'Four',
 'Three',
 'Four

In [23]:
book_info = pd.DataFrame(list(zip(book_titles,book_prices, book_availability,book_ratings)),
              columns=['book_title','book_price', 'book_availability','book_rating_stars'])

In [24]:
book_info

Unnamed: 0,book_title,book_price,book_availability,book_rating_stars
0,A Light in the Attic,£51.77,In stock,Three
1,Tipping the Velvet,£53.74,In stock,One
2,Soumission,£50.10,In stock,One
3,Sharp Objects,£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,£54.23,In stock,Five
...,...,...,...,...
975,Icing (Aces Hockey #2),£40.44,In stock,Four
976,"Hawkeye, Vol. 1: My Life as a Weapon (Hawkeye #1)",£45.24,In stock,Three
977,Having the Barbarian's Baby (Ice Planet Barbar...,£34.96,In stock,Four
978,"Giant Days, Vol. 1 (Giant Days #1-4)",£56.76,In stock,Four


In [25]:
book_info.to_csv(r'C:\bootcamp_ml\Sample_data.csv', encoding='utf-8', header=True,index= False)