In [1]:
#Importing libraries so scrape data of the web and create a dataframe of collected information
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#create dataframe for book data with columns for data analysis (we will be analysing this information in a future project)
df = pd.DataFrame(columns =['Book Name', 'Category', 'Number of Reviews', 'Rating out of 5', 'Book Price', 'Stock on Hand'])

In [5]:
#The website we will be scraping from is "http://books.toscrape.com/catalogue/category/books_1/page-1.html"
#If you review the page, you will see that there are 50 pages of data (1000 books in total) that we need to go through.
#Therefore we need to create a Range from 1 to 50 (using 51 because that Range() function excludes the upper limit
pages = range(1,51)

In [6]:
#this cell is a large FOR LOOP with another FOR LOOP Inside it
#I have elected to keep this all in 1 cell

#1st FOR loop: Loop through the "Pages" range to recreate each URL page from 1-50 and then use request to get the URL
for page in pages:
    
    #create a URL string using the current page number and the static portions of the URL #
    #We need to convert 'page' to a string so it can be concatenated with the static string portions of the URL 
    URL = "http://books.toscrape.com/catalogue/category/books_1/page-" + str(page) + ".html"
    
    #Next we create a variable to get the URL using the request library
    r = requests.get(URL)
    
    #Now, using Beautiful Soup, we parse the variable to create a 'soup' variable so we can interact with the HTML code in Python
    soup = BeautifulSoup(r.content, 'html.parser')
    
    #A bit of context:
    
    #If you use the inspector in your browser and search for the name of a book, you will find it under the <li> tag with a Class "col-xs-6 col-sm-4 col-md-3 col-lg-3" attribute
    #each book has its own <li> tag so we need a collection of <li> tags - we can use find_all() method for this!
    #the <Li> tag lives under the <ol> tag with a class = 'row' attribute. 
    #As this is not a collection of tags, we only need to find 1 <ol> tag - we can use find() method for this!
    #Intuitively, the below code can be read as follows:
    # Find a block of HTML code that (1.) is tagged with <ol> and has a class='row' attribute AND (2.) has children <li> tags with a specific class='xxxx' attribute
    
    #get a list of books on the page under the <ol> tag that have <li> tags with specific class attribute
    BooksOnPage = soup.find("ol", class_='row').find_all("li", 
                                                 class_="col-xs-6 col-sm-4 col-md-3 col-lg-3")
    
    #2nd FOR LOOP: iterate through each <li> tag to find the name of the book so we can go to that specific URL page 
    
    for bts in BooksOnPage:
        
        #This block of code is just performing some string manipulation to get the Book Name To Search (bnts)
        #find a section with an <h3> tag that also as an <a> tag with an href (hyperlink) 
        tagstring = bts.find("h3").find("a", href=True)
        s = str(tagstring)
        sub1 = 'href="../../'
        sub2 = '/index.html"'
        pos1 = s.index(sub1)
        pos2 = s.index(sub2)
        bnts = s[pos1 + len(sub1): pos2]
        
        #create a new URL using the bnts string and go to the page for the current book being iterated through
        book_url = "http://books.toscrape.com/catalogue/"+ bnts +"/index.html"
        p = requests.get(book_url)
        page_soup = BeautifulSoup(p.content, 'html.parser')
        
        #Get a portion of the code on the page under the <div> tag with class="row" attribute
        metrics = page_soup.find("div", class_="col-sm-6 product_main")
        
        #get the values for the metric for PRICE - This contains a Currency symbol therefore we will manipulate and covert it to a flow
        fprice = str(metrics.find("p", class_="price_color").text)
        price = float(fprice[1:])
        
        #get the values for the metric for STOCK ON HAND - manipulate stock string to get the available books attached to the book
        stock_string = metrics.find("p", class_="instock availability").text
        stock_split = stock_string.split()
        stock_num = int(stock_split[2][1:])
    
        #Get the values for the metric for RATING OUT OF 5 - manipulate star-rating string to get the rating attached to the book
        star_attr = metrics.find("p", class_="star-rating")
        s_star_attr = str(star_attr)
        star_split = s_star_attr.split()
        rating = star_split[2][:-2]
        
        #Get the values for the metric for NUMBER OF REVEWS - also maniuplating it to get cleaner data into the dataframe
        bmetrics = str(page_soup.find("table", class_="table table-striped"))
        start_pos = bmetrics.find('<th>Number of reviews</th>')
        new_s = bmetrics[start_pos:]
        review_start = new_s.find('<td>')
        review_end = new_s.find('</td>')
        review_num = int(new_s[review_start + 4:review_end])
        
        #Get the values for the metric for CATEGORy - This will be used for analysis later
        breadcrumb = page_soup.find("ul", class_="breadcrumb")
        catfinds = breadcrumb.find_all("a", href=True)
        cat = catfinds[2].text
        
        #get the name of the book
        bookname = str(breadcrumb.find("li", class_="active"))
        bn_start = bookname.find('">')
        bn_end = bookname.find('</l')
        booktitle = bookname[bn_start + 2 : bn_end]
        
        #assign metrics found to df
        lst = [booktitle, cat , review_num , rating , price , stock_num]
        df.loc[len(df)] = lst
    
    #this portion just allows you to keep track of pages as they are scraped and added to the database
    #When I didn't have this piece in, I was not able to see what was going on so I thought my code was not working
    if page == 50:
        print('All books on pages have been processed!')
    else:
        print('All books for page ',page,' have been processed. Moving on to page',page + 1,'...')

All books for page  1  have been processed. Moving on to page 2 ...
All books for page  2  have been processed. Moving on to page 3 ...
All books for page  3  have been processed. Moving on to page 4 ...
All books for page  4  have been processed. Moving on to page 5 ...
All books for page  5  have been processed. Moving on to page 6 ...
All books for page  6  have been processed. Moving on to page 7 ...
All books for page  7  have been processed. Moving on to page 8 ...
All books for page  8  have been processed. Moving on to page 9 ...
All books for page  9  have been processed. Moving on to page 10 ...
All books for page  10  have been processed. Moving on to page 11 ...
All books for page  11  have been processed. Moving on to page 12 ...
All books for page  12  have been processed. Moving on to page 13 ...
All books for page  13  have been processed. Moving on to page 14 ...
All books for page  14  have been processed. Moving on to page 15 ...
All books for page  15  have been pro

In [7]:
#Let's have a glance at the firts few rows of the dataframe to make sure we are happy with what we created
df.head()

Unnamed: 0,Book Name,Category,Number of Reviews,Rating out of 5,Book Price,Stock on Hand
0,A Light in the Attic,Poetry,0,Three,51.77,22
1,Tipping the Velvet,Historical Fiction,0,One,53.74,20
2,Soumission,Fiction,0,One,50.1,20
3,Sharp Objects,Mystery,0,Four,47.82,20
4,Sapiens: A Brief History of Humankind,History,0,Five,54.23,20


In [8]:
#last sanity check: the website we scraped from has 1000 books therefore we can check if our dataframe also has 1000 rows
if len(df) ==1000:
    print('Dataframe has 1000 rows therefore all books were successfully process')
else:
    print('Oops! It looks like we are missing ', 1000 - len(df), 'books!')

Dataframe has 1000 rows therefore all books were successfully process
