In [3]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests
import json
import warnings
import time
warnings.filterwarnings("ignore")

#### Getting all links from the given files

In [4]:
""" build links list  """

def get_links(paperback_category_file, hardback_category_file):
    all_links = []
    paperback_links = []
    hardback_links = []

    with open(paperback_category_file, 'r') as f:
        paperback_links = f.read().split('\n')

    with open(hardback_category_file, 'r') as f:
        hardback_links = f.read().split('\n')

    if paperback_links[-1] == '':
        paperback_links = paperback_links[:-1]

    if hardback_links[-1] == '':
        hardback_links = hardback_links[:-1]    

    print(len(paperback_links)) # for debugging purposes 
    print(len(hardback_links))  # for debugging purposes 

    all_links.extend(paperback_links)
    all_links.extend(hardback_links)
    
    print(len(all_links))   # for debugging purposes 
    
    return all_links

#### Getting book name:

In [5]:
def get_book_name(soup):
    book_name = soup.find("h1", attrs = {"itemprop":"name"})
    
    if book_name != None:
        book_name_lower_cased = book_name.text.lower().strip()
        
        if "untitled" in book_name_lower_cased:
            book_name = None
        else:
            book_name = book_name.text.strip()
            
    return book_name

#### Getting book rate:

In [6]:
def get_book_rate(soup):
    books_rate = None
    book_rate_option1 = soup.find("span", attrs = {"itemprop":"ratingValue"})
    
    if book_rate_option1 != None: 
        books_rate = book_rate_option1.text.strip()
    else:
        book_rate_option2 = soup.find("div", attrs = {"itemprop":"ratingValue"})
        if book_rate_option2 != None:
            books_rate = book_rate_option2.text.strip().split()[0]# getting only the rate number out of the sentence ("xxx out of 5 stars").  
   
    return books_rate

#### Getting book format "paperback" vs "hardback":

In [7]:
# we decided that paperback book will be 1,
# and hardback book will be 0

def get_book_format_cover(soup): 
    paper_back, hard_back = None, None
    
    book_cover_option1 = soup.find("ul", attrs = {"class":"meta-info"}).find("li")
    book_cover_option2 = soup.find("label", text = "Format").parent.find("span")
    
    if book_cover_option1 != None:
        book_cover_option1 = book_cover_option1.text.strip()   
    elif book_cover_option2 != None:
        book_cover_option2 = book_cover_option2.text.split()[0].strip()    
      

    if book_cover_option1 == "Paperback" or book_cover_option2 == "Paperback":
        paper_back = 1
        hard_back = 0
    elif book_cover_option1 == "Hardback" or book_cover_option2 == "Hardback":
        paper_back = 0
        hard_back = 1
    
    return paper_back, hard_back  

#### Getting book author:

In [8]:
def get_book_author(soup):
    author_name_temp = None
    author_name = soup.find("span", attrs = {"itemprop":"name"})
    
    if author_name != None:
        author_name_temp = author_name.text.strip()
    
    return author_name_temp

#### Getting book dimensions:

In [9]:
def get_book_dimensions(soup):
    length_in_mm, width_in_mm, depth_in_mm, weight_in_grams = None,None,None,None
    book_dimensions = soup.find("label", text = "Dimensions").parent.find("span")
    
    if book_dimensions != None:
        book_dimensions = book_dimensions.text.replace("\n", "").replace(" ", "").replace("x", " ").replace("|"," ")
        book_dimensions = book_dimensions.replace("mm","").replace("g","").strip()
        
        width_in_mm = book_dimensions.split()[0]
        length_in_mm = book_dimensions.split()[1]
        depth_in_mm = book_dimensions.split()[2]
        weight_in_grams = book_dimensions.split()[3]
    
    return length_in_mm, width_in_mm, depth_in_mm, weight_in_grams

#### Getting book publication date:

In [10]:
def get_book_publication_date(soup):
    day, month, year = None,None,None
    publication_date = soup.find("span", attrs = {"itemprop":"datePublished"})
    
    if publication_date != None:
        publication_date = publication_date.text.strip()
        day = publication_date.split()[0]
        month = publication_date.split()[1]
        year = publication_date.split()[2]
    
    return day, month, year

#### Getting book publisher:

In [11]:
def get_book_publisher(soup):
    publisher = soup.find("span", attrs = {"itemprop":"publisher"}).find("span")
    
    if publisher != None:
        publisher = publisher.text.strip()
        
    return publisher   

#### Getting publication location:

In [12]:
def get_publication_location(soup):
    city , country = None, None
    location = soup.find('label', text='Publication City/Country').parent.find("span")

    if location != None:
        location = location.text.strip()
        if ',' in location:
            city = location.split(',')[0].strip()
            country = location.split(',')[1].strip()
        else:
            city = location.strip()
            country = city 
            
    return city, country

#### Getting book language:

In [13]:
def get_book_language(soup):
    language = None
    book_language_first_attemp = soup.find("span", attrs = {"itemprop":"inLanguage"}).find("a")
    book_language_second_attemp= soup.find("label", text = "Language").parent.find("span")
    if book_language_first_attemp != None:
        language = book_language_first_attemp.text.strip()
    elif book_language_second_attemp != None:
        language = book_language_second_attemp.text.strip()
   
    return language

#### Getting book best seller rank:

In [14]:
def get_book_best_sellers_rank(soup):
    rank = soup.find("label", text = "Bestsellers rank").parent.find("span")
    if rank != None:
        rank = rank.text.strip()
   
    return rank   

#### Getting book price:

In [15]:
def get_book_price(soup):
    price = soup.find("span", attrs = {"class":"sale-price"})
    if price != None:
        price = price.text.replace("₪", "").strip()
 
    return price     

#### Getting book length:

In [16]:
def get_book_length(soup):
    number_of_pages = soup.find("span", attrs = {"itemprop":"numberOfPages"})
    if number_of_pages != None:
        number_of_pages = number_of_pages.text.split()[0].strip()
    
    return number_of_pages

## Now we proceed to building the dataframes

In [17]:
#saving resources

session = requests.Session()

In [18]:
# return the next soup for the given url, got the session to prevent network delay  

def get_soup(session, next_url):
    response = session.get(next_url) 
    soup =  BeautifulSoup(response.content, "html.parser")  # convert to BeautifulSoup format
     
    return soup

### Getting book product details:

In [19]:
def get_book_details(soup, genre_type):
    
    ## Book Name ##    
    try:
        name_of_the_book = get_book_name(soup)
        if name_of_the_book == None:
            name_of_the_book = ""
    except:
        name_of_the_book = "" 


    ## BOOK RATE ## 
    try:
        rate_of_the_book = get_book_rate(soup)
        if rate_of_the_book == None:
            rate_of_the_book = ""
    except:
        rate_of_the_book = ""
    
    
    ## IS THE SUCCESSFUL BOOK ##
    is_successfull_book = 0
    if rate_of_the_book != "" and float(rate_of_the_book) >= 4:
        is_successfull_book = 1

        
    ## BOOK_COVER ##
    try:
        paper_back, hard_back = get_book_format_cover(soup)
        if paper_back == None:
            paper_back = ""
        if hard_back == None:
            hard_back = ""
    except:
        paper_back, hard_back = "", ""

        
    ## Author_Name ##  
    try:
        author_name = get_book_author(soup)
        if author_name == None:
            author_name = ""
    except:
        author_name = ""
    
    
    
    ## Book length ##
    try:
        pages = get_book_length(soup)
        if pages == None:
            pages = ""
    except:
        pages = ""
    
    
    ## Book dimensions ##
    try:
        length_in_mm, width_in_mm, depth_in_mm, weight_in_grams = get_book_dimensions(soup)
        if length_in_mm == None:
            length_in_mm = ""
        if width_in_mm == None:
            width_in_mm = ""
        if depth_in_mm == None:
            depth_in_mm = "" 
        if weight_in_grams == None:
            weight_in_grams = "" 
    except:
        length_in_mm, width_in_mm, depth_in_mm, weight_in_grams = "", "", "", "" 
    
   
    ## Book publication date ##
    try:
        day, month, year = get_book_publication_date(soup)
        if day == None:
            day = ""
        if month == None:
            month = ""
        if year == None:
            year = "" 
    except:
        day, month, year = "","",""
    
   
    ## Book publisher ##
    try:
        publisher_name = get_book_publisher(soup)
        if publisher_name == None:
            publisher_name = ""
    except:
        publisher_name = ""
    
   
    ## Book publication location ##
    try:
        city, country = get_publication_location(soup)
        if city == None:
            city = ""
        if country == None:
            country = ""
    except:
        city, country = "", ""
    
    
    ## Book language ##
    try:
        book_language = get_book_language(soup)
        if book_language == None:
            book_language = ""
    except:
        book_language = ""
    
    
    ##Books best sellers rank ##
    try:
        best_seller_rank = get_book_best_sellers_rank(soup)
        if best_seller_rank == None:
            best_seller_rank = ""
    except:
        best_seller_rank = ""
    
    
    ## Book price ## 
    try:
        price = get_book_price(soup)
        if price == None:
            price = ""
    except:
        price = ""
        
           
    genre.append(genre_type)
    book_names.append(name_of_the_book)
    rates.append(rate_of_the_book)
    successfull_book.append(is_successfull_book) # a column that we invented which will indicate if the book's rank greater or equal to 4 
    paperback.append(paper_back)
    hardback.append(hard_back)
    author.append(author_name)
    number_of_pages.append(pages)
    book_length_mm.append(length_in_mm)
    book_width_mm.append(width_in_mm)
    book_depth_mm.append(depth_in_mm)
    book_weight_grams.append(weight_in_grams)
    publication_day.append(day)
    publication_month.append(month)
    publication_year.append(year)
    publisher.append(publisher_name)
    publication_city.append(city)
    publication_country.append(country)
    language.append(book_language)
    bestsellers_rank.append(best_seller_rank)
    price_nis.append(price)

In [20]:
""" first we get all the data for each book , then we build a dictionary that will hold the data"""

def getting_dict_data_for_given_genre(links_list, genre_type_of_the_book):
    for link in links_list:
        current_soup = get_soup(session, link) 
        get_book_details(current_soup, genre_type_of_the_book)
        time.sleep(2)


    dict_data = {'book_title' : book_names, 'book_genre' : genre, 'book_rate' : rates , 'successfull_book' : successfull_book,
             'paperback' : paperback, 'hardback' : hardback, 'author' : author, 'number_of_pages' : number_of_pages,
             'book_length_mm' : book_length_mm, 'book_width_mm' : book_width_mm, 'book_depth_mm' : book_depth_mm, 
             'book_weight_grams' : book_weight_grams, 'publication_day' : publication_day, 
             'publication_month' : publication_month, 'publication_year' : publication_year,
             'publisher' : publisher, 'publication_city' : publication_city, 'publication_country' : publication_country,
             'language' : language,  'bestsellers_rank' : bestsellers_rank, 'price_nis' : price_nis}
   
    return dict_data  

## Fiction DataFrame

In [19]:
""" every time we want to clear all the links before creating a new dataframe """
genre = []
book_names = []
rates = []
successfull_book = [] # a column that we invented which will indicate if the book's rank greater or equal to 4 
paperback = []
hardback = []
author = []
number_of_pages = []
book_length_mm = []
book_width_mm = []
book_depth_mm = []
book_weight_grams = []
publication_day = []
publication_month = []
publication_year = []
publisher = []
publication_city = []
publication_country = []
language = []
bestsellers_rank = []
price_nis = []

In [20]:
fiction_links =  get_links('fiction_paperback_links.txt', 'fiction_hardback_links.txt')
genre_type_of_the_book = "contemporary fiction"
dict_fiction_data = getting_dict_data_for_given_genre(fiction_links, genre_type_of_the_book)

# creating the fiction dataframe
fiction_df = pd.DataFrame(data = dict_fiction_data)

fiction_df.head()

1650
1650
3300


Unnamed: 0,book_title,book_genre,book_rate,successfull_book,paperback,hardback,author,number_of_pages,book_length_mm,book_width_mm,...,book_weight_grams,publication_day,publication_month,publication_year,publisher,publication_city,publication_country,language,bestsellers_rank,price_nis
0,It Ends With Us: The most heartbreaking novel ...,contemporary fiction,4.44,1,1,0,Colleen Hoover,384,198,130,...,270,2,Aug,2016,Simon & Schuster Ltd,London,United Kingdom,English,1,52.42
1,Where the Crawdads Sing,contemporary fiction,4.46,1,1,0,Delia Owens,384,126,195,...,265,14,Jan,2020,"Little, Brown Book Group",London,United Kingdom,English,19,45.74
2,The Midnight Library : The No.1 Sunday Times b...,contemporary fiction,4.08,1,1,0,Matt Haig,304,198,129,...,224,18,Feb,2021,Canongate Books Ltd,Edinburgh,United Kingdom,English,9,49.88
3,Normal People : One million copies sold,contemporary fiction,3.85,0,1,0,Sally Rooney,288,198,129,...,240,23,Jul,2019,Faber & Faber,London,United Kingdom,English,11,50.21
4,1984 : The dystopian classic reimagined with c...,contemporary fiction,4.19,1,1,0,George Orwell,336,181,111,...,184,1,Oct,2008,Penguin Books Ltd,London,United Kingdom,English,59,40.41


In [47]:
# saving fiction df to csv file.

fiction_df.to_csv('final_fiction_dataframe.csv')

In [48]:
fiction_df = pd.read_csv("final_fiction_dataframe.csv", index_col = [0])

## Mind,Body & Spirit DataFrame
### for convenience purposes we'll refer Mind,Body & Spirit as mbs from now on.

In [21]:
""" every time we want to clear all the links before creating a new dataframe """
genre = []
book_names = []
rates = []
successfull_book = [] # a column that we invented which will indicate if the book's rank greater or equal to 4 
paperback = []
hardback = []
author = []
number_of_pages = []
book_length_mm = []
book_width_mm = []
book_depth_mm = []
book_weight_grams = []
publication_day = []
publication_month = []
publication_year = []
publisher = []
publication_city = []
publication_country = []
language = []
bestsellers_rank = []
price_nis = []

In [22]:
mbs_links = get_links('mbs_paperback_links.txt', 'mbs_hardback_links.txt')
genre_type_of_the_book = "mind,body & spirit"
dict_mbs_data = getting_dict_data_for_given_genre(mbs_links, genre_type_of_the_book)

# creating the mbs dataframe
mbs_df = pd.DataFrame(data = dict_mbs_data)

mbs_df.head()

1650
1650
3300


Unnamed: 0,book_title,book_genre,book_rate,successfull_book,paperback,hardback,author,number_of_pages,book_length_mm,book_width_mm,...,book_weight_grams,publication_day,publication_month,publication_year,publisher,publication_city,publication_country,language,bestsellers_rank,price_nis
0,Milk and Honey,"mind,body & spirit",4.03,1,1,0,Rupi Kaur,208,196,127,...,204,8,Jul,2016,Andrews McMeel Publishing,Kansas City,United States,English,597,54.15
1,The Power of Now : (20th Anniversary Edition),"mind,body & spirit",4.13,1,1,0,Eckhart Tolle,224,231,189,...,162,7,Jan,2016,Hodder & Stoughton,London,United Kingdom,English,95,58.12
2,"The Body Keeps the Score : Mind, Brain and Bod...","mind,body & spirit",4.45,1,1,0,Bessel van der Kolk,560,198,129,...,385,24,Sep,2015,Penguin Books Ltd,London,United Kingdom,English,37,67.39
3,"The Happiness Trap : Stop Struggling, Start Li...","mind,body & spirit",4.06,1,1,0,Russ Harris,288,133,200,...,204,26,Jun,2008,"Little, Brown Book Group",London,United Kingdom,English,456,49.35
4,The Four Agreements : A Practical Guide to Per...,"mind,body & spirit",4.16,1,1,0,Don Miguel Ruiz,160,184,130,...,176,7,Nov,1997,"Amber-Allen Publishing,U.S.",San Rafael,CA,English,85,53.27


In [23]:
# saving mbs df to csv file.

mbs_df.to_csv('final_mbs_dataframe.csv')

## Sport DataFrame

In [31]:
""" every time we want to clear all the links before creating a new dataframe """
genre = []
book_names = []
rates = []
successfull_book = [] # a column that we invented which will indicate if the book's rank greater or equal to 4 
paperback = []
hardback = []
author = []
number_of_pages = []
book_length_mm = []
book_width_mm = []
book_depth_mm = []
book_weight_grams = []
publication_day = []
publication_month = []
publication_year = []
publisher = []
publication_city = []
publication_country = []
language = []
bestsellers_rank = []
price_nis = []

In [32]:
sport_links = get_links('sport_paperback_links.txt', 'sport_hardback_links.txt')
dict_sport_data = getting_dict_data_for_given_genre(sport_links, "sport")

# creating the sport dataframe
sport_df = pd.DataFrame(data = dict_sport_data)

sport_df.head()

1650
1650
3300


Unnamed: 0,book_title,book_genre,book_rate,successfull_book,paperback,hardback,author,number_of_pages,book_length_mm,book_width_mm,...,book_weight_grams,publication_day,publication_month,publication_year,publisher,publication_city,publication_country,language,bestsellers_rank,price_nis
0,Legacy,sport,4.19,1,1,0,James Kerr,224,215,136,...,244.0,20,Aug,2015,"Little, Brown Book Group",London,United Kingdom,English,6262,55.04
1,"Born to Run : The hidden tribe, the ultra-runn...",sport,4.29,1,1,0,Christopher McDougall,304,194,126,...,238.0,15,Apr,2010,Profile Books Ltd,London,United Kingdom,English,1826,49.5
2,Yoga Anatomy,sport,4.27,1,1,0,Leslie Kaminoff,288,254,178,...,707.6,1,Nov,2011,Human Kinetics Publishers,Champaign,IL,English,2794,124.25
3,The Salt Path : The 80-week Sunday Times bests...,sport,4.09,1,1,0,Raynor Winn,288,198,129,...,202.0,31,Jan,2019,Penguin Books Ltd,London,United Kingdom,English,587,54.11
4,The 4-Hour Body : An Uncommon Guide to Rapid F...,sport,3.74,0,1,0,Timothy Ferriss,592,234,153,...,758.0,28,Jun,2011,Ebury Publishing,London,United Kingdom,English,11987,89.71


In [33]:
# saving sport df to csv file.

sport_df.to_csv('final_sport_dataframe.csv')