![](book.jpg)
* Photo by [Fallon Michael](https://unsplash.com/@fallonmichaeltx) on [Unsplash](https://unsplash.com/photos/qmlGWIaIgpo)

# Web scraping for goodreads.com.
* We have scrapped goodreads.com website to get two hundred best book details, as voted on by the general Goodreads community.
* Total 2 pages were scrapped since each page contains 100 books information.
* We have used BeautifulSoup library for our scrapping.

In [1]:
# web scraping
import requests
from bs4 import BeautifulSoup

# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks as cf

# Data science imports
import pandas as pd
import numpy as np

%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
cf.go_offline(connected=True)

# Set global theme
cf.set_config_file(world_readable=True, theme='polar')

##  Importing Beautiful soup and creating the soup. Extracting data

In [2]:
url="https://www.goodreads.com"
headers ={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Connection' : 'keep-alive'}
books=[]
for i in range(1,3):
    response = requests.get(url+'/list/show/1.Best_Books_Ever?page='+str(i), headers = headers)
    gr_soup = BeautifulSoup(response.text, "html.parser")
    tables = gr_soup.find_all('table', attrs={'class':'tableList'})
    table=tables[0]
    books = books+table.find_all('tr',attrs={'itemtype':'http://schema.org/Book'})

## Checking the type and length of books

In [3]:
print(type(books))
print(len(books))

<class 'list'>
200


## Extracting the data we require for our analysis

In [4]:
import re
book_list = []
for book in books:
    item = {}
    book_div = book.find('div', attrs = {'data-resource-type':'Book'})
    item['id'] = book_div['data-resource-id']
    item['book_url']=book_div.a['href']
    item['name'] = book.find('span', attrs = {'role':'heading'}).text
    item['author'] = book.find('a', attrs = {'class':'authorName'}).text
    item['author_url']=book.find('a',attrs={'class':'authorName'})['href']
    item['score'] =re.sub('[^0-9]','', book.find('a', attrs = {'href':'#'}).text) 
    book_response = requests.get(url+item['book_url'], headers = headers)
    book_soup = BeautifulSoup(book_response.text, 'html.parser')
    item['isbn'] = book_soup.select("meta[property = 'books:isbn']")[0]['content']
    item['rating']=re.findall(r'\d+\.\d+', book_soup.find('span',attrs={'itemprop':'ratingValue'}).text)[0]
    item['rating_count'] = book_soup.select("meta[itemprop='ratingCount']")[0]['content']
    item['review_count'] = book_soup.select("meta[itemprop='reviewCount']")[0]['content']
    item['page_count'] = book_soup.select("meta[property = 'books:page_count']")[0]['content']
    item['year'] = re.findall(r'\d{4}', book_soup.select("div#details div.row")[1].text.strip())[0]
    item['genres'] = book_soup.find('a', attrs = {'class':"actionLinkLite bookPageGenreLink"}).text
    book_list.append(item)


## Creating our dataframe

In [5]:
import pandas as pd
df = pd.DataFrame(book_list)

In [6]:
df.head()

Unnamed: 0,id,book_url,name,author,author_url,score,isbn,rating,rating_count,review_count,page_count,year,genres
0,2767052,/book/show/2767052-the-hunger-games,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,https://www.goodreads.com/author/show/153394.S...,2959964,9780439023481.0,4.33,6227649,170045,374,2008,Young Adult
1,2,/book/show/2.Harry_Potter_and_the_Order_of_the...,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,https://www.goodreads.com/author/show/1077326....,2604164,9780439358071.0,4.5,2432082,40814,870,2004,Fantasy
2,2657,/book/show/2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,Harper Lee,https://www.goodreads.com/author/show/1825.Har...,2236599,,4.28,4380949,89315,324,2006,Classics
3,1885,/book/show/1885.Pride_and_Prejudice,Pride and Prejudice,Jane Austen,https://www.goodreads.com/author/show/1265.Jan...,1943333,,4.26,2908472,64576,279,2000,Classics
4,41865,/book/show/41865.Twilight,"Twilight (Twilight, #1)",Stephenie Meyer,https://www.goodreads.com/author/show/941441.S...,1452749,9780316015844.0,3.59,4842942,102549,501,2006,Young Adult


##  Creating .csv file from our dataframe

In [7]:
df = df.to_csv('goodreads.csv', index = False)

# Importing the data

In [8]:
df = pd.read_csv('goodreads.csv')
df.head()

Unnamed: 0,id,book_url,name,author,author_url,score,isbn,rating,rating_count,review_count,page_count,year,genres
0,2767052,/book/show/2767052-the-hunger-games,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,https://www.goodreads.com/author/show/153394.S...,2959964,9780439000000.0,4.33,6227649,170045,374,2008,Young Adult
1,2,/book/show/2.Harry_Potter_and_the_Order_of_the...,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,https://www.goodreads.com/author/show/1077326....,2604164,9780439000000.0,4.5,2432082,40814,870,2004,Fantasy
2,2657,/book/show/2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,Harper Lee,https://www.goodreads.com/author/show/1825.Har...,2236599,,4.28,4380949,89315,324,2006,Classics
3,1885,/book/show/1885.Pride_and_Prejudice,Pride and Prejudice,Jane Austen,https://www.goodreads.com/author/show/1265.Jan...,1943333,,4.26,2908472,64576,279,2000,Classics
4,41865,/book/show/41865.Twilight,"Twilight (Twilight, #1)",Stephenie Meyer,https://www.goodreads.com/author/show/941441.S...,1452749,9780316000000.0,3.59,4842942,102549,501,2006,Young Adult


## Checking the columns

In [9]:
df.columns

Index(['id', 'book_url', 'name', 'author', 'author_url', 'score', 'isbn',
       'rating', 'rating_count', 'review_count', 'page_count', 'year',
       'genres'],
      dtype='object')

## Checking the info

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            200 non-null    int64  
 1   book_url      200 non-null    object 
 2   name          200 non-null    object 
 3   author        200 non-null    object 
 4   author_url    200 non-null    object 
 5   score         200 non-null    int64  
 6   isbn          150 non-null    float64
 7   rating        200 non-null    float64
 8   rating_count  200 non-null    int64  
 9   review_count  200 non-null    int64  
 10  page_count    200 non-null    int64  
 11  year          200 non-null    int64  
 12  genres        200 non-null    object 
dtypes: float64(2), int64(6), object(5)
memory usage: 20.4+ KB


## Converting the data type

In [11]:
df.rating = pd.to_numeric(df.rating, downcast="float")
df['page_count'] = pd.to_numeric(df.page_count, downcast="float")
df['review_count'] = pd.to_numeric(df.review_count, downcast="float")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            200 non-null    int64  
 1   book_url      200 non-null    object 
 2   name          200 non-null    object 
 3   author        200 non-null    object 
 4   author_url    200 non-null    object 
 5   score         200 non-null    int64  
 6   isbn          150 non-null    float64
 7   rating        200 non-null    float32
 8   rating_count  200 non-null    int64  
 9   review_count  200 non-null    float32
 10  page_count    200 non-null    float32
 11  year          200 non-null    int64  
 12  genres        200 non-null    object 
dtypes: float32(3), float64(1), int64(4), object(5)
memory usage: 18.1+ KB


# EDA
Finding the correlation between various entities.
1. page_count: Total number of pages in the book.
2. rating: Average rating of the book.
3. rating_count: The total numberr of people who have rated the book.
4. review_count: Total number of review for the book.
5. score: It is based on multiple factors, including the number of people who have voted for it and how highly those voters ranked the book.

## Describing the data

In [12]:
df[['score', 'rating', 'rating_count', 'review_count', 'page_count']].describe()

Unnamed: 0,score,rating,rating_count,review_count,page_count
count,200.0,200.0,200.0,200.0,200.0
mean,370580.3,4.112999,1002200.0,28038.259766,434.829987
std,421307.8,0.237567,1002924.0,25833.224609,321.96579
min,71255.0,3.41,8387.0,131.0,26.0
25%,104615.5,3.97,348629.0,10857.5,238.5
50%,203116.5,4.12,654952.0,19916.5,354.0
75%,500287.0,4.28,1301533.0,36699.5,507.75
max,2959964.0,4.62,6816505.0,170045.0,2700.0


## Number of authors

In [13]:
df.author.nunique()

154

## Books published per author

In [14]:
df_author = df.groupby('author')['name'].count().sort_values(ascending = False)
df_author = pd.DataFrame(df_author)
df_author

Unnamed: 0_level_0,name
author,Unnamed: 1_level_1
J.K. Rowling,7
Rick Riordan,5
Dr. Seuss,5
Richelle Mead,4
Charles Dickens,4
...,...
Marion Zimmer Bradley,1
Mark Haddon,1
Markus Zusak,1
Mary Wollstonecraft Shelley,1


In [15]:
df.groupby("author").id.count().iplot(kind = 'bar',
                xTitle='Authors',
                yTitle='Number of books',
                title='Number of books author-wise',
                colors='navy')

## Rating of books

In [16]:
df.iplot(kind = 'bar',
                x='name',
                y='rating',
                #xTitle='Book name',
                yTitle='Rating',
                title='Rating of top books')

## Rating of books after sorting the rating

In [17]:
df_gr_top=df.sort_values('rating',ascending=False)

df_gr_top.iplot(kind = 'bar',
                x='name',
                y='rating',
                #xTitle='Book name',
                yTitle='Rating',
                title='Rating of books')

## Number of books published per year

In [18]:
df.groupby('year')['id'].count().iplot(kind = 'bar',
    xTitle = 'Date',
    yTitle = 'Number of Books',
    title = 'Number of books published per year',
    colors = 'navy')

## Dividing years to bins

In [19]:
bins = [1000,1970, 1980, 1990, 2000, 2010, np.inf]
names = ['<1970', '1970-1980', '1980-1990', '1990-2000', '2000-2010','2010>']

df['YearRange'] = pd.cut(df['year'], bins, labels=names)
df.head()

Unnamed: 0,id,book_url,name,author,author_url,score,isbn,rating,rating_count,review_count,page_count,year,genres,YearRange
0,2767052,/book/show/2767052-the-hunger-games,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,https://www.goodreads.com/author/show/153394.S...,2959964,9780439000000.0,4.33,6227649,170045.0,374.0,2008,Young Adult,2000-2010
1,2,/book/show/2.Harry_Potter_and_the_Order_of_the...,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,https://www.goodreads.com/author/show/1077326....,2604164,9780439000000.0,4.5,2432082,40814.0,870.0,2004,Fantasy,2000-2010
2,2657,/book/show/2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,Harper Lee,https://www.goodreads.com/author/show/1825.Har...,2236599,,4.28,4380949,89315.0,324.0,2006,Classics,2000-2010
3,1885,/book/show/1885.Pride_and_Prejudice,Pride and Prejudice,Jane Austen,https://www.goodreads.com/author/show/1265.Jan...,1943333,,4.26,2908472,64576.0,279.0,2000,Classics,1990-2000
4,41865,/book/show/41865.Twilight,"Twilight (Twilight, #1)",Stephenie Meyer,https://www.goodreads.com/author/show/941441.S...,1452749,9780316000000.0,3.59,4842942,102549.0,501.0,2006,Young Adult,2000-2010


## Number of books published in the given year range

In [20]:
df_year = df.groupby('YearRange')['id'].count().sort_values(ascending = False)
df_year = pd.DataFrame(df_year)
df_year

Unnamed: 0_level_0,id
YearRange,Unnamed: 1_level_1
2000-2010,119
1990-2000,38
2010>,22
1980-1990,14
1970-1980,4
<1970,3


In [21]:
year_portions = df.YearRange.value_counts(normalize=True)*100

labels = year_portions.index.tolist()
values = year_portions.values.tolist()

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

fig.update_layout(title="Year Proportions",
                 template="ggplot2",
                    font=dict(
                        family="Courier New, monospace",
                              size=18,
                              color="#7f7f7f"))

## Heatmap

In [22]:
df[['page_count',
       'rating', 'rating_count', 'review_count','score']].corr().iplot(kind=
        'heatmap',colorscale='rdpu', 
        title = 'Correlation between different columns')

## Finding books based on genre

In [23]:
df.groupby('genres')['id'].count().sort_values().iplot(
kind = 'bar',
xTitle = 'Genres',
yTitle = 'Number of books',
title = 'Number of books by Genres')

## Finding the percentage of each genre

In [24]:
df.groupby('genres', as_index= False)['id'].count().iplot(
    kind='pie', 
    labels='genres',
    values='id',
    title='Percentage of genres')

## Checking the review count of each book

In [25]:
df.iplot(kind = 'bar',
                x='name',
                y='review_count',
                #xTitle='Book name',
                yTitle='Review Count',
                title='Review count of each book',colors = 'mediumvioletred')

## Books with their rating and rating count

In [26]:
df.groupby('name')[['rating_count', 'rating']].sum().sort_values('rating', ascending = False).iplot(
 
xTitle = 'Book Name',
yTitle = 'Rating Count',
secondary_y = 'rating',
secondary_y_title = 'Rating',
title = 'Books with their rating and rating count')

## Maximum rating and rating count for each genre

In [27]:
df.groupby(['genres']).agg(max_rating=('rating',
np.max),max_rating_count=('rating_count',np.max)).sort_values('max_rating')

Unnamed: 0_level_0,max_rating,max_rating_count
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Nonfiction,4.11,1443225
Romance,4.18,1890385
Cultural,4.22,605518
Poetry,4.23,227070
Mystery,4.26,778107
Horror,4.34,1091201
Science Fiction,4.36,1391859
Sequential Art,4.36,479982
Historical,4.37,1780696
Childrens,4.38,882692


In [28]:
df.groupby(['genres']).agg(max_rating=('rating',
np.max),max_rating_count=('rating_count',np.max)).sort_values('max_rating').iplot(
xTitle = 'Genres',
yTitle = 'Max rating Count',
secondary_y = 'max_rating',
secondary_y_title = 'Max rating',
title = 'Max rating and max rating count')

## Book with rating_count and review_count

In [29]:
df.groupby('name')[['rating', 'review_count']].sum().sort_values('rating', 
                                                    ascending = False).iplot(
xTitle = 'Book Name',
yTitle = 'Review Count',
secondary_y = 'rating',
secondary_y_title = 'Rating',
title = 'Books with their raiting and review count')

## Genres and their page count

In [30]:
df.pivot(
    columns='genres', values='page_count').iplot(
        kind='box',
        layout=dict(
            height=600,
            yaxis=dict(title='page_count'),
            xaxis = dict(title = 'genres'),
            title='Page count by genres',
            margin=dict(b=140)))

## Rating Count vs Review Count by Genres

In [31]:
df.iplot(
    x='rating_count',
    y='review_count',
    categories='genres',
    xTitle='Rating Count',
    yTitle='Review Count',
    title='Rating Count vs Review Count by Genres')


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



## Score of Author

In [32]:
df.groupby(
    'author', as_index=False)['score'].sum().sort_values('score', ascending = False)[:10].iplot(
 kind='pie', labels='author', values='score', 
    title='Score by author')

## Sunburst Charts

In [33]:
import plotly.express as px
#df = px.data.tips()
fig = px.sunburst(df, path=['genres', 'rating', 'page_count'], values='score', color='review_count')
fig.show()

![](thankyou.jpg)

![](questions.jpg)