# Web scrapping for goodreads.com.
* We have scrapped goodreads.com website for gettimg two hundred best book details,
  as voted on by the general Goodreads community.
* Total 2 pages were scrapped since each page contains 100 books information.
* We have used BeautifulSoup library for our scrapping.

In [1]:
'''
!conda install -c anaconda beautifulsoup4

To install BeautifulSoup library
'''

'\nconda install -c conda-forge BeautifulSoup\nTo install BeautifulSoup library\n'

In [5]:
# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.figure_factory as ff

# Cufflinks wrapper on plotly
import cufflinks as cf

%matplotlib inline

# Options for pandas
#pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
cf.go_offline(connected=True)

# Set global theme
cf.set_config_file(world_readable=True, theme='pearl')

## Importing Beautiful soup and creating the soup. Etracting data. 

In [154]:
from bs4 import BeautifulSoup
import requests

base_url="https://www.goodreads.com"
books=[]
for i in range(1,3):
    response = requests.get(base_url+'/list/show/1.Best_Books_Ever?page='+str(i), headers=headers)
    gr_soup = BeautifulSoup(response.text, "html.parser")
    tables = gr_soup.find_all('table', attrs={'class':'tableList'})
    table=tables[0]
    books = books+table.find_all('tr',attrs={'itemtype':'http://schema.org/Book'})

## Checking the total number of books.

In [167]:
print(len(books))

200


## Extracting the data we require for our analysis.

In [168]:
import re
from time import sleep

book_list=[]
for book in books:
    item={}
    book_div=book.find('div',attrs={'data-resource-type':'Book'})
    item['id']=book_div['data-resource-id']
    item['name']=book.find('span',attrs={'role':'heading'}).text
    item['book_url']=book_div.a['href']
    item['author']=book.find('a',attrs={'class':'authorName'}).text
    item['author_url']=book.find('a',attrs={'class':'authorName'})['href']
    item['score'] =re.sub('[^0-9]','', book.find('a', attrs = {'href':'#'}).text)
    headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Connection' : 'keep-alive'}
    book_response = requests.get(base_url+item['book_url'], headers=headers)
    book_soup = BeautifulSoup(book_response.text, "html.parser")
    item['rating']=re.findall(r'\d+\.\d+', book_soup.find('span',attrs={'itemprop':'ratingValue'}).text)[0]
    item['isbn']=str(book_soup.select("meta[property='books:isbn']")[0]['content'])
    item['page_count']=book_soup.select("meta[property='books:page_count']")[0]['content']
    item['rating_count']=book_soup.select("meta[itemprop='ratingCount']")[0]['content']
    item['review_count']=book_soup.select("meta[itemprop='reviewCount']")[0]['content']
    item['year']=re.findall(r'\d{4}',book_soup.select("div#details div.row")[1].text.strip())[0]
    item['genres'] = book_soup.find('a', attrs = {'class':"actionLinkLite bookPageGenreLink"}).text
    book_list.append(item)
    
    sleep(1) # We delay our request by 1 sec so that we do not flood the website with huge number of requests

## Creating our dataframe.

In [170]:
import pandas as pd
df_book=pd.DataFrame(book_list)
df_book.head()

Unnamed: 0,author,author_url,book_url,genres,id,isbn,name,page_count,rating,rating_count,review_count,score,year
0,Suzanne Collins,https://www.goodreads.com/author/show/153394.S...,/book/show/2767052-the-hunger-games,Young Adult,2767052,9780439023481.0,"The Hunger Games (The Hunger Games, #1)",374,4.33,6223460,169961,2959568,2008
1,J.K. Rowling,https://www.goodreads.com/author/show/1077326....,/book/show/2.Harry_Potter_and_the_Order_of_the...,Fantasy,2,9780439358071.0,Harry Potter and the Order of the Phoenix (Har...,870,4.5,2429708,40758,2603171,2004
2,Harper Lee,https://www.goodreads.com/author/show/1825.Har...,/book/show/2657.To_Kill_a_Mockingbird,Classics,2657,,To Kill a Mockingbird,324,4.28,4377360,89244,2235010,2006
3,Jane Austen,https://www.goodreads.com/author/show/1265.Jan...,/book/show/1885.Pride_and_Prejudice,Classics,1885,,Pride and Prejudice,279,4.26,2905605,64490,1942638,2000
4,Stephenie Meyer,https://www.goodreads.com/author/show/941441.S...,/book/show/41865.Twilight,Young Adult,41865,9780316015844.0,"Twilight (Twilight, #1)",501,3.59,4839503,102492,1452846,2006


## Creating .csv file from our dataframe.

In [171]:
df_book.to_csv('goodreads.csv',index=False)

In [1]:
import pandas as pd
df_goodreads=pd.read_csv("goodreads.csv")

In [2]:
df_goodreads.head()

Unnamed: 0,id,book_url,name,authour,author_url,score,isbn,rating,rating_count,review_count,page_count,year,genres
0,2767052,/book/show/2767052-the-hunger-games,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,https://www.goodreads.com/author/show/153394.S...,2959668,9780439000000.0,4.33,6223460,169961,374,2008,Young Adult
1,2,/book/show/2.Harry_Potter_and_the_Order_of_the...,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,https://www.goodreads.com/author/show/1077326....,2603171,9780439000000.0,4.5,2429708,40758,870,2004,Fantasy
2,2657,/book/show/2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,Harper Lee,https://www.goodreads.com/author/show/1825.Har...,2235010,,4.28,4377280,89241,324,2006,Classics
3,1885,/book/show/1885.Pride_and_Prejudice,Pride and Prejudice,Jane Austen,https://www.goodreads.com/author/show/1265.Jan...,1942736,,4.26,2905431,64486,279,2000,Classics
4,41865,/book/show/41865.Twilight,"Twilight (Twilight, #1)",Stephenie Meyer,https://www.goodreads.com/author/show/941441.S...,1452846,9780316000000.0,3.59,4839503,102492,501,2006,Young Adult


## Checking the columns.

In [3]:
df_goodreads.columns

Index(['id', 'book_url', 'name', 'authour', 'author_url', 'score', 'isbn',
       'rating', 'rating_count', 'review_count', 'page_count', 'year',
       'genres'],
      dtype='object')

## Finding the correlation between various entities.
* page_count: Total number of pages in the book.
* rating: Average rating of the book.
* rating_count: The total numberr of people who have rated the book.
* review_count: Total number of review for the book.
* score: It is based on multiple factors, including the number of people who have voted for it and 
  how highly those voters ranked the book.

In [6]:
corr_mat = df_goodreads[['page_count',
       'rating', 'rating_count', 'review_count','score']].corr()
figure = ff.create_annotated_heatmap(
    z=corr_mat.values,
    x=list(corr_mat.columns),
    y=list(corr_mat.index),
    annotation_text=corr_mat.round(2).values,
    showscale=True)
figure

## Year wise book count.

In [7]:
df_goodreads.groupby("year").id.count().iplot(kind = 'bar',
                xTitle='Year',
                yTitle='Number of books',
                title='Number of books year-wise',
                colors='navy')

## Finding books based on genre.

In [8]:
genre_wise=df_goodreads.groupby("genres").id.count()
genre_wise

genres
Childrens          11
Classics           76
Cultural            1
Fantasy            32
Fiction            37
Historical          6
Horror              4
Mystery             1
Nonfiction          4
Plays               1
Poetry              1
Romance             3
Science Fiction     4
Sequential Art      1
Young Adult        18
Name: id, dtype: int64

In [9]:
genre_wise.iplot(kind = 'bar',
                xTitle='Genres',
                yTitle='Number of books',
                title='Number of books genre-wise',
                colors='navy')

## Finding the percentage of each genre.

In [190]:
genre_wise=genre_wise.reset_index()
genre_wise

Unnamed: 0,genres,id
0,Childrens,11
1,Classics,76
2,Cultural,1
3,Fantasy,32
4,Fiction,37
5,Historical,6
6,Horror,4
7,Mystery,1
8,Nonfiction,4
9,Plays,1


In [191]:
genre_wise.iplot(
    kind='pie', 
    labels='genres',
    values='id',
    title='Percentage of genres')

## Number of books per author.

In [200]:
df_goodreads.groupby("author").id.count().iplot(kind = 'bar',
                xTitle='Authors',
                yTitle='Number of books',
                title='Number of books author-wise',
                colors='navy')

## Showing the average ratings of each book.

In [221]:
df_goodreads.iplot(kind = 'bar',
                x='name',
                y='rating',
                #xTitle='Book name',
                yTitle='Rating',
                title='Rating of top ten books')

## Average rating of books after sorting the rating.

In [226]:
df_gr_top=df_goodreads.sort_values('rating',ascending=False)

df_gr_top.iplot(kind = 'bar',
                x='name',
                y='rating',
                #xTitle='Book name',
                yTitle='Rating',
                title='Rating of top ten books')

## Checking the review count of each book.

In [224]:
df_goodreads.iplot(kind = 'bar',
                x='name',
                y='review_count',
                #xTitle='Book name',
                yTitle='Review Count',
                title='Review count of each book',colors = 'mediumvioletred')