## Web Scraping - Best Books of 2020s (Top 100)

#### Import relevant libraries

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

#### Getting response from website

In [3]:
url = 'https://www.goodreads.com/list/show/143500.Best_Books_of_the_Decade_2020_s'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}
page = requests.get(url, headers=headers)
page

<Response [200]>

#### Getting the HTML script

In [4]:
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="desktop withSiteHeaderTopFullImage">
 <head>
  <title>
   Best Books of the Decade: 2020's (2320 books)
  </title>
  <meta content="2,320 books based on 3888 votes: The Invisible Life of Addie LaRue by Victoria Schwab, The House in the Cerulean Sea by T.J. Klune, Project Hail Mary by ..." name="description"/>
  <meta content="telephone=no" name="format-detection"/>
  <link href="https://www.goodreads.com/list/show/143500.Best_Books_of_the_Decade_2020_s" rel="canonical"/>
  <script type="text/javascript">
   var ue_t0=window.ue_t0||+new Date();
  </script>
  <script type="text/javascript">
   var ue_mid = "A1PQBFHBHS6YH1";
    var ue_sn = "www.goodreads.com";
    var ue_furl = "fls-na.amazon.com";
    var ue_sid = "774-6633560-7856956";
    var ue_id = "M1CP7BW4V3ZH9NHJAWTC";

    (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a

#### Storing the script in a table.
Since the data is stored in an HTML table, it's best to find it by searching the table

In [5]:
main_table = soup.find('table', class_ = 'tableList js-dataTooltip')
main_table

<table class="tableList js-dataTooltip">
<!-- Add query string params -->
<tr itemscope="" itemtype="http://schema.org/Book">
<td class="number" valign="top">1</td>
<td valign="top" width="5%">
<div class="u-anchorTarget" id="50623864"></div>
<div class="js-tooltipTrigger tooltipTrigger" data-resource-id="50623864" data-resource-type="Book">
<a href="/book/show/50623864-the-invisible-life-of-addie-larue" title="The Invisible Life of Addie LaRue">
<img alt="The Invisible Life of Addie..." class="bookCover" itemprop="image" src="https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1584633432i/50623864._SY75_.jpg"/>
</a> </div>
</td>
<td valign="top" width="100%">
<a class="bookTitle" href="/book/show/50623864-the-invisible-life-of-addie-larue" itemprop="url">
<span aria-level="4" itemprop="name" role="heading">The Invisible Life of Addie LaRue</span>
</a> <br/>
<span class="by">by</span>
<span itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<div class

#### Extracting the book titles from the table

In [6]:
titles = main_table.find_all('a', class_='bookTitle')
book_titles = [title.text.strip() for title in titles]
book_titles

['The Invisible Life of Addie LaRue',
 'The House in the Cerulean Sea (Cerulean Chronicles, #1)',
 'Project Hail Mary',
 'The Midnight Library',
 "I'm Glad My Mom Died",
 'Fourth Wing (The Empyrean, #1)',
 'The Ballad of Songbirds and Snakes (The Hunger Games, #0)',
 'Piranesi',
 'Lessons in Chemistry',
 'House of Earth and Blood (Crescent City, #1)',
 'Tomorrow, and Tomorrow, and Tomorrow',
 'My Dark Vanessa',
 'The Vanishing Half',
 'A \u200bCourt of Silver Flames (A Court of Thorns and Roses, #4)',
 'Beach Read',
 'The Love Hypothesis',
 'Babel',
 'Malibu Rising',
 'The Inheritance Games (The Inheritance Games, #1)',
 'Book Lovers',
 'The Four Winds',
 'Under the Whispering Door',
 'Remarkably Bright Creatures',
 'American Dirt',
 'Crying in H Mart',
 'Mexican Gothic',
 'Heartstopper: Volume Three (Heartstopper, #3)',
 'Carrie Soto Is Back',
 'The Thursday Murder Club (Thursday Murder Club, #1)',
 'Sea of Tranquility',
 'Hamnet',
 'The Guest List',
 'House of Sky and Breath (Crescen

#### Extracting the authors' names

In [7]:
authors = main_table.find_all('a', class_ = 'authorName')
book_authors = [author.text.strip() for author in authors]
book_authors

['Victoria Schwab',
 'T.J. Klune',
 'Andy Weir',
 'Matt Haig',
 'Jennette McCurdy',
 'Rebecca Yarros',
 'Suzanne Collins',
 'Susanna Clarke',
 'Bonnie Garmus',
 'Sarah J. Maas',
 'Gabrielle Zevin',
 'Kate Elizabeth Russell',
 'Brit Bennett',
 'Sarah J. Maas',
 'Emily Henry',
 'Ali Hazelwood',
 'R.F. Kuang',
 'Taylor Jenkins Reid',
 'Jennifer Lynn Barnes',
 'Emily Henry',
 'Kristin Hannah',
 'T.J. Klune',
 'Shelby Van Pelt',
 'Jeanine Cummins',
 'Michelle Zauner',
 'Silvia Moreno-Garcia',
 'Alice Oseman',
 'Taylor Jenkins Reid',
 'Richard Osman',
 'Emily St. John Mandel',
 "Maggie O'Farrell",
 'Lucy Foley',
 'Sarah J. Maas',
 'Naomi Novik',
 'Rebecca   Ross',
 'Kazuo Ishiguro',
 'Isabel Wilkerson',
 'Lancelot Schaubert',
 'Emily Henry',
 'Alice Oseman',
 'Barbara Kingsolver',
 'Jennifer L. Armentrout',
 'Martha Wells',
 'Rebecca Yarros',
 'Karen M. McManus',
 'Alice Oseman',
 'Tracy Deonn',
 'Cassandra Clare',
 'Anthony Doerr',
 'Becky  Chambers',
 'Emily St. John Mandel',
 'Brandon San

#### Extracting the books' ratings

In [8]:
ratings = main_table.find_all('span', class_ = 'minirating')
book_ratings = [rating.text.strip() for rating in ratings]
book_ratings

['4.18 avg rating — 1,176,362 ratings',
 '4.40 avg rating — 711,792 ratings',
 '4.51 avg rating — 652,030 ratings',
 '3.99 avg rating — 1,955,059 ratings',
 '4.46 avg rating — 1,144,921 ratings',
 '4.57 avg rating — 2,038,598 ratings',
 '3.96 avg rating — 848,906 ratings',
 '4.23 avg rating — 320,700 ratings',
 '4.29 avg rating — 1,434,499 ratings',
 '4.47 avg rating — 806,921 ratings',
 '4.15 avg rating — 1,027,167 ratings',
 '4.10 avg rating — 358,308 ratings',
 '4.14 avg rating — 784,507 ratings',
 '4.47 avg rating — 1,419,734 ratings',
 'really liked it 4.00 avg rating — 1,295,486 ratings',
 '4.13 avg rating — 1,506,796 ratings',
 '4.18 avg rating — 279,260 ratings',
 '4.03 avg rating — 1,114,505 ratings',
 '4.15 avg rating — 824,243 ratings',
 '4.13 avg rating — 1,218,968 ratings',
 '4.30 avg rating — 747,721 ratings',
 '4.15 avg rating — 254,206 ratings',
 '4.39 avg rating — 736,195 ratings',
 '4.37 avg rating — 604,715 ratings',
 '4.26 avg rating — 469,819 ratings',
 '3.67 avg r

In [9]:
df = pd.DataFrame(columns=['Title', 'Author', 'Avg. Rating'])
df

Unnamed: 0,Title,Author,Avg. Rating


In [10]:
length = len(book_authors)
length

100

#### Putting the extracted data into a DataFrame

In [11]:
df['Title'] = book_titles
df['Author'] = book_authors
df['Avg. Rating'] = book_ratings
df

Unnamed: 0,Title,Author,Avg. Rating
0,The Invisible Life of Addie LaRue,Victoria Schwab,"4.18 avg rating — 1,176,362 ratings"
1,The House in the Cerulean Sea (Cerulean Chroni...,T.J. Klune,"4.40 avg rating — 711,792 ratings"
2,Project Hail Mary,Andy Weir,"4.51 avg rating — 652,030 ratings"
3,The Midnight Library,Matt Haig,"3.99 avg rating — 1,955,059 ratings"
4,I'm Glad My Mom Died,Jennette McCurdy,"4.46 avg rating — 1,144,921 ratings"
...,...,...,...
95,Oona Out of Order,Margarita Montimore,"3.86 avg rating — 86,798 ratings"
96,The Death of Vivek Oji,Akwaeke Emezi,"4.13 avg rating — 62,571 ratings"
97,The Hollow Places,T. Kingfisher,"3.80 avg rating — 39,280 ratings"
98,The Switch,Beth O'Leary,"3.95 avg rating — 134,320 ratings"


### Data Cleaning

#### Splitting the 'Avg. Rating' column into 'Avg. Rating' and 'No. of Reviews' columns

In [12]:
df[['Avg. Rating', 'No. of Reviews']] = df['Avg. Rating'].str.split('—', expand=True)
df

Unnamed: 0,Title,Author,Avg. Rating,No. of Reviews
0,The Invisible Life of Addie LaRue,Victoria Schwab,4.18 avg rating,"1,176,362 ratings"
1,The House in the Cerulean Sea (Cerulean Chroni...,T.J. Klune,4.40 avg rating,"711,792 ratings"
2,Project Hail Mary,Andy Weir,4.51 avg rating,"652,030 ratings"
3,The Midnight Library,Matt Haig,3.99 avg rating,"1,955,059 ratings"
4,I'm Glad My Mom Died,Jennette McCurdy,4.46 avg rating,"1,144,921 ratings"
...,...,...,...,...
95,Oona Out of Order,Margarita Montimore,3.86 avg rating,"86,798 ratings"
96,The Death of Vivek Oji,Akwaeke Emezi,4.13 avg rating,"62,571 ratings"
97,The Hollow Places,T. Kingfisher,3.80 avg rating,"39,280 ratings"
98,The Switch,Beth O'Leary,3.95 avg rating,"134,320 ratings"


#### Cleaning and Correcting the 'Avg. Rating' column

In [13]:
df['Avg. Rating'] = df['Avg. Rating'].str.replace("[^0-9]", "", regex=True)
df['Avg. Rating'] = df['Avg. Rating'].apply(lambda x: float(x))
df

Unnamed: 0,Title,Author,Avg. Rating,No. of Reviews
0,The Invisible Life of Addie LaRue,Victoria Schwab,418.0,"1,176,362 ratings"
1,The House in the Cerulean Sea (Cerulean Chroni...,T.J. Klune,440.0,"711,792 ratings"
2,Project Hail Mary,Andy Weir,451.0,"652,030 ratings"
3,The Midnight Library,Matt Haig,399.0,"1,955,059 ratings"
4,I'm Glad My Mom Died,Jennette McCurdy,446.0,"1,144,921 ratings"
...,...,...,...,...
95,Oona Out of Order,Margarita Montimore,386.0,"86,798 ratings"
96,The Death of Vivek Oji,Akwaeke Emezi,413.0,"62,571 ratings"
97,The Hollow Places,T. Kingfisher,380.0,"39,280 ratings"
98,The Switch,Beth O'Leary,395.0,"134,320 ratings"


In [14]:
df['Avg. Rating'] = df['Avg. Rating']/100
df

Unnamed: 0,Title,Author,Avg. Rating,No. of Reviews
0,The Invisible Life of Addie LaRue,Victoria Schwab,4.18,"1,176,362 ratings"
1,The House in the Cerulean Sea (Cerulean Chroni...,T.J. Klune,4.40,"711,792 ratings"
2,Project Hail Mary,Andy Weir,4.51,"652,030 ratings"
3,The Midnight Library,Matt Haig,3.99,"1,955,059 ratings"
4,I'm Glad My Mom Died,Jennette McCurdy,4.46,"1,144,921 ratings"
...,...,...,...,...
95,Oona Out of Order,Margarita Montimore,3.86,"86,798 ratings"
96,The Death of Vivek Oji,Akwaeke Emezi,4.13,"62,571 ratings"
97,The Hollow Places,T. Kingfisher,3.80,"39,280 ratings"
98,The Switch,Beth O'Leary,3.95,"134,320 ratings"


#### Cleaning the 'No. of Reviews' column

In [15]:
df['No. of Reviews'] = df['No. of Reviews'].str.replace("[^0-9]", "", regex=True)
df['No. of Reviews'] = df['No. of Reviews'].apply(lambda x: int(x))
df

Unnamed: 0,Title,Author,Avg. Rating,No. of Reviews
0,The Invisible Life of Addie LaRue,Victoria Schwab,4.18,1176362
1,The House in the Cerulean Sea (Cerulean Chroni...,T.J. Klune,4.40,711792
2,Project Hail Mary,Andy Weir,4.51,652030
3,The Midnight Library,Matt Haig,3.99,1955059
4,I'm Glad My Mom Died,Jennette McCurdy,4.46,1144921
...,...,...,...,...
95,Oona Out of Order,Margarita Montimore,3.86,86798
96,The Death of Vivek Oji,Akwaeke Emezi,4.13,62571
97,The Hollow Places,T. Kingfisher,3.80,39280
98,The Switch,Beth O'Leary,3.95,134320


### New Stats

In [16]:
df.describe()

Unnamed: 0,Avg. Rating,No. of Reviews
count,100.0,100.0
mean,4.1796,451881.4
std,0.225455,445727.2
min,3.67,657.0
25%,4.01,143261.2
50%,4.195,250305.5
75%,4.35,634314.2
max,4.62,2038598.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           100 non-null    object 
 1   Author          100 non-null    object 
 2   Avg. Rating     100 non-null    float64
 3   No. of Reviews  100 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.3+ KB


In [19]:
df.to_csv(r'C:\Users\rajan\Downloads\Datasets\Best Books of 2020s.csv', index=False)