In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('../input/best-books-of-the-21st-century-dataset/Best_Book_21st.csv')

In [3]:
df.sample(5)

Unnamed: 0,id,title,series,author,book_link,genre,date_published,publisher,num_of_page,lang,review_count,rating_count,rate,award
8093,8093,"Secrets of the Dragon Sanctuary (Fablehaven, #4)",\n (Fablehaven #4)\n,Brandon Mull,https://www.goodreads.com/book/show/5217282-se...,"Fantasy,Young Adult,Fiction,Childrens,Middle G...",March 24th 2009,"Shadow Mountain\n\n ,",535.0,English,3162,79798,4.38,Goodreads Choice Award Nominee for Young Adult...
1804,1804,"Fool's Errand (Tawny Man, #1)",\n (The Tawny Man #1)\n,Robin Hobb,https://www.goodreads.com/book/show/68488.Fool...,"Fantasy,Fiction,Fantasy,Epic Fantasy,Fantasy,H...",October 2002,"Voyager\n\n ,",661.0,English,2321,84373,4.29,
6538,6538,"Thumped (Bumped, #2)",\n (Bumped #2)\n,Megan McCafferty,https://www.goodreads.com/book/show/12924279-t...,"Young Adult,Science Fiction,Dystopia,Fiction,S...",April 24th 2012,"Balzer + Bray\n\n ,",293.0,English,643,5377,3.37,
4214,4214,As I SAW The beginning of rendezvous,,Umar Hasan,https://www.goodreads.com/book/show/36142844-a...,,August 17th 2017,"Prowess Publishing\n\n ,",41.0,English,3,4,4.0,
8713,8713,Captive: My Time as a Prisoner of the Taliban,,Jere Van Dyk,https://www.goodreads.com/book/show/7705934-ca...,"Nonfiction,Autobiography,Memoir,War,War,Terror...",June 22nd 2010,"Times Books\n\n ,",269.0,,69,329,3.37,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018 entries, 0 to 10017
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10018 non-null  int64  
 1   title           10018 non-null  object 
 2   series          4341 non-null   object 
 3   author          10003 non-null  object 
 4   book_link       10018 non-null  object 
 5   genre           9019 non-null   object 
 6   date_published  9970 non-null   object 
 7   publisher       9624 non-null   object 
 8   num_of_page     9737 non-null   float64
 9   lang            9503 non-null   object 
 10  review_count    10003 non-null  object 
 11  rating_count    10003 non-null  object 
 12  rate            10003 non-null  float64
 13  award           4024 non-null   object 
dtypes: float64(2), int64(1), object(11)
memory usage: 1.1+ MB


In [5]:
df.isnull().sum()

id                   0
title                0
series            5677
author              15
book_link            0
genre              999
date_published      48
publisher          394
num_of_page        281
lang               515
review_count        15
rating_count        15
rate                15
award             5994
dtype: int64

# Cleaning & Feature Engineering

In [6]:
def rem_clean(row):
    # remove [\n,()] from text
    try:
        a = re.sub("\n",'', row)
        a = re.sub(",",'', a)
        a = re.sub("[()]",'', a)
        a = a.strip()
        return a
    except:
        return None
    
def award_count(row):
    # Split award with ',' and count the length
    try:
        count = len(row.split(','))
        return count
    except:
        return 0

def get_series_name(row):
    try:
        return (row.split('#')[0]).strip()
    except:
        return None

def get_series_num(row):
    try:
        return (row.split('#')[1]).strip()
    except:
        return None

def get_year(row):
    try:
        return re.findall('[0-9]{4}$', row)[0]
    except:
        return 'Unknown'

def rating(row):
    try:
        return re.sub(',','',row)
    except:
        return 0

In [7]:
df['publisher'] = df['publisher'].apply(rem_clean)
df['series'] = df['series'].apply(rem_clean)
df['clean_title'] = df['title'].apply(lambda x: (re.sub(r'\([^()]*\)', '', x)).strip())
df['series_name'] = df['series'].apply(get_series_name)
df['series_num'] = df['series'].apply(get_series_num)
df['year_published'] = df['date_published'].apply(get_year)
df['total_award'] = df['award'].apply(award_count)
df['review_count'] = df['review_count'].apply(rating).astype('int64')
df['rating_count'] = df['rating_count'].apply(rating).astype('int64')
df['author'] = df['author'].fillna('Unknown')
df['rate'] = df['rate'].fillna(0)

In [8]:
new_df = df[['clean_title','series_name','series_num','author','genre','date_published',
             'year_published','publisher','num_of_page','lang','review_count','rating_count',
             'rate','award','total_award']]

In [9]:
new_df.sample(5)

Unnamed: 0,clean_title,series_name,series_num,author,genre,date_published,year_published,publisher,num_of_page,lang,review_count,rating_count,rate,award,total_award
6611,Sita: An Illustrated Retelling of the Ramayana,The Great Indian Epics Retold,,Devdutt Pattanaik,"Fantasy,Mythology,Cultural,India,Fiction,Asian...",October 2013,2013,Penguin,328.0,English,520,6864,4.07,,0
73,The Nix,,,Nathan Hill,"Fiction,Historical,Historical Fiction,Audioboo...",August 30th 2016,2016,Knopf,640.0,English,7975,67243,4.05,Los Angeles Times Book Prize for Art Seidenbau...,2
5259,Seventh Heaven,The Princess Diaries,7.0,Meg Cabot,"Young Adult,Romance,Womens Fiction,Chick Lit,C...",2007,2007,Macmillan,284.0,English,708,25627,3.69,,0
5973,March of the Hooligans,,,Dougie Brimson,Football,October 16th 2007,2007,Virgin Books,224.0,English,6,82,3.49,,0
1827,First Light,Globiuz,1.0,R.L. Douglas,,October 31st 2018,2018,Createspace Independent Publishing Platform,314.0,English,1,4,4.0,,0


In [10]:
# Drop duplicate book
new_df.drop_duplicates(subset=['clean_title','series_name','series_num','author','publisher'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [11]:
new_df.to_csv('BestBookOf21Century.csv',index=False)

# Visualization

In [12]:
from IPython.core.display import display, HTML
display(HTML("<div class='tableauPlaceholder' id='viz1638350585321' style='position: relative'><noscript><a href='#'><img alt='Best Book Of 21st Century Analysis ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Be&#47;BestBookOf21stCenturyAnalysis&#47;Story1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='BestBookOf21stCenturyAnalysis&#47;Story1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Be&#47;BestBookOf21stCenturyAnalysis&#47;Story1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en-US' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1638350585321');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>"))

In [13]:
print('You can see the visualization here')
display(HTML("<a href='https://public.tableau.com/views/BestBookOf21stCenturyAnalysis/Story1?:language=en-US&publish=yes&:display_count=n&:origin=viz_share_link'>Click Here</a>"))


You can see the visualization here
