In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pyforest

In [None]:
pip install sci_analysis

# **Importing Libraries and dataset**

In [None]:
from pyforest import *
from datetime import datetime
from scipy.stats import norm
from scipy import stats
from scipy.stats import skew
import matplotlib.lines as mlines
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
from sci_analysis import analyze
import pandas_profiling as pp

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [None]:
books = pd.read_csv('../input/goodreadsbooks/books.csv',error_bad_lines=False)
books.head()

The below lines were skipped from the data while importing due to the following reason:

* Skipping line 3350: expected 12 fields, saw 13
* Skipping line 4704: expected 12 fields, saw 13
* Skipping line 5879: expected 12 fields, saw 13
* Skipping line 8981: expected 12 fields, saw 13

# **Stucture of the dataset**

In [None]:
print("no. of rows: ",books.shape[0], "\n""no. of columns: ",books.shape[1])

In [None]:
books.info()

**Observations**
* The data seems to have no missing values. There are total 11123 rows with total 12 columns with the range index from 0 to 11122.
* There are 6 numerical columns with 5 int64 values and 1 float64 and 6 categorical (object) values. We can see that the column publication_date has the dtype as object. We'll need to change that to datetime.
* In the column names, we can see that num_pages is indented.
* Overall the data looks in order. Memory usage: 1.0 MB

In [None]:
books.rename(columns={'  num_pages': 'num_pages'},inplace=True)

In [None]:
books.columns

In [None]:
date = books.publication_date.tolist()
Month = []
Year = []
Month_Year = []
for i in date:
    a = i.split('/')
    a.pop(1)
    Month.append(int(a[0]))
    Year.append(a[1])
    Month_Year.append(a[0]+'/'+a[1])

In [None]:
#change to month
import datetime
Month_name = []
for i in Month:
    a=datetime.date(1900, i, 1).strftime('%B')
    Month_name.append(a)

In [None]:
books['Month'] = pd.Series(Month_name, index=books.index)
books['Year'] = pd.Series(Year, index=books.index)
books['Month_Year'] = pd.Series(Month_Year, index=books.index)

In [None]:
book2 = books.copy()

In [None]:
#Let's create a categorical column for average_rating

def create_cat(i):
    if i >= 0 and i <=1:
        return '0-1'
    if i >= 1 and i <=2:
        return '1-2'
    if i >= 2 and i <=3:
        return '2-3'
    if i >= 3 and i <=4:
        return '3-4'
    if i >= 4 and i <=5:
        return '4-5'

In [None]:
books['avg_ratings'] = books['average_rating'].apply(create_cat)
books.head(3)

In [None]:
#Dividing numeric and categorical columns

numerical = books.select_dtypes(['float64','int64']).columns.values.tolist()
categorical = books.select_dtypes(['object']).columns.values.tolist()
print('Numerical Columns\n\n',numerical)
print()
print('Categorical Columns\n\n',categorical)


# Descriptive statistics

In [None]:
descr_stats = books[numerical].describe().T
descr_stats['Variance'] = books[numerical].var()
descr_stats['IQR'] = descr_stats['75%']-descr_stats['25%']
descr_stats['Range'] = descr_stats['max']-descr_stats['min']
descr_stats.rename(columns={'count':'Count',
                            'mean':'Mean',
                            'std':'Standard Deviation',
                            '25%':'Q1 (25%)',
                            '50%':'Q2 (50%)',
                            '75%':'Q3 (75%)'},inplace=True)
descr_stats.T

In [None]:
for i in books.describe().columns:
    a = skew(books[i])
    if a > 0:
        print('\n',i,':\n\nSkewnes of the data = {}\n\tnot normally distributed.'.format(a))
    elif a < 0:
        print('\n',i,':\n\nSkewnes of the data = {}\n\tnot normally distributed.'.format(a))
    else:
        print('\n',i,':\n\nSkewnes of the data = {}\n\tnormally distributed'.format(a))

**Observations**

The numeric data is not normally distributed and the columns ratings_count 17.70, text_reviews_count 16.17 are highly skewed.

num_pages, ratings_count and text_reviews_count have a higher standard deviation and variance; which indicates that the data points are very spread out from the mean, and from one another.

The categorical data shows the most popular book and author, along with other frequency and columns. It seems that the most popular book is The Brothers Karamazov that shows the frequency of 9 and the most popular author is Stephen King which shows the frequency of 40.

Bad data check

In [None]:
# Are there any duplicates?
dups = books.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
books[dups]

In [None]:
#Any missing values?
books.isnull().sum().values.any()

In [None]:
#Let's check the unique values
for i in books.columns:
    print('\n',i,'\n\n',books[i].unique())


**Observations**

The data does not have any missing values or duplicate values. Also there aren't any visible anomalies in the values of the dataset.

**Authors = NOT A BOOK**

In [None]:
print('\nBooks with authors name as `NOT A BOOK`:',len(books[books['authors']=='NOT A BOOK']))

In [None]:
books[books['authors']=='NOT A BOOK'].iloc[:,:3]

In [None]:
books = books[books['authors']!='NOT A BOOK']

In [None]:
print('\nBooks with total number of pages = 0:',len(books[books['num_pages'] == 0]))

In [None]:
books = books[books['num_pages'] != 0]

In [None]:
print("no. of rows after removing bad data: ",books.shape[0], "\n""no. of columns after removing bad data: ",books.shape[1])

print('total no of rows dropped:',11123-books.shape[0],'i.e.',
      round(((11123-books.shape[0])/11123)*100,4),'% of the original dataset')

**Observations**

We have filtered out the data that has 0 num_pages leaving us with the dataset that has 11044 of total entries. That means we have dropped about 79 columns from the original dataset i.e. about 0.7102% of the data is dropped.

**Book with maximum rating count**

In [None]:
print('\nBook with maximum rating count=',books['ratings_count'].max())
(books[books['ratings_count'] == books['ratings_count'].max()])


**Observations**

Twilight by Stephenie Meyer published in 9/6/2006 by Little Brown and Company publisher has the highest ratings_count of 4597666.

Average_rating for this books is 3.5900 and text_review_counts is 94265

**Total number of books with highest and lowest average rating**

In [None]:
print('highest average rating = ',books['average_rating'].max())
print('lowest average rating =',books['average_rating'].min())

In [None]:
books[books['average_rating'] > 5]

In [None]:
books[books['average_rating'] < 0 ]

In [None]:
avg_rat = books[books['average_rating'] == books['average_rating'].max()]
print('Total number of books with highest average rating = ',len(avg_rat))
avg_rat.iloc[:10,:3]

In [None]:
avg_rat = books[books['average_rating'] == books['average_rating'].min()]
print('Total number of books with lowest average rating = ',len(avg_rat))
avg_rat.iloc[:10,:3]

**Observation**

The highest average rating a book can get is 5.0. There are 22 books that have maximum average rating.

The lowest average rating a book can get is 0. There are 25 books that have lowest average rating.

In [None]:
profile = pp.ProfileReport(books)
profile.to_file("output.html")

# Univariate
**Numerical**

In [None]:
#Identifiying outliers with IQR
sorted(books)

Q1=books.quantile(0.25)
Q3=books.quantile(0.75)
IQR=Q3-Q1
print(IQR)

In [None]:
iqr = ((books < (Q1 - 1.5 * IQR)) |(books > (Q3 + 1.5 * IQR))).any()
iqr = iqr.to_frame().reset_index().rename(columns={'index':'Columns',0:'Outliers'})
outliers = iqr[iqr['Outliers']==True]
outliers

In [None]:
books = books[books['ratings_count']<books.ratings_count.quantile(.95)]

In [None]:
print("no. of rows after outliers treatment: ",books.shape[0], "\n""no. of columns after outliers treatment: ",books.shape[1])

print('total no of rows dropped:',11123-books.shape[0],'i.e.',round(((11123-books.shape[0])/11047)*100,4),'% of the original dataset')


# **Observations**

Columns average_rating isbn13 num_pages ratings_count text_reviews_count have outliers.

We have filtered out the data as per the 95th percentile from ratings_count leaving us with the dataset that has 10491 of total entries. That means we have dropped about 632 columns from the original dataset i.e. about 5.721% of the data is dropped.

In [None]:
print('\n\naverage_rating')
analyze(books['average_rating'])
print('\n\nnum_pages')
analyze(books['num_pages'])
print('\n\nratings_count')
analyze(books['ratings_count'])
print('\n\ntext_reviews_count')
analyze(books['text_reviews_count'])


**Observations**

We can conclude from the Shapiro-Wilk test for normality that the data is not normally distributed and is highly skewed. Also the columns have a higher standard deviation and variance; which indicates that the data points are very spread out from the mean, and from one another.

# Univariate
**Categorical**

In [None]:
most_readlang = books.language_code.value_counts().to_frame().reset_index()
most_readlang.rename(columns={'language_code':'Count','index':'language_code'},inplace=True)
most_readlang = most_readlang.iloc[:10,:]
most_readlang.sort_values(by='Count',ascending=False)

In [None]:
plt.figure(figsize =[15,5])
sns.barplot(most_readlang.language_code,most_readlang.Count,palette='BrBG')
plt.title('Most read language',fontsize=14);


**Observations**

English is by far the most common language the readers prefer to read in.

Spanish is the 3rd popular language after eng-US

In [None]:
most_popauth = books.authors.value_counts().to_frame().reset_index()
most_popauth.rename(columns={'authors':'Count','index':'authors'},inplace=True)
most_popauth = most_popauth.iloc[:10,:]
most_popauth.sort_values(by='Count',ascending=False)

In [None]:
plt.figure(figsize =[15,5])
sns.barplot(most_popauth.authors,most_popauth.Count,palette='BrBG')
plt.title('Most published author',fontsize=14)
plt.xticks(rotation=25);

# Bivariate

**Categorical**

In [None]:
month_aurt = pd.crosstab(books.authors,books.Month,margins=True)
month_aurt.sort_values(by='All',ascending=False,axis=0,inplace=True)
month_aurt.sort_values(by='All',ascending=False,axis=1,inplace=True)
month_aurt.drop('All',axis=0,inplace=True)
month_aurt.drop('All',axis=1,inplace=True)
month_aurt = month_aurt.iloc[:10,:]
month_aurt