In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Basic EDA: Amazon Top 50 Bestselling Books [Adi's Book]**

![https://i.guim.co.uk/img/media/c72db316049112ebb6c4997889a15c5b7706a861/0_182_5472_3283/master/5472.jpg?width=620&quality=85&auto=format&fit=max&s=49220534c638eafbc34e5aa07de0a240](http://)

# Table of contents:
    1. Data Insights
    2. Questions to address            
    3. Importing relevant libraries and data
    4. Data cleaning/manipulation
    5. Exploratory Data Analytics
    6. Correlation Testing
    7. References

# ***Dataset Insights***

**Source-** https://www.kaggle.com/sootersaalu/amazon-top-50-bestselling-books-2009-2019

**Description-** Dataset on Amazon's Top 50 bestselling books from 2009 to 2019. Contains 550 books, data has been categorized into fiction and non-fiction using Goodreads

**Columns- **
1. Name
2. Author
3. User Rating
4. Reviews
5. Price
6. Year
7. Genre


# ***Questions to address***

1.) What are the Top 20 Bestselling Authors having highest bestselling books? (genre wise)

2.) Which book has the most reviews? (top 10 books, genre wise)

3.) Which book has the most reviews? (overall)

4.) Which Genre sold the most between 2009-2019?

5.) Which Year saw the most reviews?

(Correlation Testing)

6.) Are user rating and reviews related?

7.) Are user rating and price related?

# ***Importing relevant libraries and data***

In [2]:
import pandas as pd #reading files, dealing with tabular representation of data
import numpy as np #dealing with numpy array functions
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
import missingno as msno # visualisation of missing values in the form of visual matrix
%matplotlib inline 
sns.set(color_codes=True)

# Manage warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
# To display the top 5 rows
df.head(5)

FileNotFoundError: [Errno 2] No such file or directory: '../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv'

In [None]:
# checking the datatypes
df.dtypes

In [None]:
print('-----------------------------------------------------------------------------------------------------------')
print(f'This dataset contains Bestselling Books from {df.Year.min()} to {df.Year.max()}.')
print('-----------------------------------------------------------------------------------------------------------\n')
df.info()

# ***Data cleaning/manipulation***

In [None]:
print(df.isnull().sum()) #summing up all the rows having missing values corresponding to their columns
msno.matrix(df, figsize=(12,6))
plt.show

So no data cleaning is required

In [None]:
df.describe(include='O')

In [None]:
# But a same book could be sold at a different price in the coming years. So we must keep the final values.
print(f'Before: {df.shape}')
df=df.drop_duplicates(subset='Name',keep='last')
print(f'After: {df.shape}')

# ***Exploratory Data Analytics***

Checking the distribution of various parameters

In [None]:
sns.distplot(df['User Rating'])

In [None]:
sns.distplot(df['Reviews'])

In [None]:
sns.distplot(df['Price'])

# 1. What are the Top 10 Bestselling Authors having highest bestselling books? (genre wise)

In [None]:
# Splitting the data into Fiction and Non-Fiction
Fiction = df[df['Genre'] == 'Fiction']
Non_Fiction = df[df['Genre'] == 'Non Fiction']

In [None]:
Top_Fiction = Fiction.groupby('Author')[['Name']].count().sort_values(by='Name', ascending=False).reset_index()
Top_10_Fiction_Auth = Top_Fiction.head(10)
Top_10_Fiction_Auth

In [None]:
Top_Non_Fiction = Non_Fiction.groupby('Author')[['Name']].count().sort_values(by='Name', ascending=False).reset_index()
Top_10_Non_Fiction_Auth = Top_Non_Fiction.head(10)
Top_10_Non_Fiction_Auth

In [None]:
plt.figure(figsize=(28,7))
plt.subplot(1,2,1)
plt.barh(Top_10_Fiction_Auth['Author'], Top_10_Fiction_Auth['Name'])
plt.title('Top 10 books by # of best-sellers (Fiction)',fontweight='bold',fontsize=18)
plt.xlabel('No. of Bestsellers',fontweight='bold')
plt.ylabel('Authors',fontweight='bold')
plt.subplot(1,2,2)
plt.barh(Top_10_Non_Fiction_Auth['Author'], Top_10_Non_Fiction_Auth['Name'])
plt.ylabel('Authors',fontweight='bold')
plt.xlabel('No. of Bestsellers',fontweight='bold')
plt.title('Top 10 books by # of best-sellers (Non-Fiction)',fontweight='bold',fontsize=18)
plt.show

# 2.) Which book has the most reviews? (top 10 books, genre wise)

In [None]:
Top_Review_Book_Fiction = Fiction.groupby('Name')[['Reviews']].agg(np.mean).sort_values(by='Reviews',ascending=False).reset_index()
Top_10_Review_Book_Fiction = Top_Review_Book_Fiction.head(10)
Top_10_Review_Book_Fiction

In [None]:
Top_Review_Book_Non_Fiction = Non_Fiction.groupby('Name')[['Reviews']].agg(np.mean).sort_values(by='Reviews',ascending=False).reset_index()
Top_10_Review_Book_Non_Fiction = Top_Review_Book_Non_Fiction.head(10)
Top_10_Review_Book_Non_Fiction

In [None]:
plt.figure(figsize=(15,20))
plt.subplot(2,1,1)
plt.barh(Top_10_Review_Book_Fiction['Name'], Top_10_Review_Book_Fiction['Reviews'])
plt.title('Top 10 books by # of best-sellers (Fiction)',fontweight='bold',fontsize=18)
plt.xlabel('No. of Reviews',fontweight='bold')
plt.ylabel('Book Name',fontweight='bold')
plt.subplot(2,1,2)
plt.barh(Top_10_Review_Book_Non_Fiction['Name'], Top_10_Review_Book_Non_Fiction['Reviews'])
plt.ylabel('Book Name',fontweight='bold')
plt.xlabel('No. of Reviews',fontweight='bold')
plt.title('Top 10 books by # of best-sellers (Non-Fiction)',fontweight='bold',fontsize=18)
plt.show

# 3.) Which book has the most reviews? (overall)

In [None]:
Top_Review_Book = df.groupby('Name')[['Reviews']].agg(np.mean).sort_values(by='Reviews',ascending=False).reset_index()
Top_10_Review_Book = Top_Review_Book.head(10)
Top_10_Review_Book

In [None]:
plt.barh(Top_10_Review_Book['Name'], Top_10_Review_Book['Reviews'])
plt.title('Top 10 books by # of reviews (Overall)',fontweight='bold',fontsize=18)
plt.xlabel('No. of Reviews',fontweight='bold')
plt.ylabel('Authors',fontweight='bold')

Clearly the book ***"Where the Crawdads Sing"*** with ***87841 reviews*** has the highest number of reviews overall.

# 4.) Which Genre sold the most between 2009-2019?

In [None]:
plt.pie(df['Genre'].value_counts(),labels=['Non Fiction','Fiction'],autopct='%.1f%%')

***Non-Fiction*** genre sold the most books from 2009-2019 ***by 54.4%.***

# 5.) Which Year saw the most reviews?

In [None]:
sns.boxplot(df['Year'],df['Reviews'])

Year ***2014*** saw the ***most number of reviews.***

# ***Correlation Testing***

# 6.) Are user rating and reviews related?

In [None]:
# Kendall's Rank Correlation for User Rating and Reviews
from scipy.stats import kendalltau
stat, p = kendalltau(df['User Rating'],df['Reviews'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably User Rating and Reviews are independent')
else:
    print('Probably User Rating and Reviews are dependent')

# 7.) Are user rating and price related?

In [None]:
# Kendall's Rank Correlation for User Rating and Price
from scipy.stats import kendalltau
stat, p = kendalltau(df['User Rating'],df['Price'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably User Rating and Price are independent')
else:
    print('Probably User Rating and Price are dependent')

# ***References***

# Useful Links to visit.

1. https://www.kaggle.com/muladpham/amazon-top-50-bestselling-books-moransbook/notebook

2. https://www.kaggle.com/ivannatarov/amazon-s-books-eda-plotly-hypothesis-test/notebook

3. https://www.python-graph-gallery.com/

4. https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/