In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.size'] = 15
plt.rcParams['figure.figsize'] = (10, 7)
%matplotlib inline

**Importing Data**

In [None]:
df = pd.read_csv("/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv")

 **Checking the no.of obsevations and columns in Data:**

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.sample(5)

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.describe() 

* Avg. rating for bestselling books is 4.6
* Min. price is 0 and max. price is 105.(further analysis for min. price)

In [None]:
df.columns

# **Exploratory data analysis :**

In [None]:
plt.figure(figsize=(10,5))
sns.set_style("darkgrid")
sns.countplot(data=df,x='User Rating',palette = 'viridis_r')
plt.title("Distribution of user rating per book",fontsize=16)

In [None]:
plt.figure(figsize=(10,5))
plt.title("Cummalative density frequency plot for User Rating",fontsize=16)
sns.ecdfplot(df['User Rating'],linewidth=3)

* 70% of the user rating were above 4.5

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['Reviews'],palette='viridis_r')
plt.title("Distribution of no. of reviews per book",fontsize=16)

In [None]:
plt.figure(figsize=(10,5))
plt.xlabel("No. of reviews")
plt.title("Cumalative density plot for no. of reviews",fontsize=16)
sns.ecdfplot(df['Reviews'],linewidth=3)

80% of books has less than 20K reviews.

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['Price'],bins=20,edgecolor='Black')
plt.title("Price Distribution",fontsize=16)

In [None]:
plt.figure(figsize=(10,5))
sns.ecdfplot(df['Price'],linewidth=3)
plt.xlabel('Price')
plt.title("CDF plot for price",fontsize=16)

85% of the book are below $20

In [None]:
plt.figure(figsize=(10,5))
sns.set_palette("Dark2")
plt.title('Year Wise User Rating',fontsize=16)
sns.lineplot(y = 'User Rating', x='Year',data=df,linewidth=3)

User rating slightly improve over a period of time.

In [None]:
plt.figure(figsize=(10,5))
custom_palette = ['crimson',"dodgerblue"]
sns.set_palette(custom_palette)
plt.title("Year wise price trend ",fontsize = 16)
sns.lineplot(y= 'Price',x='Year',hue='Genre',data=df,linewidth=3)

* Non-fiction books are costly
* Price of the books is decreasing over time

In [None]:
plt.figure(figsize=(10,5))
custom_palette = ['crimson',"dodgerblue"]
sns.set_palette(custom_palette)
plt.title('Year wise trend in Reviews',fontsize=16)
sns.lineplot(y='Reviews', x='Year', hue = 'Genre',data=df,linewidth=3)

* Reviews for fiction books are higher than non-fiction books except 2018
* It means fiction reader are ofen likes to post reviews than the non-fiction reader

In [None]:
plt.figure(figsize=(10,5))
sns.set_palette('viridis_r')
genre=df['Genre'].value_counts()
plt.pie(genre,labels=genre.index,autopct='%0.2f%%',startangle=270)
plt.title("Genre Distribution",fontsize=16)

In best selling books Non-fiction genre are slightly sells more than the fiction books

# Asking Questions :
* Top 5 best selling books by genre from 2009 to 2010:

In [None]:
non_fiction = df[df.Genre =='Non Fiction']
fiction = df[df.Genre == 'Fiction']
top_5_nonfict = non_fiction['Name'].value_counts().head(5)
top_5_fict = fiction['Name'].value_counts().head(5)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=top_5_fict,y=top_5_fict.index,palette='Accent_r',edgecolor='Black')
plt.title("Top 5 best selling fiction books",fontsize=15)
plt.xlabel("Bestseller")

Oh, the Places You'll Go! is the best selling book in Fiction genre with 8 times appeared from 2009-2019.

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=top_5_nonfict,y=top_5_nonfict.index,palette='Paired_r',edgecolor='Black')
plt.title("Top 5 best selling Non-Fiction Books",fontsize=15)
plt.xlabel("Bestseller")

Publication Manual of the American Psychological Association is the bestseller in Non-Fiction category appeared over 10 times from 2009-2019

In [None]:
nonfic_author = non_fiction['Author'].value_counts().head(5)
fic_author = fiction['Author'].value_counts().head(5)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=nonfic_author,y=nonfic_author.index,palette='PiYG',edgecolor = 'Black')
plt.title("Top 5 Best selling author in Non-Fiction Genre",fontsize=15)
plt.xlabel("Best Seller")

Gary Champman is the best seller author in the Non Fiction category appeared over 10 times from 2009-2019

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=fic_author,y=fic_author.index,palette='PuOr_r',edgecolor = 'Black')
plt.title("Top 5 Best selling author in Fiction Genre",fontsize=15)
plt.xlabel("Best Seller")

Jeff Kinney is the best selling author in the Fiction Category appeared 12 times from 2009-2019

In [None]:
genre_yearwise = df.groupby(['Year','Genre']).count().unstack()['Name']

fig,axes = plt.subplots(1,1,figsize=(10,5))
sns.set_style('darkgrid')
axes.plot(genre_yearwise,markersize=3,markerfacecolor='black',marker='o')
axes.set_xlabel('Year')
axes.set_ylabel('No. of Entries')
axes.set_xticks(genre_yearwise.index)
axes.set_ylim(10,40)
axes.spines['right'].set_color('none')
axes.spines['top'].set_color('none')
axes.legend(genre_yearwise)
axes.set_title('Fiction Vs. Non Fiction Books In The Bestsellers List (2009-2019)',fontdict={'size':15})
plt.grid(0)

plt.show()

Except for 2014 there been high proportion of Non-Fiction books in best selling category.

In [None]:
print(df['Name'].nunique())
print(df['Author'].nunique())

Out of 550 there are 351 unique books and 248 unique authors were included in the list

In [None]:
max_price = df[df['Price'] == df['Price'].max()]
max_price.drop_duplicates(subset=['Name'],keep='first',inplace =True)
max_price[['Name','Author','Price']]

In [None]:
max_price.Name.tolist()

'Diagnostic and Statistical Manual of Mental Disorders, 5th Edition: DSM-5' is the most expensive book

In [None]:
min_price = df[df['Price'] == df['Price'].min()]
min_price.drop_duplicates(subset=['Name'],keep='first',inplace =True)
min_price[['Name','Author','Price']]

In [None]:
len(min_price['Price'])

There are 9 books with price $0

In [None]:
max_rating = df[df['User Rating'] == df['User Rating'].max()]
max_rating.drop_duplicates(subset=['Name'],keep='first',inplace=True)
max_rating[['Name','Author','User Rating']]

In [None]:
len(max_rating['Name'])

28 books received highest rating of 4.9

In [None]:
min_rating = df[df['User Rating'] == df['User Rating'].min()]
min_rating.drop_duplicates(subset=['Name'],keep='first',inplace=True)
min_rating[['Name','Author','User Rating']]

In [None]:
min_rating["Name"].tolist()

'The Casual Vacancy is the lowest rated book by user 3.3

In [None]:
name = df['Name'].tolist()
name_length = []

for i in name:
    name_length.append(len(i))

In [None]:
plt.figure(figsize=(10,5))
sns.set_palette("PRGn")
sns.ecdfplot(data=name_length, linewidth=3);
plt.xlabel('Title length of books',fontsize=15);
plt.title('CDF plot for title length of books');

if the title of the length is 30 there is a 40% chance of becoming a bestseller.