# Amazon's Top 50 Bestsellers (2009-2019)
![amazon-best-seller-list.png](https://scribemedia.com/wp-content/uploads/2016/08/amazon-best-seller-list.png)

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from matplotlib.lines import Line2D
from warnings import filterwarnings
sns.set_style('whitegrid')
%matplotlib inline

# Loading the data

In [None]:
df=pd.read_csv("../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv")

# Data Preprocessing

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Author'].unique()

In [None]:
df.loc[df['Author']=='J. K. Rowling','Author']='J.K. Rowling'
df.loc[df['Author']=='George R. R. Martin','Author']='George R.R. Martin'

# EDA

In [None]:
df.sort_values('Reviews',ascending=False).head(10)

In [None]:
df.drop_duplicates('Name').sort_values('Reviews',ascending=False).head(10)

# Data Visualization

In [None]:
pie_1 = df.drop_duplicates('Name').sort_values('Reviews',ascending=False)['Genre'].head(10).value_counts()

sns.set_palette('viridis_r')
plt.figure(figsize=(8,8))
plt.pie(pie_1,explode = [0,0.15],labels=['Fiction','Non Fiction'],autopct='%.1f%%',shadow=True,startangle=20)
plt.title('Genre Pie Chart for the top 10 Bestselling Books on Amazon (2009-2019)',fontdict={'size':14},y=0);

## Price of the books:

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(df['Price'],color='purple')
plt.title('Price Distribution Plot',fontsize=16)
plt.show()
filterwarnings('ignore')

##  Reviews of the books:

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(df['Reviews'],color='red')
plt.title('Reviews Distribution Plot',fontsize=16)
plt.show()
filterwarnings('ignore')

## User Rating across all the books:

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=df['User Rating'],palette='viridis_r')
plt.title('User Rating Distribution Plot',fontsize=16)
plt.show()

## Correlation between the Price and the Reviews feature:

In [None]:
sns.jointplot(x='Price',y='Reviews',data=df)
plt.show()

In [None]:
from collections import Counter

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(df.drop(['Name','Genre','Author','Year'],axis=1).corr(),cmap='Wistia',annot=True)
plt.show()

In [None]:
Counter(df.drop_duplicates('Name')['Author']).most_common()[0]

In [None]:
new_df = df.drop_duplicates('Name')
new_df[new_df['Author']=='Jeff Kinney']

##  The 10 most famous authors from both the Fiction and Non Fiction genre on the basis of the number of appearances in the top 50 list of Bestsellers from 2009-2019.

In [None]:
best_nf_books = df.groupby(['Author', 'Genre']).count().unstack()['Name']['Non Fiction'].sort_values(ascending=False)[:10]
best_f_books = df.groupby(['Author', 'Genre']).count().unstack()['Name']['Fiction'].sort_values(ascending=False)[:10]

color=sns.color_palette('viridis')

sns.set_style('whitegrid')

fig,axes = plt.subplots(1,2,figsize=(8,8))
plt.subplots_adjust(wspace=0)

axes[0].invert_xaxis()
axes[0].invert_yaxis()
axes[0].yaxis.tick_left()
axes[0].grid(None)
axes[0].set_xticks(range(1,int(best_nf_books.values.max())+1))
axes[0].barh(y=best_nf_books.index,width=best_nf_books.values,height=0.5,tick_label=best_nf_books.index,color=color)
axes[0].set_yticklabels(best_nf_books.index,fontweight='semibold',fontsize=12)
axes[0].set_title('Non Fiction Authors',fontdict={'size':16})
axes[0].set_xlabel('Number Of Times Selected',fontsize=12)

axes[1].set_xticks(range(1,int(best_f_books.values.max())+1))
axes[1].invert_yaxis()
axes[1].yaxis.tick_right()
axes[1].grid(None)
axes[1].barh(y=best_f_books.index,width=best_f_books.values,height=0.5,tick_label=best_f_books.index,color=color)
axes[1].set_yticklabels(best_f_books.index,fontweight='semibold',fontsize=12)
axes[1].set_title('Fiction Authors',fontdict={'size':16})
axes[1].set_xlabel('Number Of Times Selected',fontsize=12)

plt.show()

## Trend of the genre of books which were selected in the top 50s each year:

In [None]:
books = df.groupby(['Year','Genre']).count().unstack()['Name']

fig,axes=plt.subplots(1,1,figsize=(12,6))
sns.set_style('white')
axes.plot(books,markersize=5,markerfacecolor='grey',marker='o')
axes.set_xlabel('Year')
axes.set_ylabel('Number Of Entries')
axes.set_xticks(books.index)
axes.set_ylim(10,40)
axes.spines['right'].set_color('none')
axes.spines['top'].set_color('none')
axes.legend(books)
axes.set_title('Fiction Vs. Non Fiction Books In The Bestsellers List (2009-2019)',fontdict={'size':15})
plt.grid(0)

plt.show()

## Top 20 authors based on their number of appearances in the top 50 list to find out their:

## 1. Average User Rating
## 2. Average Price Of The Books
## 3. Total Reviews

In [None]:
color=sns.color_palette('viridis')
c1=sns.color_palette('magma')
c2=sns.color_palette('magma_r')

new_df = df.drop_duplicates('Name')

df1 = df.groupby('Author').count().sort_values('Name',ascending=False)[:20]
index = df1.index
main_df = pd.DataFrame(new_df[df['Author']==index[0]])

for i in index[1:]:
    sub_df = new_df[new_df['Author']==i]
    main_df = pd.concat([main_df,sub_df])
    

main_df1 = main_df.groupby('Author').sum()
main_df2 = main_df.groupby('Author').mean()

fig, ax = plt.subplots(1,3,figsize=(15,12),sharey=True)
plt.subplots_adjust(wspace=0.05)

ax[0].hlines(xmin=0,xmax=main_df2['User Rating'].values,y=main_df2['User Rating'].index,linestyle='dotted',color=c2)
ax[0].plot(main_df2['User Rating'].values,main_df2.index,'o',markersize=5,markerfacecolor='purple',markeredgecolor='black')
ax[0].set_xticks(np.linspace(0,5,11))
ax[0].set_xlabel('Rating',fontsize=13)
ax[0].set_title('Average User Rating',fontsize=15)
ax[0].set_yticklabels(main_df2['User Rating'].index,fontweight='bold',fontsize=12)

ax[1].hlines(xmin=0,xmax=main_df2['Price'].values,y=main_df2['Price'].index,linestyle='dotted',color=c1)
ax[1].plot(main_df2['Price'].values,main_df2.index,'o',markersize=5,markerfacecolor='purple',markeredgecolor='black')
ax[1].set_xticks(list(range(0,51,5)))
ax[1].set_xlabel('Price',fontsize=13)
ax[1].set_title('Average Price',fontsize=15)

ax[2].barh(y=main_df1['Reviews'].index,width=main_df1['Reviews'].values,color=color)
ax[2].set_xlabel('Number Of Reviews',fontsize=13)
ax[2].set_title('Total Number Of Reviews',fontsize=15)

plt.show()
filterwarnings('ignore')

In [None]:
sns.FacetGrid(df,hue='User Rating').map(sns.scatterplot,'Reviews','Price').add_legend()

In [None]:
df.describe()

## Outliers:

In [None]:
IQR_r = df['Reviews'].quantile(0.75)-df['Reviews'].quantile(0.25)

upper_limit = df['Reviews'].quantile(0.75)+IQR_r*1.5
lower_limit = df['Reviews'].quantile(0.75)-IQR_r*1.5

ex_lower_limit = df['Reviews'].quantile(0.75)-IQR_r*3
ex_upper_limit = df['Reviews'].quantile(0.75)+IQR_r*3

print('Outlier\'s range :\t',[lower_limit,upper_limit])
print('Extreme Outlier\'s range:',[ex_lower_limit,ex_upper_limit])