# Best Seller Analysis using Plotly, Seaborn and Matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline

In [None]:
df=pd.read_csv("../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv")
df.head(10)

In [None]:
df.info()

In [None]:
#Finding the distribution of Fiction vs Non Fiction books over years
sns.lineplot(x='Year',y='Reviews',hue='Genre',data=df)

The Above graph inidicates that Fiction genre had more reviews compared to Non Fiction. This gives us an idea that Fiction is more famous.

In [None]:
#Prices of books across years for Fiction vs Non Fiction
g=sns.FacetGrid(data=df,col='Genre')
g.map(sns.lineplot,'Year','Price')

The Price of Non Fiction is higher over the years

In [None]:
#Average Price of books
df.groupby('Genre').agg({"Price":"mean"})

In [None]:
#Since the prices of Fiction books are low, the number of people buying it would be high and hence the higher reviews probably.
#Let's compare the user rating for Genre
g=sns.FacetGrid(data=df,col='Genre')
g.map(sns.histplot,'User Rating',kde=False,bins=50)

Note that there are no rating below 3 for Non Fiction.


In [None]:
rating=df[df['User Rating']>4]
print("Number of Books with Rating 4 + is",rating['Name'].count())

In [None]:
#Let's get the author with their average rating for the books published over the years
avg_rating=df.groupby(['Genre','Author']).agg({"User Rating":"mean","Reviews":"sum","Price":"mean"}).sort_values(["User Rating","Reviews"]).reset_index()

In [None]:
#By taking the mean of user rating over the year, we shall find the count of authors by their user rating
avg_rating['User Rating']=avg_rating['User Rating'].round(1)
sns.histplot(avg_rating["User Rating"],kde=False)

The above Histogram suggests that the number of Authors with an average rating of 4.8 is the highest

In [None]:
avg_rating["Rank"]=avg_rating.groupby('Genre')["User Rating"].rank("dense",ascending=False)
#top Authors with the highest rating
rank1=avg_rating[avg_rating.Rank==1]
fig=px.bar(rank1,x="Author",y="User Rating",color="Genre",barmode="group",title="Authors with Highest Rating")
fig.show()

From this graph we can find that number of Fiction books with highest rating is 11 and Non Fiction is 4

In [None]:
#Top 10 Reviews
df2=avg_rating.groupby(["Genre"]).apply(lambda x:x.sort_values(["Reviews"],ascending=False)).reset_index(drop=True)
df3 = df2.groupby('Genre').head(10)
fig=px.bar(df3,x="Author",y="Reviews",color="Genre",barmode="group",title="Top 10 Author Reviews by Genre")
fig.show()

In [None]:
#Top 10 highest priced books 
#Replace this with word cloud
price1=df.groupby(["Genre"]).apply(lambda x:x.sort_values(["Price"],ascending=False)).reset_index(drop=True)
price2=price1.groupby('Genre').head(10)
fig=px.bar(price2.sort_values(["Year"]),x="Name",y="Price",color="Genre",title="Top 10 Priced Books by Genre",hover_data=["User Rating","Reviews"])
fig.update_xaxes(showticklabels=False)
fig.show()

Note: Hovering over the bar graph above would show the book names

In [None]:
#let's find the relationship between price Vs Reviews, Price Vs User Rating and User Rating vs Reviews by Year
rating_year=df.groupby(['Genre','Author','Year']).agg({"User Rating":"mean","Reviews":"sum","Price":"mean"}).sort_values(["User Rating","Reviews"]).reset_index()

In [None]:
fig=px.scatter_matrix(rating_year,dimensions=["Price","Reviews","User Rating"],color="Year",hover_data=['Author'])
fig.update_traces(diagonal_visible=False)
fig.show()

The above plot shows the relationship between the metric. This is to see if there is any abvious relationship between one another****

In [None]:
#We shall find the price and reviews of each book over years.
fig=px.scatter(df.sort_values(["Year"]),x="Price",y="Reviews",animation_frame="Year",size="User Rating",color="Genre",hover_name="Name"
              ,facet_col="Genre",hover_data=["Author"],title="Reviews vs Price of Books over Year")
fig.show()

In [None]:
#lets find the top authors who were in top 10 Reviews with the maximum rating
topauthors=df3.merge(rank1,on=["User Rating","Author"],how='inner').reset_index()
top_authorbooks=df.merge(topauthors,on=['Author'],how='inner')
fig1=px.sunburst(top_authorbooks,path=['Author','Name'],values='Reviews',title="Top Author and their Books")
fig1.show()

These are the top authors who has maximum rating and their reviews fall under the top 10 category