# Amazon Top 50 Bestselling Books From 2009 - 2019

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

Import all modules that are needed, then load the dataset.

In [None]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.sample()

In [None]:
df.info()

I decide to use 'Year' as a Parameter to describe the dataset.

In [None]:
df['Year'].unique()

In [None]:
df.groupby('Year').mean().reset_index()

# Summary Of Fiction and Non Fiction Book In Every Year

In [None]:
pd.crosstab(df['Year'], df['Genre'])

In [None]:
plt.figure(figsize=(15,8))
sns.set_theme(style="darkgrid")
sns.countplot(x=df['Year'], hue=df['Genre'], data = df)
plt.title('Fiction and Non Fiction In Every Year')
plt.ylabel('Sum')
plt.show()

Based on this graph, non-fiction book almost has higher selling books than fiction every year, except in 2014.

# Summary Of Reviews In Every Year Based On Book's Genre

In [None]:
pd.pivot_table(df, index=['Year', 'Genre'], values='Reviews', aggfunc = 'mean').round(2)

In [None]:
plt.figure(figsize=(8,8))
sns.lineplot(data = df, x = 'Year', y = 'Reviews', hue = 'Genre')
plt.title('Summary Of Reviews In Every Year Based On Genre')
plt.show()

# Summary Of Price In Every Year Based On Book's Genre

In [None]:
pd.pivot_table(df, index=['Year', 'Genre'], values='Price', aggfunc = 'mean').round(2)

In [None]:
plt.figure(figsize=(8,8))
sns.lineplot(data = df, x = 'Year', y = 'Price', hue = 'Genre')
plt.title('Summary Of Price In Every Year Based On Genre')
plt.show()

# Summary Of User Rating In Every Year Based On Book's Genre

In [None]:
pd.pivot_table(df, index=['Year', 'Genre'], values='User Rating', aggfunc = 'mean').round(2)

In [None]:
plt.figure(figsize=(8,8))
sns.lineplot(data = df, x = 'Year', y = 'User Rating', hue = 'Genre')
plt.title('Summary Of User Rating In Every Year Based On Genre')
plt.show()

# Top 10 Author With The Most Written Books In Amazon's Catalogue

In [None]:
df['Author'].value_counts().head(10)

# Top 3 Based On Highest User Rating In Amazon's Catalogue

In [None]:
df[['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Genre', 'Year']].sort_values(by='User Rating',ascending=False).head(3)

# Top 3 Based On Highest Reviews Given In Amazon's Catalogue

In [None]:
df[['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Genre', 'Year']].sort_values(by='Reviews',ascending=False).head(3)

# Top 3 Based On Highest Selling Price In Amazon's Catalogue

In [None]:
df[['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Genre', 'Year']].sort_values(by='Price',ascending=False).head(3)

# Top 3 Based On Newest Year In Amazon's Catalogue

In [None]:
df[['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Genre', 'Year']].sort_values(by='Year',ascending=False).head(3)

# Top 3 Based On Oldest Year In Amazon's Catalogue

In [None]:
df[['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Genre', 'Year']].sort_values(by='Year',ascending=True).head(3)

# Correlation Table and Heatmap

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.corr(), annot=True)
plt.title('Heatmap Correlation Between')
plt.show()

Based on this graph, there is a correlation between user rating to year.

In this section, I try to visualize a probability between reviews given based on the genre every year. So, I make a hypothesis, if:
# H0: U0 <= U1
# H1: U0 > U1

U0 will be the reviews given for fiction books and U1 will be the reviews given for non-fiction books.

I use a T-test Independent to compare and prove the Hypothesis.

In [None]:
dffiction = df['Year'][df['Genre'] == 'Fiction'].value_counts()
dfnonfiction = df['Year'][df['Genre'] == 'Non Fiction'].value_counts()

In [None]:
dffiction

In [None]:
dfnonfiction

In [None]:
from scipy import stats

In [None]:
import scipy
scipy.__version__

Unfortunately, in this environment, it doesn't upgrade to Scipy 1.6.0, so I can't use any 'alternative' query to execute. The real code I supposed to work with is

# ttest, pval = stats.ttest_ind(dffiction, dfnonfiction, alternative = 'greater')

After I execute the code, the value of the statistic is -4.591768433773377, and the p-value is 0.9999116525388123. In the end, the Hypothesis results still the same, the H0 hypothesis is accepted.

In [None]:
ttest, pval = stats.ttest_ind(dffiction, dfnonfiction)

By default, T-test Ind without any condition added is using a two-tailed test, but in this case, I use a one-tailed test. So, the p-value will be divide by 2.

In [None]:
val = pval/2

In [None]:
val < 0.05

In this result, the p-value is not greater than 0.05, so the H0 hypothesis is rejected.
It means there is a difference between the review value of Fiction and Non-Fiction. The reviews given for fiction books is **higher** than the reviews given for non-fiction books.

Thank you for reading this notebook. If you found a useful thought, give me some feedback and upvote!