In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix,accuracy_score,f1_score




for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
sns.set_style("darkgrid")
df = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.head()

Get the basic info about the dataset

In [None]:
df.info()

Check the nulls

In [None]:
print("Number of null values in each column")
df.isnull().sum()

Check the distribution of the column `Genre`

In [None]:
df['Genre'].value_counts()

# How has the distribution of the `genre` been changing over time?

In [None]:
dataframe = df
feature_1 = 'Year'
feature_2 = 'Genre'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

As it can be seen, almost every single year saw more non-fiction books being published than fiction ones. The only  year where the trend was reversed was 2014.

In [None]:
dataframe = df
feature = 'Reviews'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of the feature `{feature}`",fontsize=25)
plt.show()

In [None]:
dataframe = df
feature = 'Price'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of the feature `{feature}`",fontsize=25)
plt.show()

In [None]:
dataframe = df
feature = 'User Rating'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of the feature `{feature}`",fontsize=25)
plt.show()

Distribution of the column `Year`

In [None]:
df['Year'].value_counts()

# Does the genre affect the number of reviews?

In [None]:
#HISTOGRAM
dataframe = df
cat_feat = 'Genre'
cont_feat = 'Reviews'
plt.figure(figsize=(7,7))
for value in df[cat_feat].unique():
    sns.distplot(df[df[cat_feat] == value][cont_feat], label=value)
plt.legend()
plt.title(f"Distribution of `{cont_feat}` conditional on `{cat_feat}`")
plt.show()

In [None]:
#STRIP
dataframe = df
feature_1 = 'Genre'
feature_2 = 'Reviews'
plt.figure(figsize=(7,7))
hue = None
sns.stripplot(x=feature_1, y=feature_2,hue=hue,
              data=df, palette="Set2", size=20, marker=".",
              edgecolor="gray", alpha=.25)
plt.show()

In [None]:
#BOX
dataframe = df
feature_1 = 'Genre'
feature_2 = 'Reviews'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

It seems that fiction books tend to get more reviews than non-fiction ones.

# How does the number of reviews change per year?

In [None]:
#BOX
dataframe = df
feature_1 = 'Year'
feature_2 = 'Reviews'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

# Is there any relation between `Reviews` and `Price`?

In [None]:
dataframe = df
feature1 = 'Reviews'
feature2 = 'Price'

g=sns.jointplot(x=dataframe[feature1], y=dataframe[feature2], kind="kde")
g.fig.set_figwidth(11)
g.fig.set_figheight(13)
plt.show()

In [None]:
corr_c = df[['Reviews','Price']].corr()['Price'][0]
print(f"Correlation between Reviews and Price: {corr_c}" )

As we've seen, the correlation between two variables is pretty weak. Although is seems like  there is a non-linear relation: the price for the books with reviews smaller than 20k is more spread out (i.e., larger variance) than for those with reviews larger than 20k. More notably, books with reviews larger than 20k tend to have lower price.

# Is there any relation between user rating and reviews?

In [None]:
dataframe = df
feature1 = 'User Rating'
feature2 = 'Reviews'

g=sns.jointplot(x=dataframe[feature1], y=dataframe[feature2], kind="kde")
g.fig.set_figwidth(11)
g.fig.set_figheight(13)
plt.show()

In [None]:
corr_c = df[['Reviews','User Rating']].corr()['User Rating'][0]
print(f"Correlation between Reviews and Price: {corr_c}" )

Similarly, we see that the correlation between reviews and price is weak.

# Does the price distribution differ for the books with different genre?

In [None]:
#BOX
dataframe = df
feature_1 = 'Genre'
feature_2 = 'Price'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

In [None]:
#HISTOGRAM
dataframe = df
cat_feat = 'Genre'
cont_feat = 'Price'
plt.figure(figsize=(7,7))
for value in df[cat_feat].unique():
    sns.distplot(df[df[cat_feat] == value][cont_feat], label=value)
plt.legend()
plt.title(f"Distribution of `{cont_feat}` conditional on `{cat_feat}`")
plt.show()

As graphs show, there doesn't seem to be a substantial difference between the two conditinal distributions.

# Does the price increase with time?

In [None]:
#BOX
dataframe = df
feature_1 = 'Year'
feature_2 = 'Price'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

No, the price doesn't seem to be increasing with time.

# Is there any relation between user rating and price?

In [None]:
dataframe = df
feature1 = 'User Rating'
feature2 = 'Price'

g=sns.jointplot(x=dataframe[feature1], y=dataframe[feature2], kind="kde")
g.fig.set_figwidth(11)
g.fig.set_figheight(13)
plt.show()

In [None]:
corr_c = df[['User Rating','Price']].corr()['Price'][0]
print(f"Correlation between Reviews and Price: {corr_c}" )

And yet again, the correlation between the two is pretty weak.

# Do the conditional distributions of user rating based  on gender differ?

In [None]:
#BOX
dataframe = df
feature_1 = 'Genre'
feature_2 = 'User Rating'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.title("Conditional distributions of `User Rating` based on `Genre`")
plt.show()

Fiction books tend to have higher user rating (on average), although fiction books have more outliers (those books with very low user rating)

# Let's have a closer look at how books with the most reviews (top 25%) differ from the rest.

In [None]:
df['top_25%'] = df['Reviews'] > df['Reviews'].quantile(0.75)

In [None]:
dataframe = df
feature_1 = 'top_25%'
feature_2 = 'Genre'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

In [None]:
#BOX
dataframe = df
feature_1 = 'top_25%'
feature_2 = 'User Rating'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

In [None]:
#BOX
dataframe = df
feature_1 = 'top_25%'
feature_2 = 'Price'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

From the graphs above, we can observe couple of things:
1. The books with most reviews have more fictional books. The books with reviews below 75th percentile have more non-fictional books.

2. Books with the most reviews tend to have higher rating (signified by the the higher median and the fact that Q1 of the books with the most reviews is on the same level as the median of the books below 75th percetinle)

3. Contrary to what some people might expect,  books with most reviews have lower price (generally speaking).

# Now let's see how the most expensive books differ from cheaper counterparts.

Again, we will define *most expensive* book as the one that has a price **above** 75% percentile.

In [None]:
df['top_25%_price'] = df['Price'] > df['Price'].quantile(0.75)

In [None]:
dataframe = df
feature_1 = 'top_25%_price'
feature_2 = 'Genre'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

In [None]:
#BOX
dataframe = df
feature_1 = 'top_25%_price'
feature_2 = 'User Rating'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe)
plt.show()

Couple of notes can be made here:
1. As far as the books with the price below 75% percentile are concerned, the proportions of the fictional and non-fictional books are roughly the same. While for the most expensive books, the proportion of fictional books is way larger.

2. Most expensive books tend to have lower user ratings (partly explained by the fact that buyers generally have higher expectation for the books with heftier price)

# Let's have a look at the most prolific authors (i.e., those who has written the most books)

In [None]:
df1 = df.drop_duplicates(['Name'])
df1 = df1.groupby("Author").count()['Name'].sort_values(ascending=False).head(6).reset_index()
df1.rename(columns={'Name': 'Number of books'},inplace=True)

fig = px.bar(df1,
             x='Author',
             y='Number of books',
             title='Most prolific authors')
fig.show()

# Let's have a look at the worst performing books.

By "worst performing", we mean the books with the rating below (or equal to) 10th percentile.

In [None]:
df1 = df.drop_duplicates(subset=['Name'])
df1 = df1[df1['Reviews'] >= 1000]
df1 = df1[df1['User Rating'] <= df1['User Rating'].quantile(0.10)]

# How many of the worst perfoming books got a lot of reviews (number of reviews above 75th percentile)?

In [None]:
df1['top_25%'] = df1['top_25%'].map({True: 'Above 75%', False: "Below 75%"})

dataframe = df1
feature = 'top_25%'
series = dataframe[feature].value_counts()

labels = series.index
sizes = series.values
explode = [0 for x in range(series.size)]  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots(figsize=(10,15))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title(f'How many of the worst performing books got large number of reviews?',fontsize=20)
plt.show()

As we see, worst performing books are unlikely to get a lot of reviews.

# How many of the worst perfoming books are expensive (price above 75% percentile?)?

In [None]:
df1['top_25%_price'] = df1['top_25%_price'].map({True: 'Above 75%', False: "Below 75%"})

dataframe = df1
feature = 'top_25%_price'
series = dataframe[feature].value_counts()

labels = series.index
sizes = series.values
explode = [0 for x in range(series.size)]  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots(figsize=(10,15))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title(f'How many of the worst performing books are relatively expensive?',fontsize=20)
plt.show()

# What is the distribution of `Genre` for worst performing books?

In [None]:
dataframe = df1
feature = 'Genre'
series = dataframe[feature].value_counts()

labels = series.index
sizes = series.values
explode = [0 for x in range(series.size)]  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots(figsize=(10,15))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title(f'Distribution of `Genre` for worst performing books',fontsize=20)
plt.show()