In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.head()

**Exploratory Data Analysis (EDA)**

In [None]:
print("Shape of Dataset")
print(df.shape)
print()
print("unique elements in Features")
print()
print(df.nunique())
print()
print("duplicated Series values")
print(df.duplicated().sum())
print()
print("About Features : ")
print()
print(df.count()/df.isna().count()*100)
x=df.count()/df.isna().count()*100
plt.hist(x)
plt.ylabel("Features of Dataset")
plt.xlabel("Dataset Present")
plt.show()
#or we can use df.info() to get basic info about data 
print()
print('\033[1m'+"Pairwise correlation of all columns in the dataframe")
print()
df.corr()
print()
sns.heatmap(df.corr(),annot=True,cmap="coolwarm")
sns.pairplot(df)


In [None]:
# Almost 56% rated as best selling books are Fiction
df['Genre'].value_counts().plot.pie(autopct="%.1f%%");
df['Genre'].value_counts()

Below Countplot shows the number of books(Count) that were fiction vs non fiction among the best sellers over the years. <br>
For all the years except 2014, the number of fiction best sellers have been greater than non fiction best sellers books.

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(df['Year'],hue=df['Genre'])
plt.show()

In [None]:
print('\033[1m'+"max User Rating")
print(df['User Rating'].max())
print()
print('\033[1m'+"Avg User Rating")
print(df['User Rating'].mean())
print()
print('\033[1m'+"Most Often User Rating")
print(df['User Rating'].mode())


In [None]:
plt.figure(figsize=(12,6))
plt.style.use("seaborn")
plt.figure(figsize=(20,20))
plt.subplot(221)
fund= sns.countplot(df["User Rating"], palette="magma",edgecolor='black',saturation=0.50)
fund.set_xticklabels(fund.get_xticklabels(),fontsize=10)
plt.title("COUNT OF RATINGS",fontsize=20)
fund.set_xlabel("Counts", fontsize=20,)
fund.set_ylabel("USER RATING", fontsize=20)
plt.show()

In [None]:
#it tells us about number of books Author have !!
a=dict(df['Author'].value_counts())
b=list(a.items())
print(b[:10])

In [None]:
#Author's Books having Maximum rating: 4.9 
maxrating=df[df['User Rating']==4.9]
aumax=maxrating.groupby(['Author']).size().reset_index(name="Count")
aumax.sort_values(by='Count',ascending=False).head(20)

In [None]:
#'Where the Crawdads sing' Book of Delia Owens has maximum user reviews (87841).
print(df[df['Reviews']==df['Reviews'].max()])

In [None]:
#'Oh, the Places You'll Go!' Book of Dr. Seuss has maximum user reviews (21834) with highest rating books among amazon bestseller.
#This has also received highest ratings from users.
print(maxrating[maxrating['Reviews']==maxrating['Reviews'].max()])

In [None]:
#Most of books having rating 4.9 have price 8 
plt.figure(figsize=(12,6))
sns.distplot(maxrating['Price'])
plt.title('Price Distribution Plot',fontsize=20)
plt.show()
maxrating['Price'].mode()

In [None]:
from wordcloud import WordCloud, STOPWORDS


In [None]:
#we can see some names of author from [Author's Books having Maximum rating: 4.9 ] clearly 
imp_words = df['Author'].to_list()

wordcloud = WordCloud(width = 1000, height = 500, 
                background_color ='White', 
                min_font_size = 15).generate(str(imp_words))
plt.figure(figsize = (7,7)) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.show()

In [None]:
imp_words = df['Name'].to_list()

wordcloud = WordCloud(width = 1000, height = 500, 
                background_color ='White', 
                min_font_size = 15).generate(str(imp_words))
plt.figure(figsize = (7,7)) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.show()

In [None]:
# pip install autoviz
# pip install xlrd

In [None]:
import autoviz

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class


In [None]:
av=AutoViz_Class()


In [None]:
autoviz_eda=av.AutoViz('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv',verbose=0)

**Predict User Rating of a Book**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics


LE=LabelEncoder()

LE.fit(df['Genre'])
df['Genre']=LE.transform(df['Genre'])
df.head()

In [None]:
col =["Reviews","Price","Year","Genre"]
# X =df.[[col]].values 
X =df.iloc[:,3:8].values 
y =df[["User Rating"]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# import 
from sklearn.linear_model import LinearRegression

# object
model = LinearRegression()
# training - > fit

model.fit(X_train, y_train)

In [None]:
Prediction=model.predict(X_test)

In [None]:
Prediction.shape

In [None]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, Prediction)))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
m1 = RandomForestRegressor()
m1.fit(X_train, y_train)

In [None]:
Prediction=m1.predict(X_test)

In [None]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, Prediction)))

In [None]:
# Gradient Boosting 
from sklearn.ensemble import GradientBoostingRegressor
m2 = GradientBoostingRegressor()
m2.fit(X_train, y_train)

In [None]:
Prediction=m2.predict(X_test)

In [None]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, Prediction)))

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=11).fit(X_train , y_train)

In [None]:
Prediction=dt.predict(X_test)

In [None]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, Prediction)))

So According to RMSE,
Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit.[(Source Google)](https://www.google.com/search?q=RMSE)<br>

We can see that Decision Tree was the worst model among Linear and Others in Predicting Reviews though linear regression has done quiet good work but still Gradient Boosting and Random Forest Have very much low RMSE value shows that it better predicts the Reviews than both of the above <bR>
But we can still improve the result by removing Outliers 


In [None]:
# general trend + Outlier
sns.boxplot(df["User Rating"])
plt.show()
sns.boxplot(df["Reviews"])
plt.show()
sns.boxplot(df["Price"])
plt.show()
plt.boxplot([df[df['Genre']=='Fiction']['User Rating'], df[df['Genre']=='Non Fiction']['User Rating']])
plt.show()