In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing required libraries

from sklearn.model_selection import train_test_split                
from sklearn.tree import DecisionTreeClassifier                     
from sklearn.metrics import accuracy_score                          
from sklearn.metrics import classification_report                   
from sklearn import tree    
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import math
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from wordcloud import WordCloud , ImageColorGenerator
from PIL import Image

The points discussed :
1. Total number of unique books
2. Total number of unique authors
3. Percentage of Fictional and non-fictional books
4. Variations in books with respect to year
5. Distribution of books :
    1. wrt Price
    2. wrt Reviews
    3. wrt User Rating
6. First 10 Books with the Most Reviews
7. First 10 Books with the Highest user ratings
8. First 10 Authors with the Highest reviews
9. First 10 Authors with the Highest user ratings
10. Linear Regression Model
11. Calculation of errors (MAE,R2 score)
12. Decision Tree Classifier for Genre Prediction
13. Decision Tree Classifier for User Rating Prediction

# **1. Import Data**

In [None]:
#import data
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

In [None]:
#exploring data
df.info()

In [None]:
Books = df.Name.nunique()  #number of unique book names
Books

In [None]:
Authors = df.Author.nunique()  #number of unique authors
Authors 

# **2. Data Interpretation**
*Checking Statistics, Null values if any, Total no of rows and columns*

In [None]:
#check null values if any
df.isnull().sum()

In [None]:
#statistics
df.describe()

In [None]:
#no of rows and columns
df.shape

# **3. Data Cleaning : Label Encoding**

In [None]:
#label encoding of 4 columns
from sklearn.preprocessing import LabelEncoder

df1 = df.copy(deep = True)
Genre = LabelEncoder()
df1['Genre'] = Genre.fit_transform(df['Genre'])

In [None]:
from sklearn.preprocessing import LabelEncoder

df2 = df1.copy(deep = True)
Author = LabelEncoder()
df2['Author'] = Author.fit_transform(df1['Author'])

In [None]:
from sklearn.preprocessing import LabelEncoder

df3 = df2.copy(deep = True)
Name = LabelEncoder()
df3['Name'] = Name.fit_transform(df2['Name'])

In [None]:
from sklearn.preprocessing import LabelEncoder

df4 = df3.copy(deep = True)
Name = LabelEncoder()
df4['User Rating'] = Name.fit_transform(df3['User Rating'])

# **4. Data Visualization**

In [None]:
#check correlation and plot heatmap 
corr1 = df4.corr()
plt.figure(figsize = (12,8))
sns.heatmap(corr1,annot=True)
plt.show()

In [None]:
#Genre wise classification
pie_df = df.Genre.value_counts().reset_index()
pie_df.columns = ['Genre', 'count']
fig = px.pie(pie_df, values='count', names='Genre', title='Genre',
             color_discrete_sequence=['blue', 'light green'])
fig.show()

In [None]:
#variations wrt year 
sns.countplot('Year', data=df)
plt.show()

In [None]:
#import seaborn as sns
#sns.set_theme(style="darkgrid")
#p = sns.load_dataset("df")
#ax = sns.countplot(x="Price", data=p)

In [None]:
#how much it costs?
sns.distplot(df["Price"])

In [None]:
#What are people's reviews
sns.distplot(df["Reviews"])

In [None]:
#How people rate the book
sns.distplot(df["User Rating"])

In [None]:
df_new1 = df.drop_duplicates(subset=['Name'])

highest_reviews = df_new1[['Name','Reviews']].groupby('Name').sum().sort_values('Reviews', ascending=False)

highest_reviews.iloc[:10].plot(kind='barh', color=['skyblue', 'blue'])
plt.gcf().set_size_inches(8, 5)
plt.title('10 Books with the Most Reviews')
plt.gca().invert_yaxis()
plt.xlabel('Number of Reviews')
_ = plt.ylabel('Book')

In [None]:
df_new1 = df.drop_duplicates(subset=['Name'])

#books with the highest number of reviews
highest_rating = df_new1[['Name','User Rating']].groupby('Name').sum().sort_values('User Rating', ascending=False)

highest_rating.iloc[:10].plot(kind='barh',color=['skyblue', 'blue'])
plt.gcf().set_size_inches(8,5)
plt.title('First 10 Books with the Highest ratings')
plt.gca().invert_yaxis()
plt.xlabel('Ratings')
_ = plt.ylabel('Book')

In [None]:
df_new2 = df.drop_duplicates(subset=['Author'])

#authors with the highest number of reviews
highest_rating_auth = df_new2[['Author','Reviews']].groupby('Author').sum().sort_values('Reviews', ascending=False)

highest_rating_auth.iloc[:10].plot(kind='barh',color=['lightgreen', 'blue'])
plt.gcf().set_size_inches(8,5)
plt.title('First 10 Authors with the Highest reviews')
plt.gca().invert_yaxis()
plt.xlabel('Reviews')
_ = plt.ylabel('Author Name')

In [None]:
df_new = df.drop_duplicates(subset=['Author'])

#authors with the highest number of reviews
highest_rating_auth = df_new[['Author','User Rating']].groupby('Author').sum().sort_values('User Rating', ascending=False)

highest_rating_auth.iloc[:10].plot(kind='barh',color=['lightgreen', 'blue'])
plt.gcf().set_size_inches(8,5)
plt.title('First 10 Authors with the Highest rating books')
plt.gca().invert_yaxis()
plt.xlabel('User Rating')
_ = plt.ylabel('Author Name')

# **5. Data Modelling **

In [None]:
#Dependent variables
X = np.array(df4[['Name', 'Author', 'Reviews', 'Price', 'Year', 'Genre']]) 

#Independent variables
y = np.array(df4["User Rating"])  

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 100)                                    

**Regression Model**

In [None]:
#Linear Regression
from sklearn import linear_model
reg = linear_model.LinearRegression()

reg.fit(X_train, y_train)
y_1 = reg.predict(X_test)

In [None]:
y_1

In [None]:
print("MAE: " + str(mean_absolute_error(y_test, y_1)))

In [None]:
print("R2_score: " + str(r2_score(y_test, y_1)))

**Decision Tree : To predict Genre**

In [None]:
X1 = np.array(df4[['Name', 'Author','Reviews', 'Price', 'Year', 'User Rating']])
y1 = np.array(df4[['Genre']])
from sklearn import tree

tree1 = tree.DecisionTreeClassifier(max_depth=4)
tree1 = tree1.fit(X1, y1)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.30, random_state = 1)                                    

In [None]:
tree1.score(X1, y1)

In [None]:
y1_predicted = tree1.predict(X1_test)
y1_predicted

**Decision Tree : To predict User Rating**

In [None]:
X2 = np.array(df4[['Name', 'Author','Reviews', 'Price', 'Year', 'Genre']])
y2 = np.array(df4[['User Rating']])
from sklearn import tree

tree2 = tree.DecisionTreeClassifier(max_depth=4)
tree2 = tree2.fit(X2, y2)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.30, random_state = 100)                                    

In [None]:
tree2.score(X2, y2)

In [None]:
y2_predicted = tree2.predict(X2_test)
y2_predicted

# **Conclusion**

1. Total unique Books : 351.
2. Total unique Authors : 248.
3. Average User Rating : 4.6.
4. Average Price : 13.10.
5. Min rating : 3.3 | Max rating : 4.9.
6. Maximum books were in 2019 whereas minimum were in 2009.
7. 56.4 % books are Non Fictional while 43.6 are fictional.
8. Decision tree classifier works enough good for Genre prediction whereas it is not that accurate for User rating prediction.
9. Linear Regression model works good in User Rating prediction.