# Our goal is to predict which rating will the user give to the movie
## Importing data

In [None]:
# Important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum().max()

In [None]:
# No missing data
df.describe()

In [None]:
df.shape

# Exploratory data analysis

In [None]:
# First thing to notice - Name won't really matter here so we will drop this column
df.drop('Name', axis=1, inplace=True)
df.head()

In [None]:
df['Author'].nunique()

In [None]:
# There are 248 authors for 550 books - that's around 2 books for one author which is not a lot
# That means that there are a lot of authors that probably have one book
df['Author'].value_counts().median()

In [None]:
# This means that there are a lot of authors that have only one book in the category
# Now I will calculate how many authors have more than one occurence
authors = df['Author'].value_counts()
authors_data = pd.DataFrame(authors)
authors_data = authors_data.reset_index()
authors_data.columns = ['Author', 'Occurrences']
authors_data[authors_data['Occurrences'] > 1]

In [None]:
# We see that 118 authors have more than 2 books on the list and 130 authors have only one book
# I will change the author column to reflect that fact
def has_more_than_one_book(author, authors_data):
    if (int(authors_data[authors_data['Author'] == author]['Occurrences']) == 1):
        return 'One Book'
    else:
        return 'More than one book'
    
df['Author'] = df['Author'].apply(lambda x: has_more_than_one_book(x, authors_data))
df.head()

In [None]:
# Now it's time to see whether this fact affects user rating
df.groupby('Author').describe()['User Rating']

In [None]:
sns.boxplot(data=df, x='Author', y='User Rating')

In [None]:
# We can see that only authors that got more than one book on the list got a review lower than 4.0
df['Reviews'].hist(bins=30)

In [None]:
# Most reviews are within range 0-30000
average_reviews = df.groupby('User Rating').mean()
average_reviews

In [None]:
plt.figure(figsize=(12, 6))
average_reviews.sort_values(by=['Reviews'])['Reviews'].plot(kind='bar')

In [None]:
# We can see that there are a ton of reviews for 3.8 user rating and a lot for 3.9 and 4.1 user rating
sns.lineplot(data=df, x='User Rating', y='Reviews')

In [None]:
plt.figure(figsize=(12, 6))
average_reviews.sort_values(by=['Price'])['Price'].plot(kind='bar')

In [None]:
# The highest price is for books that have 4.5 rating and lowest for books that have 4.9 rating
# We can see that there are a ton of reviews for 3.8 user rating and a lot for 3.9 and 4.1 user rating
sns.lineplot(data=df, x='User Rating', y='Price')

In [None]:
plt.figure(figsize=(12, 6))
average_reviews.sort_values(by=['Year'])['Year'].plot(kind='bar', ylim=(2011, 2017))

In [None]:
# We can see that the oldest books have the rating of 4.2 and the highest around 4.9
sns.lineplot(data=df, x='User Rating', y='Year')

In [None]:
df['Genre'].value_counts()

In [None]:
# There is a similar amount of genres there
df.groupby('Genre').mean()['User Rating']

In [None]:
# Fiction novels tend to have a higher rating than non-fiction ones
sns.boxplot(data=df, x='Genre', y='User Rating')

In [None]:
# Looking at boxplot we can see that fiction novels have overall much better ratings but they also have
# ratings that are very low compared to non fiction novels

# Training Data

In [None]:
# I need to create dummy variables before I decide to test my data
data = pd.get_dummies(data=df['Author'], drop_first=True)
df = pd.concat([df, data], axis=1)
df.drop('Author', axis=1, inplace=True)

data = pd.get_dummies(data=df['Genre'], drop_first=True)
df = pd.concat([df, data], axis=1)
df.drop('Genre', axis=1, inplace=True)

df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('User Rating', axis=1)
y = df['User Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# We will use a few different machine learning algorithms and chech which one of them returns the best results

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [None]:
predictions

In [None]:
# We can use scatterplot to visualise whether the linear regression was successful in finding user ratings
ax = sns.scatterplot(x=y_test, y=predictions)
ax.set(xlabel='Real values', ylabel='Predictions')
plt.show()

In [None]:
# We see that linear regression didn't output results that can be easily fitted into a linear function
RMSE_linear_regression = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print('RMSE: ', RMSE_linear_regression)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtree = DecisionTreeRegressor(random_state=42)
dtree.fit(X_train, y_train)
predictions = dtree.predict(X_test)

In [None]:
predictions

In [None]:
RMSE_decision_tree = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print('RMSE: ', RMSE_decision_tree)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rtree = RandomForestRegressor(random_state=42)
rtree.fit(X_train, y_train)
predictions = rtree.predict(X_test)

In [None]:
predictions

In [None]:
RMSE_random_forest = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print('RMSE: ', RMSE_random_forest)

# Gradient Boosting Tree

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbtree = GradientBoostingRegressor(random_state=42)
gbtree.fit(X_train, y_train)
predictions = gbtree.predict(X_test)

In [None]:
predictions

In [None]:
RMSE_boosting_tree = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print('RMSE: ', RMSE_boosting_tree)

# Summary of a machine learning model

In [None]:
len(predictions)

In [None]:
RMSE_df = pd.DataFrame(data=[['Linear Regression', RMSE_linear_regression], 
                                ['Decision Tree', RMSE_decision_tree], 
                                ['Random Forest', RMSE_random_forest], 
                                ['Gradient Boosting Tree', RMSE_boosting_tree]],
                         columns=['Model', 'RMSE'])
RMSE_df

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='RMSE', data=RMSE_df, order=RMSE_df.sort_values('RMSE').Model)
plt.ylim(0.2, 0.3)

### We can see that the decision tree was the worst model in predicting reviews
### Linear regression is the second worst (although it did quite well for a simple machine learning algorithm)
### Random Forest and Gradient Boosting Tree were the best in predicting the user rating

In [None]:
RMSE_df = RMSE_df.sort_values('RMSE')
RMSE_df.reset_index(drop=True, inplace=True)
RMSE_df