## Project Name: Amazon Top 50 Bestselling Books 2009 - 2019

The main goal of this project is to do EDA and predict user rating

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
dataset = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

print(dataset.shape)

In [None]:
dataset.head()

In [None]:
dataset.info()

## Missing Values

In [None]:
features_with_na = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 1]

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean, 4), '% missing values')

With the above step we can conclude that there are no missing values in any of the features

## Numerical Variables

In [None]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtype != 'O']

print('There are {} numerical columns in the dataset'.format(len(numerical_features)))

dataset[numerical_features].head()


## Categorical Variables

In [None]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']

dataset[categorical_features].head()

In [None]:
for feature in categorical_features:
    print('The feature {} has {} number of categories'.format(feature, len(dataset[feature].unique())))

### Relationship between Genre and dependent variable User Rating

In [None]:
dataset.groupby('Genre')['User Rating'].median().plot.bar()
plt.xlabel('Genre')
plt.ylabel('User Rating')
plt.title(feature)
plt.show()

In [None]:
sns.heatmap(data = dataset[numerical_features].corr(), annot = True)

In [None]:
sns.pairplot(data = dataset[numerical_features])

In [None]:
for feature in numerical_features:
    plt.figure()
    sns.kdeplot(data = dataset[feature], shade = True, color = 'seagreen')

In [None]:
from sklearn.preprocessing import LabelEncoder

lencoder = LabelEncoder()
lencoder.fit(dataset['Genre'])
dataset['Genre'] = lencoder.transform(dataset['Genre'])
dataset.head()

In [None]:
X = dataset[['Reviews','Genre','Price','Year']]
y = dataset['User Rating']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [None]:
print('Shape of X_train is: ', X_train.shape) 
print('Shape of X_test is: ', X_test.shape)

print('Shape of y_train is: ', y_train.shape)
print('Shape of y_test is: ', y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lmodel = LinearRegression()

lmodel.fit(X_train, y_train)

In [None]:
y_predicted = lmodel.predict(X_test)

In [None]:
from sklearn import metrics

mean_square_error = metrics.mean_squared_error(y_test, y_predicted)
root_mean_square_error = np.sqrt(mean_square_error)

print('Mean Squared Error is: ',mean_square_error)
print('Root Mean Squared Error is: ',root_mean_square_error)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_predicted, c='teal',marker='+')

p1 = max(max(y_predicted), max(y_test))
p2 = min(min(y_predicted), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('Actuals', fontsize=12)
plt.ylabel('Predictions', fontsize=12)
plt.axis('equal')
plt.show()