In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
data.head()

In [None]:
data.describe()

**check null values**

In [None]:
data.isna().any()

**check and remove duplicates**

In [None]:
data.loc[:, 'Name':'Price'].duplicated().any()

In [None]:
data = data[data.loc[:, 'Name':'Reviews'].duplicated() == False].reset_index(drop=True)

## EDA

In [None]:
sns.set(rc={'figure.figsize':(16, 12)})
sns.heatmap(data.corr())
plt.show()

In [None]:
sns.barplot(x='User Rating', y='Reviews', data=data)
plt.show()

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(16, 24))

sns.histplot(x='User Rating', data=data, ax=axes[0], kde=True)
sns.histplot(x='Year', data=data, ax=axes[1])
sns.histplot(x='Price', data=data, ax=axes[2], kde=True)
sns.histplot(x='Reviews', data=data, ax=axes[3], kde=True)

axes[0].title.set_text('Rating distribution')
axes[1].title.set_text('Books over year distribution')
axes[2].title.set_text('Price distribution')
axes[3].title.set_text('Reviews distribution')

plt.subplots_adjust(top=1.5)

plt.show()

In [None]:
min_rating = data['User Rating'].min()
max_raiting = data['User Rating'].max()

plt.gcf().set_size_inches(16, 10)
sns.boxplot(x='Year', y='User Rating', data=data)
plt.ylim([min_rating-0.1, max_raiting+0.1])
plt.show()

In [None]:
group_1 = data[data['Genre']=='Fiction']
group_2 = data[data['Genre']=='Non Fiction']

fig, axes = plt.subplots(2, 3, figsize=(30, 12))

sns.kdeplot(x='User Rating', y='Price', data=data, ax=axes[0, 0])
sns.kdeplot(x='User Rating', y='Reviews', data=data, ax=axes[0, 1])
sns.kdeplot(x='Price', y='Reviews', data=data, ax=axes[0, 2])

sns.kdeplot(x='User Rating', y='Price', data=group_1, color='red', ax=axes[1, 0])
sns.kdeplot(x='User Rating', y='Price', data=group_2, color='blue', ax=axes[1, 0])
axes[1, 0].legend(['Fiction', 'Non Fiction'])

sns.kdeplot(x='User Rating', y='Reviews', data=group_1, color='red', ax=axes[1, 1])
sns.kdeplot(x='User Rating', y='Reviews', data=group_2, color='blue', ax=axes[1, 1])
axes[1, 1].legend(['Fiction', 'Non Fiction'])

sns.kdeplot(x='Price', y='Reviews', data=group_1, color='red', ax=axes[1, 2])
sns.kdeplot(x='Price', y='Reviews', data=group_2, color='blue', ax=axes[1, 2])
axes[1, 2].legend(['Fiction', 'Non Fiction'])

plt.subplots_adjust(top=1.5)

plt.show()

In [None]:
from scipy import stats

sns.histplot(x='User Rating', data=group_1, kde=True, color='red')
sns.histplot(x='User Rating', data=group_2, kde=True, color='blue')
plt.legend(['Fiction', 'Non Fiction'])

alpha = 0.5

stat, pval = stats.ttest_ind(group_1[['User Rating']], group_2[['User Rating']])
plt.show()

print('Stat', f'{stat[0]:.5f}')
print('P-Value:', f'{pval[0]:.10f}')

if pval > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

## Removing outliers

In [None]:
data = data[(data.Reviews <= 40000) & (data.Price < 50)]

## Creating X and y

In [None]:
data['Price bins'] = \
pd.cut(data.Price, bins=[-1, 5, 10, 15, 20, 25, np.inf], labels=['0-5$', '6-10$', '11-15$', '16-20$', '20-25$','>26$'])

data['name_len'] =  data.Name.map(len)

X = pd.concat([data[['Reviews', 'Price', 'Year', 'name_len']], pd.get_dummies(data.Genre), pd.get_dummies(data['Price bins'])], axis=1)
y = data['User Rating']

X.head()

## Scaler and Grid Search

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sc = MinMaxScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [3, 5, 7, 10, 12, 15],
    'max_depth': [3, 5, 7, 15],
    'min_samples_split': [3, 5, 7, 10],
    'min_samples_leaf': [3, 5, 7, 10],
    'max_leaf_nodes': [3, 5, 7, 10, 12],
    'oob_score': [True, False]
}

rfr_model = RandomForestRegressor(n_jobs=-1)

grid = GridSearchCV(rfr_model, params, scoring = 'neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

## Evaluation and Visualising results

In [None]:
from sklearn.metrics import mean_squared_error as rmse

print('Train Data: {:.10f}'.format(rmse(y_train, grid.best_estimator_.predict(X_train), squared=False)))
print('Test Data:  {:.10f}'.format(rmse(y_test, grid.best_estimator_.predict(X_test), squared=False)))

In [None]:
predicted = grid.best_estimator_.predict(X_test)

In [None]:
fig, axes = plt.subplots(1, figsize=(100, 12))

ax = sns.scatterplot(X_test[:, 0], y_test, alpha=0.3, s=1000, color='blue')
ax1 = sns.scatterplot(X_test[:, 0], predicted, alpha=0.3, s=1000, color='red')
plt.legend(['Real', 'Predicted'])
plt.xlabel('Reviews ratio')

for line in range(0, X_test.shape[0]):
    #numbered bubbles
    ax.text(X_test[line, 0], y_test.iloc[line]-0.005, line, horizontalalignment='center', size='medium', color='blue', weight='semibold', alpha=0.7)
    ax1.text(X_test[line, 0], predicted[line]-0.005, line, horizontalalignment='center', size='medium', color='red', weight='semibold', alpha=0.7)
    
    
    #line between
    ax2 = sns.lineplot([X_test[line, 0], X_test[line, 0]], [y_test.iloc[line], predicted[line]], alpha=0.3)

    #real/predicted ratings
    max = np.max([y_test.iloc[line], predicted[line]])
    min = np.min([y_test.iloc[line], predicted[line]])
    if (min == y_test.iloc[line]):   
        ax2.text(X_test[line, 0], min + (max-min)/2-0.02, y_test.iloc[line], color='blue')
        ax2.text(X_test[line, 0], min + (max-min)/2+0.02, '/{:.2f}'.format(round(predicted[line], 2)), color='red')
    else:
        ax2.text(X_test[line, 0], min + (max-min)/2-0.02, '{:.2f}/'.format(round(predicted[line], 2)), color='red')
        ax2.text(X_test[line, 0], min + (max-min)/2+0.02, y_test.iloc[line], color='blue')

**same visualisation less detailed**

In [None]:
fig, axes = plt.subplots(1, figsize=(29, 12))

sns.scatterplot(X_test[:, 0], y_test, s=100, color='blue')
sns.regplot(X_test[:, 0], predicted, scatter_kws={'s':100}, lowess=True, color='red')
plt.legend(['Predicted', 'Real'])
plt.xlabel('Reviews ratio')
plt.show()