In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')

In [None]:
df.sort_values(['Year','Name']).head()

# Check and drop duplicates

In [None]:
df['Name'].value_counts()

In [None]:
df[df['Name']=='StrengthsFinder 2.0']

In [None]:
df.drop_duplicates('Name', keep='last', inplace=True)
df.info()

# EDA

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline


df['User Rating'].hist(bins=50)
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(16,10))

axs[0,0].hist(df['Reviews'], bins=50)
axs[0,1].hist(df['Price'], bins=50)
axs[1,0].hist(df['Year'], bins=50)
axs[1,1].hist(df['Genre'], bins=50)

plt.show()


In [None]:
plt.scatter('Reviews', 'User Rating', data=df, color='r')
plt.scatter('Price', 'User Rating', data=df, color='b')
plt.show()

In [None]:
Fiction = df['Genre']=='Fiction'
Non_Fiction = df['Genre']=='Non Fiction'
plt.boxplot([df[Fiction]['User Rating'], df[Non_Fiction]['User Rating']])
plt.show()

****Generate dummies

In [None]:
df_dummies=pd.get_dummies(df, drop_first=True, columns=['Year', 'Genre'])
df_dummies.head()

In [None]:
df_dummies.columns

****More EDA

In [None]:
import seaborn as sns 

column_list = ['User Rating', 'Reviews', 'Price', 'Year_2010',
       'Year_2011', 'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015',
       'Year_2016', 'Year_2017', 'Year_2018', 'Year_2019',
       'Genre_Non Fiction']

corr_matrix = df_dummies[column_list].corr()
plt.figure(figsize=(16,12))
sns.heatmap(corr_matrix, annot=True)
plt.show()

The correlation do not seems to shows much notable relationship. 

# Model 1: Simple Linear Regression

I'm choosing User Rating as the independent variable, eventhough the correlation is alsmot non-existent (-0.056). This is based on business intuition. Looking from the chart a good variables should be Year. But realisticly, do we really believe that there is an intrinsic relationship between user rating and the year the book got published?

In [None]:
X0 = df['Reviews'].values.reshape(-1,1)
y0 = df['User Rating'].values.reshape(-1,1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from scipy import stats

In [None]:
def run_lr(X, y):
    
    """
    Run linear regression on the data, calculate RMSE, R_squared and plot regression plot on test data
    
    Arg: 
    X: Dataframe of independent variables 
    y: Array of predicted variables
    
    Returns:
    Int
    Figure and ax objects
    
    Raise:
    ValueError: If X is not array or dataframe or y is not array
    """
    #Train and fit:
    reg = LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_pred_train = reg.predict(X_train)
    
    # Get regression score:
    R_squared = reg.score(X_test, y_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

    print(str(RMSE) +','+str(R_squared) + ': RMSE, R_squared')
    
    #Plot
    fig, axs = plt.subplots(1,2, figsize=(16,4))
    
    if X.shape[1] == 1: 
         axs[0].scatter(X_test, y_test)
         axs[0].plot(X_test, y_pred, color='r')
         axs[0].set(xlabel='X_test', ylabel = 'y_pred', title ='Regression line on test data')
    
         axs[1].scatter(X_train, y_train)
         axs[1].plot(X_train, y_pred_train, color='r')
         axs[1].set(xlabel='X_train', ylabel = 'y_pred_train', title ='Regression line on train data')
    
         plt.show()
    
        
    else:
        axs[0].scatter(X_test.iloc[:,0], y_test)
        axs[0].scatter(X_test.iloc[:,0], y_pred, color='r')
        axs[0].set(xlabel='X_test', ylabel = 'y_pred', title ='Predicted values on test data')
    
        axs[1].scatter(X_train.iloc[:,0], y_train)
        axs[1].scatter(X_train.iloc[:,0], y_pred_train, color='r')
        axs[1].set(xlabel='X_train', ylabel = 'y_pred_train', title ='Predicted values on train data')
    
        plt.show()
     
        
    
        

    
run_lr(X0, y0)

# Remove outliers

In [None]:
plt.hist(df['Reviews'], bins=100)
outlier_limit = (df['Reviews'].mean() + 3*df['Reviews'].std())
plt.axvline(x=outlier_limit, color='r')
plt.show()

In [None]:
df_no_outlier = df_dummies[df_dummies['Reviews'] <= outlier_limit]

In [None]:
X1 = df_no_outlier['Reviews'].values.reshape(-1,1)
y1 = df_no_outlier['User Rating'].values.reshape(-1,1)
print(len(X1), len(y1))

In [None]:
run_lr(X1, y1)

How about Price as independent variable:

In [None]:
X2 = df_no_outlier['Price'].values.reshape(-1,1)
y2 = df_no_outlier['User Rating'].values.reshape(-1,1)
print(len(X2), len(y2))

In [None]:
run_lr(X2, y2)

Now fiction as independent:

In [None]:
X3 = df_no_outlier['Genre_Non Fiction'].values.reshape(-1,1)
y3 = df_no_outlier['User Rating'].values.reshape(-1,1)
run_lr(X3, y3)

# Model 2: Multiple Linear Regression

In [None]:
y = df_no_outlier['User Rating'].values.reshape(-1,1)

full_var_list = ['Year_2010',
       'Year_2011', 'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015',
       'Year_2016', 'Year_2017', 'Year_2018', 'Year_2019',
       'Genre_Non Fiction']
main_list = ['Reviews', 'Price']

R2 = []
rMSE = []

for var in full_var_list:
    main_list.append(var)
    X = df_no_outlier[main_list]
    
     #Train and fit:
    reg = LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_pred_train = reg.predict(X_train)
    
    # Get regression score:
    R_squared = reg.score(X_test, y_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    R2.append(R_squared)
    rMSE.append(RMSE)
    
    print('Add ' + var + ' to the model')

    #Plot
    fig, axs = plt.subplots(1,2, figsize=(16,4))
    
    axs[0].scatter(X_test.iloc[:,0], y_test)
    axs[0].scatter(X_test.iloc[:,0], y_pred, color='r')
    axs[0].set(xlabel='X_test', ylabel = 'y_pred', title ='Predicted values on test data')
    
    axs[1].scatter(X_train.iloc[:,0], y_train)
    axs[1].scatter(X_train.iloc[:,0], y_pred_train, color='r')
    axs[1].set(xlabel='X_train', ylabel = 'y_pred_train', title ='Predicted values on train data')
    
    plt.show()    

In [None]:
plt.figure(figsize=(16,8))
plt.plot(full_var_list, R2, color='r', label = 'R Squared')
plt.plot(full_var_list, rMSE, color='b', label = 'RMSE')
plt.legend()
plt.title('Impact of adding variables to model', fontsize=16)
plt.show()

Looking at the chart above, let's just add all variables to this model then

#  Model 3: All variables and Let's do some scaling

In [None]:
#Take a look at our data again: 

plt.figure(figsize=(16,4))
df_no_outlier.boxplot()
plt.show()

In [None]:
df_no_outlier.describe()

In [None]:
y=df_no_outlier['User Rating']
X=df_no_outlier.iloc[:, 3:]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps=[('scaler', StandardScaler()), ('Ln', LinearRegression())]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

ln_scaled = pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

R_squared = ln_scaled.score(X_test, y_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

print(str(RMSE) +','+str(R_squared) + ': RMSE, R_squared')

In [None]:
plt.figure(figsize=(16,4))
plt.scatter(X_test.iloc[:,0], y_pred, color='#ff6037', alpha= 0.8, label='Predicted')
plt.scatter(X_test.iloc[:,0], y_test, marker='x', label='Actual')
plt.title('Visual Regression result')
plt.xlabel('Num of Reviews')
plt.ylabel('User Rating')
plt.legend()
plt.show()

****Model 4: Tree

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import xgboost as xgb

a={'xgb_model__n_estimators':[50, 100, 200],'xgb_model__max_depth':[2,5]}

steps=[('scaler', StandardScaler()), ('xgb_model', xgb.XGBRegressor())]

pipeline = Pipeline(steps)
randomized_rmse = RandomizedSearchCV(estimator=pipeline, param_distributions=a, n_iter=5, scoring={'MSE':'neg_mean_squared_error', 'R_squared':'r2'}, refit='MSE', cv=10, verbose=1)

randomized_rmse.fit(X, y)

print(randomized_rmse.best_estimator_)
print(randomized_rmse.best_score_)