In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#imports
import seaborn as sns
pd.set_option('max_rows',1000)
sns.set(rc={'figure.figsize':(15,10)})
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder,StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

In [None]:
df = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.head(10)

In [None]:
df.info()

In [None]:
df.describe()

# EDA

lets see the correlation between features.

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
df['Author'].value_counts()

In [None]:
df['Genre'].value_counts()

In [None]:
sns.distplot(df['User Rating'])

In [None]:
sns.barplot(x='Genre',y='User Rating',data=df,palette='rocket_r')

User ratings for both Genres are pretty much evened out. Everyone has their taste.

In [None]:
df['Name'].value_counts()

In [None]:
df.loc[(df['Name']=='Publication Manual of the American Psychological Association, 6th Edition')]

Some books have been listed multiple times in the dataset, the only differnce being the year in which they were featured as a bestselling book.

In [None]:
df.loc[(df['Name']=="Oh, the Places You'll Go!")]

In [None]:
df["Author"].unique()

In [None]:
df['Year'].value_counts()

Let's do some visualizations.

In [None]:
sns.barplot(x='Year',y='User Rating',data=df,palette='rocket_r')

In [None]:
sns.boxplot(x='Year',y='User Rating',data=df,palette='magma_r')

In [None]:
sns.distplot(df['Reviews'])

In [None]:
sns.barplot(x='Year',y='Reviews',data=df,palette='rocket_r')

Most reviews were recorded in 2014/2019, least in 2009.

In [None]:
sns.boxplot(x='Year',y='Reviews',data=df,palette='summer_r')

**Doing some groupwise feature analysis**

In [None]:
g1 = df.groupby('Author')[['Reviews','Year','User Rating','Price']].mean()
g1

In [None]:
g2 = df.groupby('Genre')[['Reviews','Year','User Rating','Price']].mean()
g2

Fiction books recorded more reviews and has less mean price.

In [None]:
g3 = df.groupby('Year')[['Reviews','User Rating','Price']].mean()
g3

Books are becoming cheaper as the years progress, yet becoming more popular, garnering more reviews.
A Positive trend I believe.

In [None]:
sns.distplot(np.log1p(df['Price']))

Most book prices seems to lie below $40.

In [None]:
sns.barplot(x='Year',y='Price',data=df,palette='magma_r')

Again,Books becoming cheaper.

In [None]:
sns.barplot(x='Genre',y='Price',data=df,palette='magma_r')

In [None]:
g4 = df.groupby('Name')[['Reviews','Year','User Rating','Price']].mean()
g4

In [None]:
sns.scatterplot(x='Price',y='User Rating',data=df,hue='Year',palette='rocket_r')

# MODELLING

Let us try to Price of books by training several ML models on different features.

In [None]:
enc = LabelEncoder()
df['Author_enc'] = enc.fit_transform(df['Author'])
df['Genre_enc'] = enc.fit_transform(df['Genre'])
df['Name_enc'] = enc.fit_transform(df['Name'])
df.head(10)

In [None]:
target = np.log1p(df['Price'])
train = df[['Author_enc','Reviews','Year','User Rating','Genre_enc','Name_enc']]
print(train.shape,target.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.25, random_state=42,shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)
print('Done.')

In [None]:
#models
rf = RandomForestRegressor(n_jobs=-1)
xgb = XGBRegressor()
gbr = GradientBoostingRegressor( learning_rate=0.01,) 
dtr = DecisionTreeRegressor(criterion='mse',random_state=42,max_depth=35,
                           max_features='sqrt', min_samples_leaf=15, min_samples_split=10)
abr = AdaBoostRegressor(dtr,learning_rate=0.01)
cat =  CatBoostRegressor(learning_rate=0.1,eval_metric = 'RMSE')
lgb = LGBMRegressor(max_depth=25,num_leaves=120,learning_rate=0.01,n_jobs=-1,boosting_type='gbdt',
                   objective='regression',mertic='rmse', verbosity=1,bagging_fraction=0.7,
                   feature_fraction=0.5,bagging_frequency=6,bagging_seed=42,seed=42,
                   colsample_bylevel=1,colsample_bynode=1, colsample_bytree=1)

In [None]:
xgb.fit(X_train,y_train)
xgb_pred = np.expm1(xgb.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,xgb_pred)),mean_absolute_error(y_test,xgb_pred))

In [None]:
rf.fit(X_train,y_train)
rf_pred = np.expm1(rf.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,rf_pred)),mean_absolute_error(y_test,rf_pred))

In [None]:
dtr.fit(X_train,y_train)
dtr_pred = np.expm1(dtr.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,dtr_pred)),mean_absolute_error(y_test,dtr_pred))

In [None]:
abr.fit(X_train,y_train)
abr_pred = np.expm1(abr.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,abr_pred)),mean_absolute_error(y_test,abr_pred))

In [None]:
gbr.fit(X_train,y_train)
gbr_pred = np.expm1(gbr.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,gbr_pred)),mean_absolute_error(y_test,gbr_pred))

In [None]:
cat.fit(X_train,y_train,verbose=0)
cat_pred = np.expm1(cat.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,cat_pred)),mean_absolute_error(y_test,cat_pred))

In [None]:
lgb.fit(X_train,y_train)
lgb_pred = np.expm1(lgb.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,lgb_pred)),mean_absolute_error(y_test,lgb_pred))

In [None]:
VR = VotingRegressor([('xgb',xgb),('gbr',gbr),('dtr',dtr),('lgb',lgb),('abr',abr)],n_jobs=-1)
VR.fit(X_train,y_train)
VR_pred = np.expm1(VR.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,VR_pred)),mean_absolute_error(y_test,VR_pred))

In [None]:
level0 = [('xgb',xgb),('gbr',gbr),('dtr',dtr),('lgb',lgb),('abr',abr),('vr',VR)]
level1 = LinearRegression()
stack = StackingRegressor(estimators=level0, cv=5,n_jobs=-1)
stack.fit(X_train,y_train)
stack_pred = np.expm1(stack.predict(X_test))
print(np.sqrt(mean_squared_error(y_test,stack_pred)),mean_absolute_error(y_test,stack_pred))

In [None]:
output = pd.DataFrame({'Actual':np.expm1(y_test),'Predicted (xgb)':xgb_pred,'Predicted (lgb)':lgb_pred,
                      'Predicted (abr)':abr_pred,'Predicted (gbr)':gbr_pred,'Predicted (dtr)':dtr_pred,
                      'Predicted (rf)':rf_pred,'Predicted (cat)':cat_pred,
                      'Predicted (VR)':VR_pred,'Predicted (stack)':stack_pred})
output.head(10)

In [None]:
output.to_csv('Amazon_bestseller_prediction.csv',index=False)