<a href="https://colab.research.google.com/github/santoshmahanti/Rossmann-Sales-Prediction/blob/main/Baseline_Model_Building_Notebook_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder


import warnings    
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Retail Sales Prediction/df_train.csv', parse_dates=['Date'])

In [4]:
test_df = pd.read_csv('/content/drive/MyDrive/Retail Sales Prediction/df_test.csv',parse_dates=['Date'])

In [5]:
#setting date and store as index
train_df.set_index(['Date','Store'], inplace=True)
train_df.sort_values(by=['Date','Store'], inplace=True)

test_df.set_index(['Date','Store'], inplace=True)
test_df.sort_values(by=['Date','Store'], inplace=True)

In [6]:
train_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,Month,WeekOfYear,DayOfYear,CompetitionOpen,Promo2Open,Promo2running
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2013-01-01,85,2,8.34759,619,0,1,1,b,a,1870.0,0,2013,1,1,1,15.0,0.0,0


In [7]:
train_df.columns

Index(['DayOfWeek', 'Sales', 'Customers', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'Promo2', 'Year', 'Month', 'WeekOfYear', 'DayOfYear', 'CompetitionOpen',
       'Promo2Open', 'Promo2running'],
      dtype='object')

In [8]:
test_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,Month,WeekOfYear,DayOfYear,CompetitionOpen,Promo2Open,Promo2running
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-15,1,1,8.615771,586,1,0,0,c,a,1270.0,0,2015,6,25,166,81.0,0.0,0


## Test-Train-Split

In [9]:
# dividing train df into train_x and train_y 
X_train = train_df.drop('Sales', axis=1)
y_train = train_df[['Sales']]

#dividing test df into test_x and test_y
X_test = test_df.drop("Sales", axis=1)
y_test = test_df[['Sales']]

One Hot Encoding of categorical columns to conver them into numerical columns

In [10]:
# making a list of categorical columns
Categorical_columns = ['DayOfWeek','StoreType','Assortment']

In [11]:
# assigning one hot encoder
enc = OneHotEncoder(sparse=False)

In [12]:
enc.fit(X_train[Categorical_columns])

OneHotEncoder(sparse=False)

In [13]:
# getting a new list of encoded columns from Categorical_columns in train data
encoded_cols = enc.get_feature_names(Categorical_columns).tolist()

In [14]:
encoded_cols

['DayOfWeek_1',
 'DayOfWeek_2',
 'DayOfWeek_3',
 'DayOfWeek_4',
 'DayOfWeek_5',
 'DayOfWeek_6',
 'DayOfWeek_7',
 'StoreType_a',
 'StoreType_b',
 'StoreType_c',
 'StoreType_d',
 'Assortment_a',
 'Assortment_b',
 'Assortment_c']

In [15]:
#performing transformation on training data and creating new features for encoded columns
X_train[encoded_cols] = enc.transform(X_train[Categorical_columns])

In [16]:
#dropping old categorical columns from training data
X_train.drop(Categorical_columns, axis=1, inplace=True)

In [17]:
#performing transformation on testing data and creating new features for encoded columns
X_test[encoded_cols] = enc.transform(X_test[Categorical_columns])

In [18]:
#dropping old categorical columns from testing data
X_test.drop(Categorical_columns, axis=1, inplace=True)

In [19]:
X_train.columns

Index(['Customers', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'CompetitionDistance', 'Promo2', 'Year', 'Month', 'WeekOfYear',
       'DayOfYear', 'CompetitionOpen', 'Promo2Open', 'Promo2running',
       'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7', 'StoreType_a',
       'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_a',
       'Assortment_b', 'Assortment_c'],
      dtype='object')

### **Time for transformations in our data**

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[list(X_train.columns)] = scaler.fit_transform(X_train[list(X_train.columns)])
X_test[list(X_test.columns)] = scaler.fit_transform(X_test[list(X_test.columns)])

scaler = StandardScaler()
y_train[list(y_train.columns)] = scaler.fit_transform(y_train[list(y_train.columns)])
y_test[list(y_test.columns)] = scaler.transform(y_test[list(y_train.columns)])

## **Model Selection**

Going through the assumptions of linear models, we can confidently conclude that we can go for them. Given Our Data has a lot of genuine multicollinearity and also some columns have way too many outliers than others. Dataset has patterns such as peak days, festive seasons etc which would most likely be considered as outliers in simple linear regression.

Linear models like Linear regression and Logistic ones can't be used for our purpose, so we will move ahead with decision trees and randomforests, but, before that just to check we are implementing linear regression

In [21]:
lrreg = LinearRegression()

In [22]:
lrreg.fit(X_train,y_train)

LinearRegression()

In [23]:
y_train_pred = lrreg.predict(X_train)

In [24]:
y_test_pred = lrreg.predict(X_test)

In [25]:
# importing evaluation matrices
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [26]:
print(f'r2_score for training data is {r2_score(y_train_pred,y_train)}')
print(f'r2_score for testing data is {r2_score(y_test_pred,y_test)}')

r2_score for training data is 0.6656650622618923
r2_score for testing data is -9.769518527491527e-12


As expected high bias and high variance.

### **Model 1(Baseline): DecisionTree**

In [27]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor 

In [28]:
# assigning a variable
dtree = DecisionTreeRegressor(random_state=68)

In [29]:
dtree.fit(X_train,y_train)

DecisionTreeRegressor(random_state=68)

In [30]:
# predicting Y_train
y_pred_train = dtree.predict(X_train)

In [31]:
# predicting Y_train
y_pred_test = dtree.predict(X_test)

In [33]:
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test,y_test)}')

print(f'root Mean Squared Error for training data is {(mean_squared_error(y_pred_train,y_train))**0.5}')
print(f'root Mean Squared Error for testing data is {(mean_squared_error(y_pred_test,y_test))**0.5}')


r2_score for training data is 0.9999987542867533
r2_score for testing data is 0.91439936525088
Adjusted r2_score for training data is 0.999999
Adjusted r2_score for testing data is 0.912579
Mean Absolute Error for training data is 4.189885590080446e-06
Mean Absolute Error for testing data is 0.20466152827539516
Mean Squared Error for training data is 1.2457116948123794e-06
Mean Squared Error for testing data is 0.08229968168791212
root Mean Squared Error for training data is 0.0011161145527285178
root Mean Squared Error for testing data is 0.286879210971991


As it is famously assumed, Decision trees are indeed prone to overfitting, with r2_score of 99% on traing data, it has completely fitted to the training data, while for test data, it's accuracy is around 91%.

To ease this problem of overfitting we will use Random Forest to improve our model accuracy.

### **Model 2: Random Forest**

A single tree was not able to lift much weight, so now we will use entire forest.

In [34]:
# importing random forest from ScikitLearn
from sklearn.ensemble import RandomForestRegressor

In [35]:
rf = RandomForestRegressor(n_estimators=100, random_state=68)

In [36]:
rf.fit(X_train,y_train)

RandomForestRegressor(random_state=68)

In [37]:
y_pred_train_rf = rf.predict(X_train)

In [38]:
y_pred_test_rf = rf.predict(X_test)

In [39]:
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train_rf,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test_rf,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train_rf)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test_rf)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train_rf,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test_rf,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train_rf,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test_rf,y_test)}')

print(f'root Mean Squared Error for training data is {((mean_squared_error(y_pred_train_rf,y_train))**0.5)}')
print(f'root Mean Squared Error for testing data is {((mean_squared_error(y_pred_test_rf,y_test))**0.5)}')


r2_score for training data is 0.9964392053414902
r2_score for testing data is 0.9508258091334019
Adjusted r2_score for training data is 0.996503
Adjusted r2_score for testing data is 0.952213
Mean Absolute Error for training data is 0.043316193490068176
Mean Absolute Error for testing data is 0.15393337647510844
Mean Squared Error for training data is 0.0034968064640474084
Mean Squared Error for testing data is 0.044987260239829074
root Mean Squared Error for training data is 0.05913380136645545
root Mean Squared Error for testing data is 0.21210200432770332


Our r2_score for training data is almost 99% while for test data it is 
95%, which is better than baseline model. Try finding best parameters to see if we can further improve our score.

### **LGBM**

In [40]:
# importing lgbm
from lightgbm import LGBMRegressor

In [41]:
# assigning variable
lgbm_reg = LGBMRegressor(random_state=68)

In [42]:
# fitting on training data
lgbm_reg.fit(X_train,y_train)

LGBMRegressor(random_state=68)

In [43]:
# predicting on y_train
y_train_pred_lgbm = lgbm_reg.predict(X_train)

In [44]:
# predicting on y_test
y_test_pred_lgbm = lgbm_reg.predict(X_test)

In [45]:
# printing accuracy
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_train_pred_lgbm,y_train)}')
print(f'r2_score for testing data is {r2_score(y_test_pred_lgbm,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_train_pred_lgbm)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_test_pred_lgbm)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_train_pred_lgbm,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_test_pred_lgbm,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_train_pred_lgbm,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_test_pred_lgbm,y_test)}')

print(f'root Mean Squared Error for training data is {((mean_squared_error(y_train_pred_lgbm,y_train))**0.5)}')
print(f'root Mean Squared Error for testing data is {((mean_squared_error(y_test_pred_lgbm,y_test))**0.5)}')


r2_score for training data is 0.9154403064676544
r2_score for testing data is 0.8900099640724496
Adjusted r2_score for training data is 0.925163
Adjusted r2_score for testing data is 0.901768
Mean Absolute Error for training data is 0.21481748224056066
Mean Absolute Error for testing data is 0.2383248799366392
Mean Squared Error for training data is 0.07483406048523196
Mean Squared Error for testing data is 0.09247695917510892
root Mean Squared Error for training data is 0.2735581482705861
root Mean Squared Error for testing data is 0.3041002452730167


In [None]:
# Due to lack of ram colab is crashing, we have continued on next notebook.