<a href="https://colab.research.google.com/github/santoshmahanti/Rossmann-Sales-Prediction/blob/main/Model_Building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder


import warnings    
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Retail Sales Prediction/df_train.csv', parse_dates=['Date'])

In [4]:
test_df = pd.read_csv('/content/drive/MyDrive/Retail Sales Prediction/df_test.csv',parse_dates=['Date'])

In [5]:
#setting date and store as index
train_df.set_index(['Date','Store'], inplace=True)
train_df.sort_values(by=['Date','Store'], inplace=True)

test_df.set_index(['Date','Store'], inplace=True)
test_df.sort_values(by=['Date','Store'], inplace=True)

In [6]:
train_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,Month,WeekOfYear,DayOfYear,CompetitionOpen,Promo2Open,Promo2running
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2013-01-01,85,2,8.34759,619,0,1,1,b,a,1870.0,0,2013,1,1,1,15.0,0.0,0


In [7]:
train_df.columns

Index(['DayOfWeek', 'Sales', 'Customers', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'Promo2', 'Year', 'Month', 'WeekOfYear', 'DayOfYear', 'CompetitionOpen',
       'Promo2Open', 'Promo2running'],
      dtype='object')

In [8]:
test_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,Month,WeekOfYear,DayOfYear,CompetitionOpen,Promo2Open,Promo2running
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-15,1,1,8.615771,586,1,0,0,c,a,1270.0,0,2015,6,25,166,81.0,0.0,0


## Test-Train-Split

In [9]:
# dividing train df into train_x and train_y 
X_train = train_df.drop('Sales', axis=1)
y_train = train_df[['Sales']]

#dividing test df into test_x and test_y
X_test = test_df.drop("Sales", axis=1)
y_test = test_df[['Sales']]

One Hot Encoding of categorical columns to conver them into numerical columns

In [10]:
# making a list of categorical columns
Categorical_columns = ['DayOfWeek','StoreType','Assortment']

In [11]:
# assigning one hot encoder
enc = OneHotEncoder(sparse=False)

In [12]:
enc.fit(X_train[Categorical_columns])

OneHotEncoder(sparse=False)

In [13]:
# getting a new list of encoded columns from Categorical_columns in train data
encoded_cols = enc.get_feature_names(Categorical_columns).tolist()

In [14]:
encoded_cols

['DayOfWeek_1',
 'DayOfWeek_2',
 'DayOfWeek_3',
 'DayOfWeek_4',
 'DayOfWeek_5',
 'DayOfWeek_6',
 'DayOfWeek_7',
 'StoreType_a',
 'StoreType_b',
 'StoreType_c',
 'StoreType_d',
 'Assortment_a',
 'Assortment_b',
 'Assortment_c']

In [15]:
#performing transformation on training data and creating new features for encoded columns
X_train[encoded_cols] = enc.transform(X_train[Categorical_columns])

In [16]:
#dropping old categorical columns from training data
X_train.drop(Categorical_columns, axis=1, inplace=True)

In [17]:
#performing transformation on testing data and creating new features for encoded columns
X_test[encoded_cols] = enc.transform(X_test[Categorical_columns])

In [18]:
#dropping old categorical columns from testing data
X_test.drop(Categorical_columns, axis=1, inplace=True)

In [19]:
X_train.columns

Index(['Customers', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'CompetitionDistance', 'Promo2', 'Year', 'Month', 'WeekOfYear',
       'DayOfYear', 'CompetitionOpen', 'Promo2Open', 'Promo2running',
       'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7', 'StoreType_a',
       'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_a',
       'Assortment_b', 'Assortment_c'],
      dtype='object')

### **Time for transformations in our data**

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[list(X_train.columns)] = scaler.fit_transform(X_train[list(X_train.columns)])
X_test[list(X_test.columns)] = scaler.fit_transform(X_test[list(X_test.columns)])

scaler = StandardScaler()
y_train[list(y_train.columns)] = scaler.fit_transform(y_train[list(y_train.columns)])
y_test[list(y_test.columns)] = scaler.transform(y_test[list(y_train.columns)])

## **Model Selection**

Going through the assumptions of linear models, we can confidently conclude that we can go for them. Given Our Data has a lot of genuine multicollinearity and also some columns have way too many outliers than others. 

Linear models like Linear regression and Logistic ones can't be used for our purpose, so we will move ahead with decision trees and randomforests, but, before that just to check.

In [48]:
lrreg = LinearRegression()

In [49]:
lrreg.fit(X_train,y_train)

LinearRegression()

In [50]:
y_train_pred = lrreg.predict(X_train)

In [51]:
y_test_pred = lrreg.predict(X_test)

In [52]:
# importing evaluation matrices
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score



In [53]:
print(f'r2_score for training data is {r2_score(y_train_pred,y_train)}')
print(f'r2_score for training data is {r2_score(y_test_pred,y_test)}')

r2_score for training data is 0.6656650622618923
r2_score for training data is -9.769518527491527e-12


As expected high bias and high variance.

In [54]:
from sklearn.linear_model import Ridge
ridge= Ridge(alpha=1.0)

In [55]:
ridge.fit(X_train,y_train)

Ridge()

In [56]:
ridge.score(X_train, y_train)

0.7504345340938793

In [57]:
y_pred_train = ridge.predict(X_train)

In [58]:
y_test_pred = ridge.predict(X_test)

In [59]:
print(f'r2_score for training data is {r2_score(y_train_pred,y_train)}')
print(f'r2_score for training data is {r2_score(y_test_pred,y_test)}')

r2_score for training data is 0.6656650622618923
r2_score for training data is 0.6581498736176206


### **Model 1(Baseline): DecisionTree**

In [30]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor 

In [31]:
# assigning a variable
dtree = DecisionTreeRegressor(random_state=68)

In [32]:
dtree.fit(X_train,y_train)

DecisionTreeRegressor(random_state=68)

In [33]:
# predicting Y_train
y_pred_train = dtree.predict(X_train)

In [34]:
# predicting Y_train
y_pred_test = dtree.predict(X_test)

In [35]:
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test,y_test)}')

r2_score for training data is 0.9999987542867533
r2_score for testing data is 0.91439936525088
Adjusted r2_score for training data is 0.999999
Adjusted r2_score for testing data is 0.912579
Mean Absolute Error for training data is 4.189885590080446e-06
Mean Absolute Error for testing data is 0.20466152827539516
Mean Squared Error for training data is 1.2457116948123794e-06
Mean Squared Error for testing data is 0.08229968168791212


As it is famously assumed, Decision trees are indeed prone to overfitting, with r2_score of 1 on traing data, it has completely overfitted the data, while for test data, it's accuracy is around 93%.

To ease this problem of overfitting we will use Random Forest to improve our model accuracy.

### **Model 2: Random Forest**

A single tree was not able to lift much weight, so now we will use entire forest.

In [36]:
# importing random forest from ScikitLearn
from sklearn.ensemble import RandomForestRegressor

In [37]:
rf = RandomForestRegressor(n_estimators=100, random_state=68)

In [38]:
rf.fit(X_train,y_train)

RandomForestRegressor(random_state=68)

In [39]:
y_pred_train_rf = rf.predict(X_train)

In [40]:
y_pred_test_rf = rf.predict(X_test)

In [41]:
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train_rf,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test_rf,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train_rf)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test_rf)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train_rf,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test_rf,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train_rf,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test_rf,y_test)}')

r2_score for training data is 0.9964392053414902
r2_score for testing data is 0.9508258091334019
Adjusted r2_score for training data is 0.996503
Adjusted r2_score for testing data is 0.952213
Mean Absolute Error for training data is 0.043316193490068176
Mean Absolute Error for testing data is 0.15393337647510844
Mean Squared Error for training data is 0.0034968064640474084
Mean Squared Error for testing data is 0.044987260239829074


Our r2_score for training data is almost 99% while for test data it is 95%. It seems all good, but let's try finding best parameters to see if we can further improve our score.

## **XGBoost**

In [60]:
# importing XGBoost
import xgboost as xgb

In [61]:
# assigning variable for it
xgb_reg = xgb.XGBRegressor(random_state=68)

In [62]:
# importing GridSearch CV 
from sklearn.model_selection import GridSearchCV

In [63]:
# assigning grid
param_grid = {"max_depth":[4, 5],
              "n_estimators":[100,150,None],
              "learning_rate":[0.01, 0.015]}

In [64]:
#assigning variable to GridSearchCV
search = GridSearchCV(xgb_reg, param_grid, cv=3)

In [65]:
search.fit(X_train, y_train)



GridSearchCV(cv=3, estimator=XGBRegressor(random_state=68),
             param_grid={'learning_rate': [0.01, 0.015], 'max_depth': [4, 5],
                         'n_estimators': [100, 150, None]})

In [66]:
search.best_params_

{'learning_rate': 0.015, 'max_depth': 5, 'n_estimators': 150}

In [67]:
y_pred_train=search.predict(X_train)

In [68]:
y_pred_test=search.predict(X_test)

In [69]:
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test,y_test)}')

r2_score for training data is 0.7791871087051665
r2_score for testing data is 0.7714000899632978
Adjusted r2_score for training data is 0.857216
Adjusted r2_score for testing data is 0.851248
Mean Absolute Error for training data is 0.2985578982514212
Mean Absolute Error for testing data is 0.29644968830022556
Mean Squared Error for training data is 0.14277883891376167
Mean Squared Error for testing data is 0.14003768897104113
