In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

%matplotlib inline

In [2]:
df=pd.read_csv("C:/Users/sahay/Downloads/TRAIN.csv")
df.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [3]:
df.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
#Order           0
Sales            0
dtype: int64

In [4]:
df. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             188340 non-null  object 
 1   Store_id       188340 non-null  int64  
 2   Store_Type     188340 non-null  object 
 3   Location_Type  188340 non-null  object 
 4   Region_Code    188340 non-null  object 
 5   Date           188340 non-null  object 
 6   Holiday        188340 non-null  int64  
 7   Discount       188340 non-null  object 
 8   #Order         188340 non-null  int64  
 9   Sales          188340 non-null  float64
dtypes: float64(1), int64(3), object(6)
memory usage: 14.4+ MB


In [5]:
df.columns=df.columns.str.lower()
df.columns

Index(['id', 'store_id', 'store_type', 'location_type', 'region_code', 'date',
       'holiday', 'discount', '#order', 'sales'],
      dtype='object')

In [6]:
#converting the store-id to str since we want this column to be categorical
df['store_id']=df['store_id'].astype(str)
#converting the data column to proper datetime format
df['date']=pd.to_datetime(df['date'])

In [7]:
#creating new features
df['day']=df['date'].dt.dayofweek
df['month']=df['date'].dt.month
df['qtr']=df['date'].dt.quarter
#creating a new column weekend
df['is_weekend'] = df['day'].apply(lambda x: 1 if x >= 5 else 0)

In [8]:
#converting discount to proper categorical data type
df['discount']=df['discount'].map({'Yes':1,'No':0})

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   id             188340 non-null  object        
 1   store_id       188340 non-null  object        
 2   store_type     188340 non-null  object        
 3   location_type  188340 non-null  object        
 4   region_code    188340 non-null  object        
 5   date           188340 non-null  datetime64[ns]
 6   holiday        188340 non-null  int64         
 7   discount       188340 non-null  int64         
 8   #order         188340 non-null  int64         
 9   sales          188340 non-null  float64       
 10  day            188340 non-null  int32         
 11  month          188340 non-null  int32         
 12  qtr            188340 non-null  int32         
 13  is_weekend     188340 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(4)

In [9]:
#separating target variable
X=df.drop(['sales','id','date'],axis=1)
y=df['sales']

In [10]:
# Step 4: Identifying numerical and categorical columns
numerical_cols = ['holiday','#order', 'discount', 'day', 'month', 'qtr','is_weekend']
categorical_cols = ['store_id', 'store_type', 'location_type', 'region_code']  # 'store_id' treated as categorical


In [11]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardize numerical columns
        ('cat', OneHotEncoder(drop='first'), categorical_cols)  # One-hot encode categorical columns
    ])

In [13]:
X=preprocessor.fit_transform(X)

In [14]:
print(X.shape)
print(y.shape)

(188340, 381)
(188340,)


#### Splitting the dataset in train and test set

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state = 42)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


In [17]:
#importing the necessary performance metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [18]:
#creating instance for the class
models={
    'lr' : LinearRegression(),
    'rf' : RandomForestRegressor(),
    'gb' : GradientBoostingRegressor(),
    'xgb' : XGBRegressor(),
    'dt' : DecisionTreeRegressor(),
    'l1' : Lasso(),
    'l2' : Ridge()

}

In [19]:
model_list=[]
Accuracy=[]

for i in  range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #Making Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Training set performance
    mae_train = mean_absolute_error(y_train,y_train_pred)
    mse_train = mean_squared_error(y_train,y_train_pred)
    r2_train = r2_score(y_train,y_train_pred)

    #Test Set Performance
    mae_test = mean_absolute_error(y_test,y_test_pred)
    mse_test = mean_squared_error(y_test,y_test_pred)
    r2_test = r2_score(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model Performance for Training Set')
    print(" - MAE: {:.4f}".format(mae_train))
    print(" - MSE: {:.4f}".format(mse_train))
    print(" - R2 Score: {:.4f}".format(r2_train))

    print ('-------------------------------------------')
    print('Model Performance for Test Set')
    print(" - MAE: {:.4f}".format(mae_test))
    print(" - MSE: {:.4f}".format(mse_test))
    print(" - R2 Score: {:.4f}".format(r2_test))

    print('='*35)
    print('\n')



lr
Model Performance for Training Set
 - MAE: 3103.3839
 - MSE: 17920357.9665
 - R2 Score: 0.9475
-------------------------------------------
Model Performance for Test Set
 - MAE: 3108.2821
 - MSE: 18220010.0349
 - R2 Score: 0.9461


rf
Model Performance for Training Set
 - MAE: 817.4554
 - MSE: 1654423.5481
 - R2 Score: 0.9952
-------------------------------------------
Model Performance for Test Set
 - MAE: 2139.1747
 - MSE: 11211202.8417
 - R2 Score: 0.9668


gb
Model Performance for Training Set
 - MAE: 2957.6532
 - MSE: 17013157.1199
 - R2 Score: 0.9502
-------------------------------------------
Model Performance for Test Set
 - MAE: 2991.4902
 - MSE: 17521497.5506
 - R2 Score: 0.9482


xgb
Model Performance for Training Set
 - MAE: 2050.7442
 - MSE: 9090006.3540
 - R2 Score: 0.9734
-------------------------------------------
Model Performance for Test Set
 - MAE: 2132.5732
 - MSE: 10188388.4907
 - R2 Score: 0.9699


dt
Model Performance for Training Set
 - MAE: 57.0643
 - MSE: 