In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

%matplotlib inline

In [93]:
df=pd.read_csv("C:/Users/sahay/Downloads/TRAIN.csv")
df.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [94]:
df.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
#Order           0
Sales            0
dtype: int64

In [95]:
df. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             188340 non-null  object 
 1   Store_id       188340 non-null  int64  
 2   Store_Type     188340 non-null  object 
 3   Location_Type  188340 non-null  object 
 4   Region_Code    188340 non-null  object 
 5   Date           188340 non-null  object 
 6   Holiday        188340 non-null  int64  
 7   Discount       188340 non-null  object 
 8   #Order         188340 non-null  int64  
 9   Sales          188340 non-null  float64
dtypes: float64(1), int64(3), object(6)
memory usage: 14.4+ MB


In [96]:
df.columns=df.columns.str.lower()
df.columns

Index(['id', 'store_id', 'store_type', 'location_type', 'region_code', 'date',
       'holiday', 'discount', '#order', 'sales'],
      dtype='object')

In [97]:
#converting the store-id to str since we want this column to be categorical
df['store_id']=df['store_id'].astype(str)
#converting the data column to proper datetime format
df['date']=pd.to_datetime(df['date'])

In [98]:
#creating new features
df['day']=df['date'].dt.dayofweek
df['month']=df['date'].dt.month
df['qtr']=df['date'].dt.quarter
df['day1']=df['date'].dt.day
df['day_count'] = (df['date'] - df['date'].min()).dt.days
#creating a new column weekend
df['is_weekend'] = df['day'].apply(lambda x: 1 if x >= 5 else 0)

In [99]:
#converting discount to proper categorical data type
df['discount']=df['discount'].map({'Yes':1,'No':0})

In [100]:
X=df.drop(['id','date','#order','sales'],axis=1)
y=df['#order']

In [101]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   id             188340 non-null  object        
 1   store_id       188340 non-null  object        
 2   store_type     188340 non-null  object        
 3   location_type  188340 non-null  object        
 4   region_code    188340 non-null  object        
 5   date           188340 non-null  datetime64[ns]
 6   holiday        188340 non-null  int64         
 7   discount       188340 non-null  int64         
 8   #order         188340 non-null  int64         
 9   sales          188340 non-null  float64       
 10  day            188340 non-null  int32         
 11  month          188340 non-null  int32         
 12  qtr            188340 non-null  int32         
 13  day1           188340 non-null  int32         
 14  day_count      188340 non-null  int64         
 15  

In [103]:
# Step 4: Identifying numerical and categorical columns
numerical_cols = ['holiday', 'discount', 'day', 'month', 'qtr','is_weekend','day1','day_count']
categorical_cols = ['store_id', 'store_type', 'location_type', 'region_code']  # 'store_id' treated as categorical


In [104]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardize numerical columns
        ('cat', OneHotEncoder(drop='first'), categorical_cols)  # One-hot encode categorical columns
    ])

In [105]:
X=preprocessor.fit_transform(X)

In [106]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state = 42)

In [107]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(150672, 382)
(37668, 382)
(150672,)
(37668,)


In [108]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


In [109]:
#importing the necessary performance metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [110]:
#creating instance for the class
models={
    'lr' : LinearRegression(),
    'rf' : RandomForestRegressor(),
    'gb' : GradientBoostingRegressor(),
    'xgb' : XGBRegressor(),
    'dt' : DecisionTreeRegressor(),
    'l1' : Lasso(),
    'l2' : Ridge()

}

In [111]:
model_list=[]
Accuracy=[]

for i in  range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    #Making Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Training set performance
    mae_train = mean_absolute_error(y_train,y_train_pred)
    mse_train = mean_squared_error(y_train,y_train_pred)
    r2_train = r2_score(y_train,y_train_pred)

    #Test Set Performance
    mae_test = mean_absolute_error(y_test,y_test_pred)
    mse_test = mean_squared_error(y_test,y_test_pred)
    r2_test = r2_score(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model Performance for Training Set')
    print(" - MAE: {:.4f}".format(mae_train))
    print(" - MSE: {:.4f}".format(mse_train))
    print(" - R2 Score: {:.4f}".format(r2_train))

    print ('-------------------------------------------')
    print('Model Performance for Test Set')
    print(" - MAE: {:.4f}".format(mae_test))
    print(" - MSE: {:.4f}".format(mse_test))
    print(" - R2 Score: {:.4f}".format(r2_test))

    print('='*35)
    print('\n')



lr
Model Performance for Training Set
 - MAE: 10.5988
 - MSE: 236.3967
 - R2 Score: 0.7453
-------------------------------------------
Model Performance for Test Set
 - MAE: 10.6414
 - MSE: 235.6623
 - R2 Score: 0.7464


rf
Model Performance for Training Set
 - MAE: 3.1083
 - MSE: 21.5985
 - R2 Score: 0.9767
-------------------------------------------
Model Performance for Test Set
 - MAE: 8.4246
 - MSE: 153.5648
 - R2 Score: 0.8348


gb
Model Performance for Training Set
 - MAE: 11.4399
 - MSE: 255.3839
 - R2 Score: 0.7248
-------------------------------------------
Model Performance for Test Set
 - MAE: 11.4634
 - MSE: 257.5735
 - R2 Score: 0.7228


xgb
Model Performance for Training Set
 - MAE: 8.7971
 - MSE: 152.0078
 - R2 Score: 0.8362
-------------------------------------------
Model Performance for Test Set
 - MAE: 9.1460
 - MSE: 167.0866
 - R2 Score: 0.8202


dt
Model Performance for Training Set
 - MAE: 0.0000
 - MSE: 0.0000
 - R2 Score: 1.0000
--------------------------------

In [112]:
params= {
    'n_estimators': [5,10,20,50,70,100,200,250,300,350,400,450],
    'max_depth': [3, 4, 5, 6, 7],

    'learning_rate':[0.3,0.4,0.6,0.8,0.9,1.0,2.0],
    'booster':['gbtree'],
          
}


In [113]:
randomcv_models = [
    ('xgb',XGBRegressor(),params)
]
randomcv_models

[('xgb',
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
  {'n_estimators': [5, 10, 20, 50, 70, 100, 200, 250, 300, 350, 400, 450],
   'max_depth': [3, 4, 5, 6, 7],
   'learning_rate': [0.3, 0.4, 0.6, 0.8, 0.9, 1.0, 2.0],
   'booster': ['gbtree']})]

In [114]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name,model,params in randomcv_models:
        random = RandomizedSearchCV(estimator = model,
                           param_distributions = params,
                           n_iter = 100,
                           scoring = 'neg_mean_squared_error',
                           cv=3)
    
        random.fit(X_train,y_train)
        model_param[name]= random.best_params_
    
for model_name in model_param:
    print(f"-------------------- Best Params for {model_name}-----------")
    print(model_param[model_name])

-------------------- Best Params for xgb-----------
{'n_estimators': 450, 'max_depth': 4, 'learning_rate': 0.6, 'booster': 'gbtree'}


In [120]:

random.fit(X_train,y_train)

# Predict orders
predicted_orders = random.predict(X)
df['predicted_orders'] = predicted_orders

In [116]:
X_sales = df[['predicted_orders', 'holiday', 'discount', 'day', 'month', 'qtr', 'is_weekend', 'day1', 'day_count',
              'store_id', 'store_type', 'location_type', 'region_code']]
y_sales = df['sales']

# Preprocessing
X_sales = preprocessor.fit_transform(X_sales)

In [117]:
X_train_sales, X_test_sales, y_train_sales, y_test_sales = train_test_split(X_sales, y_sales, test_size=0.2, random_state=42)


In [121]:
# Predicting Sales
random.fit(X_train_sales,y_train_sales)
df['predicted_sales'] =random.predict(X_sales)

In [122]:
r2 = r2_score(y_sales, df['predicted_sales'])
r2

0.8742869624195531

In [74]:
random.fit(X_train,y_train)
y_pred = random.predict(X_test)
r2_score(y_pred,y_test)

0.8285684546246751