# **Initialisation**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from category_encoders import BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Importing all library necessary for model training
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

#Import Walmart CSV Dataset 
data = pd.read_csv("/kaggle/input/walmart-dataset/Walmart.csv")

# **Data Collection and Exploration**

# **Data Preprocessing**

In [2]:
data.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

In [3]:
data.dtypes

Store             int64
Date             object
Weekly_Sales    float64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object

In [4]:
def get_season(quarter):
    if quarter == 1:
        return 'Winter'
    elif quarter == 2:
        return 'Spring'
    elif quarter == 3:
        return 'Summer'
    else:
        return 'Autumn'
    
data['Date'] = pd.to_datetime(data['Date'], format = "%d-%m-%Y")
data['Year'] = data['Date'].dt.year
data['Quarter'] = data['Date'].dt.quarter
data['Season'] = data['Quarter'].apply(get_season)
data['Month'] = data['Date'].dt.month
data['Month_Name'] = data['Date'].dt.month_name()
data['Week'] = data['Date'].dt.isocalendar().week
data['Day_of_Week'] = data['Date'].dt.day_name()
data['Week'] = data['Week'].astype('int32')

In [5]:
data_preprocess = data.copy()
data_preprocess.drop(['Date', 'Year', 'Quarter', 'Month', 'Day_of_Week'], axis = 1, inplace = True)

data_preprocess.dtypes

Store             int64
Weekly_Sales    float64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
Season           object
Month_Name       object
Week              int32
dtype: object

In [6]:
data_preprocess['Store'] = data_preprocess['Store'].astype('object')
data_preprocess['Holiday_Flag'] = data_preprocess['Holiday_Flag'].astype('object')
data_preprocess['Week'] = data_preprocess['Week'].astype('object')

data_preprocess.dtypes

Store            object
Weekly_Sales    float64
Holiday_Flag     object
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
Season           object
Month_Name       object
Week             object
dtype: object

In [7]:
X = data_preprocess.drop('Weekly_Sales', axis = 1)
y = data_preprocess['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

num_feats = data_preprocess.select_dtypes('number').columns.to_list()
num_feats.remove('Weekly_Sales')

cat_feats = data_preprocess.select_dtypes('object').columns.to_list()

print(f'Numerical Features : {num_feats}')
print(f'Categorical Features: {cat_feats}')

Numerical Features : ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
Categorical Features: ['Store', 'Holiday_Flag', 'Season', 'Month_Name', 'Week']


In [8]:
preprocessor = ColumnTransformer([('num_features', StandardScaler(), num_feats),
                                  ('cat_features', BinaryEncoder(), cat_feats),])

preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# **Model Building and Evaluation**

In [9]:
#Calculating the evaluation metrics of the decision tree
def evaluateModel(model, X, y_test):
    
    y_predict = model.predict(X)
    
    MAE = mean_absolute_error(y_test, y_predict)
    MSE = mean_squared_error(y_test, y_predict)
    RMSE = np.sqrt(MSE)
    R2 = r2_score(y_test, y_predict)

    print(f"Mean Absolute Error: {MAE}")
    print(f"Mean Squared Error: {MSE}")
    print(f"Root Mean Squared Error: {RMSE}")
    print(f"\nR2 Score: {R2}")

## **1. Decision Tree Regressor** 

In [10]:
decisionTree_regressor = DecisionTreeRegressor()
decisionTree_regressor.fit(X_train_transformed, y_train)

print("\nEvaluation for Decision Tree Regressor Model\n")
evaluateModel(decisionTree_regressor, X_train_transformed, y_train)


Evaluation for Decision Tree Regressor Model

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0

R2 Score: 1.0


## **2. Linear Regression** 

In [11]:
linearRegression_regressor = LinearRegression()
linearRegression_regressor.fit(X_train_transformed, y_train)

print("\nEvaluation for Linear Regression Model\n")
evaluateModel(linearRegression_regressor, X_train_transformed, y_train)


Evaluation for Linear Regression Model

Mean Absolute Error: 445396.7314952883
Mean Squared Error: 282687248083.4086
Root Mean Squared Error: 531683.4096371718

R2 Score: 0.10969474712530514


## **3. K-Nearest Neighbors Regressor** 

In [12]:
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train_transformed, y_train)

print("\nEvaluation for K-Nearest Neighbors Regressor Model\n")
evaluateModel(knn_regressor, X_train_transformed, y_train)


Evaluation for K-Nearest Neighbors Regressor Model

Mean Absolute Error: 280699.1654696969
Mean Squared Error: 134593881455.65982
Root Mean Squared Error: 366870.38781517895

R2 Score: 0.576105252439928


# **Hyperparameter Tuning**

In [13]:
#Tuning the model to optimise the performance
def tuneModel(estimator, X_train, y_train, param_grid, cv=5):   
    
    grid_search = GridSearchCV(estimator=estimator,
                               param_grid=param_grid,
                               scoring='r2',
                               cv=cv)

    # Fit the data
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best parameters: {best_params} \n")
    print(f"Best R2 score: {best_score}")
    
    best_estimator = grid_search.best_estimator_
    return best_estimator

## **1. Decision Tree Regressor** 

In [14]:
dt_param_grid = {'max_depth': np.arange(2, 15),
              'min_samples_split': [10, 20, 30, 40, 50, 100, 200, 300]}

decisionTree_regressor_tuned = tuneModel(decisionTree_regressor, X_train_transformed, y_train, dt_param_grid, cv = 5)

Best parameters: {'max_depth': 10, 'min_samples_split': 30} 

Best R2 score: 0.9203988527764982


In [15]:
print("\nEvaluation for tuned Decision Tree Regressor Model\n")
evaluateModel(decisionTree_regressor_tuned, X_train_transformed, y_train)


Evaluation for tuned Decision Tree Regressor Model

Mean Absolute Error: 63044.414837232376
Mean Squared Error: 13153013615.362638
Root Mean Squared Error: 114686.58864646136

R2 Score: 0.9585754320639375


In [16]:
print("\nTesting the tuned Decision Tree Regressor Model\n")
evaluateModel(decisionTree_regressor_tuned, X_test_transformed, y_test)


Testing the tuned Decision Tree Regressor Model

Mean Absolute Error: 85068.27495429495
Mean Squared Error: 25487857012.172134
Root Mean Squared Error: 159649.16852953582

R2 Score: 0.9208831417889422


## **2. Linear Regression** 

In [17]:
LR_pipe = Pipeline([('poly_feat', PolynomialFeatures()),
                    ('lin_reg', LinearRegression())])

param_grid = {'poly_feat__degree': [2, 3, 4]}

linearRegression_regressor_tuned = tuneModel(LR_pipe, X_train_transformed, y_train, param_grid, cv = 5)

Best parameters: {'poly_feat__degree': 3} 

Best R2 score: 0.9598657981553396


In [18]:
print("\nEvaluation for tuned Linear Regression Model\n")
evaluateModel(linearRegression_regressor_tuned, X_train_transformed, y_train)


Evaluation for tuned Linear Regression Model

Mean Absolute Error: 50286.66308857808
Mean Squared Error: 5368562630.278031
Root Mean Squared Error: 73270.47584312546

R2 Score: 0.983092058299308


In [19]:
print("\nTesting the tuned Linear Regression Model\n")
evaluateModel(linearRegression_regressor_tuned, X_test_transformed, y_test)


Testing the tuned Linear Regression Model

Mean Absolute Error: 73083.6624009324
Mean Squared Error: 11479116180.884602
Root Mean Squared Error: 107140.63739256268

R2 Score: 0.9643676748956342


## **3. K-Nearest Neighbors Regressor** 

In [20]:
knn_param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13]}

knn_regressor_tuned = tuneModel(knn_regressor, X_train_transformed, y_train, knn_param_grid, cv = 5)

Best parameters: {'n_neighbors': 13} 

Best R2 score: 0.3530567607408912


In [21]:
print("\nEvaluation for tuned KNN Regressor Model\n")
evaluateModel(knn_regressor_tuned, X_train_transformed, y_train)


Evaluation for tuned KNN Regressor Model

Mean Absolute Error: 334245.1421195684
Mean Squared Error: 170681912401.75092
Root Mean Squared Error: 413136.67520779476

R2 Score: 0.46244832686212667


In [22]:
print("\nTesting the tuned KNN Regressor Model\n")
evaluateModel(knn_regressor_tuned, X_test_transformed, y_test)


Testing the tuned KNN Regressor Model

Mean Absolute Error: 372400.97612814535
Mean Squared Error: 209049193400.26334
Root Mean Squared Error: 457218.9775154388

R2 Score: 0.3510903884353235
