# 5. Data Preprocessing

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Read the data
data= pd.read_csv('Train_cleaned.csv')

In [8]:
# create a copy of original dataset
df = data.copy()
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [9]:
# Check the null values
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Impute the null values of Item_Weight and Outlet_Size columns.
In many cases to impute the null values we can use mean values for numerical data and mode mode values for categorical data.


But here the data the null values are high we follow the traditional method the data will get baised and this may affect the prediction of Item_Outlet_Sales.


1.   Predict the null values in the Item_Weight column using the independent features.
2.   Next, predict the null values in the Outlet_Size column using the independent features.



### 5.1 Predict null values in Item_Weight column

In [10]:
# Slice the required columns to predict null values Item_Weight column
df1= df[['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP']]

In [11]:
df1.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095
4,NCD19,8.93,Low Fat,0.0,Household,53.8614


In [12]:
# Null values
df1.isnull().sum()

Item_Identifier        0
Item_Weight         1463
Item_Fat_Content       0
Item_Visibility        0
Item_Type              0
Item_MRP               0
dtype: int64

In [13]:
# Save the null records seperately which will be used for prediction
null_records1 = df1[df1.isnull().any(axis=1)]

In [14]:
# Check the shape of null records
null_records1.shape

(1463, 6)

In [15]:
# Drop the null records in the df1 dataframe
df1.dropna(inplace=True)

In [16]:
# Now df1 is free from null values
df1.isnull().sum()

Item_Identifier     0
Item_Weight         0
Item_Fat_Content    0
Item_Visibility     0
Item_Type           0
Item_MRP            0
dtype: int64

In [17]:
# Import required libraries for model building
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [18]:
# OnehotEncoding for categorical columns
categorical_processor=Pipeline([('oe',OneHotEncoder(sparse_output=False, drop= 'first', handle_unknown= 'ignore'))])

In [19]:
# Use Column Transformer
preprocessor= ColumnTransformer([
    ('scaler', MinMaxScaler(),['Item_Visibility','Item_MRP']),
    ('categorical_processor',categorical_processor,['Item_Identifier', 'Item_Fat_Content','Item_Type'])
])

Use "pycaret" library for quick view on the performace metrics to know which algorithm will perform better on the data.
Choose the best algorithm.

Linear Regressin performs better.

In [20]:
# Make the pipeline
pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [21]:
# Split the features and target columm
x1 = df1.drop(columns=['Item_Weight'],axis=1)
y1 = df1['Item_Weight']

# Split the train,test data
x1_train,x1_test,y1_train,y1_test = train_test_split(x1,y1,test_size=0.3,random_state=42)

In [22]:
# Shape of x_train1,y_train1
print(x1_train.shape)
print(x1_test.shape)

(4942, 5)
(2118, 5)


In [23]:
# Fit the model on train data
model1 = pipeline1.fit(x1_train,y1_train)

In [24]:
from sklearn import set_config

set_config(display='diagram')  # Enable graphviz visualization
pipeline1


In [25]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
# Check the r2_socre
from sklearn.metrics import r2_score

y1_pred= model1.predict(x1_test)
print(r2_score(y1_test,y1_pred))

0.9595626388203002


In [27]:
# Check the training and testing accuracy
print(model1.score(x1_train,y1_train))
print(model1.score(x1_test,y1_test))

1.0
0.9595626388203002


In [28]:
# Now we are going to fit the model on whole data in df1. Next we will predict the null values
model1_final=pipeline1.fit(x1,y1)

In [29]:
null_records1.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622
18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834
21,FDW12,,Regular,0.0354,Baking Goods,144.5444
23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938
29,FDC14,,Regular,0.072222,Canned,43.6454


In [30]:
# Drop the target column
null_records1.drop(columns=['Item_Weight'],inplace=True)

In [31]:
# Predict the null records
Item_weight_nan = model1_final.predict(null_records1)

Predict the null records in Item_Weight column, impute the values which are near to the existing Item_Weights.

In [32]:
# Unique weights
unique_weights=np.array(df['Item_Weight'].unique())

In [33]:
pd.DataFrame(unique_weights).nunique()

0    415
dtype: int64

We have 415 unique item_weights, now impute the nearest weights to the predicted values

In [34]:
nan_indices1 = df['Item_Weight'].isnull()

In [35]:
# Find closest unique values for each predicted weight
closest_values = [min(unique_weights, key=lambda x: abs(x - pred)) for pred in Item_weight_nan]


In [36]:
# Insert the predicted null values
df.loc[nan_indices1, 'Item_Weight'] = closest_values

In [37]:
pd.DataFrame(closest_values).nunique()

0    357
dtype: int64

In [38]:
# Compare the value_counts before and apply replacing null values
df['Item_Weight'].value_counts()

Item_Weight
12.150    105
17.600    102
11.800     90
13.650     89
15.100     86
         ... 
5.675       2
6.520       2
8.800       2
7.890       2
5.210       2
Name: count, Length: 415, dtype: int64

In [39]:
# Once again check the dataframe whether all null values in Item_Weight are filled
df.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

### 5.2 Predict the null values in Outlet_Size column

In [40]:
# Slice the columns required for Outlet_Size prediction
df2=df[['Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type']]

In [41]:
df2.head()

Unnamed: 0,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,OUT010,1998,,Tier 3,Grocery Store
4,OUT013,1987,High,Tier 3,Supermarket Type1


In [42]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Outlet_Identifier          8523 non-null   object
 1   Outlet_Establishment_Year  8523 non-null   int64 
 2   Outlet_Size                6113 non-null   object
 3   Outlet_Location_Type       8523 non-null   object
 4   Outlet_Type                8523 non-null   object
dtypes: int64(1), object(4)
memory usage: 333.1+ KB


In [43]:
# Value counts
df['Outlet_Size'].value_counts()

Outlet_Size
Medium    2793
Small     2388
High       932
Name: count, dtype: int64

In [44]:
# Seperate the null records in Outlet_size column
null_records2= df2[df2['Outlet_Size'].isnull()]

In [45]:
# Drop the null value
df2.dropna(inplace=True)

In [46]:
# Drop the Outlet_size column from null_records2 dataframe
null_records2.drop(columns='Outlet_Size',inplace=True)

In [47]:
df2.shape

(6113, 5)

In [48]:
# Seperate the Outlet_Size column which will be useful for prediction
x2= df2.drop(columns='Outlet_Size',axis=1)
y2=df2['Outlet_Size']

In [49]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [50]:
cat_processor2=Pipeline([('oe',OneHotEncoder(sparse_output=False, drop= 'first', handle_unknown= 'ignore'))])

In [51]:
# Column Transformer
preprocessor2= ColumnTransformer([
    ('scaler', MinMaxScaler(),['Outlet_Establishment_Year']),
    ('categorical_processor',cat_processor2,['Outlet_Identifier', 'Outlet_Location_Type','Outlet_Type'])
])

Use "pycaret" library for quick view on the performace metrics to know which algorithm will perform better on the data.
Choose the best algorithm.

Logistic Regression performs better.

In [52]:
# Pipeline
pipeline2 = Pipeline([
    ('preprocessor', preprocessor2),
    ('regressor', LogisticRegression())
])

In [53]:
from sklearn import set_config

set_config(display='diagram')  # Enable graphviz visualization
pipeline2


In [54]:
# Split the data into train and test
from sklearn.model_selection import train_test_split
x2_train,x2_test,y2_train,y2_test= train_test_split(x2,y2,test_size=0.3,random_state=42)

In [55]:
# Fit the model on train data
model2 = pipeline2.fit(x2_train,y2_train)

In [56]:
#2 Check the training and testing accuaracy
print(model2.score(x2_train,y2_train))
print(model2.score(x2_test,y2_test))

1.0
1.0


In [57]:
# Predict x2_test values
y2_pred = model2.predict(x2_test)

In [58]:
# Performance metrics
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y2_test,y2_pred))
print(classification_report(y2_test,y2_pred))

[[288   0   0]
 [  0 826   0]
 [  0   0 720]]
              precision    recall  f1-score   support

        High       1.00      1.00      1.00       288
      Medium       1.00      1.00      1.00       826
       Small       1.00      1.00      1.00       720

    accuracy                           1.00      1834
   macro avg       1.00      1.00      1.00      1834
weighted avg       1.00      1.00      1.00      1834



In [59]:
# Fit and Predict the null records
model2_final = pipeline2.fit(x2,y2)
outlet_size_nan = model2_final.predict(null_records2)

Impute the predicted values in the place of null_values

In [60]:
# Assign the indices
nan_indices2 = df['Outlet_Size'].isnull()

In [61]:
# Replace the null records
df.loc[nan_indices2, 'Outlet_Size'] = outlet_size_nan

In [62]:
# Check the null values
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

Now, the data has no null values.

In [63]:
# Check VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in df.columns:
    if ((df[col].dtype != 'object') & (col !='Item_Outlet_Sales')):
        col_list.append(col)

X = df[col_list]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,feature,VIF
0,Item_Weight,8.68679
1,Item_Visibility,2.640883
2,Item_MRP,6.130671
3,Outlet_Establishment_Year,15.188819


Outlet_Establishment_Year VIF is more than 5.

In [64]:
# Remove the Outlet_Establishment_Year column
df_final = df.drop(columns=['Outlet_Establishment_Year'],axis=1)

In [65]:
# Once again check VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in df_final.columns:
  if df_final[col].dtype != 'object' and col !='Item_Outlet_Sales':
    col_list.append(col)

X = df_final[col_list]
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
vif_data

Unnamed: 0,feature,VIF
0,Item_Weight,4.444135
1,Item_Visibility,2.345004
2,Item_MRP,4.189643


VIF values are below 5. So, there is no multicollinerity.

In [66]:
# Final_dataset
df_final= df_final.drop(columns= ['Item_Identifier'])

In [67]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           8523 non-null   float64
 1   Item_Fat_Content      8523 non-null   object 
 2   Item_Visibility       8523 non-null   float64
 3   Item_Type             8523 non-null   object 
 4   Item_MRP              8523 non-null   float64
 5   Outlet_Identifier     8523 non-null   object 
 6   Outlet_Size           8523 non-null   object 
 7   Outlet_Location_Type  8523 non-null   object 
 8   Outlet_Type           8523 non-null   object 
 9   Item_Outlet_Sales     8523 non-null   float64
dtypes: float64(4), object(6)
memory usage: 666.0+ KB


In [68]:
# Numeric features
nf = []
for i in df_final.columns:
  if df_final[i].dtype!='object' and i!='Item_Outlet_Sales':
    nf.append(i)
nf

['Item_Weight', 'Item_Visibility', 'Item_MRP']

In [69]:
# Categorical features
cf = []
for i in df_final.columns:
  if df_final[i].dtype=='object':
    cf.append(i)
cf

['Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [70]:
# OneHotEncoding
df_enc = pd.get_dummies(df_final,columns= cf,prefix='',prefix_sep='',drop_first=True)

# Standardization
scaler = MinMaxScaler()

In [71]:
# Remove the target column

X= df_enc.drop(columns=['Item_Outlet_Sales'])
y= df_final['Item_Outlet_Sales']

In [72]:
# Split the train and test data

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_trainScaled = scaler.fit_transform(X_train)
X_testScaled = scaler.transform(X_test)

# Predictive Modelling:

In [73]:
# OLS
import statsmodels.api as sm

sm_X= sm.add_constant(X_trainScaled)
sm_model_lr = sm.OLS(y_train, sm_X).fit()

In [74]:
print(sm_model_lr.summary())

                            OLS Regression Results                            
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.559
Model:                            OLS   Adj. R-squared:                  0.558
Method:                 Least Squares   F-statistic:                     307.9
Date:                Thu, 28 Dec 2023   Prob (F-statistic):               0.00
Time:                        15:47:34   Log-Likelihood:                -57674.
No. Observations:                6818   AIC:                         1.154e+05
Df Residuals:                    6789   BIC:                         1.156e+05
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.536e+15   2.06e+15      0.744      0.4

In [75]:
# LinearRegression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_trainScaled,y_train)
lr.score(X_testScaled,y_test)

0.5792880197197388

In [76]:
# Use shuffle split to test for different shuffle species
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=123)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.57151682, 0.55588944, 0.56601721, 0.55307925, 0.57083349])

In [77]:
# Ridge
from sklearn.linear_model import Ridge

alpha_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
scores_ridge = []

for alpha in alpha_values:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    scores_ridge.append(ridge.score(X_test, y_test))

best_alpha = alpha_values[scores_ridge.index(max(scores_ridge))]
best_score = max(scores_ridge)

print("Best alpha:", best_alpha)
print("Best score:", best_score)

Best alpha: 10.0
Best score: 0.5797266829470489


In [78]:
# Lasso
from sklearn.linear_model import Lasso

alpha_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
scores_lasso = []

for alpha in alpha_values:
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    scores_lasso.append(lasso.score(X_test, y_test))

best_alpha = alpha_values[scores_lasso.index(max(scores_lasso))]
best_score = max(scores_lasso)

print("Best alpha:", best_alpha)
print("Best score:", best_score)

Best alpha: 1.0
Best score: 0.5803232906675687


In [79]:
# Elastic Net
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV

en_model = ElasticNet()
alpha_values = [1e-5,1e-4,1e-3,1e-2,1e-1,1,10,100,1000,10000]
l1_ratios= np.linspace(0,1,11)

# Reduce the number of iterations for faster results
n_iter = 10

param_grid_en= {'alpha':alpha_values,'l1_ratio': l1_ratios}
random_search_en= RandomizedSearchCV(en_model,param_grid_en,cv=5,scoring= 'neg_mean_squared_error')
random_search_en.fit(X_train,y_train)
test_score_en= random_search_en.best_estimator_.score(X_train,y_train)

print(f"Best alpha value: { random_search_en.best_params_['alpha']}")
print(f"Best l1 ratio value: { random_search_en.best_params_['l1_ratio']}")
print(f"Best negative mean squared error: {random_search_en.best_score_}")
print(f"Best model test R^2 score: {test_score_en}")

Best alpha value: 0.001
Best l1 ratio value: 0.4
Best negative mean squared error: -1317530.5215841718
Best model test R^2 score: 0.5594378939498093


In [80]:
# KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

r2_scores = []

for n in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=n)
    knn.fit(X_trainScaled, y_train)
    y_pred_knn = knn.predict(X_testScaled)
    r2 = r2_score(y_test, y_pred_knn)
    r2_scores.append(r2)
    print(f"R2 score for n_neighbors={n}: {r2}")

best_n = np.argmax(r2_scores) + 1  # Get the best n_neighbors (adding 1 to convert 0-based index to 1-based)
print(f"\nBest R2 score achieved with n_neighbors={best_n}: {max(r2_scores)}")

R2 score for n_neighbors=1: 0.1943016231903334
R2 score for n_neighbors=2: 0.4024981740379282
R2 score for n_neighbors=3: 0.4603463080193373
R2 score for n_neighbors=4: 0.47181583719090736
R2 score for n_neighbors=5: 0.48286969083107745
R2 score for n_neighbors=6: 0.4862908964534176
R2 score for n_neighbors=7: 0.49573242531745054
R2 score for n_neighbors=8: 0.490492565692387
R2 score for n_neighbors=9: 0.4913682385773357
R2 score for n_neighbors=10: 0.49104650489431545
R2 score for n_neighbors=11: 0.4907324474000443
R2 score for n_neighbors=12: 0.4857451929440798
R2 score for n_neighbors=13: 0.4860325711711335
R2 score for n_neighbors=14: 0.48128796536482354
R2 score for n_neighbors=15: 0.47804123209831306
R2 score for n_neighbors=16: 0.4704359498046744
R2 score for n_neighbors=17: 0.4663565210346907
R2 score for n_neighbors=18: 0.4605756598905949
R2 score for n_neighbors=19: 0.4572290062227332
R2 score for n_neighbors=20: 0.45161567211534626

Best R2 score achieved with n_neighbors=7:

In [81]:
# SVR
from sklearn.svm import SVR

# Initialize an empty list to store R2 scores for different C values (SVM parameter)
r2_scores_svm = []

for c_value in [0.1, 1, 10, 100]:  # You can adjust the range of C values as needed
    svm = SVR(C=c_value)
    svm.fit(X_trainScaled, y_train)
    y_pred_svm = svm.predict(X_testScaled)
    r2_svm = r2_score(y_test, y_pred_svm)
    r2_scores_svm.append(r2_svm)
    print(f"R2 score for C={c_value}: {r2_svm}")

best_c = [0.1, 1, 10, 100][np.argmax(r2_scores_svm)]  # Get the best C value
best_r2_svm = max(r2_scores_svm)
print(f"Best R2 score achieved with C={best_c}: {best_r2_svm}")

R2 score for C=0.1: -0.026295016442235752
R2 score for C=1: 0.01843115535533424
R2 score for C=10: 0.27535924385400545
R2 score for C=100: 0.5566482505658119
Best R2 score achieved with C=100: 0.5566482505658119


In [82]:
# DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid_dt = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10]
}

dt_reg = DecisionTreeRegressor()

dt_reg = GridSearchCV(dt_reg, param_grid=param_grid_dt, cv=5, scoring='r2')
dt_reg.fit(X_train, y_train)

best_dt = dt_reg.best_estimator_
best_score_dt = dt_reg.best_score_

print("Best Decision Tree parameters:", best_dt)
print("Best Decision Tree R2 score:", best_score_dt)

Best Decision Tree parameters: DecisionTreeRegressor(max_depth=5, min_samples_split=10)
Best Decision Tree R2 score: 0.5790425934663475


In [83]:
# RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

rf_reg = RandomForestRegressor()

rf_reg =RandomizedSearchCV(rf_reg, param_distributions=param_grid_rf, cv=5, scoring='r2')
rf_reg.fit(X_train, y_train)

best_rf = rf_reg.best_estimator_
best_score_rf = rf_reg.best_score_

print("Best Random Forest parameters:", best_rf)
print("Best Random Forest R2 score:", best_score_rf)


Best Random Forest parameters: RandomForestRegressor(max_depth=5, min_samples_split=10, n_estimators=300)
Best Random Forest R2 score: 0.5927189115507503


In [84]:
# GradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

gb_reg = GradientBoostingRegressor()

gb_reg = RandomizedSearchCV(gb_reg, param_distributions=param_grid_gb, cv=5, scoring='r2')
gb_reg.fit(X_trainScaled, y_train)

best_gb = gb_reg.best_estimator_
best_score_gb = gb_reg.best_score_

print("Best Gradient Boosted Regressor parameters:", best_gb)
print("Best Gradient Boosted Regressor R2 score:", best_score_gb)

Best Gradient Boosted Regressor parameters: GradientBoostingRegressor()
Best Gradient Boosted Regressor R2 score: 0.5835646285084145


In [85]:
# XGBRegressor
from xgboost import XGBRegressor

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

xgb_reg = XGBRegressor()

xgb_reg = RandomizedSearchCV(xgb_reg, param_distributions=param_grid_xgb, cv=5, scoring='r2')
xgb_reg.fit(X_trainScaled, y_train)

best_xgb = xgb_reg.best_estimator_
best_score_xgb = xgb_reg.best_score_

print("Best XGBoost Regressor parameters:", best_xgb)
print("\nBest XGBoost Regressor R2 score:", best_score_xgb)


Best XGBoost Regressor parameters: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=300, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

Best XGBoost Regressor R2 score: 0.5876922837921044


In [88]:
# Choosing n_estimators=200, learning_rate=0.01, max_depth=5
import xgboost as xgb

final_model_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.01, max_depth=5)

# Retraining the model with the best parameters on the entire training set
final_model_xgb.fit(X_trainScaled, y_train)

In [89]:
# Predicting the test values

y_pred = final_model_xgb.predict(X_testScaled)

score = r2_score(y_test,y_pred)
print('r2_score:',score)

r2_score: 0.6016341149262132


Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]

where:

R2: The R2 of the model n: The number of observations k: The number of predictor variables

In [90]:
# Adjusted R-squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

0.5932801269228684

In [91]:
from sklearn.preprocessing import OrdinalEncoder

In [92]:
# Initailize preprocessor
numerical_processor = Pipeline([('scaler',MinMaxScaler())])
categorical_processor = Pipeline([('onehot',OneHotEncoder(handle_unknown='ignore'))])
ordinal_processor = Pipeline([('ordinal',OrdinalEncoder())])

In [93]:
preprocessor = ColumnTransformer([
    ('numerical',numerical_processor,['Item_Weight','Item_Visibility','Item_MRP']),
    ('onehot',categorical_processor,['Item_Fat_Content','Item_Type','Outlet_Type']),
    ('ordinal',ordinal_processor,['Outlet_Size','Outlet_Location_Type'])
])

In [94]:
# Pipeline3
pipeline3 = Pipeline([
    ('preprocessor',preprocessor),
    ('model',xgb.XGBRegressor(n_estimators=200, learning_rate=0.01, max_depth=5))
])

In [95]:
# Split the target column
X_final = df.drop(columns=['Item_Outlet_Sales'])
y_final = df['Item_Outlet_Sales']

In [96]:
# Split the train,test data
X_trainFinal,X_testFinal,y_trainFinal,y_testFinal = train_test_split(X_final,y_final,test_size=0.2,random_state=42)

In [97]:
model = pipeline3.fit(X_trainFinal,y_trainFinal)

In [98]:
# Enable graphviz visualization
from sklearn import set_config

set_config(display='diagram')
pipeline3

In [99]:
# Predict the test values
y_predFinal = model.predict(X_testFinal)
print(r2_score(y_testFinal,y_predFinal))

0.6042946874501158


In [100]:
# Fit the model using whole data
model3_final = pipeline3.fit(X_final,y_final)

Now build a pipeline with final model.

In [101]:
# Use Base Estimator, Transform Mixin for preprocessing errors in Item_Fat_Content
from sklearn.base import BaseEstimator, TransformerMixin

class FatContent_pcsr(BaseEstimator,TransformerMixin):
  def fit(self,X,y=None):
    return self
  def transform(self,X):
    X['Item_Fat_Content'] = X['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})
    return X

In [102]:
# Pipeline1 which we made to predict the null values in the Item_Weight column
from sklearn import set_config

set_config(display='diagram')  # Enable graphviz visualization
pipeline1


In [103]:
# Use Base Estimator, Transform Mixin for imputing null values in Item_Weight column
class ItemWeightImputer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        null_records = X[X['Item_Weight'].isnull()].copy()
        null_records.drop(columns=['Item_Weight'], inplace=True)
        Item_weight_nan = self.model.predict(null_records)
        unique_weights = np.array(X['Item_Weight'].unique())
        nan_indices = X['Item_Weight'].isnull()
        closest_values = [min(unique_weights, key=lambda x: abs(x - pred)) for pred in Item_weight_nan]
        X.loc[nan_indices, 'Item_Weight'] = closest_values
        return X

In [104]:
# Pipeline2 which we made to predict the null values in the Oultet_Size column
from sklearn import set_config

set_config(display='diagram')  # Enable graphviz visualization
pipeline2


In [105]:
# Use Base Estimator, Transform Mixin for imputing null values in Outlet_Size column
class OutletSizeImputer(BaseEstimator, TransformerMixin):
  def __init__(self,model):
    self.model = model

  def fit(self, X, y=None):
    return self

  def transform(self,X):
    null_records = X[X['Outlet_Size'].isnull()].copy()
    null_records.drop(columns=['Outlet_Size'], inplace=True)
    Outlet_Size_nan = self.model.predict(null_records)
    nan_indices = X['Outlet_Size'].isnull()
    X.loc[nan_indices, 'Outlet_Size'] = Outlet_Size_nan
    return X


In [106]:
# Pipeline3 to predict the Item_Outlet_Size using the cleaned data
from sklearn import set_config

set_config(display='diagram')
pipeline3

In [107]:
# Final pipeline
from sklearn.pipeline import make_pipeline
final_pipeline = make_pipeline(FatContent_pcsr(),
                               ItemWeightImputer(model=model1_final),
                               OutletSizeImputer(model2_final),
                               model3_final)

In [108]:
# Final Pipeline from start to end including all steps
from sklearn import set_config

set_config(display='diagram')
final_pipeline

Test the final pipeline on the raw data and check whether it is giving the same r2_score.

In [109]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [110]:
# Split the data into train and test
p_train,p_test,q_train,q_test = train_test_split(data.drop(columns=['Item_Outlet_Sales']),df['Item_Outlet_Sales'],test_size=0.2,random_state=42)

In [111]:
# Fit the train data
model_check = final_pipeline.fit(p_train,q_train)

In [112]:
# Predict the test data and check the r2_score
q_check= model_check.predict(p_test)
print(r2_score(q_test,q_check))

0.6042946874501158


We are getting the same r2_score

### Final Test data:

In [113]:
# Read the final test data
df_test = pd.read_csv('Test.csv')

In [114]:
df_test['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    3396
Regular    1935
LF          206
reg          78
low fat      66
Name: count, dtype: int64

In [115]:
# Check the shape of the test data
df_test.shape

(5681, 11)

In [116]:
# Predict the Item_Outlet_Sales
Item_Outlet_Sales_test= final_pipeline.predict(df_test)

In [117]:
Item_Outlet_Sales_test

array([1757.9362 , 1500.9321 ,  771.27576, ..., 1977.1925 , 3387.9526 ,
       1405.1381 ], dtype=float32)

In [118]:
# Check the length of the predicted values to cross verify whether all the records are predicted or not
len(Item_Outlet_Sales_test)

5681

In [119]:
# Add the predicted Item_Outlet_Sales values to the df_test
df_test['Item_Outlet_Sales'] = Item_Outlet_Sales_test

In [120]:
df_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,1757.936157
1,FDW14,8.3,Regular,0.038428,Dairy,87.3198,OUT017,2007,Small,Tier 2,Supermarket Type1,1500.932129
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,Medium,Tier 3,Grocery Store,771.275757
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,Small,Tier 2,Supermarket Type1,2417.337891
4,FDY38,13.6,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,5810.654297


In [122]:
# Save the predicted csv
df_test.to_csv('Test_predicted.csv',index=False)

In [None]:
# Save the final pipeline
import pickle

with open('StoreSales_Prediction_model.pkl','wb') as file:
  pickle.dump(final_pipeline,file)
