<a href="https://colab.research.google.com/github/sheha919/Food-sales-predictions/blob/main/sales_prediction_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libraries

In [338]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import set_config
set_config(display='diagram')

#Functions

In [339]:
#function for evaluation
def eval_model(act_train, pred_train, act_test, pred_test):
  eval_results = pd.DataFrame(columns = ['Metrics','Train Score', 'Test Score'])
  train_mae = mean_absolute_error(act_train, pred_train)
  test_mae = mean_absolute_error(act_test, pred_test)

  train_mse = mean_squared_error(act_train, pred_train)
  test_mse = mean_squared_error(act_test, pred_test)

  train_rmse = np.sqrt(train_mse)
  test_rmse = np.sqrt(test_mse)

  train_r2 = r2_score(act_train, pred_train)
  test_r2 = r2_score(act_test, pred_test)

  eval_results.loc[1, 'Metrics'] = 'MAE'
  eval_results.loc[2, 'Metrics'] = 'MSE'
  eval_results.loc[3, 'Metrics'] = 'RMSE'
  eval_results.loc[4, 'Metrics'] = 'R2'

  eval_results.loc[1, 'Train Score'] = f'\u20B9{round(train_mae,2)}'
  eval_results.loc[2, 'Train Score'] = f'\u20B9{round(train_mse,2)}'
  eval_results.loc[3, 'Train Score'] = f'\u20B9{round(train_rmse,2)}'
  eval_results.loc[4, 'Train Score'] = train_r2

  eval_results.loc[1, 'Test Score'] = f'\u20B9{round(test_mae,2)}'
  eval_results.loc[2, 'Test Score'] = f'\u20B9{round(test_mse,2)}'
  eval_results.loc[3, 'Test Score'] = f'\u20B9{round(test_rmse,2)}'
  eval_results.loc[4, 'Test Score'] = test_r2

  return eval_results

In [340]:
#function for final evaluation
def eval_model_final(pipe, act_train = y_train, act_test = y_test):

  train_mse = mean_squared_error(act_train, pipe.predict(X_train))
  test_mse = mean_squared_error(act_test, pipe.predict(X_test))

  train_rmse = np.sqrt(train_mse)
  test_rmse = np.sqrt(test_mse)

  train_r2 = r2_score(act_train, pipe.predict(X_train))
  test_r2 = r2_score(act_test, pipe.predict(X_test))

  return (train_r2, test_r2, f'\u20B9{round(train_rmse,2)}', f'\u20B9{round(test_rmse,2)}')

# Load Data

In [341]:
df = pd.read_csv('/content/sales_predictions.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [342]:
#make a copy of df
ml_df = df.copy()
ml_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Explore Data set

In [343]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [344]:
ml_df.shape

(8523, 12)

In [345]:
#checking for duplicates
ml_df.duplicated().sum()

0

no duplicates in the data set

In [346]:
#checking for missing data
ml_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

# Data Cleaning

In [347]:
#checking the missing values in 'Outlet_size' column
ml_df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [348]:
#checking the data inconcistency
ml_df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [349]:
ml_df['Item_Fat_Content'].replace({'LF':'Low Fat','low fat':'Low Fat', 'reg':'Regular'},inplace = True)
ml_df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [350]:
#ordinal encoding
ml_df['Outlet_Size'].replace({'Small':0, 'Medium':1, 'High':2}, inplace = True)
ml_df['Outlet_Size'].value_counts()

1.0    2793
0.0    2388
2.0     932
Name: Outlet_Size, dtype: int64

In [351]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   float64
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(5), int64(1), object(6)
memory usage: 799.2+ KB


Here, the ordinal data changed to numerical data. But still there are missing data in both 'Outlet_Size' and 'Item_Weight'. It will be fixed in next steps

In [352]:
# statistical analysis of numeric data
ml_df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,6113.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,0.761819,2181.288914
std,4.643456,0.051598,62.275067,8.37176,0.697463,1706.499616
min,4.555,0.0,31.29,1985.0,0.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,0.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,1.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,2.0,13086.9648


# Validation Split

In [353]:
y = ml_df['Item_Outlet_Sales']
X = ml_df.drop(columns = ['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           5285 non-null   float64
 1   Item_Fat_Content      6392 non-null   object 
 2   Item_Visibility       6392 non-null   float64
 3   Item_Type             6392 non-null   object 
 4   Item_MRP              6392 non-null   float64
 5   Outlet_Size           4580 non-null   float64
 6   Outlet_Location_Type  6392 non-null   object 
 7   Outlet_Type           6392 non-null   object 
dtypes: float64(4), object(4)
memory usage: 449.4+ KB


# Data Preprocessing

## Instantiate Column Selectors

In [354]:
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')

## Instantiate Transformers

In [355]:
#Imputers
num_imputer = SimpleImputer(strategy='median')

#Scaler
scaler = StandardScaler()

#One Hot encoder
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

Imputers are only required to numeric data ('Outlet_Size' and 'Item_Weight') and the 'median' is selected since 'Outlet_Size' cannot be a float value.

## Instantiate Numerical Pipelines

In [356]:
#Numeric pipeline
num_pipeline = make_pipeline(num_imputer, scaler)

num_pipeline

We don't need a pipeline for categorical data since there is only one transformer.

## Instantiate ColumnTransformer

In [357]:
#Tuples for Column Transformer
num_tuple = (num_pipeline, num_selector)
cat_tuple = (ohe, cat_selector)

#column transformer
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough') # remainder = 'passthrough' is not necessary

preprocessor

##Transform Data


In [358]:
preprocessor.fit(X_train)

In [359]:
#transform train and test data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [360]:
X_train_processed

array([[ 0.82748547, -0.71277507,  1.82810922, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.56664432, -1.29105225,  0.60336888, ...,  0.        ,
         1.        ,  0.        ],
       [-0.12102782,  1.81331864,  0.24454056, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.12389588, -0.92052713,  1.52302674, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.77599877, -0.2277552 , -0.38377708, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.82748547, -0.95867683, -0.73836105, ...,  1.        ,
         0.        ,  0.        ]])

## Inspect the Result

In [361]:
print('Number of missing values in training data: ', np.isnan(X_train_processed).sum().sum())
print('Number of missing values in testing data: ', np.isnan(X_test_processed).sum().sum())
print('\n')
print('Data type in X_train_processed: ', X_train_processed.dtype)
print('Data type in X_test_processed: ', X_test_processed.dtype)
print('\n')
print('Shape of training data: ', X_train_processed.shape)

Number of missing values in training data:  0
Number of missing values in testing data:  0


Data type in X_train_processed:  float64
Data type in X_test_processed:  float64


Shape of training data:  (6392, 29)


#Linear Regression Model

##Instantiate a linear regression model

In [362]:
lin_reg = LinearRegression()

##Instantiate Pipelines to combine ColumnTransformer and linear regression model

In [363]:
lin_reg_pipe = make_pipeline(preprocessor, lin_reg)
lin_reg_pipe

## Fit the training data to the model

In [364]:
lin_reg_pipe.fit(X_train, y_train)

##Make Predictions

In [365]:
lin_reg_train_pred = lin_reg_pipe.predict(X_train)
lin_reg_test_pred = lin_reg_pipe.predict(X_test)

##Evaluate the model

In [366]:
eval_model(y_train, lin_reg_train_pred, y_test, lin_reg_test_pred)

Unnamed: 0,Metrics,Train Score,Test Score
1,MAE,₹847.4,₹805.42
2,MSE,₹1300470.38,₹1197833.13
3,RMSE,₹1140.38,₹1094.46
4,R2,0.560571,0.565842


# Decision Tree Model

##Instantiate decision tree model

In [367]:
dec_tree = DecisionTreeRegressor(random_state = 42)

##Initial Model

###Instantiate pipelines to combine ColumnTransformer and decision tree model

In [368]:
dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
dec_tree_pipe

###Fit the training data to the model

In [369]:
dec_tree_pipe.fit(X_train, y_train)

### Make Predictions

In [370]:
dec_tree_train_pred = dec_tree_pipe.predict(X_train)
dec_tree_test_pred = dec_tree_pipe.predict(X_test)

###Evaluate the Model

In [371]:
dec_tree_train_score = dec_tree_pipe.score(X_train, y_train)
dec_tree_test_score = dec_tree_pipe.score(X_test, y_test)
print(f'Train score: {dec_tree_train_score}')
print(f'Test score: {dec_tree_test_score}')

Train score: 1.0
Test score: 0.1273297420940004


According to train and test R2 scores, we can observed that there is overfitting. Therfore, optimizing the parameters are required.

##Tuning the model

In [372]:
dec_tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [373]:
#max_depth value
dec_tree.get_depth()

42

In [374]:
#optimizing the max_depth value
depths = range(1, 43)
scores = pd.DataFrame(index = depths, columns = ['Train Score', 'Test Score'])
for depth in depths:
  dec_tree = DecisionTreeRegressor(random_state = 42, max_depth= depth)
  dec_tree_pipe = make_pipeline(preprocessor, dec_tree)
  dec_tree_pipe.fit(X_train, y_train)

  dec_tree_train_score = dec_tree_pipe.score(X_train, y_train)
  dec_tree_test_score = dec_tree_pipe.score(X_test, y_test)

  scores.loc[depth, 'Train Score'] = dec_tree_train_score
  scores.loc[depth, 'Test Score'] = dec_tree_test_score

In [375]:
scores.head()

Unnamed: 0,Train Score,Test Score
1,0.237797,0.229683
2,0.431641,0.433778
3,0.524218,0.524222
4,0.582625,0.584005
5,0.603933,0.594709


In [376]:
#sorting scores according to the test score
scores = scores.sort_values(by = 'Test Score', ascending = False)
scores.head()

Unnamed: 0,Train Score,Test Score
5,0.603933,0.594709
4,0.582625,0.584005
6,0.615063,0.582337
7,0.626452,0.578545
8,0.642731,0.566229


The maximum Test Score obtained when the max_depth = 5

## Optimized model

In [377]:
dec_tree_opt = DecisionTreeRegressor(random_state = 42, max_depth= 5)
dec_tree_opt_pipe = make_pipeline(preprocessor, dec_tree_opt)
dec_tree_opt_pipe.fit(X_train, y_train)
#predictions
dec_tree_opt_train_pred = dec_tree_opt_pipe.predict(X_train)
dec_tree_opt_test_pred = dec_tree_opt_pipe.predict(X_test)

### Evaluvate the model

In [378]:
eval_model(y_train, dec_tree_opt_train_pred, y_test, dec_tree_opt_test_pred)

Unnamed: 0,Metrics,Train Score,Test Score
1,MAE,₹762.64,₹738.36
2,MSE,₹1172142.04,₹1118187.95
3,RMSE,₹1082.66,₹1057.44
4,R2,0.603933,0.594709


#Results & Discusion

In [379]:
model = ['Linear Regression', 'Decision Tree']
pipe = [lin_reg_pipe, dec_tree_opt_pipe]
final_eval_results = pd.DataFrame(columns = ['Model','Train R2', 'Test R2', 'Train RMSE', 'Test RMSE'])

for i in range(len(model)):
  final_eval_results.loc[i, 'Model'] = model[i]
  final_eval_results.loc[i,'Train R2'], final_eval_results.loc[i,'Test R2'], final_eval_results.loc[i,'Train RMSE'], final_eval_results.loc[i,'Test RMSE'] = eval_model_final(pipe[i])


In [380]:
final_eval_results.set_index('Model', inplace = True) 

In [381]:
final_eval_results

Unnamed: 0_level_0,Train R2,Test R2,Train RMSE,Test RMSE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear Regression,0.560571,0.565842,₹1140.38,₹1094.46
Decision Tree,0.603933,0.594709,₹1082.66,₹1057.44


- Above table illustrated the summary of the performance of the two models used in this work. 
- In both models, corresponding train and test R2 scores are approximately equal to each other. That says they are not overfitting.
- Since all R2 score values are above 0.3, the models are not underfitting too.   
- Here I conclude that the decision tree model is the best performing model since it obtained;
  - highest test R2 score
  - lowest train and test RMSE values
  - lowest diffrence value between train and test RMSE