In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# load the datasets
train=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Regression analysis/Train.csv')
test=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Regression analysis/Test.csv')

In [None]:
#adding source to combine them... without source, we won't know which dataset is from test and which one is from train later while separating them
train['source']='train'
test['source']='test'

## combining the test and train datasets

data= pd.concat([train,test], ignore_index=True)



In [None]:
data.shape

(14204, 13)

In [None]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,train
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1,,test
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,,test


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                11765 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                10188 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          8523 non-null   float64
 12  source                     14204 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usag

In [None]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [None]:
data.isnull().sum()

Unnamed: 0,0
Item_Identifier,0
Item_Weight,2439
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,4016
Outlet_Location_Type,0


In [None]:
data.tail()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
14199,FDB58,10.5,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test
14200,FDD47,7.6,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test
14201,NCO17,10.0,Low Fat,0.073529,Health and Hygiene,118.744,OUT045,2002,,Tier 2,Supermarket Type1,,test
14202,FDJ26,15.3,Regular,0.0,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,,test
14203,FDU37,9.5,Regular,0.10472,Canned,79.796,OUT045,2002,,Tier 2,Supermarket Type1,,test


In [None]:
#Handling out the missing values

#fill missing item weight with mean weight of the same item_identifier

item_avg_weight= data.pivot_table(values='Item_Weight',
                                  index='Item_Identifier')

def impute_weight(row):
  if pd.isnull(row['Item_Weight']):
    return item_avg_weight.loc[row['Item_Identifier']].values[0]
  else:
    return row['Item_Weight']

data['Item_Weight']=data.apply(impute_weight, axis=1)

In [None]:
outlet_size_mode = data.pivot_table(values='Outlet_Size', index='Outlet_Type', aggfunc=lambda x: x.mode()[0])

def impute_outlet_size(row):
    if pd.isnull(row['Outlet_Size']):
        return outlet_size_mode.loc[row['Outlet_Type']].values[0]
    else:
        return row['Outlet_Size']

data['Outlet_Size'] = data.apply(impute_outlet_size, axis=1)


In [None]:
data.isnull().sum()

Unnamed: 0,0
Item_Identifier,0
Item_Weight,0
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,0
Outlet_Location_Type,0


Encoding so that ML models can understand numerical values and not texts like "Supermarket Type"


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                14204 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                14204 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          8523 non-null   float64
 12  source                     14204 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usag

In [None]:
data.select_dtypes('object').columns

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'source'],
      dtype='object')

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

cols_to_encode= ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Identifier']

for col in cols_to_encode:
  data[col]=le.fit_transform(data[col])



In [None]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.30,1,0.016047,Dairy,249.8092,9,1999,1,0,1,3735.1380,train
1,DRC01,5.92,2,0.019278,Soft Drinks,48.2692,3,2009,1,2,2,443.4228,train
2,FDN15,17.50,1,0.016760,Meat,141.6180,9,1999,1,0,1,2097.2700,train
3,FDX07,19.20,2,0.000000,Fruits and Vegetables,182.0950,0,1998,2,2,0,732.3800,train
4,NCD19,8.93,1,0.000000,Household,53.8614,1,1987,0,2,1,994.7052,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,2,0.013496,Snack Foods,141.3154,8,1997,2,0,1,,test
14200,FDD47,7.60,2,0.142991,Starchy Foods,169.1448,3,2009,1,2,2,,test
14201,NCO17,10.00,1,0.073529,Health and Hygiene,118.7440,7,2002,2,1,1,,test
14202,FDJ26,15.30,2,0.000000,Canned,214.6218,2,2007,2,1,1,,test


In [None]:
data=pd.get_dummies(data, columns=['Item_Type'], drop_first=True)

In [None]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,FDA15,9.30,1,0.016047,249.8092,9,1999,1,0,1,...,False,False,False,False,False,False,False,False,False,False
1,DRC01,5.92,2,0.019278,48.2692,3,2009,1,2,2,...,False,False,False,False,False,False,False,False,True,False
2,FDN15,17.50,1,0.016760,141.6180,9,1999,1,0,1,...,False,False,False,False,True,False,False,False,False,False
3,FDX07,19.20,2,0.000000,182.0950,0,1998,2,2,0,...,True,False,False,False,False,False,False,False,False,False
4,NCD19,8.93,1,0.000000,53.8614,1,1987,0,2,1,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,2,0.013496,141.3154,8,1997,2,0,1,...,False,False,False,False,False,False,False,True,False,False
14200,FDD47,7.60,2,0.142991,169.1448,3,2009,1,2,2,...,False,False,False,False,False,False,False,False,False,True
14201,NCO17,10.00,1,0.073529,118.7440,7,2002,2,1,1,...,False,False,True,False,False,False,False,False,False,False
14202,FDJ26,15.30,2,0.000000,214.6218,2,2007,2,1,1,...,False,False,False,False,False,False,False,False,False,False


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Item_Identifier                  14204 non-null  object 
 1   Item_Weight                      14204 non-null  float64
 2   Item_Fat_Content                 14204 non-null  int64  
 3   Item_Visibility                  14204 non-null  float64
 4   Item_MRP                         14204 non-null  float64
 5   Outlet_Identifier                14204 non-null  int64  
 6   Outlet_Establishment_Year        14204 non-null  int64  
 7   Outlet_Size                      14204 non-null  int64  
 8   Outlet_Location_Type             14204 non-null  int64  
 9   Outlet_Type                      14204 non-null  int64  
 10  Item_Outlet_Sales                8523 non-null   float64
 11  source                           14204 non-null  object 
 12  Item_Type_Breads  

In [None]:
data.drop('Item_Identifier', axis=1, inplace=True)

In [None]:
data

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,9.30,1,0.016047,249.8092,9,1999,1,0,1,3735.1380,...,False,False,False,False,False,False,False,False,False,False
1,5.92,2,0.019278,48.2692,3,2009,1,2,2,443.4228,...,False,False,False,False,False,False,False,False,True,False
2,17.50,1,0.016760,141.6180,9,1999,1,0,1,2097.2700,...,False,False,False,False,True,False,False,False,False,False
3,19.20,2,0.000000,182.0950,0,1998,2,2,0,732.3800,...,True,False,False,False,False,False,False,False,False,False
4,8.93,1,0.000000,53.8614,1,1987,0,2,1,994.7052,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,10.50,2,0.013496,141.3154,8,1997,2,0,1,,...,False,False,False,False,False,False,False,True,False,False
14200,7.60,2,0.142991,169.1448,3,2009,1,2,2,,...,False,False,False,False,False,False,False,False,False,True
14201,10.00,1,0.073529,118.7440,7,2002,2,1,1,,...,False,False,True,False,False,False,False,False,False,False
14202,15.30,2,0.000000,214.6218,2,2007,2,1,1,,...,False,False,False,False,False,False,False,False,False,False


Done with cleaning the data, and encoding... one hot encoding for the item_type and label encoding for the others.... now and splitting the test and train set again

In [None]:
train_df= data[data['source']=='train'].copy() ##.cop() is an insurance so that real data would not be messed with
test_df= data[data['source']=='test'].copy()

In [None]:
##dropping the source column

train_df.drop(['source'], axis=1, inplace=True)
test_df.drop(['source', 'Item_Outlet_Sales'], axis=1, inplace=True)

In [None]:
##defining X(features) and Y(target)

X=train_df.drop(['Item_Outlet_Sales'], axis=1)
y=train_df['Item_Outlet_Sales']

In [None]:
## Train-Test split

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid= train_test_split(X,y, test_size=0.2, random_state=47)

In [None]:
# train the regression model

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model= RandomForestRegressor(n_estimators= 100, random_state=47)
model.fit(X_train,y_train)

In [None]:
#prediction on the validation test

y_pred=model.predict(X_valid)

In [None]:
#Evaluation
print("RMSE:", np.sqrt(mean_squared_error(y_valid, y_pred)))
print("R² Score:", r2_score(y_valid, y_pred))

RMSE: 1153.2451812791999
R² Score: 0.5424498061594516


In [None]:
#Prediction on test set
test_pred=model.predict(test_df)

In [None]:
import pickle

with open('sales_model.pkl', 'wb') as file:
  pickle.dump(model,file)

In [49]:
feature_columns= X_train.columns
with open('feature.columns.pkl', 'wb') as file:
  pickle.dump(feature_columns, file)   ## data maa encoding vayeko le streamlit pani tyesari nai encode gariyeko hunaa ley