In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [31]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [32]:
print(train.head())
print(test.head())

  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15         9.30          Low Fat         0.016047   
1           DRC01         5.92          Regular         0.019278   
2           FDN15        17.50          Low Fat         0.016760   
3           FDX07        19.20          Regular         0.000000   
4           NCD19         8.93          Low Fat         0.000000   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  182.0950            OUT010   
4              Household   53.8614            OUT013   

   Outlet_Establishment_Year Outlet_Size Outlet_Location_Type  \
0                       1999      Medium               Tier 1   
1                       2009      Medium               Tier 3   
2                       1999      Medium               Tier

In [33]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [34]:
def data_prep(train):
  train['Item_Weight'] = np.where(train['Item_Weight'].isna(),train['Item_Weight'].median(skipna = True),train['Item_Weight'])
  train['Outlet_Size'] = np.where(train['Outlet_Size'].isna(),train['Outlet_Size'].mode()[0], train['Outlet_Size'])
  train['Item_Fat_Content'] = train['Item_Fat_Content'].replace('low fat', 'Low Fat')
  train['Item_Fat_Content'] = train['Item_Fat_Content'].replace('LF', 'Low Fat')
  train['Item_Fat_Content'] = train['Item_Fat_Content'].replace('reg', 'Regular')
  train['YOB'] = 2023 - train['Outlet_Establishment_Year']
  return train

In [35]:
train_new = data_prep(train)

In [36]:
train_new.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
YOB                          0
dtype: int64

In [38]:
train_new.drop(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Type', 'Outlet_Location_Type', 'Outlet_Size'], inplace=True, axis=1)
train_new

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,YOB
0,9.300,0.016047,249.8092,3735.1380,24
1,5.920,0.019278,48.2692,443.4228,14
2,17.500,0.016760,141.6180,2097.2700,24
3,19.200,0.000000,182.0950,732.3800,25
4,8.930,0.000000,53.8614,994.7052,36
...,...,...,...,...,...
8518,6.865,0.056783,214.5218,2778.3834,36
8519,8.380,0.046982,108.1570,549.2850,21
8520,10.600,0.035186,85.1224,1193.1136,19
8521,7.210,0.145221,103.1332,1845.5976,14


In [39]:
y = train_new['Item_Outlet_Sales']
x = train_new.drop(['Item_Outlet_Sales'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 15)

In [40]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6392, 4)
(2131, 4)
(6392,)
(2131,)


In [41]:
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_train = lr.predict(x_train)
lr_test = lr.predict(x_test)

In [42]:
#Lets define a function for Model Evaluation
def model_eval(actual, predicted):
  rmse = np.sqrt(mean_squared_error(actual, predicted))
  r2 = r2_score(actual, predicted)
  print('The RMSE value for the model is: ', round(rmse,2))
  print('The R2 Score for the model is: ', round(r2, 2))

In [43]:
model_eval(y_train, lr_train)

The RMSE value for the model is:  1371.54
The R2 Score for the model is:  0.33


In [44]:
model_eval(y_train, lr_train)

The RMSE value for the model is:  1371.54
The R2 Score for the model is:  0.33


In [45]:
# random forest 
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

In [46]:
rf_preds_train = rf.predict(x_train)
rf_preds_test = rf.predict(x_test)

In [47]:
model_eval(y_train, rf_preds_train)

The RMSE value for the model is:  502.33
The R2 Score for the model is:  0.91


In [48]:
model_eval(y_test, rf_preds_test)


The RMSE value for the model is:  1393.24
The R2 Score for the model is:  0.39
