importing dependencies 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score


Data Collection and Analysis

In [2]:
sales_data = pd.read_csv('big_mart_data.csv')

In [3]:
sales_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
sales_data.shape

(8523, 12)

In [5]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [6]:
sales_data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [7]:
sales_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Data PreProcessing

Handling Missing Values

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
imp = SimpleImputer(strategy='most_frequent')
imp1 = SimpleImputer(strategy='mean')
sales_data['Item_Weight'] = imp1.fit_transform(sales_data[['Item_Weight']]).ravel()
sales_data['Outlet_Size'] = imp.fit_transform(sales_data[['Outlet_Size']]).ravel()

In [10]:
sales_data1 = pd.DataFrame(sales_data,columns=sales_data.columns)

In [11]:
sales_data1.head(6)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088


In [12]:
sales_data1.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [13]:
sales_data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Data handling

In [14]:
sales_data1['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [15]:
sales_data1.replace({'Item_Fat_Content': {'low fat' :'Low Fat' ,'LF':'Low Fat','reg':'Regular' }},inplace=True)

In [16]:
sales_data1['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

Label Encoding

In [17]:
le = LabelEncoder()

In [18]:
sales_data1['Item_Identifier']= le.fit_transform(sales_data1['Item_Identifier'])
sales_data1['Item_Fat_Content']= le.fit_transform(sales_data1['Item_Fat_Content'])
sales_data1['Item_Type']= le.fit_transform(sales_data1['Item_Type'])
sales_data1['Outlet_Identifier']= le.fit_transform(sales_data1['Outlet_Identifier'])
sales_data1['Outlet_Type']= le.fit_transform(sales_data1['Outlet_Type'])
sales_data1['Outlet_Size']= le.fit_transform(sales_data1['Outlet_Size'])
sales_data1['Outlet_Location_Type']= le.fit_transform(sales_data1['Outlet_Location_Type'])

In [19]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [20]:
sales_data1.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,3735.138
1,8,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,662,17.5,0,0.01676,10,141.618,9,1999,1,0,1,2097.27
3,1121,19.2,1,0.0,6,182.095,0,1998,1,2,0,732.38
4,1297,8.93,0,0.0,9,53.8614,1,1987,0,2,1,994.7052


Splitting of The Dataset Features and Target 

In [21]:
X = sales_data1.drop('Item_Outlet_Sales',axis=1)
Y = sales_data1['Item_Outlet_Sales']

In [22]:
print(X)

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0                 156        9.300                 0         0.016047   
1                   8        5.920                 1         0.019278   
2                 662       17.500                 0         0.016760   
3                1121       19.200                 1         0.000000   
4                1297        8.930                 0         0.000000   
...               ...          ...               ...              ...   
8518              370        6.865                 0         0.056783   
8519              897        8.380                 1         0.046982   
8520             1357       10.600                 0         0.035186   
8521              681        7.210                 1         0.145221   
8522               50       14.800                 0         0.044878   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0             4  249.8092                  9    

In [23]:
print(Y)

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64


Model Selection for the Project

importing all the neccesary models

In [24]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [25]:
model = RandomForestRegressor(random_state=0)

Splitting of test data and train data 

In [26]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=40)

model training 

In [27]:
model.fit(X_train,Y_train)

Model Evaluation

For the Training Data :

In [28]:
from sklearn.metrics import r2_score

In [29]:
train_pred = model.predict(X_train)
train_data_accuracy = r2_score(Y_train,train_pred)
print(f'The R sqaured value of the {model.__class__.__name__} is {train_data_accuracy}')
test_pred =model.predict(X_test)
test_data_accuracy = r2_score(Y_test,test_pred)
print(f'The R sqaured value of the {model.__class__.__name__} is {test_data_accuracy}')
print('-----------------------------------------------------------------------------------------------------')

The R sqaured value of the RandomForestRegressor is 0.9381650666475106
The R sqaured value of the RandomForestRegressor is 0.5496240976669604
-----------------------------------------------------------------------------------------------------


Cross validation

In [30]:
cv_scores = cross_val_score(model, X, Y, cv=5, scoring='r2')
print('Cross-Validation R2 Scores:', cv_scores)
print('Average CV Score:', np.mean(cv_scores))


Cross-Validation R2 Scores: [0.56885362 0.5341655  0.52733682 0.56215635 0.56278623]
Average CV Score: 0.5510597038698876


HyperParameter Tuning

In [31]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, Y_train)

print('Best Parameters:', grid_search.best_params_)
print("The Best Score for those Parameters : ",grid_search.best_score_)


Best Parameters: {'max_depth': 10, 'n_estimators': 200}
The Best Score for those Parameters :  0.5861635246392002


Model Optimization

In [37]:
model = RandomForestRegressor(max_depth=10,n_estimators=200)
model.fit(X_train,Y_train)

In [39]:
train_pred = model.predict(X_train)
train_data_accuracy = r2_score(Y_train,train_pred)
print(f'The R sqaured value of the {model.__class__.__name__} for the Training data  is : {train_data_accuracy}')
test_pred =model.predict(X_test)
test_data_accuracy = r2_score(Y_test,test_pred)
print(f'The R sqaured value of the {model.__class__.__name__} for the Testing data is : {test_data_accuracy}')
print('-----------------------------------------------------------------------------------------------------')

The R sqaured value of the RandomForestRegressor for the Training data  is : 0.7333915340698496
The R sqaured value of the RandomForestRegressor for the Testing data is : 0.5833732828630938
-----------------------------------------------------------------------------------------------------
