In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
data = pd.read_csv('/content/big_mart_data.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.info()

Categorical Features
- Item_Identifier
- Item_Fat_Content
- Item_Type
- Outlet_Identifier
- Outlet_Size
- Outlet_Location_Type
- Outlet_Type

In [None]:
# Filling missing values

data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)

mode_outlet_size = data.pivot_table(values='Outlet_Size', columns = 'Outlet_Type', aggfunc=(lambda x : x.mode()[0]))

In [None]:
print(mode_outlet_size)

In [None]:
missing_values = data['Outlet_Size'].isnull()

In [None]:
print(missing_values)

In [None]:
data.loc[missing_values, 'Outlet_Size'] = data.loc[missing_values, 'Outlet_Type'].apply(lambda x: mode_outlet_size.loc['Outlet_Size', x])

In [None]:
data.isnull().sum()

Data Analysis

In [None]:
data.describe()

Numerical Features

In [None]:
sns.set()

In [None]:
# Item_Weight distribution
plt.figure(figsize=(6,6))
sns.displot(data['Item_Weight'])
plt.show()

In [None]:
# Item_Visibility distribution
plt.figure(figsize=(6,6))
sns.displot(data['Item_Visibility'])
plt.show()

In [None]:
# Item_MRP distribution
plt.figure(figsize=(6,6))
sns.displot(data['Item_MRP'])
plt.show()

In [None]:
# Item_Outlet_Sales distribution
plt.figure(figsize=(6,6))
sns.displot(data['Item_Outlet_Sales'])
plt.show()

In [None]:
# Outlet_Establishment_Year distribution
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Establishment_Year', data=data)
plt.show()

Categorical Features

In [None]:
# Item_Fat_Content distribution
plt.figure(figsize=(6,6))
sns.countplot(x='Item_Fat_Content', data=data)
plt.show()

In [None]:
# Item_Type distribution
plt.figure(figsize=(30,6))
sns.countplot(x='Item_Type', data=data)
plt.show()

In [None]:
# Outlet_Size distribution
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Size', data=data)
plt.show()

In [None]:
# Outlet_Location_Type distribution
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Location_Type', data=data)
plt.show()

Data Pre-Processing

In [48]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,8,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,662,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,1121,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,1297,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
data['Item_Fat_Content'].value_counts()

In [None]:
data.replace({'Item_Fat_Content': {'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
data['Item_Fat_Content'].value_counts()

Label Encoding

In [46]:
encoder = LabelEncoder()

In [49]:
data['Item_Identifier'] = encoder.fit_transform(data['Item_Identifier'])
data['Item_Fat_Content'] = encoder.fit_transform(data['Item_Fat_Content'])
data['Item_Type'] = encoder.fit_transform(data['Item_Type'])
data['Outlet_Size'] = encoder.fit_transform(data['Outlet_Size'])
data['Outlet_Location_Type'] = encoder.fit_transform(data['Outlet_Location_Type'])
data['Outlet_Identifier'] = encoder.fit_transform(data['Outlet_Identifier'])
data['Outlet_Type'] = encoder.fit_transform(data['Outlet_Type'])

In [50]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,3735.138
1,8,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,662,17.5,0,0.01676,10,141.618,9,1999,1,0,1,2097.27
3,1121,19.2,1,0.0,6,182.095,0,1998,2,2,0,732.38
4,1297,8.93,0,0.0,9,53.8614,1,1987,0,2,1,994.7052


Splitting features and targets

In [51]:
X = data.drop(columns='Item_Outlet_Sales', axis=1)
Y = data['Item_Outlet_Sales']

In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=2, test_size=0.2)

In [54]:
print(X.shape, X_train.shape, X_test.shape)

(8523, 11) (6818, 11) (1705, 11)


Machine Learning Model Training - XGBoost Regressor

In [55]:
regressor = XGBRegressor()

In [56]:
regressor.fit(X_train, Y_train)

Evaluation

In [57]:
# prediction on training data
training_data_pred = regressor.predict(X_train)

In [58]:
# R squared values
r2_train = metrics.r2_score(Y_train, training_data_pred)

In [60]:
print(r2_train)

0.8762174618111388


In [61]:
# prediction on test data
test_data_pred = regressor.predict(X_test)

In [62]:
# R squared values
r2_test = metrics.r2_score(Y_test, test_data_pred)

In [63]:
print(r2_test)

0.5017253991620692
