In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **PROBLEM STATEMENT**
# **To Build a predictive model and predict the sales of each product at a particular outlet.**

# *Dataset Description*

The dataset contains a set of **8,523 records** with **12 attributes**

* **Item_Identifier** --- Unique product ID

* **Item_Weight** --- Weight of product

* **Item_Fat_Content** --- Checks the Concentration of fat in the product

* **Item_Visibility** --- The % of total display area of all similar products in a store

* **Item_Type** --- Product Category

* **Item_MRP** --- Maximum Retail Price for a Product

* **Outlet_Identifier** --- Store ID

* **Outlet_Establishment_Year** --- The year in which store was established

* **Outlet_Size** --- The size of the store (Area Size Category)

* **Outlet_Location_Type** --- In Terms of city Tiers (Size)

* **Outlet_Type** --- Grocery store or a type of supermarket

* **Item_Outlet_Sales** --- Sales of the product In the Specific outlet

# *Libraries*

In [None]:
import os #paths to file
import numpy as np # linear algebra
import pandas as pd # data processing
import warnings# warning filter


#ploting libraries
import matplotlib.pyplot as plt 
import seaborn as sns

#feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#train test split
from sklearn.model_selection import train_test_split

#metrics
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
from sklearn.model_selection  import cross_val_score as CVS

#ML models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso


#default theme and settings
sns.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=False, rc=None)
pd.options.display.max_columns

#warning handle
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

In [None]:
#path for the training set
tr_path = "/kaggle/input/bigmart-sales-data/Train.csv"
#path for the testing set
te_path = "/kaggle/input/bigmart-sales-data/Test.csv"

# *Checking Dataset*

In [None]:
#Training Dataset
tr_df = pd.read_csv(tr_path)
tr_df.head()

In [None]:
#Testing Dataset
te_df = pd.read_csv(te_path)
te_df.head()

In [None]:
#Checking Size of Dataset
print(f"training set (row, col): {tr_df.shape}\n\ntesting set (row, col): {te_df.shape}")

# *Preprocessing Dataset*

In [None]:
#column information
tr_df.info(verbose=True, null_counts=True)

In [None]:
#summary statistics on Testing Dataset
te_df.describe()

In [None]:
#summary statistics on Training Dataset
tr_df.describe()

# *Nullvalue Check*

In [None]:
#Missing values in decsending order
#Training Dataset
print("Train:\n")
print(tr_df.isnull().sum().sort_values(ascending=False),"\n\n",tr_df.isnull().sum()/tr_df.shape[0] *100,"\n\n")
#Training Dataset
print("Test:\n")
print(te_df.isnull().sum().sort_values(ascending=False),"\n\n",te_df.isnull().sum()/te_df.shape[0] *100,"\n\n")

*Inference*

column ------------- size

Outlet_Size -------- 28%

Item_Weight -------- 17%

In [None]:
# Checking the value counts for Outlet_Size and Item_Weight inorder to Replace the null values
print("Outlet_Size:\n", tr_df.Outlet_Size.value_counts(), "\n\n")
print("Item_Weight:\n", tr_df.Item_Weight.value_counts(), "\n\n")

*Inference*

1) Outlet_Size is a categorical column, therefore we will impute missing values with Medium the mode value.

2) item_weight is a numerical column therefore we need to visualize it's distribution for an appropriate conclusion.

Outlet_Size Missing Value Correction

In [None]:
#Outlet Size

print("test mode, train mode\n",[tr_df['Outlet_Size'].mode().values[0], te_df['Outlet_Size'].mode().values[0]])

In [None]:
#Replacing Data in Outlet_Size

#Training Dataset
tr_df['Outlet_Size'] = tr_df['Outlet_Size'].fillna(tr_df['Outlet_Size'].dropna().mode().values[0])

#Testing Dataset
te_df['Outlet_Size'] = te_df['Outlet_Size'].fillna(te_df['Outlet_Size'].dropna().mode().values[0])

#Checking Data
tr_df['Outlet_Size'].isnull().sum(),te_df['Outlet_Size'].isnull().sum()

Item_Weight Missing Value Correction

In [None]:
# Visualising Item_Weight
sns.boxplot(data=tr_df['Item_Weight'],orient="v", color = 'c')
plt.title("Item_Weight Boxplot")

*Inference:*

We can fill the missing data with the mean value for maximum efficiency

In [None]:
#training Dataset
tr_df['Item_Weight'] = tr_df['Item_Weight'].fillna(tr_df['Item_Weight'].dropna().mean())

#testing Dataset
te_df['Item_Weight'] = te_df['Item_Weight'].fillna(te_df['Item_Weight'].dropna().mean())

#checking if we filled missing values
tr_df['Item_Weight'].isnull().sum(),te_df['Item_Weight'].isnull().sum()

# *Final Data Check*

In [None]:
print("train:\n")
print(tr_df.info())
print("\n\ntest:\n")
print(te_df.info())

# *Exploring Dataset*

In [None]:
#list of all the numeric columns
num = tr_df.select_dtypes('number').columns.to_list()
#list of all the categoric columns
cat = tr_df.select_dtypes('object').columns.to_list()

#numeric df
BM_num =  tr_df[num]
#categoric df
BM_cat = tr_df[cat]

[tr_df[category].value_counts() for category in cat[1:]]

**Inference:**

Item_Fat_Content column, has repeating values with different names.

In [None]:
#Correcting Item_Fat_Content Column

#Training Dataset
tr_df['Item_Fat_Content'].replace(['LF', 'low fat', 'reg'],['Low Fat','Low Fat','Regular'],inplace = True)
#Testing Dataset
te_df['Item_Fat_Content'].replace(['LF', 'low fat', 'reg'],['Low Fat','Low Fat','Regular'],inplace = True)

#Checking result
tr_df.Item_Fat_Content.value_counts()

# *Review Our Data*

In [None]:
tr_df.head()

**Inference:**

Since Outlet_Establishment_Year can be used efficiently if its replace by Age, we can convert it and use in New column named Outlet_Age

In [None]:
#Creating our new column for both datasets
tr_df['Outlet_Age'], te_df['Outlet_Age']= tr_df['Outlet_Establishment_Year'].apply(lambda year: 2021 - year), te_df['Outlet_Establishment_Year'].apply(lambda year: 2021 - year)

#Uncomment to check result
tr_df['Outlet_Age'].head
te_df['Outlet_Age'].head

# *Data Visualisation*

**Univariate Plots - Countplots**

i) For Categorical Columns

In [None]:
#Our Categorical Columns:
['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 
 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

#Item_Fat_Content Column
plt.figure(figsize=(6,4))
sns.countplot(x='Item_Fat_Content' , data=tr_df ,palette='mako')
plt.xlabel('Item_Fat_Content', fontsize=14)
plt.show()

In [None]:
#Item_Type Column
plt.figure(figsize=(27,10))
sns.countplot(x='Item_Type' , data=tr_df ,palette='summer')
plt.xlabel('Item_Type', fontsize=14)
plt.show()

In [None]:
# Outlet_Identifier Column
plt.figure(figsize=(15,4))
sns.countplot(x='Outlet_Identifier' , data=tr_df ,palette='winter')
plt.xlabel('Outlet_Identifier', fontsize=14)
plt.show()

In [None]:
# Outlet_Size Column
plt.figure(figsize=(10,4))
sns.countplot(x='Outlet_Size' , data=tr_df ,palette='autumn')
plt.xlabel('Outlet_Size', fontsize=14)
plt.show()

In [None]:
# Outlet_Location_Type Column
plt.figure(figsize=(10,4))
sns.countplot(x='Outlet_Location_Type' , data=tr_df ,palette='twilight_shifted')
plt.xlabel('Outlet_Location_Type', fontsize=14)
plt.show()

In [None]:
# Outlet_Type Column
plt.figure(figsize=(10,4))
sns.countplot(x='Outlet_Type' , data=tr_df ,palette='rocket')
plt.xlabel('Outlet_Type', fontsize=14)
plt.show()

**Inference From Univariate Plots for Categorical Columns:**

Highest Counts for each column are listed below,

Item_Fat_Content - Low fat.

Item_Type - Fruits and vegetables and snack foods.

Outlet_Identifier - All Other outlets excluding OUT010 and OUT019.

Outlet_Size - Medium sized.

Outlet_Location_Type - Tier3.

Outlet_Type - Supermarket Type1.

# *Data Visualisation*

**Univariate Plots**

ii) For Numerical Columns

In [None]:
num = tr_df.select_dtypes('number').columns.to_list()
#numeric df
BM_num =  tr_df[num]

#Outlet_Age
plt.hist(tr_df['Outlet_Age'])
plt.title("Outlet_Age")
plt.show()

In [None]:
# Item_Outlet_Sales
for numeric in BM_num[num[:3]]:
    plt.scatter(BM_num[numeric], BM_num['Item_Outlet_Sales'])
    plt.title(numeric)
    plt.ylabel('Item_Outlet_Sales')
    plt.show()

**Inference From Univariate Plots for Categorical Columns:**

Outlet_Age - 35 years old outlets are most common

Item_Weight - No specific pattern.

Item_Visibility - Appears to be spreaded but some concentration around the (0,0) indicate small visibility items are not selling well is some cases.

Item_MRP - Items with higher MRP tend to sell better in most cases.

# *Data Visualisation*

**Multivariate Plots**

* Sales per item type

* Sales per outlet

* Sales per outlet type

* Sales per outlet size

* Sales per location type

In [None]:
# Item_Outlet_Sales Vs Item_Type
plt.figure(figsize=(27,10))
sns.barplot('Item_Type' ,'Item_Outlet_Sales', data=tr_df ,palette='gist_rainbow_r')
plt.xlabel('Item_Type', fontsize=14)
plt.legend()
plt.show()

In [None]:
# Item_Outlet_Sales Vs Outlet_Identifier

plt.figure(figsize=(27,10))
sns.barplot('Outlet_Identifier' ,'Item_Outlet_Sales', data=tr_df ,palette='gist_rainbow')
plt.xlabel('Outlet_Identifier', fontsize=14)
plt.legend()
plt.show()

In [None]:
# Item_Outlet_Sales Vs Outlet_Type

plt.figure(figsize=(10,5))
sns.barplot('Outlet_Type' ,'Item_Outlet_Sales', data=tr_df ,palette='nipy_spectral')
plt.xlabel('Outlet_Type', fontsize=14)
plt.legend()
plt.show()

In [None]:
# Item_Outlet_Sales Vs Outlet_Size

plt.figure(figsize=(10,5))
sns.barplot('Outlet_Size' ,'Item_Outlet_Sales', data=tr_df ,palette='YlOrRd')
plt.xlabel('Outlet_Size', fontsize=14)
plt.legend()
plt.show()

In [None]:
# Item_Outlet_Sales Vs Outlet_Location_Type

plt.figure(figsize=(10,5))
sns.barplot('Outlet_Location_Type' ,'Item_Outlet_Sales', data=tr_df ,palette='Spectral')
plt.xlabel('Outlet_Location_Type', fontsize=14)
plt.legend()
plt.show()

**Inference From Multivariate Plots:**

* Sales per item type: All items are performing well, the difference in item types by sales is very small.
* Sales per outlet: Outlet 27 is the most profitable and Outlet 10 and 19 are underperforming.
* Sales per outlet type: Suprisingly supermarket type 3 is the most profitable and not type 1.
* Sales per outlet size: Medium and high outlet sizes are pretty much even in sales.
* Sales per location type: Tier 2 and 3 are almost even being the highest in sales (2 is slightly larger).

# **Correlation Matrix**

In [None]:
#plotting the correlation matrix
sns.heatmap(tr_df.corr() ,cmap='rocket')

**Inference:**
* We can see Item_Outlet_Sales is highly correlated with Item_MRP
* The columns Outlet_Establishment_Year, Item_Identifier and Outlet_Identifier don't have significant values so we will drop them.
* All Ordinal variables will be Label encoded.
* The columns Outlet_Type and Item_Type will be One Hot encoded.

In [None]:
#checking the number of unique values in each column
BM_cat.apply(lambda x: x.nunique()) 

In [None]:
#lable encoding

le = LabelEncoder()
Label = ['Item_Fat_Content','Outlet_Size','Outlet_Location_Type']

for i in Label:
    tr_df[i] = le.fit_transform(tr_df[i])
    te_df[i] = le.fit_transform(te_df[i])
    
tr_df.head()

In [None]:
#one hot encoding
cols = ['Item_Type','Outlet_Type']
# Apply one-hot encoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
tr_oh = pd.DataFrame(OH_encoder.fit_transform(tr_df[cols])).astype('int64')
te_oh = pd.DataFrame(OH_encoder.fit_transform(te_df[cols])).astype('int64')


#get feature columns
tr_oh.columns = OH_encoder.get_feature_names(cols)
te_oh.columns = OH_encoder.get_feature_names(cols)

# One-hot encoding removed index; put it back
tr_oh.index = tr_df.index
te_oh.index = te_df.index

# Add one-hot encoded columns to our main df new name: tr_fe, te_fe (means feature engeenired) 
tr_fe = pd.concat([tr_df, tr_oh], axis=1)
te_fe = pd.concat([te_df, te_oh], axis=1)

In [None]:
# Dropping irrelevant columns

tr_fe  = tr_fe.drop(['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year','Outlet_Type','Item_Type'],axis=1)
te_fe = te_fe.drop(['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year','Outlet_Type','Item_Type'],axis=1)

In [None]:
tr_fe.head()

# **Machine Learning Model**

First of all we will divide our dataset into two variables X as the features we defined earlier and y as the Item_Outlet_Sales the target value we want to predict.

Assumptions:

This is a regression problem so we will use Regression methods.

Train test split will be a 8:2 ratio respectively.

Models we will use:
* Linear Regression
* Random Forest Regressor
* Lasso Regressor

The Process of Modeling the Data:
1. Importing the model
2. Fitting the model
3. Predicting Item Outlet Sales
4. Regression metrics

In [None]:
y = tr_fe['Item_Outlet_Sales']
X = tr_fe.drop('Item_Outlet_Sales', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 0)

In [None]:
def cross_val(model_name,model,X,y,cv):
    
    scores = CVS(model, X, y, cv=cv)
    print(f'{model_name} Scores:')
    for i in scores:
        print(round(i,2))
    print(f'Average {model_name} score: {round(scores.mean(),4)}')

# Linear Regression

In [None]:
#model
LR = LinearRegression(normalize=True)

#fit
LR.fit(X_train, y_train)

#predict
y_predict = LR.predict(X_test)

#score variables
LR_MAE = round(MAE(y_test, y_predict),2)
LR_MSE = round(MSE(y_test, y_predict),2)
LR_R_2 = round(R2(y_test, y_predict),4)
LR_CS  = round(CVS(LR, X, y, cv=5).mean(),4)

print(f" Mean Absolute Error: {LR_MAE}\n")
print(f" Mean Squared Error: {LR_MSE}\n")
print(f" R^2 Score: {LR_R_2}\n")
cross_val(LR,LinearRegression(),X,y,5)

# Random Forest Regressor

In [None]:
#model
RFR= RandomForestRegressor(n_estimators=200,max_depth=5, min_samples_leaf=100,n_jobs=4,random_state=101)
#fit
RFR.fit(X_train, y_train)
#predict
y_predict = RFR.predict(X_test)

#score variables
RFR_MAE = round(MAE(y_test, y_predict),2)
RFR_MSE = round(MSE(y_test, y_predict),2)
RFR_R_2 = round(R2(y_test, y_predict),4)
RFR_CS  = round(CVS(RFR, X, y, cv=5).mean(),4)



print(f" Mean Absolute Error: {RFR_MAE}\n")
print(f" Mean Squared Error: {RFR_MSE}\n")
print(f" R^2 Score: {RFR_R_2}\n")
cross_val(RFR,RandomForestRegressor(),X,y,5)

# Lasso Regressor

In [None]:
#model
LS = Lasso(alpha = 0.05)
#fit
LS.fit(X_train,y_train)

#predict
y_predict = LS.predict(X_test)

#score variables
LS_MAE = round(MAE(y_test, y_predict),2)
LS_MSE = round(MSE(y_test, y_predict),2)
LS_R_2 = round(R2(y_test, y_predict),4)
LS_CS  = round(CVS(LS, X, y, cv=5).mean(),4)

print(f" Mean Absolute Error: {LS_MAE}\n")
print(f" Mean Squared Error: {LS_MSE}\n")
print(f" R^2 Score: {LS_R_2}\n")
cross_val(LS,Lasso(alpha = 0.05),X,y,5)

# **CONCLUSION**

In [None]:
MAE= [LR_MAE,RFR_MAE,LS_MAE]
MSE= [LR_MSE,RFR_MSE,LS_MSE]
R_2= [LR_R_2,RFR_R_2,LS_R_2]
Cross_score= [LR_CS,RFR_CS,LS_CS]

Models = pd.DataFrame({
    'models': ["Linear Regression","Random Forest Regressor","Lasso Regressor"],
    'MAE': MAE, 'MSE': MSE, 'R^2':R_2, 'Cross Validation Score':Cross_score})
Models.sort_values(by='MAE', ascending=True)

# **FINAL INFERENCE**

1. Item_MRP optimizes Maximum Outlet sales (positive correlation with the target).
2. For better peformance this models need tuning e.g. Grid Search.
3. Linear Regression and Lasso Regressor have the best perfomance in most categories than Random Forest Regressor.