In [None]:
# Importing the Libraries
import pandas as pd                                  # For managing Data Structures
import numpy as np                                   # For mathematical functions
import matplotlib.pyplot as plt                      # For Data visualization
import seaborn as sns                                # For Data visualization
from mpl_toolkits.mplot3d import Axes3D              # For 3D graphs
from sklearn.impute import SimpleImputer             # For handeling the missing data (Categorical)
from sklearn.preprocessing import LabelEncoder       # For Label encoding
from sklearn.preprocessing import OneHotEncoder      # For One Hot Encoding
from sklearn.compose import ColumnTransformer        # Fro using OneHotEncoder to transform columns
from sklearn.linear_model import LinearRegression    # For linear regression model
from sklearn.tree import DecisionTreeRegressor       # For Decision tree regression model
from sklearn.ensemble import RandomForestRegressor   # For Random Forest Regression Model
from sklearn import metrics                          # For Evaluation of the regression models

In [None]:
train_df = pd.read_csv("../input/big-mart-sales-prediction/Train.csv")
test_df = pd.read_csv("../input/big-mart-sales-prediction/Test.csv")
y_df = pd.read_csv("../input/big-mart-sales-prediction/Submission.csv")

In [None]:
# Changing the Column Names
train_df.columns = ['Item_ID','Weight','Fat_Content','Visibility','Item_Type',
                    'MRP','Out_ID', 'Out_year','Out_Size','Out_Loc','Out_Type', 'Sales']
test_df.columns = ['Item_ID','Weight','Fat_Content','Visibility','Item_Type',
                    'MRP','Out_ID', 'Out_year','Out_Size','Out_Loc','Out_Type']

In [None]:
# getting basic information of training and test datasets
train_df.info()
test_df.info()

In [None]:
# Checking the Unique values for categorical dat
print("Fat_Content\n ",train_df.Fat_Content.unique())
print("Item_Type\n ",train_df.Item_Type.unique())
print("Out_ID\n ",train_df.Out_ID.unique())
print("Out_Size\n ",train_df.Out_Size.unique())
print("Out_Loc\n ",train_df.Out_Loc.unique())
print("Out_Type\n ",train_df.Out_Type.unique())

There are basically two categories but with different names in "Fat_Content". This is required to be handled.

In [None]:
# Handling categories in "Fat_Content"
# Training Set
train_df['Fat_Content'] = train_df['Fat_Content'].replace('low fat', 'Low Fat')
train_df['Fat_Content'] = train_df['Fat_Content'].replace('LF', 'Low Fat')
train_df['Fat_Content'] = train_df['Fat_Content'].replace('reg', 'Regular')
# Test Set
test_df['Fat_Content'] = test_df['Fat_Content'].replace('low fat', 'Low Fat')
test_df['Fat_Content'] = test_df['Fat_Content'].replace('LF', 'Low Fat')
test_df['Fat_Content'] = test_df['Fat_Content'].replace('reg', 'Regular')

print("New Categories: Fat_Content (Training Set)\n ",train_df.Fat_Content.unique())
print("New Categories: Fat_Content (Test Set)\n ",test_df.Fat_Content.unique())

In [None]:
# Checking the Missing values
print(pd.concat([train_df.isnull().sum(), (train_df.isnull().sum()/train_df.isnull().count()*100)],
                    axis = 1,
                    keys = ['Missing values (Train Set)','%']))
print(pd.concat([test_df.isnull().sum(), (test_df.isnull().sum()/test_df.isnull().count()*100)],
                    axis = 1,
                    keys = ['Missing values (Test Set)','%']))

There are 17% missing values in Weight which has a dtype of float, and 28% missing values in Out_Size which is of object dtype.

In [None]:
#--------------------------------- Handling the Missing Data-------------------------------------------
# ---------------- Training Set
# Weight
si1 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
arr = train_df.iloc[:,1].values.reshape(-1,1)
si1 = si1.fit(arr)
arr = si1.transform(arr)
train_df['Weight'] = arr[:,0]
# Out Size
si2 = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
arr = train_df.iloc[:,8].values.reshape(-1,1)
si2 = si2.fit(arr)
arr = si2.transform(arr)
train_df['Out_Size'] = arr[:,0]
# Checking values after Imputing - Training set
print(pd.concat([train_df.isnull().sum(), (train_df.isnull().sum()/train_df.isnull().count()*100)],
                    axis = 1,
                    keys = ['Missing values (Train Set)','%']))

#---------------- Test Set
# Weight
arr = test_df.iloc[:,1].values.reshape(-1,1)
si1 = si1.fit(arr)
arr = si1.transform(arr)
test_df['Weight'] = arr[:,0]
# Out_Size
arr = test_df.iloc[:,8].values.reshape(-1,1)
si2 = si2.fit(arr)
arr = si2.transform(arr)
test_df['Out_Size'] = arr[:,0]
# Checking values after Imputing - Training set
print(pd.concat([test_df.isnull().sum(), (test_df.isnull().sum()/test_df.isnull().count()*100)],
                    axis = 1,
                    keys = ['Missing values (Test Set)','%']))

In [None]:
#--------------- EDA -----------------
# No of times different types of Items has been purchased
sns.set_style('darkgrid')
plt.figure(figsize=(15,10))
sns.countplot(train_df.Item_Type, hue=train_df.Fat_Content)
plt.xticks(rotation=90)
plt.legend(loc = 'upper right', bbox_to_anchor=(1.1, 1), title = 'Fat Content')

Insights:
* Mostly people buy "Household" item followed by "Snacks Food" and "Fruits and Vegetables"
* The least bought items are seafood and breakfast
* People tend to by Low fat items more as compares to regular fat item

In [None]:
# Average sales from different Item Category
plt.figure(figsize=(15,10))
sns.barplot(x = 'Item_Type', y = 'Sales', data = train_df)
plt.xticks(rotation = 90)

Insights:
* Even though houshold items are sold more, but Starchy Food contributes the maximum to the total sales.

In [None]:
# Visibility of different Item Type
plt.figure(figsize = (15,10))
sns.barplot(x = 'Item_Type', y = 'Visibility', data = train_df)
plt.xticks(rotation = 90)

Insights:
* Breakfast and Seafood are the most visible items, even though they are the least bought items.

In [None]:
# Sales from different Outlet Location
plt.figure(figsize=(15,10))
sns.barplot(x = 'Out_Loc', y = 'Sales', hue = 'Out_Size', data = train_df)
plt.xlabel("Outlet Location")
plt.legend(loc = 'upper right', bbox_to_anchor=(1.1, 1), title = 'Outlet Size')

* The Medium size outlets in all the three locations have alomost same amount of slaes
* The Small size oultels of tier 2 has more sales than the tier 1
* And only Tier 3 location has High size outlet

In [None]:
# Sales from differenr type of Outlets
plt.figure(figsize=(15,10))
sns.barplot(x = 'Out_Type', y = 'Sales', hue = 'Out_Size', data = train_df)
plt.xlabel("Outlet Type")
plt.legend(loc = 'upper right', bbox_to_anchor=(1.1, 1), title = 'Outlet Size')

In [None]:
# Relation between Visibility and Sales
plt.figure(figsize=(15,10))
plt.scatter(train_df.Visibility, train_df.Sales, marker = '.', edgecolors = 'Black')
plt.xlabel("Visibility")
plt.ylabel("Sales")

Clearly, there is a negetive relation between visibilty and sales, which means that the items which provides less sales are kept at a more visible location. This may have been done to improve the sales of that particular item

In [None]:
# Relation between the price of the item and sales
plt.figure(figsize=(15,10))
plt.scatter(train_df.MRP, train_df.Sales, marker = '.', edgecolors = 'black')
plt.xlabel("MPR")
plt.ylabel("Sales")

We can observe different bracktes of price of the items, and there is a positive relation between the MRP and the Slaes. This may not necessaryly mean that expensive products are sold more. Sales is the product of price and no. of items sold. So, keeping the no of items sold constant, higher price will yield more sales.

In [None]:
# 3D representation between Visibility, MRP and Sales
fig = plt.figure(figsize=(15,10))
ax = Axes3D(fig)
ax.scatter(train_df.Visibility,
           train_df.MRP, 
           train_df.Sales, 
           marker = 'o', edgecolors = 'black')
ax.set_xlabel('Visibility')
ax.set_ylabel('MRP')
ax.set_zlabel('Sales')
ax.legend()
ax.grid(linestyle='-', linewidth='0.5', color='red')

We can clearly see that items which have more Sales value, have higher value of MRP and lower visibilty.

In [None]:
# ------------------- Encoding the categorical variables using Label encoder

lencoder = LabelEncoder()
# Training Set
for i in (2,4,6,7,8,9,10):
    train_df.iloc[:,i] = lencoder.fit_transform(train_df.iloc[:,i])
# Test set
for i in (2,4,6,7,8,9,10):
    test_df.iloc[:,i] = lencoder.fit_transform(test_df.iloc[:,i])

Since, we will be working with machine learning models, we have to encode different categories, as the model only understands numbers and not text. Label encoder allocates different numbers (starting from 0) to different categories.

In [None]:
# Checking the Unique values for categorical data after label encoding
print("Fat_Content\n ",train_df.Fat_Content.unique())
print("Item_Type\n ",train_df.Item_Type.unique())
print("Out_ID\n ",train_df.Out_ID.unique())
print("Out_Size\n ",train_df.Out_Size.unique())
print("Out_Loc\n ",train_df.Out_Loc.unique())
print("Out_Type\n ",train_df.Out_Type.unique())

In [None]:
# Plotting a heatmap to visualise the correlation between different variables
plt.figure(figsize = (15,10))
sns.heatmap(train_df.corr(), annot= True)


* MRP and Outlet Type have a strong positive corelation with Sales
* Visibility, as discussed erlier also, have a weak negetive corelation with Sales
* Item type, Fat content, weight and Outlet location have a weak positive corelation with Sales

In [None]:
# Selecting the appropriate factors from training and test set
# this dataset will be later used in building the Machine Learning Model
X_train = train_df.iloc[:, 1:11]
X_test = test_df.iloc[:, 1:11]
y_train = train_df.iloc[:, 11].values
y_test = y_df.iloc[:,3].values
X_train.head()

Colunms like Item ID are not required to be considered as a varibale contributing to Sales, hence it is removed.
To train the ML models, we have seperated the dependent variable, y_train (Sales) and independent variable, X_train. For testing the ML model, test set has been declared (X_test). y_test is considerd the actual sales for the test set. y_test will be used to compare the predicted values of sales for the test set.

In [None]:
# Encoding usinng One Hot Encoder
ohe = ColumnTransformer([('onehotencoder',OneHotEncoder(),[1,3,5,6,7,8,9])], remainder = 'passthrough')
X_train = ohe.fit_transform(X_train).toarray()
X_test = ohe.fit_transform(X_test).toarray()

The categorical variables are encoded using Label encoder, which alocates whole number to the categories. The machine can interpret the category with a higher number to be greater than a category with a lower number. 
For example, two categories of variable Item_Type, dairy and meat, have been encoded with the number 1 and 5 respectively. Now the machine will interpret that meat is greater than dairy, which is illogical.

To rectify this, One Hot Encoder is used. This object create new columns with binary values for every category. So if a variable has 5 different category, the OneHotEncoder will create 5 new columns with binary values. So, for a specific record (row), the new columm with the assigned category of the row will have value 1 and the rest of the new columns will have 0.

In [None]:
# Dropping dummy columns to evade dummy variable trap
X_train = np.delete(X_train, [0,2,18,28,37,40,43], axis = 1)
X_test = np.delete(X_test, [0,2,18,28,37,40,43], axis = 1)

One column is removed for every varible which is one hot encoded, to tackle multi-colinearity among the categories. This is known as dummy variable trap, and this can can affect the model in a bad way. All the one hot encoded variables are aligned to the left of the data set and in the same order. So, depending on the no of categories in each variable, the first dummy column for each variable is removed. 

In [None]:
# ----------------------------------------- Building Regression Models -------------------------------------------

In [None]:
# Linear Regression
LR_regressor = LinearRegression(normalize=True)
LR_regressor.fit(X_train, y_train)
y_pred_LR = LR_regressor.predict(X_test)

# Model Evaluation (Linear Regression)
mse_LR = metrics.mean_squared_error(y_test, y_pred_LR)
r2_LR = metrics.r2_score(y_test, y_pred_LR)
RMSE_LR = np.sqrt(mse_LR)

print("---------------------- Linear Regression ----------------------\n",
     "Mean Squared Error: ", mse_LR, "\n",
     "R Squared: ", r2_LR, "\n",
     "Root Mean Squared Error: ", RMSE_LR)

In [None]:
# Decision Tree Regression
DT_regressor = DecisionTreeRegressor(random_state=0)
DT_regressor.fit(X_train, y_train)
y_pred_DTR = DT_regressor.predict(X_test)

# Model Evaluation
mse_DTR = metrics.mean_squared_error(y_test, y_pred_DTR)
r2_DTR = metrics.r2_score(y_test, y_pred_DTR)
RMSE_DTR = np.sqrt(mse_DTR)

print("---------------------- Decision Tree Regression ----------------------\n",
     "Mean Squared Error: ", mse_DTR, "\n",
     "R Squared: ", r2_DTR, "\n",
     "Root Mean Squared Error: ", RMSE_DTR)

In [None]:
# Random Forest Regression
RF_regressor = RandomForestRegressor(n_estimators = 300, random_state = 0)
RF_regressor.fit(X_train, y_train)
y_pred_RF = RF_regressor.predict(X_test)

# Model Evaluation
mse_RF = metrics.mean_squared_error(y_test, y_pred_RF)
r2_RF = metrics.r2_score(y_test, y_pred_RF)
RMSE_RF = np.sqrt(mse_RF)

print("---------------------- Random Forest Regression ----------------------\n",
     "Mean Squared Error: ", mse_RF, "\n",
     "R Squared: ", r2_RF, "\n",
     "Root Mean Squared Error: ", RMSE_RF)


In [None]:
# Exporting results to csv File
results = {
            'Item_Identifier': test_df.Item_ID,
            'Outlet_Identifier': test_df.Out_ID,
            'Item_Outlet_Sales': y_pred_LR
        }
results = pd.DataFrame(results)
results.to_csv('Submission_Sid.csv')