In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **To Build a predictive model and predict the sales of each product at a particular outlet.**

# *Dataset Description*

The dataset contains a set of **8,523 records** with **12 attributes**

* **Item_Identifier** --- Unique product ID

* **Item_Weight** --- Weight of product

* **Item_Fat_Content** --- Checks the Concentration of fat in the product

* **Item_Visibility** --- The % of total display area of all similar products in a store

* **Item_Type** --- Product Category

* **Item_MRP** --- Maximum Retail Price for a Product

* **Outlet_Identifier** --- Store ID

* **Outlet_Establishment_Year** --- The year in which store was established

* **Outlet_Size** --- The size of the store (Area Size Category)

* **Outlet_Location_Type** --- In Terms of city Tiers (Size)

* **Outlet_Type** --- Grocery store or a type of supermarket

* **Item_Outlet_Sales** --- Sales of the product In the Specific outlet

# *Libraries*

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# %matplotlib inline will lead to static images of your plot embedded in the notebook

from sklearn.ensemble import  RandomForestRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# *Checking Dataset*

In [None]:
#Training Data
df_train=pd.read_csv('../input/bigmart-sales-data/Train.csv')
df_train.sample(5)

In [None]:
# Testing data 
df_test=pd.read_csv('../input/bigmart-sales-data/Test.csv')
df_test.sample(5)

In [None]:
# Training Data's datatype check
df_train.info()

In [None]:
# Testing Data's datatype check
df_test.info()

In [None]:
# Training Data's null value check
df_train.isnull().sum()

In [None]:
# Testing Data's null value check
df_test.isnull().sum()

# *Overview of Null Values in Dataset*

In [None]:
# Total percentage of the missing values in Training Data
missing_data = df_train.isnull().sum()
total_percentage = (missing_data.sum()/df_train.shape[0]) * 100
print(f'The total percentage of missing values in Training Dataset is {round(total_percentage,2)}%')

In [None]:
# Specific attributes null value percentage
total = df_train.isnull().sum().sort_values(ascending=False)
percent_total = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)*100
missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missing_data = missing[missing['Total']>0]
missing_data

In [None]:
# Plotting the percentage of missing values
plt.figure(figsize=(6,6))
sns.set(style="whitegrid")
sns.barplot(x=missing_data.index, y=missing_data['Percentage'], data = missing_data)
plt.title('Percentage of missing data by feature')
plt.xlabel('Features', fontsize=14)
plt.ylabel('Percentage', fontsize=14)
plt.show()

# *Null Value Treatment*

In [None]:
## Here Item_Weight should be related with Item_Type. So we have to filling null values with mean specific to its item_type only.
# Training Data
Category_mean = df_train.groupby('Item_Type')['Item_Weight'].mean()
for i in range(len(Category_mean)):
    c1 = (df_train['Item_Type']==Category_mean.index[i])&(df_train['Item_Weight'].isna()==True)
    df_train['Item_Weight'] = np.select([c1], [Category_mean[i]], df_train['Item_Weight'] )

In [None]:
## Here Item_Weight should be related with Item_Type. So we have to filling null values with mean specific to its item_type only.
# Testing Data
Category_mean_test = df_test.groupby('Item_Type')['Item_Weight'].mean()
for i in range(len(Category_mean_test)):
    c1 = (df_test['Item_Type']==Category_mean_test.index[i])&(df_test['Item_Weight'].isna()==True)
    df_test['Item_Weight'] = np.select([c1], [Category_mean_test[i]], df_test['Item_Weight'] )

In [None]:
# Treatment of null values in Outlet_Size column for training data
from statistics import mode
df_train['Outlet_Size'].fillna(mode(df_train['Outlet_Size']),inplace=True)

In [None]:
# Treatment of null values in Outlet_Size column for testing data
from statistics import mode
df_test['Outlet_Size'].fillna(mode(df_test['Outlet_Size']),inplace=True)

In [None]:
## Checking for null value after treatment in Training Data

print('Null values in df_train[Item_Weight] : ',  df_train['Item_Weight'].isna().sum())
print('Null values in df_train[Outlet_Size] : ',  df_train['Outlet_Size'].isna().sum())

In [None]:
## Checking for null value after treatment in Testing Data

print('Null values in df_test[Item_Weight] : ',  df_test['Item_Weight'].isna().sum())
print('Null values in df_test[Outlet_Size] : ',  df_test['Outlet_Size'].isna().sum())

# *Data Visualisation*

In [None]:
#Countplot For Training Data
fig,ax=plt.subplots(2,3,figsize=(35,8))
sns.countplot(df_train['Item_Fat_Content'],ax=ax[0,0])
sns.countplot(df_train['Item_Type'],ax=ax[0,1])
sns.countplot(df_train['Outlet_Size'],ax=ax[0,2])
sns.countplot(df_train['Outlet_Location_Type'].value_counts(),ax=ax[1,0])
sns.countplot(df_train['Outlet_Type'],ax=ax[1,1])

In [None]:
#Countplot For Testing Data
fig,ax=plt.subplots(2,3,figsize=(35,8))
sns.distplot(df_train['Item_Weight'],ax=ax[0,0])
sns.distplot(df_train['Item_Visibility'],ax=ax[0,1])
sns.distplot(df_train['Item_MRP'],ax=ax[0,2])
sns.distplot(df_train['Outlet_Establishment_Year'],ax=ax[1,0])
sns.distplot(df_train['Item_Outlet_Sales'],ax=ax[1,1])

In [None]:
#Item_Outlet_Sales vs Item_Weight
plt.figure(figsize=(12,7))
plt.xlabel("Item_Weight")
plt.ylabel("Item_Outlet_Sales")
plt.title("Item_Weight and Item_Outlet_Sales Analysis")
plt.plot(df_train.Item_Weight, df_train["Item_Outlet_Sales"],'.', alpha = 0.3)

In [None]:
#Item_Outlet_Sales vs Item_Visibility
plt.figure(figsize=(12,7))
plt.xlabel("Item_Visibility")
plt.ylabel("Item_Outlet_Sales")
plt.title("Item_Visibility and Item_Outlet_Sales Analysis")
plt.plot(df_train.Item_Visibility, df_train["Item_Outlet_Sales"],'.', alpha = 0.3)

In [None]:
#Item_Outlet_Sales vs Item_MRP
plt.figure(figsize=(12,7))
plt.xlabel("Item_MRP")
plt.ylabel("Item_Outlet_Sales")
plt.title("Item_MRP and Item_Outlet_Sales Analysis")
plt.plot(df_train.Item_MRP, df_train["Item_Outlet_Sales"],'.', alpha = 0.3)

In [None]:
#Item_Outlet_Sales vs Outlet_Establishment_Year
Outlet_Establishment_Year_pivot = \
df_train.pivot_table(index='Outlet_Establishment_Year', values="Item_Outlet_Sales", aggfunc=np.median)

Outlet_Establishment_Year_pivot.plot(kind='bar', color='blue',figsize=(12,7))
plt.xlabel("Outlet_Establishment_Year")
plt.ylabel("Item_Outlet_Sales")
plt.title("Impact of Outlet_Establishment_Year on Item_Outlet_Sales")
plt.xticks(rotation=0)
plt.show()

In [None]:
import plotly.express as px
fig=px.sunburst(df_train,path=['Outlet_Type','Item_Type'],values='Item_Outlet_Sales')

In [None]:
px.sunburst(df_train,path=['Outlet_Location_Type','Outlet_Identifier','Outlet_Type'],values='Item_Outlet_Sales')

# *Inference*

**Item_Weight** - No specific pattern, as the data is very much spreaded, 

**Item_Visibility** - Appears to be spreaded as well but some concentration around the (0,0) indicates small visibility items are not selling well is some cases.

**Item_MRP** - Items with average higher MRP tend to sell better in most cases.

**Outlet_Age** - No specific inference as both old and new outlets are performing better except few.

# *Treating Categorical Columns*

In [None]:
#Training Dataset Info
df_train.info()

In [None]:
#Testing Dataset Info
df_test.info()

In [None]:
df_train['Item_Identifier'].unique()
df_train.drop('Item_Identifier',axis=1,inplace=True)

In [None]:
df_test['Item_Identifier'].unique()
df_test.drop('Item_Identifier',axis=1,inplace=True)

In [None]:
print('Values before Imputing numeric values:',df_train['Item_Fat_Content'].unique())
df_train['Item_Fat_Content']=df_train['Item_Fat_Content'].apply(lambda x: x.lower())
df_train['Item_Fat_Content']=df_train['Item_Fat_Content'].apply(lambda x:'lf' if x=='low fat' else x )
df_train['Item_Fat_Content']=df_train['Item_Fat_Content'].apply(lambda x:'reg' if x=='regular' else x )
df_train['Item_Fat_Content']=df_train['Item_Fat_Content'].map({
    'lf':0,
    'reg':1
})
print('Values after Imputing numeric values:',df_train['Item_Fat_Content'].unique())

In [None]:
print('Values before Imputing numeric values:',df_test['Item_Fat_Content'].unique())
df_test['Item_Fat_Content']=df_test['Item_Fat_Content'].apply(lambda x: x.lower())
df_test['Item_Fat_Content']=df_test['Item_Fat_Content'].apply(lambda x:'lf' if x=='low fat' else x )
df_test['Item_Fat_Content']=df_test['Item_Fat_Content'].apply(lambda x:'reg' if x=='regular' else x )
df_test['Item_Fat_Content']=df_test['Item_Fat_Content'].map({
    'lf':0,
    'reg':1
})
print('Values after Imputing numeric values:',df_test['Item_Fat_Content'].unique())

In [None]:
print('Values before Imputing numeric values:', df_train['Outlet_Size'].unique())
df_train['Outlet_Size']=df_train['Outlet_Size'].map({ 'Medium':1,                            
                                'High':2,
                                'Small':3
})
print('Values after Imputing numeric values:' ,df_train['Outlet_Size'].unique())

In [None]:
print('Values before Imputing numeric values:', df_test['Outlet_Size'].unique())
df_test['Outlet_Size']=df_test['Outlet_Size'].map({ 'Medium':1,                            
                                'High':2,
                                'Small':3
})
print('Values after Imputing numeric values:' ,df_test['Outlet_Size'].unique())

In [None]:
print('Values before Imputing :'  ,df_train['Outlet_Location_Type'].unique())
df_train['Outlet_Location_Type']=df_train['Outlet_Location_Type'].map({ 'Tier 1':1,
                                'Tier 3':3,
                                'Tier 2':2   
})
print('Values after Imputing :'  ,df_train['Outlet_Location_Type'].unique())

In [None]:
print('Values before Imputing :'  ,df_test['Outlet_Location_Type'].unique())
df_test['Outlet_Location_Type']=df_test['Outlet_Location_Type'].map({ 'Tier 1':1,
                                'Tier 3':3,
                                'Tier 2':2   
})
print('Values after Imputing :'  ,df_test['Outlet_Location_Type'].unique())

In [None]:
print(df_train['Outlet_Type'].unique())
df_train['Outlet_Type']=df_train['Outlet_Type'].map({ 'Supermarket Type1':1,
                                'Supermarket Type2':2,
                                'Grocery Store':3,
                                'Supermarket Type3':4
})
print('Values after Imputing:'  ,df_train['Outlet_Type'].unique())

In [None]:
print(df_test['Outlet_Type'].unique())
df_test['Outlet_Type']=df_test['Outlet_Type'].map({ 'Supermarket Type1':1,
                                'Supermarket Type2':2,
                                'Grocery Store':3,
                                'Supermarket Type3':4
})
print('Values after Imputing:'  ,df_test['Outlet_Type'].unique())

In [None]:
df_train=pd.get_dummies(df_train, columns= ['Item_Type','Outlet_Identifier','Outlet_Type'],drop_first=True)

In [None]:
for col in df_train.iloc[:,0:7].columns:
#     if type(col) !='str':
    print(col)
    sns.boxplot(x=df_train[col],data=df_train)
    plt.show()

In [None]:
def boxoutlier(var):
    for x in var.iloc[:,2:3].columns :        
        Q1=var[x].quantile(0.25)
        Q3=var[x].quantile(0.75)
        IQR=Q3-Q1
        Lower = Q1-(1.5*IQR)
        Upper = Q3+(1.5*IQR)
        var.loc[:,x]=np.where(var[x].values > Upper,Upper,var[x].values)
        var.loc[:,x]=np.where(var[x].values < Lower,Lower,var[x].values)
        
    return var
df_train=boxoutlier(df_train)

In [None]:
for col in df_train.iloc[:,0:7].columns:
    print(col)
    sns.boxplot(x=df_train[col],data=df_train)
    plt.show()

In [None]:
df_test=pd.get_dummies(df_test, columns= ['Item_Type','Outlet_Identifier','Outlet_Type'],drop_first=True)

In [None]:
df_train.sample(5)

In [None]:
X=df_train.drop('Item_Outlet_Sales',axis=1)
X.isnull().sum()

In [None]:
y=df_train['Item_Outlet_Sales']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4 )
display(X_train.head(),y_train.head(),'Testing Data',X_test.head(),y_test.head())

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
df_test=sc.transform(df_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

In [None]:
clf = DecisionTreeRegressor() # defining Classifier
param_dist = {
            'max_depth': [2,5,10,50,25,30,40,],
             } #list of estimators i.e number of Tress to be Considered for Training

dt_gs = GridSearchCV(clf, param_grid=param_dist, cv = 6) # CV = 5 data will be split into train & test folds 5 times
dt_gs.fit(X_train, y_train) # Fitting Gridsearch to Trainig Data


predict_Xtest_gcv=dt_gs.predict(X_test)
predict_test_gcv=dt_gs.predict(df_test)

from sklearn.metrics import mean_squared_error
print('mean_squared_error',mean_squared_error(y_test,predict_Xtest_gcv))
print('RMSE',np.sqrt(mean_squared_error(y_test,predict_Xtest_gcv)))

In [None]:
rf_c = RandomForestRegressor(n_estimators=50, max_depth=15, random_state = 47, min_samples_leaf = 10) 
rf_c.fit(X_train,y_train)

predict_Xtest_rf=dt_gs.predict(X_test)
predict_test_rf=dt_gs.predict(df_test)

from sklearn.metrics import mean_squared_error
print('Random Forest ', 'mean_squared_error',mean_squared_error(y_test,predict_Xtest_rf))
print('Random Forest ','RMSE',np.sqrt(mean_squared_error(y_test,predict_Xtest_rf)))
