In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IMPORT MODULES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

# LOADING THE DATASET

In [None]:
df = pd.read_csv('../input/bigmart-sales-data/Train.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# unique values in dataset
df.apply(lambda x: len(x.unique()))

# PREPROCESSING THE DATASET

In [None]:
df.isnull().sum()

In [None]:
# categorical attributes
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)

In [None]:
cat_col

In [None]:
cat_col.remove('Item_Identifier')

In [None]:
cat_col.remove('Outlet_Identifier')

In [None]:
cat_col

In [None]:
for col in cat_col:
    print(col)
    print(df[col].value_counts())
    print()

In [None]:
item_weight_mean = df.pivot_table(values = "Item_Weight", index = 'Item_Identifier')

In [None]:
item_weight_mean

In [None]:
miss_bool = df['Item_Weight'].isnull()

In [None]:
miss_bool

In [None]:
for i, item in enumerate(df['Item_Identifier']):
    if miss_bool[i]:
        if item in item_weight_mean:
            df['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
        else:
            df['Item_Weight'][i] = np.mean(df['Item_Weight'])

In [None]:
df['Item_Weight'].isnull().sum()

In [None]:
outlet_size_mode = df.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [None]:
outlet_size_mode

In [None]:
miss_bool = df['Outlet_Size'].isnull()

In [None]:
df.loc[miss_bool, 'Outlet_Size'] = df.loc[miss_bool, 'Outlet_Type'].apply(lambda x: outlet_size_mode[x])

In [None]:
df['Outlet_Size'].isnull().sum()

In [None]:
sum(df['Item_Visibility'] == 0)

In [None]:
df.loc[:,'Item_Visibility'].replace([0] , [df['Item_Visibility'].mean()] , inplace = True )

In [None]:
sum(df['Item_Visibility']==0)

In [None]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
df['Item_Fat_Content'].value_counts()

# CREATION OF NEW ATTRIBUTES

In [None]:
df['New_Item_Type'] = df['Item_Identifier'].apply(lambda x: x[:2])

In [None]:
df['New_Item_Type']

In [None]:
df['New_Item_Type'] = df['New_Item_Type'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
df['New_Item_Type'].value_counts()

In [None]:
df.loc[ df ['New_Item_Type'] =='Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'
df['Item_Fat_Content'].value_counts()

In [None]:
df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']

In [None]:
df['Outlet_Years']

In [None]:
df.head()

# EXPLORATORY DATA ANALYSIS

In [None]:
sns.distplot(df['Item_Weight'] , color = 'red')

In [None]:
sns.distplot(df['Item_Visibility'] , color = 'green')

In [None]:
sns.distplot(df['Item_MRP'] , color = 'purple')

In [None]:
sns.distplot(df['Item_Outlet_Sales'] , color = 'orange')

In [None]:
# log transformation:
df['Item_Outlet_Sales'] = np.log(1+df['Item_Outlet_Sales'])

In [None]:
sns.distplot(df['Item_Outlet_Sales'] , color = 'orange')

In [None]:
sns.countplot(df["Item_Fat_Content"] , palette="Set3")

In [None]:
l = list(df['Item_Type'].unique())
chart = sns.countplot(df["Item_Type"],palette="Set3")
chart.set_xticklabels(labels=l, rotation=90)

In [None]:
sns.countplot(df['Outlet_Establishment_Year'] , palette="Set3")

In [None]:
sns.countplot(df['Outlet_Size'], palette="Set3")

In [None]:
sns.countplot(df['Outlet_Location_Type'] , palette="Set3")

In [None]:
sns.countplot(df['Outlet_Type'] , palette = 'Set3')

# COORELATION MATRIX

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(corr, annot=True )

# LABEL ENCODING

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df['Outlet'] = le.fit_transform(df['Outlet_Identifier'])

In [None]:
cat_col = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type']
for col in cat_col:
    df[col] = le.fit_transform(df[col])

# ONEHOT ENCODING

In [None]:
df = pd.get_dummies(df, columns=['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type'])

In [None]:
df.head()

# INPUT SPLIT

In [None]:
X = df.drop(columns=['Outlet_Establishment_Year', 'Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])

In [None]:
y = df['Item_Outlet_Sales']

# MODEL TRAINING

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
def train(model, X, y):
    model.fit(X, y)
    pred = model.predict(X)
    
    cv_score = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    cv_score = np.abs(np.mean(cv_score))
    
    print("Model Report")
    print("MSE:",mean_squared_error(y,pred))
    print("CV Score:", cv_score)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
model = LinearRegression(normalize=True)
train(model, X, y)
coef = pd.Series(model.coef_, X.columns).sort_values()
coef.plot(kind='bar', title="Model Coefficients",color = 'lightblue')

In [None]:
model = Ridge(normalize=True)
train(model, X, y)
coef = pd.Series(model.coef_, X.columns).sort_values()
coef.plot(kind='bar', title="Model Coefficients" , color = 'red')

In [None]:

model = Lasso()
train(model, X, y)
coef = pd.Series(model.coef_, X.columns).sort_values()
coef.plot(kind='bar', title="Model Coefficients" , color ='lightblue')

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model = DecisionTreeRegressor()
train(model, X, y)
coef = pd.Series(model.feature_importances_, X.columns).sort_values(ascending=False)
coef.plot(kind='bar', title="Feature Importance" , color ='red')

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor()
train(model, X, y)
coef = pd.Series(model.feature_importances_, X.columns).sort_values(ascending=False)
coef.plot(kind='bar', title="Feature Importance" , color = 'lightblue')

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
model = ExtraTreesRegressor()
train(model, X, y)
coef = pd.Series(model.feature_importances_, X.columns).sort_values(ascending=False)
coef.plot(kind='bar', title="Feature Importance" , color = 'red')