In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)

In [None]:
df = pd.read_csv('../input/big-mart-sales-prediction/Train.csv')
df.head()

In [None]:
dftest = pd.read_csv('../input/big-mart-sales-prediction/Test.csv')
dftest.head()

In [None]:
# Type of Data
df.info()

In [None]:
# Unique values in each column
df.nunique()

In [None]:
df.describe().T

## Data Cleaning

In [None]:
# Item_Fat_Content classes are not organized
print(df['Item_Fat_Content'].unique())

In [None]:
# Mapping a dictionary to map all the fat types into low or regular.
item_fat = {'Low Fat':'low', 'Regular':'regular', 'LF':'low', 'reg':'regular','low fat':'low'}

df['Item_Fat_Content'] = df['Item_Fat_Content'].map(item_fat)
dftest['Item_Fat_Content'] = dftest['Item_Fat_Content'].map(item_fat)
print(df['Item_Fat_Content'].unique())  # All classes are now changed into low or regular.

## Treating Missing Values

In [None]:
# Null Values in terms of percentage
df.isnull().sum() / df.shape[0]*100

### Item Pre Processing

In [None]:
# Null values based on 'Item_Fat_Content'
print(df[ df['Item_Weight'].isnull() ]['Item_Fat_Content'].value_counts())

sns.countplot(df[ df['Item_Weight'].isnull() ]['Item_Fat_Content'])
plt.show()

In [None]:
# Null values based on 'Item_Fat_Content'
print(df[ df['Item_Weight'].isnull() ]['Item_Type'].value_counts())

plt.figure(figsize=(15,5))
sns.countplot(df[ df['Item_Weight'].isnull() ]['Item_Type'])
plt.xticks(rotation=90)
plt.show()

In [None]:
# Creating an array for those unique item_identifier which are having null values.
item_having_null = df[df['Item_Weight'].isnull()]['Item_Identifier'].unique()

# Showing products from those item_identifier which have missing values.
df[ df['Item_Identifier'].isin(item_having_null)].sort_values(by='Item_Identifier', ascending=True).head(5)

#  Item_weight and item_mrp is correlated for each item_identifier

In [None]:
# Creating a new column called Item_MRP_per_unit weight

df.insert(6,'Item_MRP_per_unit_weight',float )
df['Item_MRP_per_unit_weight'] = df['Item_MRP']/df['Item_Weight']

dftest.insert(6,'Item_MRP_per_unit_weight',float )
dftest['Item_MRP_per_unit_weight'] = dftest['Item_MRP']/dftest['Item_Weight']

df.head(5)

In [None]:
# Bifurcating all the item columns with outlet columns
item_train = df.iloc[:,:7]
outlet_train = df.iloc[:,7:-1]

item_test = dftest.iloc[:,:7]
outlet_test = dftest.iloc[:,7:]

In [None]:
# Creating a dataframe which contains all the item_identifier and item_mrp_per_unit_weight
item_train_mean_mrp = item_train[['Item_Identifier','Item_MRP_per_unit_weight']]
item_train_mean_mrp = item_train_mean_mrp.groupby(by='Item_Identifier').mean()
item_train_mean_mrp['Item_MRP_per_unit_weight'].head()

In [None]:
# Creating new column to specify whether item_weight contains null value or not for a record
item_train['MRP_null'] = item_train['Item_MRP_per_unit_weight'].isnull()
item_test['MRP_null'] = item_test['Item_MRP_per_unit_weight'].isnull()

print(item_train['MRP_null'].value_counts()) , print(item_test['MRP_null'].value_counts())

In [None]:
for i in range(item_train.shape[0]):
    e = item_train.iloc[i,0]
    if item_train.iloc[i,-1] == True:
        item_train.loc[i, 'Item_MRP_per_unit_weight'] =  item_train_mean_mrp['Item_MRP_per_unit_weight'][e]

for i in range(item_test.shape[0]):
    e = item_test.iloc[i,0]
    if item_test.iloc[i,-1] == True:
        item_test.loc[i, 'Item_MRP_per_unit_weight'] =  item_train_mean_mrp['Item_MRP_per_unit_weight'][e]

In [None]:
# Item_MRP_per_unit_weight is filled with average Item_MRP_per_unit_weight value based on Item_Identifier
item_train[item_train['Item_Identifier'] == 'DRI11']

In [None]:
# Item_weight would be equal to Item_MRP / Item_MRP_per_unit_weight
item_train.loc[ item_train['Item_Weight'].isnull() , 'Item_Weight'] = item_train['Item_MRP']/item_train['Item_MRP_per_unit_weight']
item_test.loc[ item_test['Item_Weight'].isnull() , 'Item_Weight'] = item_test['Item_MRP']/item_test['Item_MRP_per_unit_weight']

In [None]:
# Still Some items are having null values, because they were the only record with respect to Item_Identifier
# and thats why group mean did not imputed null values. For these we will use KNN imputation
item_train.loc[ item_train['Item_Weight'].isnull() ]

In [None]:
item_train_knn = item_train[['Item_Weight','Item_Visibility','Item_MRP']]
item_test_knn = item_test[['Item_Weight','Item_Visibility','Item_MRP']]

In [None]:
# Scaling of the numerical data
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
item_train_knn = pd.DataFrame( st.fit_transform(item_train_knn), columns=item_train_knn.columns )
item_test_knn = pd.DataFrame(st.transform(item_test_knn), columns=item_test_knn.columns)

In [None]:
# KNN imputation of the data
from sklearn.impute import KNNImputer
knn = KNNImputer()
item_train_knn = pd.DataFrame(knn.fit_transform(item_train_knn), columns=item_train_knn.columns)
item_test_knn = pd.DataFrame(knn.transform(item_test_knn), columns=item_test_knn.columns)

In [None]:
item_train_knn.isnull().sum()

In [None]:
item_train = item_train.drop(columns=['Item_MRP_per_unit_weight','MRP_null'])
item_train[item_train_knn.columns] = item_train_knn
item_train.head()

In [None]:
item_test = item_test.drop(columns=['Item_MRP_per_unit_weight','MRP_null'])
item_test[item_test_knn.columns] = item_test_knn
item_test.head()

### Outlet Preprocessing

In [None]:
outlet_train.isnull().sum()

In [None]:
# Missing values percentage wise
outlet_train.isnull().sum()/outlet_train.shape[0]*100

In [None]:
# Value count of outlet size in data set
outlet_train['Outlet_Size'].value_counts()

sns.countplot(outlet_train['Outlet_Size'])
plt.show()

In [None]:
outlet_train['outlet_null'] = outlet_train['Outlet_Size'].isnull()
outlet_test['outlet_null'] = outlet_test['Outlet_Size'].isnull()

In [None]:
# All the records and null values grouped by outlet_type, outlet_location_type, establishment year
outlet_train.drop(columns=['Outlet_Identifier']).groupby(['Outlet_Type','Outlet_Location_Type','Outlet_Establishment_Year']).count()

# We can see some particular pattern is present for the missing values.
# Only Grocery store in Tier 3 data is missing
# and Supermarket Type1 in Tier 2 data is missing

In [None]:
# All the missing outlet_size values are belongs to come particular outler_identifier
outlet_train.groupby(by='Outlet_Identifier').count()

# Those missing values are actually belongs to particular outlet_identifier
# only 3 outlet_identifier is having missing values i.e. OUT010, OUT017 and OUT045

In [None]:
# Every outlet description
outlet_train[ outlet_train.duplicated() == False ].sort_values(by='Outlet_Identifier')

# For OUT017 and OUT045 both belongs to Tier2 and Supermarket Type1. We can inpute the missing values with Small
# since for OUT035, also belongs to Tier2 and Supermarket Type1 and have 'Small' outlet size.

# For 'OUT010', imputing it with 'Small' since grocery store would be smaller in size as compared to supermarkets

In [None]:
# Imputing outlet size with 'Small'
outlet_train.loc[ (outlet_train['Outlet_Size'].isnull()) , 'Outlet_Size' ] = 'Small'

In [None]:
outlet_train = outlet_train.drop(columns=['outlet_null'])
outlet_test = outlet_test.drop(columns=['outlet_null'])

### Joining Item /Outlet

In [None]:
y = df['Item_Outlet_Sales']

In [None]:
x = pd.concat([item_train, outlet_train], axis=1)
x.head(3)

In [None]:
test = pd.concat([item_test, outlet_test], axis=1)
test.head(3)

## EDA

In [None]:
df1 = pd.concat([x,y], axis=1)

In [None]:
df1.shape

In [None]:
# Number of sales by outlet_identifier
plt.figure(figsize=(12,5))
sns.countplot(x = df1['Outlet_Identifier'])
plt.show()

In [None]:
# Number of sales by Fat_count
plt.figure(figsize=(5,5))
sns.countplot(x = df1['Item_Fat_Content'])
plt.show()

In [None]:
# Checking correlation
sns.heatmap( df1.corr(), annot=True )
plt.show()

In [None]:
# We can see item_mrp is aving mild positive correlaton with item_outlet_sales
plt.figure(figsize=(15,5))
sns.scatterplot(x = df1['Item_MRP'], y = df1['Item_Outlet_Sales'])
plt.show()

# As the MRP is high, Sales is also getting higher.

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x = df1['Item_Visibility'], y = df1['Item_Outlet_Sales'])
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x='Item_Type',y='Item_Outlet_Sales',data=df1)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x='Outlet_Identifier',y='Item_Outlet_Sales',data=df1)
plt.show()

## One Hot Encoding

In [None]:
# Removing Unique columns i.e, Item_Identifier
x = x.drop(columns=['Item_Identifier'])
test = test.drop(columns=['Item_Identifier'])

In [None]:
# Chaning outlet_establishment_year to numerical by changing it to how long it was operating.
x['Outlet_Establishment_Year'] = x['Outlet_Establishment_Year'].apply(lambda x : 2020-x)

st = StandardScaler()

x['Outlet_Establishment_Year'] = st.fit_transform(x[['Outlet_Establishment_Year']])

test['Outlet_Establishment_Year'] = test['Outlet_Establishment_Year'].apply(lambda x : 2020-x)
test['Outlet_Establishment_Year'] = st.transform(test[['Outlet_Establishment_Year']])

In [None]:
# Since Outlet size is ordinal to changing it to -1, 0 and 1 for small , medium and high
x['Outlet_Size'] = x['Outlet_Size'].map({'Small':-1, 'Medium':0, 'High':1})
test['Outlet_Size'] = test['Outlet_Size'].map({'Small':-1, 'Medium':0, 'High':1})

In [None]:
def f1(data):
    num_data = data.select_dtypes(include=np.number)
    cat_data = data.select_dtypes(exclude=np.number)
    
    cat_data = pd.get_dummies(cat_data, drop_first=True)
    data = pd.concat([num_data, cat_data], axis=1, )
    return data

In [None]:
x = f1(x)
test = f1(test)

In [None]:
x.head()

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=16)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
lr = LinearRegression()

lr.fit(xtrain, ytrain)

ytrain_pred = lr.predict(xtrain)
ytest_pred = lr.predict(xtest)

print(r2_score(ytrain, ytrain_pred))
print(mean_squared_error(ytrain, ytrain_pred)**0.5)

print(r2_score(ytest, ytest_pred))
print(mean_squared_error(ytest, ytest_pred)**0.5)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint

In [None]:
dtree = DecisionTreeRegressor() # estimator


param_dist = {'max_depth':sp_randint(1,20),
             'min_samples_leaf':sp_randint(1,50),
              'min_samples_split':sp_randint(2,50)}


rsearch  = RandomizedSearchCV(dtree, param_distributions = param_dist, cv=4) 

rsearch.fit(x,y)
rsearch.best_params_

In [None]:
dtree_rand_tuned = DecisionTreeRegressor(**rsearch.best_params_)
dtree_rand_tuned.fit(xtrain,ytrain)


ytrain_pred = dtree_rand_tuned.predict(xtrain)
print('RMSE on train data: ', mean_squared_error(ytrain, ytrain_pred)**0.5 )
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))


ytest_pred = dtree_rand_tuned.predict(xtest)
print('RMSE on test data: ', mean_squared_error(ytest, ytest_pred)**0.5 )
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

## Extra Tree Regressor

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
etr = ExtraTreesRegressor()


param_dist = { 'n_estimators':sp_randint(50,100),
              'max_features': sp_randint(1,25),
              'max_depth' : sp_randint(5,20),
             'min_samples_leaf':sp_randint(10,50),
              'min_samples_split':sp_randint(2,50)}


rsearch_etr  = RandomizedSearchCV(estimator=etr, param_distributions = param_dist, cv=4, random_state=4) 

rsearch_etr.fit(x,y)
rsearch_etr.best_params_

In [None]:
etr_tuned = ExtraTreesRegressor(**rsearch_etr.best_params_)
etr_tuned.fit(xtrain, ytrain)

ytrain_pred = etr_tuned.predict(xtrain)
print('RMSE on train data: ', mean_squared_error(ytrain, ytrain_pred)**0.5) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = etr_tuned.predict(xtest)
print('RMSE on test data: ', mean_squared_error(ytest, ytest_pred)**0.5)
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()


param_dist = { 'n_estimators':sp_randint(100,150),
              'max_features': sp_randint(1,10),
              'max_depth' : sp_randint(5,20),
             'min_samples_leaf':sp_randint(10,50),
              'min_samples_split':sp_randint(2,50)}


rsearch_rf  = RandomizedSearchCV(estimator=rf, param_distributions = param_dist, cv=4, random_state=16) 

rsearch_rf.fit(x,y)
rsearch_rf.best_params_

In [None]:
rf= RandomForestRegressor(**rsearch_rf.best_params_)

rf.fit(xtrain, ytrain)

ytrain_pred = rf.predict(xtrain)
print('RMSE on train data: ', mean_squared_error(ytrain, ytrain_pred)**0.5 )
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))


ytest_pred = rf.predict(xtest)
print('RMSE on test data: ', mean_squared_error(ytest, ytest_pred)**0.5 )
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

## Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(criterion='mse')
gbr.fit(xtrain, ytrain)

ytrain_pred = gbr.predict(xtrain)
print('RMSE on train data: ', mean_squared_error(ytrain, ytrain_pred)**0.5 )
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))


ytest_pred = gbr.predict(xtest)
print('RMSE on test data: ', mean_squared_error(ytest, ytest_pred)**0.5 )
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

## LightGBM

In [None]:
import lightgbm as lgb

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

lgbmc = lgb.LGBMRegressor()

params = {
    'n_estimators': sp_randint(100, 200),
    'learning_rate': sp_uniform(0, 0.5),
    'max_depth': sp_randint(1, 15),
    'num_leaves': sp_randint(10, 50)}

rsearch_lg = RandomizedSearchCV(lgbmc, param_distributions=params, cv=4, n_iter=50, random_state=4)

rsearch_lg.fit(x, y)

In [None]:
lgbr = lgb.LGBMRegressor(**rsearch_lg.best_params_, random_state=4)  


lgbr.fit(xtrain, ytrain)

ytrain_pred = lgbr.predict(xtrain)
ytest_pred = lgbr.predict(xtest)


ytrain_pred = rf.predict(xtrain)
print('RMSE on train data: ', mean_squared_error(ytrain, ytrain_pred)**0.5 )
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))


ytest_pred = rf.predict(xtest)
print('RMSE on test data: ', mean_squared_error(ytest, ytest_pred)**0.5 )
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

## Stacking Model

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
rf_tuned = RandomForestRegressor(**rsearch_rf.best_params_)
etr_tuned = ExtraTreesRegressor(**rsearch_etr.best_params_)
lgbr = lgb.LGBMRegressor(**rsearch_lg.best_params_)
gbr = GradientBoostingRegressor(criterion='mse')

In [None]:
estimators = [('rf_tuned', rf_tuned),('etr_tuned',etr_tuned),('lgbr', lgbr), ('gbr',gbr)]

stack1 = VotingRegressor(estimators=estimators)

stack1.fit(xtrain, ytrain)

ytrain_pred = stack1.predict(xtrain)
print('RMSE on train data: ', mean_squared_error(ytrain, ytrain_pred)**0.5) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = stack1.predict(xtest)
print('RMSE on test data: ', mean_squared_error(ytest, ytest_pred)**0.5)
print('R^2 on test data: ', r2_score(ytest, ytest_pred))
