In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = pd.read_csv('bigmart_train.csv')

# Data Summarization

In [None]:
df.head(10)

In [None]:
df.describe()

1. As there is some gap between 75% and max, there may be outliers.
2. Since there is difference between mean and median, skewness appears.

In [None]:
df.info()

1. All the object type columns needs to be converted to integers or float. We can use LabelEncoders.
2. There are some null values present in columns : Item_Weight and Outlet_Size. SimpleImputation can be used.
    

# Data Visualization 

In [None]:
df.columns

In [None]:
#Types of fat content in items present in the store
fat_count = df['Item_Fat_Content'].value_counts()
fat_types = ['Low Fat','Regular','LF','reg','low fat']

plt.figure(figsize=(5,5))
sns.barplot(fat_types,fat_count)
plt.title("Item's Fat Content")
plt.xlabel("Fat Content")
plt.ylabel("Count")

We can see that Low fat and Regular are the two types of fat but they have spelled in various ways. Data Cleaning needs to be done.

In [None]:
#Top 10 items sold in the mart:
item_type = df['Item_Type'].value_counts()[:10]
plt.figure(figsize=(16,5))
sns.barplot(item_type.index,item_type.values,alpha=0.8)
plt.title("Top 10 items sold in BigMart")
plt.xlabel("Item Types")
plt.ylabel("Count")

We can say from this graph that most people buy fruits and vegetables the most follwed by snacks.

In [None]:
#BigMart Outlet Sizes
outlet_size = df['Outlet_Size'].value_counts()
plt.figure(figsize=(5,4))
sns.barplot(outlet_size.index,outlet_size.values,alpha=0.8)
plt.title("BigMart Outlet Sizes",fontsize=13)
plt.xlabel("Outlet Size")
plt.ylabel("Count")

In [None]:
sns.distplot(df['Item_MRP'])

We can know from this plot that mrp of the products ranges between 0 to 300 only.

In [None]:
sns.distplot(df['Item_Outlet_Sales'])

Its a right skewed data and we can see that the sale price goes beyond 8k, which is a huge amount when compared to mrp.

In [None]:
#Null values :
sns.heatmap(df.isnull())

We can see that there are null values in Item_Weight and Outlet_Size column.

# Data Manipulation

In [None]:
df.head()

In [None]:
#We can drop Item_Identifier column 

df.drop('Item_Identifier',axis=1,inplace=True)

In [None]:
#Fill Missing values : 
def fill_na(df):
    mode = list(df['Outlet_Size'].mode())
    #print("Mode : ", mode)
    df['Outlet_Size'] = df['Outlet_Size'].fillna(mode[0])

    mean = round(df['Item_Weight'].mean(),2)
    #print("Mean : ", mean)
    df['Item_Weight'] = df['Item_Weight'].fillna(mean)
    return df

df = fill_na(df)

In [None]:
#Removing duplicates from Item_Fat_Content

def remove_duplicates(df): 
    df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg','Regular')
    df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF','Low Fat')
    df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat','Low Fat')
    return df

df = remove_duplicates(df)

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encode(df):
    df_new = df.copy()
    df_new.drop('Item_Weight',axis=1,inplace=True)
    df_new.drop('Item_Visibility',axis=1,inplace=True)
    df_new.drop('Item_MRP',axis=1,inplace=True)
    df_new.drop('Outlet_Establishment_Year',axis=1,inplace=True)
    df_new.drop('Item_Outlet_Sales',axis=1,inplace=True)

    col = df_new.columns.values
    le = LabelEncoder()

    for each in range(len(col)):
        le.fit(df[col[each]])
        df[col[each]] = le.transform(df[col[each]])
    df.head()    
    
    return df

df = label_encode(df)


In [None]:
#Correlation between columns :

df_corr = df.corr()
plt.figure(figsize=(10,6))
sns.heatmap(df_corr,annot=True)
plt.title("Co-relation between the columns",fontsize=15)

1. We can see from the heatmap that there is a positive correlation with the target variable 'Item_Outlet_Sales' in almost all columns except 'Item_Visibility','Outlet_Establishment_Year' and 'Outlet_Size'.
2. Outlet_MRP is highly correlated to our target variable.

In [None]:
#Check for skewness : 

print("Skewness value in all columns : ")
print(df.skew())

We can see  skewness in columns : ['Item_Fat_Content','Item_Visibility','Outlet_Type','Item_Outlet_Sales']

In [None]:
#Skewness graph :
col = df.columns.values
plt.figure(figsize=(17,70))
for i in range(0,len(col)):
    plt.subplot(16,11,i+1)
    sns.distplot(df[col[i]])
plt.show()    

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer('yeo-johnson')
df = pd.DataFrame(pt.fit_transform(df))

df.skew()

Almost all the skewness has been removed now.

In [None]:
col = df.columns.values
plt.figure(figsize=(15,100))
for i in range(0,len(col)):
    plt.subplot(16,11,i+1)
    sns.boxplot(df[col[i]],palette='viridis',orient='v')
    plt.tight_layout()
    
    

Few outliers are present.

In [None]:
#pairplot
plt.figure(figsize=(16,16))
sns.pairplot(df)
plt.tight_layout()
plt.show()
    

In [None]:
from scipy.stats import zscore

z =np.abs(zscore(df))
print(np.where(z>3))

In [None]:
df_new = df[(z < 3).all(axis=1)]
df_new

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler 

def scale(df_new):
    scaler = StandardScaler()
    scaler.fit(df_new)
    df_new = pd.DataFrame(scaler.transform(df_new))
    return df_new
df_new = scale(df_new)

In [None]:
X = df_new.iloc[:,:10]
y= df_new.iloc[:,10]



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor




def random_state_counter(model):
    max_r2_score=0
    for j in range(10,90):
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=j)
        reg = model
        reg.fit(x_train,y_train)
        y_pred = reg.predict(x_test)
        score = r2_score(y_test,y_pred)
        if score>max_r2_score:
            max_r2_score = score
            final_state = j
    return final_state,max_r2_score
    

In [None]:
lr = LinearRegression()
svr = SVR()
knr = KNR()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
model = [lr,svr,knr,dtr,rfr]

for each in range(len(model)):
    print("-------------------------------------------------------")
    print(model[each])
    seed,score = random_state_counter(model[each])
    print("-------------------------------------------------------")
    print("Random State is ", seed , "with score : ",score)
    print("-------------------------------------------------------")

    print("\n")


In [None]:
#SVR : 
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=85)
final_reg = svr
final_reg.fit(x_train,y_train)
y_pred = final_reg.predict(x_test)

print("R2 Score : ",r2_score(y_test,y_pred))

Ada Boost Regressor works best of all.

In [None]:
#Boosting : 
#Adaboost Regressor and RandomforestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


ada = AdaBoostRegressor(n_estimators=100,random_state=124)
reg = GradientBoostingRegressor(n_estimators=100,random_state=13)

boosting_model = [ada,reg]

for i in range(len(boosting_model)):
    boost = boosting_model[i]
    boost.fit(x_train,y_train)
    pred = boost.predict(x_test)
    r2 = r2_score(y_test,y_pred)
    print("-----------------------------------------------------------")
    print(boost)
    print("-----------------------------------------------------------")
    print("R2 Score : ", r2)
    


In [None]:
# Create the parameter distirbution
param_dist = { 
    'n_estimators': [50, 100],
    'learning_rate': [0.01,0.05,0.1,0.3,1],
    'loss': ['linear','square','exponential']
}


jobs=-1
ada_grid = GridSearchCV(estimator=ada,
                      param_grid=param_dist,
                      scoring='r2',
                      cv=5,
                      n_jobs=jobs)

cv_score = cross_val_score(ada_grid,X,y,cv=5,scoring='r2')
print("Cross Validation Score : ", cv_score.mean())

In [None]:
print("Mean Absolute Error : ",mean_absolute_error(y_test,y_pred))
print("Mean Squared Error  : ",mean_squared_error(y_test,y_pred))

In [None]:
from sklearn.externals import joblib

joblib.dump(ada,'adamodel.obj')

# Testing the Model

In [None]:
testing_data = pd.read_csv("bigmart_test.csv")

In [None]:
testing_data.head()

In [None]:
testing_data = fill_na(testing_data)
testing_data = remove_duplicates(testing_data)
#testing_data.drop('Item_Identifier',axis=1,inplace=True)
#testing_data = label_encode(testing_data)

col = ['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type']
le = LabelEncoder()
for each in range(len(col)):
        le.fit(testing_data[col[each]])
        testing_data[col[each]] = le.transform(testing_data[col[each]])
    

testing_data = scale(testing_data)
testing_data

In [None]:
svr_from_joblib = joblib.load('adamodel.obj')
svr_from_joblib.predict(testing_data)