                                         Importing The Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.linear_model import LinearRegression

                                          
                                          
                                          Data Collection & Analysis

In [None]:
bmd = pd.read_csv("Train.csv")

# NUMBER OF DATAPOINTS(ROWS) & NUMBER OF FEATURES(COLUMNS)
print(bmd.shape)   

In [None]:
bmd.head()

In [None]:
# GETTING SOME INFORMATION ABOUT THE DATASET
bmd.info()

 WE ARE HAVING SOME CATEGORICAL VALUES IN OUR DATASET WHICH ARE NON-NUMERICAL LIKE FATCONTENT HAVE TWO CATEGORIES
 CATEGORICAL FEATURES IN OUR DATASETS:
 - ProductID
 - FatContent
 - ProductType
 - OutletID
 - OutletSize
 - LocationType
 - OutletType

In [None]:
# CHECKING FOR MISSING VALUES
bmd.isnull().sum()

In [None]:
#unique values in data 
bmd.apply(lambda x: len(x.unique()))

                          
                          
                                 HANDLING MISSING VALUES USING IMPUTATION :
                          - MEAN --> average value    (can use for numerical data)
                          - MODE --> most repeated value  (can use for categorical data)

In [None]:
# MEAN VALUE OF WEIGHT COLUMN
bmd['Weight'].mean()

In [None]:
# FILLING THE MISSING VALUES IN WEIGHT COLUMN WITH CALCULATED MEAN
bmd['Weight'].fillna(bmd['Weight'].mean(), inplace = True)

In [None]:
bmd['OutletSales'].fillna(bmd['OutletSales'].mean(),inplace = True)

In [None]:
bmd.isnull().sum()

                         
                         REPLACING THE MISSING VALUES IN 'OUTLETSIZE' COLUMN WITH MODE

In [None]:
mode_of_outlet_size = bmd.pivot_table(values='OutletSize', columns = 'OutletType', aggfunc=(lambda x: x.mode()[0]))
# here we are using lambda its same as def but for using a function only one time we dont need to create a def function

In [None]:
print(mode_of_outlet_size) # what it says is most of the grocery stores are small

In [None]:
missing_values = bmd['OutletSize'].isnull()
print(missing_values)

In [None]:
bmd.loc[missing_values, 'OutletSize'] = bmd.loc[missing_values, 'OutletType'].apply(lambda x: mode_of_outlet_size[x])

In [None]:
bmd.isnull().sum()

                                              DATA ANALYSIS

In [None]:
bmd.describe()

                                      PLOTTING OF NUMERICAL FEATURES

In [None]:
sns.set()  #THEMES

In [None]:
# WEIGHT DISTRIBUTION
plt.figure(figsize=(6, 6))
sns.distplot(bmd['Weight'])
plt.show()
sns.histplot(bmd['Weight'])
plt.show()

In [None]:
# PRODUCT VISIBILITY
plt.figure(figsize=(6,6))
sns.distplot(bmd['ProductVisibility'])
plt.show

In [None]:
# MRP
plt.figure(figsize=(6,6))
sns.distplot(bmd['MRP'])
plt.show()

In [None]:
# ESTABLISHMENT YEAR
plt.figure(figsize=(6,6))
sns.distplot(bmd['EstablishmentYear'])
plt.show()

In [None]:
# OUTLET SALES
plt.figure(figsize=(6,6))
sns.distplot(bmd['OutletSales'])
plt.show()

In [None]:
# ESTABLISHMENT YEAR
plt.figure(figsize=(8,8))
sns.countplot(x='EstablishmentYear', data=bmd)
plt.show()



                                           PLOTTING OF CATEGORICAL DATA

In [None]:
# FatContent
plt.figure(figsize=(6,6))
sns.countplot(x='FatContent', data=bmd)
plt.show()

In [None]:
# FatContent Corrected 
plt.figure(figsize=(6,6))
sns.countplot(x='FatContent', data=bmd)
plt.show()

In [None]:
# ProductType
plt.figure(figsize=(20,6))
sns.countplot(x='ProductType', data=bmd)
plt.show()

In [None]:
# change in Fatcontent plot
bmd['FatContent'].value_counts()

In [None]:
bmd.replace({'FatContent': {'low fat':'Low Fat', 'reg':'Regular', 'LF':'Low Fat'}}, inplace = True)

In [None]:
# Outlet Size
plt.figure(figsize=(6,6))
sns.countplot(x='OutletSize', data=bmd)
plt.show()

                                              
                                              
                                              DATA PREPROCESSING
                                               [LABEL ENCODING]

In [None]:
bmd.head()

In [None]:
encoder = LabelEncoder()

In [None]:
bmd['ProductID'] = encoder.fit_transform(bmd['ProductID'])
bmd['FatContent'] = encoder.fit_transform(bmd['FatContent'])
bmd['ProductType'] = encoder.fit_transform(bmd['ProductType'])
bmd['OutletID'] = encoder.fit_transform(bmd['OutletID'])
bmd['OutletSize'] = encoder.fit_transform(bmd['OutletSize'])
bmd['LocationType'] = encoder.fit_transform(bmd['LocationType'])
bmd['OutletType'] = encoder.fit_transform(bmd['OutletType'])

In [None]:
bmd.head()

In [None]:
bmd.dtypes

In [None]:
bmd.head()



                                               
                                               SPLITTING FEATURES AND TARGET

In [None]:
feature=bmd.drop(columns='OutletSales', axis=1)
target=bmd['OutletSales']

In [None]:
print(feature)

In [None]:
print(target)



                               SPLITTING THE DATA INTO TRAINING DATA & TESTING DATA

In [None]:
feature_train, feature_test, target_train, target_test = train_test_split(feature,target,test_size=0.2,random_state=2)

                                             
                                             
                                             GENERIC FUNCTION

In [None]:
def modelprototype(algo,featuretrain,targettrain,featuretest,targettest):
    
    #Fit the algorithm on the data
    algo.fit(featuretrain,targettrain)
    
    #Predict training set:
    feature_train_predict = algo.predict(featuretrain)
    prediction_train = np.sqrt(metrics.mean_squared_error(targettrain,feature_train_predict))
    acc_train = algo.score(featuretrain,targettrain)
    
    #predict testing set:
    feature_test_predict = algo.predict(featuretest)
    prediction_test = np.sqrt(metrics.mean_squared_error(targettest,feature_test_predict))
    acc_test = algo.score(featuretest,targettest)
    
    #ModelReport
    print("MODEL REPORT :")
    print("RMSE of train:",prediction_train)
    print("RMSE of test:",prediction_test)
    print("accuracy of train data:",acc_train)
    print("accuracy of test data:",acc_test)
    print("r squared value:",metrics.r2_score(targettest,feature_test_predict))

                                        
                                        
                                        
                                        
                                        TRAINING MACHINE LEARNING MODEL

                                              XGBoost Regressor

In [None]:
regressor = XGBRegressor()
modelprototype(regressor,feature_train,target_train,feature_test,target_test)

                                               Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt1 = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
modelprototype(dt1,feature_train,target_train,feature_test,target_test)

In [None]:
dt2 = DecisionTreeRegressor(max_depth=8, min_samples_leaf=150)
modelprototype(dt2,feature_train,target_train,feature_test,target_test)

                                               
                                               
                                               
                                                LINEAR REGRESSION

In [None]:
linear = LinearRegression()
modelprototype(linear,feature_train,target_train,feature_test,target_test)

In [None]:
print("weights: ",linear.coef_)
print("intercept: ",linear.intercept_)

                                                      
                                                      
                                                      RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor(n_estimators=200,max_depth=5, min_samples_leaf=100,n_jobs=4)
modelprototype(rf1,feature_train,target_train,feature_test,target_test)

In [None]:
rf2 = RandomForestRegressor(n_estimators=400,max_depth=6, min_samples_leaf=100,n_jobs=4)
modelprototype(rf2,feature_train,target_train,feature_test,target_test)