In [42]:
import math
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [43]:
train = pd.read_csv("./Train.csv")
test = pd.read_csv("./Test.csv")

In [44]:
train.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [45]:
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
Item_Identifier              8523 non-null object
Item_Weight                  7060 non-null float64
Item_Fat_Content             8523 non-null object
Item_Visibility              8523 non-null float64
Item_Type                    8523 non-null object
Item_MRP                     8523 non-null float64
Outlet_Identifier            8523 non-null object
Outlet_Establishment_Year    8523 non-null int64
Outlet_Size                  6113 non-null object
Outlet_Location_Type         8523 non-null object
Outlet_Type                  8523 non-null object
Item_Outlet_Sales            8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [46]:
train['Item_Fat_Content'].unique()


array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [47]:
# Correcting mislabeleld columns
train['Item_Fat_Content'].replace(to_replace='low fat', value='Low Fat', inplace=True)
train['Item_Fat_Content'].replace(to_replace='LF', value='Low Fat', inplace=True)
train['Item_Fat_Content'].replace(to_replace='reg', value='Regular', inplace=True)
test['Item_Fat_Content'].replace(to_replace='low fat', value='Low Fat', inplace=True)
test['Item_Fat_Content'].replace(to_replace='LF', value='Low Fat', inplace=True)
test['Item_Fat_Content'].replace(to_replace='reg', value='Regular', inplace=True)


In [48]:
# Factorising categorical columns in the dataset
col_enc = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Location_Type', 'Outlet_Type']
for x in col_enc:
    train[x], _ = pd.factorize(train[x])
    test[x], _ = pd.factorize(test[x])


In [49]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,0,20.75,0,0.007565,0,107.8622,0,0,Medium,0,0
1,1,8.3,1,0.038428,1,87.3198,1,1,,1,0
2,2,14.6,0,0.099575,2,241.7538,2,2,,2,1
3,3,7.315,0,0.015388,0,155.034,1,1,,1,0
4,4,,1,0.118599,1,234.23,3,3,Medium,2,2


In [50]:
test.isnull().sum()


Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [51]:
# Handling the missing values
# Use regression to fill missing values in the 'Item_Weight' column.
# Train set
train_sub = train.drop(['Outlet_Size'], axis = 1)
print(train_sub)
train_sub_test = train_sub[train_sub["Item_Weight"].isnull()]
print(train_sub_test)
train_sub = train_sub.dropna()
print(train_sub)
y_train = train_sub["Item_Weight"]
X_train = train_sub.drop("Item_Weight", axis=1)
X_test = train_sub_test.drop("Item_Weight", axis=1)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
train.loc[train.Item_Weight.isnull(), 'Item_Weight'] = y_pred


      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0                   0        9.300                 0         0.016047   
1                   1        5.920                 1         0.019278   
2                   2       17.500                 0         0.016760   
3                   3       19.200                 1         0.000000   
4                   4        8.930                 0         0.000000   
...               ...          ...               ...              ...   
8518              359        6.865                 0         0.056783   
8519             1537        8.380                 1         0.046982   
8520              354       10.600                 0         0.035186   
8521              908        7.210                 1         0.145221   
8522              462       14.800                 0         0.044878   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0             0  249.8092                  0    

In [52]:
# Test set
test_sub = test.drop(['Outlet_Size'], axis = 1)
test_sub_test = test_sub[test_sub["Item_Weight"].isnull()]
test_sub = test_sub.dropna()
y_test = test_sub["Item_Weight"]
X_test = test_sub.drop("Item_Weight", axis=1)
X_test_test = test_sub_test.drop("Item_Weight", axis=1)
lr = LinearRegression()
lr.fit(X_test, y_test)
y_pred = lr.predict(X_test_test)
test.loc[test.Item_Weight.isnull(), 'Item_Weight'] = y_pred


In [53]:
# Filling in 'Outlet_Size' column using mode replacement.
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)
test['Outlet_Size'].fillna(test['Outlet_Size'].mode()[0], inplace=True)
train['Outlet_Size'], _ = pd.factorize(train['Outlet_Size'])
test['Outlet_Size'], _ = pd.factorize(test['Outlet_Size'])


In [54]:
# Preparing training and test sets
X = train.drop(['Item_Outlet_Sales'], axis = 1)
y = train['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [55]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
print('Mean squared error: ', mean_squared_error(y_test, predictions))
print('Root mean squared error: ', math.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error: ', mean_absolute_error(y_test, predictions))
print('Coefficient of determination (R2): ', r2_score(y_test, predictions))


Mean squared error:  1593302.9660163904
Root mean squared error:  1262.2610530379168
Mean absolute error:  928.8977207526835
Coefficient of determination (R2):  0.4315167309048755


In [56]:
# Gradient Boosting
reg = GradientBoostingRegressor(random_state = 42)
reg.fit(X_train, y_train)
predictions = reg.predict(X_test)
print('Mean squared error: ', mean_squared_error(y_test, predictions))
print('Root mean squared error: ', math.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error: ', mean_absolute_error(y_test, predictions))
print('Coefficient of determination (R2): ', r2_score(y_test, predictions))


Mean squared error:  1135809.2466410724
Root mean squared error:  1065.7435182261595
Mean absolute error:  753.1713728419547
Coefficient of determination (R2):  0.5947484142244764


In [57]:
# Extreme Gradient Boosting
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)
print('Mean squared error: ', mean_squared_error(y_test, predictions))
print('Root mean squared error: ', math.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error: ', mean_absolute_error(y_test, predictions))
print('Coefficient of determination (R2): ', r2_score(y_test, predictions))


Mean squared error:  1328878.6941220842
Root mean squared error:  1152.7700092048215
Mean absolute error:  807.3120965056385
Coefficient of determination (R2):  0.5258621113634385


In [58]:
# Random Forest
rf = RandomForestRegressor(max_depth = 2, random_state = 42)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
print('Mean squared error: ', mean_squared_error(y_test, predictions))
print('Root mean squared error: ', math.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error: ', mean_absolute_error(y_test, predictions))
print('Coefficient of determination (R2): ', r2_score(y_test, predictions))


Mean squared error:  1701378.0748748793
Root mean squared error:  1304.3688415762158
Mean absolute error:  986.4445918560846
Coefficient of determination (R2):  0.3929560224256238


In [59]:
# Decision Tree
dt = DecisionTreeRegressor(random_state = 42)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
print('Mean squared error: ', mean_squared_error(y_test, predictions))
print('Root mean squared error: ', math.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error: ', mean_absolute_error(y_test, predictions))
print('Coefficient of determination (R2): ', r2_score(y_test, predictions))


Mean squared error:  2433598.059110748
Root mean squared error:  1559.9993779199876
Mean absolute error:  1082.4324565232846
Coefficient of determination (R2):  0.13170325429959928


In [60]:
# Support Vector Machine
rng = np.random.RandomState(42)
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train, y_train)
predictions = regr.predict(X_test)
print('Mean squared error: ', mean_squared_error(y_test, predictions))
print('Root mean squared error: ', math.sqrt(mean_squared_error(y_test, predictions)))
print('Mean absolute error: ', mean_absolute_error(y_test, predictions))
print('Coefficient of determination (R2): ', r2_score(y_test, predictions))


Mean squared error:  2720662.385723626
Root mean squared error:  1649.4430531920846
Mean absolute error:  1239.0536801696262
Coefficient of determination (R2):  0.02928000504055006


In [21]:
# Gradient Boosting Regressor gives the best performance with the
# lease RMSE of 1065.74.
