In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
train=pd.read_csv('Train_UWu5bXk.csv')
test = pd.read_csv('Test_u94Q5KV.csv')

In [3]:
#Renaming the columns
train.rename(columns={'Item_Visibility':'Item_display_area'},inplace=True)
test.rename(columns={'Item_Visibility':'Item_display_area'},inplace=True)

In [4]:
#Concatenate Training and test data
total = pd.concat([train,test])

In [5]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14204 entries, 0 to 5680
Data columns (total 12 columns):
Item_Fat_Content             14204 non-null object
Item_Identifier              14204 non-null object
Item_MRP                     14204 non-null float64
Item_Outlet_Sales            8523 non-null float64
Item_Type                    14204 non-null object
Item_Weight                  11765 non-null float64
Item_display_area            14204 non-null float64
Outlet_Establishment_Year    14204 non-null int64
Outlet_Identifier            14204 non-null object
Outlet_Location_Type         14204 non-null object
Outlet_Size                  10188 non-null object
Outlet_Type                  14204 non-null object
dtypes: float64(4), int64(1), object(7)
memory usage: 1.4+ MB


In [6]:
#Changing the required datatypes
for c in total.columns:
    if(total[c].dtype =='O'):
        total[c] = total[c].astype('category')

In [7]:
total['Item_Fat_Content'] = pd.Categorical(total['Item_Fat_Content'],categories=['LF','Reg','reg','Low Fat','Regular','low fat'])
#Correcting inconsistencies with data
total.loc[total['Item_Fat_Content']=='Low Fat','Item_Fat_Content'] ='LF'
total.loc[total['Item_Fat_Content']=='Regular','Item_Fat_Content'] ='Reg'
total.loc[total['Item_Fat_Content']=='reg','Item_Fat_Content'] ='Reg'
total.loc[total['Item_Fat_Content']=='low fat','Item_Fat_Content'] ='LF'
total['Item_Fat_Content'] = pd.Categorical(total['Item_Fat_Content'],categories=['LF','Reg'])

In [8]:
#Imputing data- #Outlet Size
#As per existing data Irrespective of Tier type1 and Grocery store are small.So using small
total.loc[total.Outlet_Size.isnull(),'Outlet_Size'] = 'Small'

In [9]:
#Item_Weight
total['Item_Weight'] = total['Item_Weight'].fillna(total.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))
total['Item_Weight'] = total['Item_Weight'].fillna(total.groupby(['Item_Fat_Content'])['Item_Weight'].transform('mean'))

In [10]:
total.loc[total['Item_display_area']== 0,'Item_display_area'] = np.nan
total['Item_display_area'] = total['Item_display_area'].fillna(total.groupby(['Item_Identifier','Outlet_Type'])['Item_display_area'].transform('mean'))
total['Item_display_area'] = total['Item_display_area'].fillna(total.groupby(['Item_Identifier'])['Item_display_area'].transform('mean'))


In [11]:
#Creating a Generated type 
def gen_type(x):
    if x[0:2] == 'FD':
        return 'Foods'
    elif x[0:2] == 'DR':
        return 'Drinks'
    else:
        return 'Non Consumable'

In [12]:
#Apply the function on Gen_Item_type
total['Gen_Item_type'] = total['Item_Identifier'].apply(gen_type)

In [13]:
#It doesnt make sense to have fatcontent for non consumables hence make it inedible
total['Item_Fat_Content'] = pd.Categorical(total['Item_Fat_Content'],categories=['LF','Reg','Inedible'])
total.loc[total['Gen_Item_type'] == 'Non Consumable','Item_Fat_Content'] = 'Inedible'

In [14]:
#New column to calculate the age of a outlet
total['Outlet_Age']= 2013 - total['Outlet_Establishment_Year']

In [15]:
train_final = total.iloc[:8523,]
test_final = total.iloc[8523:,]

test_final.to_csv('Big_mart_test.csv')


In [16]:
total.head()

Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Type,Item_Weight,Item_display_area,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type,Gen_Item_type,Outlet_Age
0,LF,DRA12,141.6154,3829.0158,Soft Drinks,11.6,0.041011,2002,OUT045,Tier 2,Small,Supermarket Type1,Drinks,11
1,LF,DRA12,141.9154,992.7078,Soft Drinks,11.6,0.041011,2004,OUT035,Tier 2,Small,Supermarket Type1,Drinks,9
2,LF,DRA12,142.3154,2552.6772,Soft Drinks,11.6,0.040912,1987,OUT013,Tier 3,High,Supermarket Type1,Drinks,26
3,LF,DRA12,142.0154,850.8924,Soft Drinks,11.6,0.041113,2009,OUT018,Tier 3,Medium,Supermarket Type2,Drinks,4
4,LF,DRA12,140.3154,2552.6772,Soft Drinks,11.6,0.041178,2007,OUT017,Tier 2,Small,Supermarket Type1,Drinks,6


In [17]:
total = pd.get_dummies(total, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type', 'Outlet_Identifier', 'Gen_Item_type'])

In [18]:
total.head()

Unnamed: 0,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Type,Item_Weight,Item_display_area,Outlet_Establishment_Year,Outlet_Age,Item_Fat_Content_LF,Item_Fat_Content_Reg,...,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Gen_Item_type_Drinks,Gen_Item_type_Foods,Gen_Item_type_Non Consumable
0,DRA12,141.6154,3829.0158,Soft Drinks,11.6,0.041011,2002,11,1,0,...,0,0,0,0,1,0,0,1,0,0
1,DRA12,141.9154,992.7078,Soft Drinks,11.6,0.041011,2004,9,1,0,...,0,0,0,1,0,0,0,1,0,0
2,DRA12,142.3154,2552.6772,Soft Drinks,11.6,0.040912,1987,26,1,0,...,0,0,0,0,0,0,0,1,0,0
3,DRA12,142.0154,850.8924,Soft Drinks,11.6,0.041113,2009,4,1,0,...,1,0,0,0,0,0,0,1,0,0
4,DRA12,140.3154,2552.6772,Soft Drinks,11.6,0.041178,2007,6,1,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score,regression

In [20]:
train_final = total.iloc[:8523,]
test_final = total.iloc[8523:,]
test_final.drop(['Item_Outlet_Sales', 'Item_Type', 'Item_Identifier', 'Outlet_Establishment_Year'], axis = 1, inplace = True)
X = train_final.drop(['Item_Outlet_Sales', 'Item_Type', 'Item_Identifier', 'Outlet_Establishment_Year'], axis = 1)
Y= train_final['Item_Outlet_Sales']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Linear regression

In [21]:

X_train,X_test,y_train,y_test = train_test_split(X, Y,test_size=0.30, random_state=42)

In [28]:
Scaler = MinMaxScaler()
#Scaler.fit(X_train)
X_train_scaled = Scaler.fit_transform(X_train)
X_test_scaled = Scaler.fit_transform(X_test)
test_final_scaled = Scaler.fit_transform(test_final)

In [29]:
lr = linear_model.LinearRegression()
lr.fit(X_train_scaled,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
predictions=lr.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,predictions))
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

R-Squared: 0.556834730041
MAE: 844.215536253
MSE: 1300539.00879
RMSE: 1140.41177159


### Ridge Regression

In [31]:
Rg = linear_model.Ridge(alpha=1, normalize=False, solver='auto')
Rg.fit(X_train_scaled,y_train)
pred2 = Rg.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,pred2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred2)))

R-Squared: 0.557110791085
RMSE: 1140.05651778


### Decision tree regression

In [78]:
## Grid search CV for extracting best parameters

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': np.arange(3, 10), 'min_samples_leaf':np.arange(10,500,20)}

tree = GridSearchCV(DecisionTreeRegressor(), param_grid)

tree.fit(X_train_scaled, y_train)
tree_preds = tree.predict(X_test_scaled)

tree.best_score_
tree.best_params_

{'max_depth': 5, 'min_samples_leaf': 50}

In [34]:
from sklearn.tree import DecisionTreeRegressor
Dt = DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
Dt.fit(X_train_scaled,y_train)
pred3 = Dt.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,pred3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred3)))

R-Squared: 0.589588615235
RMSE: 1097.45957571


In [79]:
Dt.predict(test_final_scaled)
pd.DataFrame(Rf.predict(test_final_scaled)).to_csv('decisiontree.csv')

## random forest regressor

In [80]:
from sklearn.ensemble import RandomForestRegressor
Rf = RandomForestRegressor(n_estimators= 500, max_depth=5,min_samples_leaf=50)
Rf.fit(X_train_scaled,y_train)
pred3 = Rf.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,pred3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred3)))

R-Squared: 0.595028752541
RMSE: 1090.16171813


In [81]:
Rf.predict(test_final_scaled)

array([ 1620.65842422,  1446.38900549,   591.59390509, ...,  1927.35437599,
        3536.94740081,  1317.99513173])

In [82]:
pd.DataFrame(Rf.predict(test_final_scaled)).to_csv('random_forest.csv')

##  GRADIENT bOOST REGRESSOR

In [83]:
from sklearn.ensemble import GradientBoostingRegressor
gf = GradientBoostingRegressor(learning_rate=0.1, n_estimators=45, subsample=1.0,  
                          min_samples_split=2, min_samples_leaf=150, max_depth=6, random_state=None, 
                          alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False )
gf.fit(X_train_scaled,y_train)
pred3 = gf.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,pred3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred3)))

R-Squared: 0.594756022526
RMSE: 1090.52874391


In [84]:
gf.predict(test_final_scaled)
pd.DataFrame(gf.predict(test_final_scaled)).to_csv('Gradient_boost.csv')

## Extra Tree regressor

In [88]:
from sklearn.ensemble import ExtraTreesRegressor

In [89]:
Et = ExtraTreesRegressor(n_estimators=200, criterion='mse', max_depth=6, min_samples_leaf=50, 
                          max_features='auto', 
                         bootstrap=False, random_state=None, verbose=0, warm_start=False)
Et.fit(X_train_scaled,y_train)
pred3 = Et.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,pred3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred3)))

R-Squared: 0.597461894916
RMSE: 1086.88183726


In [90]:
Et.predict(test_final_scaled)
pd.DataFrame(Et.predict(test_final_scaled)).to_csv('Extra_tree.csv')

## Neural Networks

In [59]:
from sklearn.neural_network import MLPRegressor

In [60]:
X_train.shape

(5966, 30)

In [72]:
NN= MLPRegressor(hidden_layer_sizes=(30, 30, 30 ,30))
NN.fit(X_train_scaled,y_train)
pred3 = NN.predict(X_test_scaled)
print('R-Squared:',metrics.r2_score(y_test,pred3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred3)))

R-Squared: 0.593850565363
RMSE: 1091.74637597


In [66]:
NN.predict(test_final_scaled)
pd.DataFrame(NN.predict(test_final_scaled)).to_csv('Neuralnet.csv')