In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import tree

In [2]:
train = pd.read_csv('Train_UWu5bXk.csv')
test = pd.read_csv('Test_u94Q5KV.csv')
random_state = 42

item = test['Item_Identifier']
outlet = test['Outlet_Identifier']

In [3]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
Item_Identifier              8523 non-null object
Item_Weight                  7060 non-null float64
Item_Fat_Content             8523 non-null object
Item_Visibility              8523 non-null float64
Item_Type                    8523 non-null object
Item_MRP                     8523 non-null float64
Outlet_Identifier            8523 non-null object
Outlet_Establishment_Year    8523 non-null int64
Outlet_Size                  6113 non-null object
Outlet_Location_Type         8523 non-null object
Outlet_Type                  8523 non-null object
Item_Outlet_Sales            8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.1+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
Item_Identifier              5681 non-null object
Item_Weight                  4705 non-null float64
Item_Fat_Content             5681 non-null object
Item_Visibility              5681 non-null float64
Item_Type                    5681 non-null object
Item_MRP                     5681 non-null float64
Outlet_Identifier            5681 non-null object
Outlet_Establishment_Year    5681 non-null int64
Outlet_Size                  4075 non-null object
Outlet_Location_Type         5681 non-null object
Outlet_Type                  5681 non-null object
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB


In [7]:
train['Item_Type_Combined'] = train['Item_Identifier'].apply(lambda x : x[0:2])
train['Item_Type_Combined'] = train['Item_Type_Combined'].map({'FD': 'Food',
                                                              'NC' : 'Non Consumable',
                                                              'DR': 'Drink'})

test['Item_Type_Combined'] = test['Item_Identifier'].apply(lambda x : x[0:2])
test['Item_Type_Combined'] = test['Item_Type_Combined'].map({'FD': 'Food',
                                                              'NC' : 'Non Consumable',
                                                              'DR': 'Drink'})

In [8]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Type_Combined
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Food
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Drink
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Food
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,Food
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Non Consumable


In [9]:
train['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [10]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})


train.loc[train['Item_Type_Combined']=="Non-Consumable",'Item_Fat_Content'] = "Non-Edible"
test.loc[test['Item_Type_Combined']=="Non-Consumable",'Item_Fat_Content'] = "Non-Edible"

In [11]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food
1,FDW14,8.3,Regular,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1,Food
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store,Non Consumable
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1,Food
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,Food


In [12]:
cat_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 
             'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Type_Combined']
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
miss_cols = ['Item_Weight', 'Outlet_Size']

In [13]:
enc = LabelEncoder()

for col in cat_cols:
    train[col] = train[col].astype('str')
    test[col] = test[col].astype('str')
    train[col] = enc.fit_transform(train[col])
    test[col] = enc.transform(test[col])

In [14]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Type_Combined
0,156,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,3735.138,1
1,8,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,443.4228,0
2,662,17.5,0,0.01676,10,141.618,9,1999,1,0,1,2097.27,1
3,1121,19.2,1,0.0,6,182.095,0,1998,3,2,0,732.38,1
4,1297,8.93,0,0.0,9,53.8614,1,1987,0,2,1,994.7052,2


In [15]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,1114,20.75,0,0.007565,13,107.8622,9,1999,1,0,1,1
1,1078,8.3,1,0.038428,4,87.3198,2,2007,3,1,1,1
2,1420,14.6,0,0.099575,11,241.7538,0,1998,3,2,0,2
3,817,7.315,0,0.015388,13,155.034,2,2007,3,1,1,1
4,1197,,1,0.118599,4,234.23,5,1985,1,2,3,1


In [16]:
train['Item_Fat_Content'].value_counts()

0    5517
1    3006
Name: Item_Fat_Content, dtype: int64

In [17]:
imp = Imputer(missing_values=0,strategy='mean', axis=0)

train['Item_Visibility'] = imp.fit_transform(train['Item_Visibility'].reshape(-1, 1))
test['Item_Visibility']= imp.transform(test['Item_Visibility'].reshape(-1, 1))

train = pd.DataFrame(train)
test = pd.DataFrame(test)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [18]:
X_train, y_train = train.iloc[:, 1:], train['Item_Outlet_Sales']
X_train = X_train.drop(['Item_Outlet_Sales'], axis=1)
X_test = test.iloc[:,1:]

In [19]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,1
1,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,0
2,17.5,0,0.01676,10,141.618,9,1999,1,0,1,1
3,19.2,1,0.070482,6,182.095,0,1998,3,2,0,1
4,8.93,0,0.070482,9,53.8614,1,1987,0,2,1,2


In [20]:
X_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,20.75,0,0.007565,13,107.8622,9,1999,1,0,1,1
1,8.3,1,0.038428,4,87.3198,2,2007,3,1,1,1
2,14.6,0,0.099575,11,241.7538,0,1998,3,2,0,2
3,7.315,0,0.015388,13,155.034,2,2007,3,1,1,1
4,,1,0.118599,4,234.23,5,1985,1,2,3,1


In [21]:
y_train.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

In [22]:
imp = Imputer(missing_values='NaN',strategy='median', axis=0)

X_train = imp.fit_transform(X_train)
X_test= imp.transform(X_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [23]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,9.3,0.0,0.016047,4.0,249.8092,9.0,1999.0,1.0,0.0,1.0,1.0
1,5.92,1.0,0.019278,14.0,48.2692,3.0,2009.0,1.0,2.0,2.0,0.0
2,17.5,0.0,0.01676,10.0,141.618,9.0,1999.0,1.0,0.0,1.0,1.0
3,19.2,1.0,0.070482,6.0,182.095,0.0,1998.0,3.0,2.0,0.0,1.0
4,8.93,0.0,0.070482,9.0,53.8614,1.0,1987.0,0.0,2.0,1.0,2.0


In [24]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,20.75,0.0,0.007565,13.0,107.8622,9.0,1999.0,1.0,0.0,1.0,1.0
1,8.3,1.0,0.038428,4.0,87.3198,2.0,2007.0,3.0,1.0,1.0,1.0
2,14.6,0.0,0.099575,11.0,241.7538,0.0,1998.0,3.0,2.0,0.0,2.0
3,7.315,0.0,0.015388,13.0,155.034,2.0,2007.0,3.0,1.0,1.0,1.0
4,12.6,1.0,0.118599,4.0,234.23,5.0,1985.0,1.0,2.0,3.0,1.0


In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
0     8523 non-null float64
1     8523 non-null float64
2     8523 non-null float64
3     8523 non-null float64
4     8523 non-null float64
5     8523 non-null float64
6     8523 non-null float64
7     8523 non-null float64
8     8523 non-null float64
9     8523 non-null float64
10    8523 non-null float64
dtypes: float64(11)
memory usage: 732.5 KB


In [26]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
0     5681 non-null float64
1     5681 non-null float64
2     5681 non-null float64
3     5681 non-null float64
4     5681 non-null float64
5     5681 non-null float64
6     5681 non-null float64
7     5681 non-null float64
8     5681 non-null float64
9     5681 non-null float64
10    5681 non-null float64
dtypes: float64(11)
memory usage: 488.3 KB


In [27]:
X_train['Outlet_Years'] = 2013 - X_train[6]
X_test['Outlet_Years'] = 2013 - X_test[6]

In [28]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,Outlet_Years
0,9.3,0.0,0.016047,4.0,249.8092,9.0,1999.0,1.0,0.0,1.0,1.0,14.0
1,5.92,1.0,0.019278,14.0,48.2692,3.0,2009.0,1.0,2.0,2.0,0.0,4.0
2,17.5,0.0,0.01676,10.0,141.618,9.0,1999.0,1.0,0.0,1.0,1.0,14.0
3,19.2,1.0,0.070482,6.0,182.095,0.0,1998.0,3.0,2.0,0.0,1.0,15.0
4,8.93,0.0,0.070482,9.0,53.8614,1.0,1987.0,0.0,2.0,1.0,2.0,26.0


In [29]:
X_train = X_train.drop([6], axis=1)
X_test = X_test.drop([6], axis=1)

In [30]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,Outlet_Years
0,9.3,0.0,0.016047,4.0,249.8092,9.0,1.0,0.0,1.0,1.0,14.0
1,5.92,1.0,0.019278,14.0,48.2692,3.0,1.0,2.0,2.0,0.0,4.0
2,17.5,0.0,0.01676,10.0,141.618,9.0,1.0,0.0,1.0,1.0,14.0
3,19.2,1.0,0.070482,6.0,182.095,0.0,3.0,2.0,0.0,1.0,15.0
4,8.93,0.0,0.070482,9.0,53.8614,1.0,0.0,2.0,1.0,2.0,26.0


In [31]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,Outlet_Years
0,20.75,0.0,0.007565,13.0,107.8622,9.0,1.0,0.0,1.0,1.0,14.0
1,8.3,1.0,0.038428,4.0,87.3198,2.0,3.0,1.0,1.0,1.0,6.0
2,14.6,0.0,0.099575,11.0,241.7538,0.0,3.0,2.0,0.0,2.0,15.0
3,7.315,0.0,0.015388,13.0,155.034,2.0,3.0,1.0,1.0,1.0,6.0
4,12.6,1.0,0.118599,4.0,234.23,5.0,1.0,2.0,3.0,1.0,28.0


In [32]:
scaler = StandardScaler()

X_train[[0, 4, 'Outlet_Years']] = scaler.fit_transform(X_train[[0, 4, 'Outlet_Years']])
X_test[[0, 4, 'Outlet_Years']] = scaler.transform(X_test[[0, 4, 'Outlet_Years']])

In [33]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10,Outlet_Years
0,-0.831187,0.0,0.016047,4.0,1.747454,9.0,1.0,0.0,1.0,1.0,-0.139541
1,-1.63081,1.0,0.019278,14.0,-1.489023,3.0,1.0,2.0,2.0,0.0,-1.334103
2,1.108727,0.0,0.01676,10.0,0.01004,9.0,1.0,0.0,1.0,1.0,-0.139541
3,1.510904,1.0,0.070482,6.0,0.66005,0.0,3.0,2.0,0.0,1.0,-0.020085
4,-0.918719,0.0,0.070482,9.0,-1.39922,1.0,0.0,2.0,1.0,2.0,1.293934


In [34]:
X_train.shape

(8523, 11)

In [35]:
# lin_reg = LinearRegression()

# lin_reg.fit(X_train, y_train)
# y_predict = lin_reg.predict(X_test)

In [36]:
# svm_reg = svm.SVR(kernel='poly')

# svm_reg.fit(X_train, y_train)
# y_predict = svm_reg.predict(X_test)

In [37]:
# tree_reg = tree.DecisionTreeRegressor()

# tree_reg.fit(X_train, y_train)
# y_predict = tree_reg.predict(X_test)

In [38]:
# elastic_net = ElasticNet(random_state=0)

# elastic_net.fit(X_train, y_train)
# y_predict = elastic_net.predict(X_test)

In [39]:
# forest_reg = RandomForestRegressor(n_estimators=100)

# forest_reg.fit(X_train, y_train)
# y_predict = forest_reg.predict(X_test)

In [40]:
# creating XGBoost model

XGB = XGBRegressor()
XGB.fit(X_train, y_train)
y_predict = XGB.predict(X_test)

In [41]:
y_predict

array([1589.4667, 1409.1659,  561.7424, ..., 1864.1749, 3678.4937,
       1294.0447], dtype=float32)

In [42]:
new = pd.DataFrame(columns=['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
new['Item_Identifier'] = item
new['Outlet_Identifier'] = outlet
new['Item_Outlet_Sales'] = y_predict
new.to_csv('submission.csv', index=False)

Score : 1153.16 Using XGB