In [5]:
# !pip install featuretools
import featuretools as ft
import numpy as np
import pandas as pd

train = pd.read_csv("train_v9rqX0R.csv")
test = pd.read_csv("test_AbJTz2l.csv")

In [20]:
print('train size', train.shape)
print('test size', test.shape)

train size (8523, 11)
test size (5681, 11)


In [6]:
# saving identifiers
test_Item_Identifier = test['Item_Identifier']
test_Outlet_Identifier = test['Outlet_Identifier']
sales = train['Item_Outlet_Sales']
train.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

In [7]:
combi = train.append(test, ignore_index=True)

In [14]:
combi.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
id                           0
dtype: int64

In [9]:
# imputing missing data
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), inplace = True)
combi['Outlet_Size'].fillna("missing", inplace = True)

In [13]:
combi['Item_Fat_Content'].value_counts()

0    9185
1    5019
Name: Item_Fat_Content, dtype: int64

In [11]:
# dictionary to replace the categories
fat_content_dict = {'Low Fat':0, 'Regular':1, 'LF':0, 'reg':1, 'low fat':0}

combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(fat_content_dict, regex=True)

In [12]:
combi['id'] = combi['Item_Identifier'] + combi['Outlet_Identifier']
combi.drop(['Item_Identifier'], axis=1, inplace=True)

In [26]:
# creating and entity set 'es'
es = ft.EntitySet(id = 'sales')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = combi, index = 'id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [27]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships


In [28]:
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [29]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier


In [32]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'bigmart', 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 37 features
Elapsed: 00:00 | Progress: 100%|██████████


In [33]:
feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.COUNT(bigmart)',
       'outlet.MAX(bigmart.Item_Fat_Content)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Visibility)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MEAN(bigmart.Item_Fat_Content)',
       'outlet.MEAN(bigmart.Item_MRP)', 'outlet.MEAN(bigmart.Item_Visibility)',
       'outlet.MEAN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Fat_Content)', 'outlet.MIN(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Visibility)',
       'outlet.MIN(bigmart.Item_Weight)', 'outlet.MODE(bigmart.Item_Type)',
       'outlet.NUM_UNIQUE(bigmart.Item_Type)',
       'outlet.SKEW(bigmart.Item_Fat_Content)',
       'outlet.SKEW(bigmart.Item_MRP)', 'outlet.SKEW(bigmart.Item_Visibility)',
       'outlet.SKEW(bi

In [34]:
feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,outlet.COUNT(bigmart),outlet.MAX(bigmart.Item_Fat_Content),outlet.MAX(bigmart.Item_MRP),outlet.MAX(bigmart.Item_Visibility),outlet.MAX(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_Weight),outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_MRP),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_Weight),outlet.MODE(bigmart.Item_Type),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.SKEW(bigmart.Item_Fat_Content),outlet.SKEW(bigmart.Item_MRP),outlet.SKEW(bigmart.Item_Visibility),outlet.SKEW(bigmart.Item_Weight),outlet.STD(bigmart.Item_Fat_Content),outlet.STD(bigmart.Item_MRP),outlet.STD(bigmart.Item_Visibility),outlet.STD(bigmart.Item_Weight),outlet.SUM(bigmart.Item_Fat_Content),outlet.SUM(bigmart.Item_MRP),outlet.SUM(bigmart.Item_Visibility),outlet.SUM(bigmart.Item_Weight)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
FDA15OUT049,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,1550,1,266.4884,0.18785,21.35,0.352903,141.163199,0.059,12.803003,0,32.4558,0.0,4.555,Fruits and Vegetables,16,0.616228,0.126294,0.790782,0.099024,0.478027,62.144594,0.043924,4.650796,547,218802.9588,91.450099,19844.655
DRC01OUT018,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,1546,1,266.3226,0.188323,21.35,0.353816,141.000899,0.059976,12.803638,0,31.89,0.0,4.555,Fruits and Vegetables,16,0.612046,0.133528,0.783017,0.102602,0.478308,62.022851,0.044489,4.650874,547,217987.3906,92.723425,19794.425
FDN15OUT049,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,1550,1,266.4884,0.18785,21.35,0.352903,141.163199,0.059,12.803003,0,32.4558,0.0,4.555,Fruits and Vegetables,16,0.616228,0.126294,0.790782,0.099024,0.478027,62.144594,0.043924,4.650796,547,218802.9588,91.450099,19844.655
FDX07OUT010,19.2,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,Grocery Store,925,1,266.6884,0.313935,21.35,0.356757,141.159742,0.101939,12.72287,0,32.6558,0.0,4.61,Fruits and Vegetables,16,0.599012,0.104693,0.776902,0.112759,0.479301,62.010835,0.073604,4.67507,330,130572.7618,94.293418,11768.655
NCD19OUT013,8.93,0,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,1553,1,266.6884,0.185913,21.35,0.353509,141.128428,0.060242,12.788139,0,31.49,0.0,4.555,Fruits and Vegetables,16,0.613449,0.130888,0.759033,0.104392,0.478213,62.140848,0.044005,4.650214,549,219172.4492,93.555174,19859.98


In [36]:
combi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                14204 non-null  float64
 1   Item_Fat_Content           14204 non-null  int64  
 2   Item_Visibility            14204 non-null  float64
 3   Item_Type                  14204 non-null  object 
 4   Item_MRP                   14204 non-null  float64
 5   Outlet_Identifier          14204 non-null  object 
 6   Outlet_Establishment_Year  14204 non-null  int64  
 7   Outlet_Size                14204 non-null  object 
 8   Outlet_Location_Type       14204 non-null  object 
 9   Outlet_Type                14204 non-null  object 
 10  id                         14204 non-null  object 
dtypes: float64(3), int64(2), object(6)
memory usage: 1.2+ MB


In [38]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.8MB 56kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [40]:
from catboost import CatBoostRegressor

categorical_features = np.where(feature_matrix.dtypes == 'object')[0]
for i in categorical_features:
    feature_matrix.iloc[:,i] = feature_matrix.iloc[:,i].astype('str')

#Let’s split feature_matrix back into train and test sets.
# feature_matrix.drop(['id'], axis=1, inplace=True)
train = feature_matrix[:8523]
test = feature_matrix[8523:]

# removing uneccesary variables
train.drop(['Outlet_Identifier'], axis=1, inplace=True)
test.drop(['Outlet_Identifier'], axis=1, inplace=True)

# identifying categorical features
categorical_features = np.where(train.dtypes == 'object')[0]

#Split the train data into training and validation set to check the model’s performance locally.
from sklearn.model_selection import train_test_split

# splitting train data into training and validation set
xtrain, xvalid, ytrain, yvalid = train_test_split(train, sales, test_size=0.25, random_state=11)

#Finally, we can now train our model. The evaluation metric we will use is RMSE (Root Mean Squared Error).
model_cat = CatBoostRegressor(iterations=100, learning_rate=0.3, depth=6, eval_metric='RMSE', random_seed=7)

# training model
model_cat.fit(xtrain, ytrain, cat_features=categorical_features, use_best_model=True)

# validation score
model_cat.score(xvalid, yvalid)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 1440.1669686	total: 52.8ms	remaining: 5.22s
1:	learn: 1291.2555633	total: 58ms	remaining: 2.84s
2:	learn: 1198.9242824	total: 63.7ms	remaining: 2.06s
3:	learn: 1144.4826511	total: 67.6ms	remaining: 1.62s
4:	learn: 1110.1295069	total: 72.8ms	remaining: 1.38s
5:	learn: 1089.8059753	total: 78.3ms	remaining: 1.23s
6:	learn: 1081.0870251	total: 83.2ms	remaining: 1.1s
7:	learn: 1073.8363039	total: 88.1ms	remaining: 1.01s
8:	learn: 1068.1980841	total: 93.3ms	remaining: 944ms
9:	learn: 1065.3736138	total: 98.5ms	remaining: 886ms
10:	learn: 1062.6591514	total: 103ms	remaining: 835ms
11:	learn: 1058.9950763	total: 108ms	remaining: 794ms
12:	learn: 1058.2651366	total: 113ms	remaining: 758ms
13:	learn: 1057.5983637	total: 117ms	remaining: 719ms
14:	learn: 1056.4337486	total: 122ms	remaining: 691ms
15:	learn: 1055.8339476	total: 127ms	remaining: 666ms
16:	learn: 1053.4063958	total: 133ms	remaining: 647ms
17:	learn: 1053.3144343	total: 138ms	remaining: 626ms
18:	learn: 1052.0205175	total: 

0.5792825973878382

In [41]:
# validation score
model_cat.score(xvalid, yvalid)

0.5792825973878382