In [2]:
import featuretools

In [3]:
import pandas

In [4]:
import numpy 

In [5]:
import featuretools as ft
import numpy as np
import pandas as pd

In [6]:
train = pd.read_csv(r'C:\Users\rdas3\Downloads\Train_UWu5bXk.csv')
test = pd.read_csv(r'C:\Users\rdas3\Downloads\Test_u94Q5KV.csv')

In [7]:
# saving identifiers
test_Item_Identifier = test['Item_Identifier']
test_Outlet_Identifier = test['Outlet_Identifier']
sales = train['Item_Outlet_Sales']
train.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

In [8]:
#we will now combine the train and test set as it saves us the trouble of performing the same step(s) twice.

combi = train.append(test, ignore_index=True)

In [9]:
#Check the missing values in the dataset.

combi.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [10]:
#a lot of missing values in the Item_Weight and Outlet_size variables. Let’s deal with them:

In [11]:
# imputing missing data
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), inplace = True)
combi['Outlet_Size'].fillna("missing", inplace = True)

In [12]:
#Data Preprocessing

In [13]:
combi['Item_Fat_Content'].value_counts()

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

In [15]:
#It seems Item_Fat_Content contains only two categories, i.e., “Low Fat” and “Regular” – the rest of them we will consider redundant. So, let’s convert it into a binary variable.


In [18]:
# dictionary to replace the categories
fat_content_dict = {'Low Fat':0, 'Regular':1, 'LF':0, 'reg':1, 'low fat':0}


In [23]:
combi['Item_Fat_Content'] = combi['Item_Fat_Content'].map(fat_content_dict, regex=True)

TypeError: map() got an unexpected keyword argument 'regex'

In [24]:
combi['Item_Fat_Content'] = combi['Item_Fat_Content'].map(fat_content_dict)

In [25]:
# It is necessary to have a unique identifier feature in the dataset (our dataset doesn’t have any right now). So, we will create one unique ID for our combined dataset. If you notice, we have two IDs in our data—one for the item and another for the outlet. So, simply concatenating both will give us a unique ID.


In [26]:
combi['id'] = combi['Item_Identifier'] + combi['Outlet_Identifier']
combi.drop(['Item_Identifier'], axis=1, inplace=True)

In [27]:
#Please note that I have dropped the feature Item_Identifier as it is no longer required. However, I have retained the feature Outlet_Identifier because I plan to use it later.

#Now before proceeding, we will have to create an EntitySet. An EntitySet is a structure that contains multiple dataframes and relationships between them. So, let’s create an EntitySet and add the dataframe combination to it.

In [28]:
# creating and entity set 'es'
es = ft.EntitySet(id = 'sales')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = combi, index = 'id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [29]:
#Our data contains information at two levels—item level and outlet level. Featuretools offers a functionality to split a dataset into multiple tables. We have created a new table ‘outlet’ from the BigMart table based on the outlet ID Outlet_Identifier.

In [30]:
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])


Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [31]:
#Summary of EntitySet.

In [32]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier


In [33]:
#As you can see above, it contains two entities – bigmart and outlet. There is also a relationship formed between the two tables, connected by Outlet_Identifier. This relationship will play a key role in the generation of new features.

#Now we will use Deep Feature Synthesis to create new features automatically. Recall that DFS uses Feature Primitives to create features using multiple tables present in the EntitySet.

In [34]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'bigmart', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 37 features
EntitySet scattered to workers in 4.672 seconds
Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 11/11 chunks


In [35]:
#target_entity is nothing but the entity ID for which we wish to create new features (in this case, it is the entity ‘bigmart’). The parameter max_depth controls the complexity of the features being generated by stacking the primitives. The parameter n_jobs helps in parallel feature computation by using multiple cores.


In [36]:
#newly created features.

In [37]:
feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.SUM(bigmart.Item_Weight)',
       'outlet.SUM(bigmart.Item_Fat_Content)',
       'outlet.SUM(bigmart.Item_Visibility)', 'outlet.SUM(bigmart.Item_MRP)',
       'outlet.STD(bigmart.Item_Weight)',
       'outlet.STD(bigmart.Item_Fat_Content)',
       'outlet.STD(bigmart.Item_Visibility)', 'outlet.STD(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MAX(bigmart.Item_Fat_Content)',
       'outlet.MAX(bigmart.Item_Visibility)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.SKEW(bigmart.Item_Weight)',
       'outlet.SKEW(bigmart.Item_Fat_Content)',
       'outlet.SKEW(bigmart.Item_Visibility)', 'outlet.SKEW(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Fat_Content)',
       

In [38]:
#DFS has created 29 new features in such a quick time. It is phenomenal as it would have taken much longer to do it manually. If you have datasets with multiple interrelated tables, Featuretools would still work. In that case, you wouldn’t have to normalize a table as multiple tables will already be available.

In [39]:
#Let’s print the first few rows of feature_matrix.

feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,...,outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DRA12OUT010,11.6,,0.068535,Soft Drinks,143.0154,OUT010,1998,missing,Tier 3,Grocery Store,...,,0.0,32.6558,12.72287,,0.101939,141.159742,925,16,Fruits and Vegetables
DRA12OUT013,11.6,,0.040912,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,...,,0.0,31.49,12.788139,,0.060242,141.128428,1553,16,Fruits and Vegetables
DRA12OUT017,11.6,,0.041178,Soft Drinks,140.3154,OUT017,2007,missing,Tier 2,Supermarket Type1,...,,0.0,32.09,12.78208,,0.061142,140.998931,1543,16,Snack Foods
DRA12OUT018,11.6,,0.041113,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,,0.0,31.89,12.803638,,0.059976,141.000899,1546,16,Fruits and Vegetables
DRA12OUT027,12.792854,,0.040748,Soft Drinks,140.0154,OUT027,1985,Medium,Tier 3,Supermarket Type3,...,,0.0,31.29,12.792854,,0.060344,141.012347,1559,16,Fruits and Vegetables


In [41]:
feature_matrix = feature_matrix.reindex(index=combi['id'])
feature_matrix = feature_matrix.reset_index()

ValueError: cannot insert id, already exists

In [43]:
from catboost import CatBoostRegressor

In [44]:
#We will use them to build a model and predict Item_Outlet_Sales. Since our final data (feature_matrix) has many categorical features, I decided to use the CatBoost algorithm. It can use categorical features directly and is scalable in nature. 

In [45]:
#CatBoost requires all the categorical variables to be in the string format. So, we will convert the categorical variables in our data to string first:

categorical_features = np.where(feature_matrix.dtypes == 'object')[0]

for i in categorical_features:
    feature_matrix.iloc[:,i] = feature_matrix.iloc[:,i].astype('str')

In [46]:
#Let’s split feature_matrix back into train and test sets.

feature_matrix.drop(['id'], axis=1, inplace=True)
train = feature_matrix[:8523]
test = feature_matrix[8523:]

In [47]:
# removing uneccesary variables
train.drop(['Outlet_Identifier'], axis=1, inplace=True)
test.drop(['Outlet_Identifier'], axis=1, inplace=True)

# identifying categorical features
categorical_features = np.where(train.dtypes == 'object')[0]

In [48]:
#Split the train data into training and validation set to check the model’s performance locally.

from sklearn.model_selection import train_test_split

# splitting train data into training and validation set
xtrain, xvalid, ytrain, yvalid = train_test_split(train, sales, test_size=0.25, random_state=11)

In [49]:
#Finally, we can now train our model. The evaluation metric we will use is RMSE (Root Mean Squared Error).

model_cat = CatBoostRegressor(iterations=100, learning_rate=0.3, depth=6, eval_metric='RMSE', random_seed=7)

# training model
model_cat.fit(xtrain, ytrain, cat_features=categorical_features, use_best_model=True)
# validation score
model_cat.score(xvalid, yvalid)

You should provide test set for use best model. use_best_model parameter swiched to false value.


0:	learn: 2297.3467173	total: 69.5ms	remaining: 6.88s
1:	learn: 2018.5649335	total: 87ms	remaining: 4.26s
2:	learn: 1866.7620425	total: 111ms	remaining: 3.59s
3:	learn: 1787.6466144	total: 130ms	remaining: 3.13s
4:	learn: 1747.5567517	total: 150ms	remaining: 2.85s
5:	learn: 1727.5650568	total: 171ms	remaining: 2.68s
6:	learn: 1717.6802046	total: 190ms	remaining: 2.53s
7:	learn: 1712.8138435	total: 208ms	remaining: 2.4s
8:	learn: 1710.4233107	total: 229ms	remaining: 2.32s
9:	learn: 1709.2502575	total: 254ms	remaining: 2.29s
10:	learn: 1708.6749362	total: 273ms	remaining: 2.21s
11:	learn: 1708.3928446	total: 299ms	remaining: 2.19s
12:	learn: 1708.2545471	total: 315ms	remaining: 2.11s
13:	learn: 1708.1867501	total: 355ms	remaining: 2.18s
14:	learn: 1708.1535151	total: 375ms	remaining: 2.12s
15:	learn: 1708.1372232	total: 394ms	remaining: 2.07s
16:	learn: 1708.1292369	total: 412ms	remaining: 2.01s
17:	learn: 1708.1253221	total: 430ms	remaining: 1.96s
18:	learn: 1708.1234030	total: 456ms	re

1701.420503735139