In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
import featuretools as ft

import gc

In [26]:
train = pd.read_csv('../data/train_merged.csv')
test = pd.read_csv('../data/test_merged.csv')

In [27]:
train.shape, test.shape

((456548, 15), (32573, 14))

In [28]:
#drop the outlier
train.drop(train[train.num_orders > 23000].index, inplace=True)

train['discount'] = train.base_price - train.checkout_price
test['discount'] = test.base_price - test.checkout_price

In [5]:
train['emailer_for_promotion'] = train['emailer_for_promotion'].astype('str')
train['homepage_featured'] = train['homepage_featured'].astype('str')
train['category'] = train['category'].astype('str')
train['cuisine'] = train['cuisine'].astype('str')
train['city_code'] = train['city_code'].astype('str')
train['region_code'] = train['region_code'].astype('str')
train['center_type'] = train['center_type'].astype('str')
train['op_area'] = train['op_area'].astype('str')

test['emailer_for_promotion'] = test['emailer_for_promotion'].astype('str')
test['homepage_featured'] = test['homepage_featured'].astype('str')
test['category'] = test['category'].astype('str')
test['cuisine'] = test['cuisine'].astype('str')
test['city_code'] = test['city_code'].astype('str')
test['region_code'] = test['region_code'].astype('str')
test['center_type'] = test['center_type'].astype('str')
test['op_area'] = test['op_area'].astype('str')

train['interaction_1'] = (train['emailer_for_promotion'] + train['homepage_featured'])
train['interaction_2'] = (train['emailer_for_promotion'] + train['category'])
train['interaction_3'] = (train['emailer_for_promotion'] + train['cuisine'])
train['interaction_4'] = (train['emailer_for_promotion'] + train['city_code'])
train['interaction_5'] = (train['emailer_for_promotion'] + train['region_code'])
train['interaction_6'] = (train['emailer_for_promotion'] + train['center_type'])
train['interaction_7'] = (train['emailer_for_promotion'] + train['op_area'])

train['interaction_8'] = (train['homepage_featured'] + train['category'])
train['interaction_9'] = (train['homepage_featured'] + train['cuisine'])
train['interaction_10'] = (train['homepage_featured'] + train['city_code'])
train['interaction_11'] = (train['homepage_featured'] + train['region_code'])
train['interaction_12'] = (train['homepage_featured'] + train['center_type'])
train['interaction_13'] = (train['homepage_featured'] + train['op_area'])

train['interaction_14'] = (train['category'] + train['cuisine'])
train['interaction_15'] = (train['category'] + train['city_code'])
train['interaction_16'] = (train['category'] + train['region_code'])
train['interaction_17'] = (train['category'] + train['center_type'])
train['interaction_18'] = (train['category'] + train['op_area'])

train['interaction_19'] = (train['city_code'] + train['region_code'])
train['interaction_20'] = (train['city_code'] + train['center_type'])
train['interaction_21'] = (train['city_code'] + train['op_area'])

train['interaction_22'] = (train['region_code'] + train['center_type'])
train['interaction_23'] = (train['region_code'] + train['op_area'])

train['interaction_24'] = (train['center_type'] + train['op_area'])


test['interaction_1'] = (test['emailer_for_promotion'] + test['homepage_featured'])
test['interaction_2'] = (test['emailer_for_promotion'] + test['category'])
test['interaction_3'] = (test['emailer_for_promotion'] + test['cuisine'])
test['interaction_4'] = (test['emailer_for_promotion'] + test['city_code'])
test['interaction_5'] = (test['emailer_for_promotion'] + test['region_code'])
test['interaction_6'] = (test['emailer_for_promotion'] + test['center_type'])
test['interaction_7'] = (test['emailer_for_promotion'] + test['op_area'])

test['interaction_8'] = (test['homepage_featured'] + test['category'])
test['interaction_9'] = (test['homepage_featured'] + test['cuisine'])
test['interaction_10'] = (test['homepage_featured'] + test['city_code'])
test['interaction_11'] = (test['homepage_featured'] + test['region_code'])
test['interaction_12'] = (test['homepage_featured'] + test['center_type'])
test['interaction_13'] = (test['homepage_featured'] + test['op_area'])

test['interaction_14'] = (test['category'] + test['cuisine'])
test['interaction_15'] = (test['category'] + test['city_code'])
test['interaction_16'] = (test['category'] + test['region_code'])
test['interaction_17'] = (test['category'] + test['center_type'])
test['interaction_18'] = (test['category'] + test['op_area'])

test['interaction_19'] = (test['city_code'] + test['region_code'])
test['interaction_20'] = (test['city_code'] + test['center_type'])
test['interaction_21'] = (test['city_code'] + test['op_area'])

test['interaction_22'] = (test['region_code'] + test['center_type'])
test['interaction_23'] = (test['region_code'] + test['op_area'])

test['interaction_24'] = (test['center_type'] + test['op_area'])

train['emailer_for_promotion'] = train['emailer_for_promotion'].astype('int64')
train['homepage_featured'] = train['homepage_featured'].astype('int64')
train['category'] = train['category'].astype('category')
train['cuisine'] = train['cuisine'].astype('category')
train['city_code'] = train['city_code'].astype('int64')
train['region_code'] = train['region_code'].astype('int64')
train['center_type'] = train['center_type'].astype('category')
train['op_area'] = train['op_area'].astype('float64')

test['emailer_for_promotion'] = test['emailer_for_promotion'].astype('int64')
test['homepage_featured'] = test['homepage_featured'].astype('int64')
test['category'] = test['category'].astype('category')
test['cuisine'] = test['cuisine'].astype('category')
test['city_code'] = test['city_code'].astype('int64')
test['region_code'] = test['region_code'].astype('int64')
test['center_type'] = test['center_type'].astype('category')
test['op_area'] = test['op_area'].astype('float64')


In [6]:
for df in [train, test]:
    for col in df.columns:
        if 'interaction_' in col:
            df[col] = df[col].astype('category')

In [29]:
#separate the target column and merge train dataset with test dataset
target = train[['id', 'num_orders']]
train.drop(['num_orders'], axis=1, inplace=True)
train['set']='train'
test['set']='test'
data = train.append(test, ignore_index=True)

<h1>Feature Engineering using Featuretools</h1>
<h2>Create an entity dataset</h2><br>
An EntitySet is a structure that contains multiple dataframes and relationships between them. So, let’s create an EntitySet and add the dataframe combination to it.

In [30]:
# create an entity set 'es'
es = ft.EntitySet(id = 'fooddemand')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'demand', dataframe = data, index = 'id')

Entityset: fooddemand
  Entities:
    demand [Rows: 489120, Columns: 16]
  Relationships:
    No relationships

In [31]:
es.normalize_entity(base_entity_id='demand', new_entity_id='fullfilment_center', index = 'center_id', 
additional_variables = ['city_code', 'region_code', 'center_type', 'op_area'])

es.normalize_entity(base_entity_id='demand', new_entity_id='meal_info', index = 'meal_id', 
additional_variables = ['category', 'cuisine'])

Entityset: fooddemand
  Entities:
    demand [Rows: 489120, Columns: 10]
    fullfilment_center [Rows: 77, Columns: 5]
    meal_info [Rows: 51, Columns: 3]
  Relationships:
    demand.center_id -> fullfilment_center.center_id
    demand.meal_id -> meal_info.meal_id

In [10]:
print(es)

Entityset: fooddemand
  Entities:
    demand [Rows: 489120, Columns: 34]
    fullfilment_center [Rows: 77, Columns: 5]
    meal_info [Rows: 51, Columns: 3]
  Relationships:
    demand.center_id -> fullfilment_center.center_id
    demand.meal_id -> meal_info.meal_id


In [32]:
# Specify the aggregation primitives
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'demand',  
                                                 agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode'], 
                                                 max_depth = 2, features_only = False, verbose = True)

Built 69 features
Elapsed: 00:35 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 10/10 chunks


In [12]:
#apply one hot encoding
def one_hot_encode(df, cat_attribs):
    dummies = pd.get_dummies(df[cat_attribs])
    df = df.join(dummies)
    df = df.drop(cat_attribs, axis=1)
    return df

In [33]:
#OHE - 'fullfilment_center.center_type', 'meal_info.category',	'meal_info.cuisine'
print('Feature matrix shape before one hot encoding is {}'.format(feature_matrix.shape))
ohe_cols = ['fullfilment_center.center_type', 'meal_info.category', 'meal_info.cuisine']
#             'interaction_1', 'interaction_2', \
#             'interaction_3', 'interaction_4', 'interaction_5', 'interaction_6', 'interaction_7', 'interaction_8', \
#            'interaction_9', 'interaction_10', 'interaction_11', 'interaction_12', 'interaction_13', 'interaction_14', \
#             'interaction_15', 'interaction_16', 'interaction_17', 'interaction_18', 'interaction_19', 'interaction_20', \
#        'interaction_21', 'interaction_22', 'interaction_23', 'interaction_24']

feature_matrix = one_hot_encode(feature_matrix, ohe_cols)
print('Feature matrix shape after one hot encoding is {}'.format(feature_matrix.shape))

Feature matrix shape before one hot encoding is (489120, 69)
Feature matrix shape after one hot encoding is (489120, 87)


In [34]:
#drop cols with nan correlations to the target variable
cols_to_drop = ['fullfilment_center.MIN(demand.week)', 'fullfilment_center.MIN(demand.emailer_for_promotion)', \
                'fullfilment_center.MIN(demand.homepage_featured)', 'fullfilment_center.MAX(demand.week)', \
                'fullfilment_center.MAX(demand.emailer_for_promotion)', 'fullfilment_center.MAX(demand.homepage_featured)', \
                'meal_info.MIN(demand.emailer_for_promotion)', 'meal_info.MIN(demand.homepage_featured)', \
                'meal_info.MAX(demand.week)', 'meal_info.MAX(demand.homepage_featured)']
feature_matrix.drop(cols_to_drop, axis=1, inplace=True)
print('Feature matrix shape after dropping cols with nan correlations to the target variable is {}'.format(feature_matrix.shape))

Feature matrix shape after dropping cols with nan correlations to the target variable is (489120, 77)


In [18]:
cols = feature_matrix.columns

In [19]:
with open('../data/columns.txt', 'w') as f:
    for item in cols:
        f.write("%s\n" % item)

In [38]:
feature_matrix.iloc[:5,50:70]

Unnamed: 0_level_0,meal_info.MEAN(demand.base_price),meal_info.MEAN(demand.emailer_for_promotion),meal_info.MEAN(demand.homepage_featured),meal_info.MEAN(demand.discount),meal_info.MODE(demand.center_id),meal_info.MODE(demand.set),fullfilment_center.center_type_TYPE_A,fullfilment_center.center_type_TYPE_B,fullfilment_center.center_type_TYPE_C,meal_info.category_Beverages,meal_info.category_Biryani,meal_info.category_Desert,meal_info.category_Extras,meal_info.category_Fish,meal_info.category_Other Snacks,meal_info.category_Pasta,meal_info.category_Pizza,meal_info.category_Rice Bowl,meal_info.category_Salad,meal_info.category_Sandwich
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1000000,239.501669,0.0,0.069682,11.598132,10,train,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1000001,599.132903,0.014419,0.032844,15.265829,34,train,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1000002,305.912397,0.178012,0.160741,20.862923,10,train,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1000003,131.432133,0.03086,0.163486,3.31366,10,train,0,1,0,1,0,0,0,0,0,0,0,0,0,0
1000004,131.432133,0.03086,0.163486,3.31366,10,train,1,0,0,1,0,0,0,0,0,0,0,0,0,0


In [34]:
#separate train and test dataset
train = feature_matrix[feature_matrix['set'] == 'train'].copy()
test = feature_matrix[feature_matrix['set'] == 'test'].copy()
#add num_orders to train using id
train = train.merge(target, on='id', how = 'left')
train.reset_index(inplace=True)
train.drop('index', axis=1, inplace=True)
test.reset_index(inplace=True)
print('Train shape is {}'.format(train.shape))
print('Test shape is {}'.format(test.shape))

Train shape is (456547, 2082)
Test shape is (32573, 2081)


In [35]:
#Drop - 'id', 'set', 'meal_info.MODE(demand.set)'
cols_drop = ['id', 'set', 'meal_info.MODE(demand.set)', 'fullfilment_center.MODE(demand.set)']
train.drop(cols_drop, axis=1, inplace=True)
test.drop(cols_drop, axis=1, inplace=True)
print('Train shape after dropping unwanted columns is {}'.format(train.shape))
print('Test shape after dropping unwanted columns is {}'.format(test.shape))

Train shape after dropping unwanted columns is (456547, 2078)
Test shape after dropping unwanted columns is (32573, 2077)


In [36]:
# Function to calculate correlations with the target for a dataframe
def target_corrs(df):

    # List of correlations
    corrs = []
    
    #create a new df with numeric columns
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    newdf = df.select_dtypes(include=numerics)
    
    # Iterate through the columns 
    for col in newdf.columns:
#         print(col)
        # Skip the target column
        if col != 'num_orders':
            # Calculate correlation with the target
            corr = newdf['num_orders'].corr(newdf[col])

            # Append the list as a tuple
            corrs.append((col, corr))
            
    # Sort by absolute magnitude of correlations
    corrs = sorted(corrs, key = lambda x: abs(x[1]), reverse = True)
    
    #free up memory
    gc.enable()
    del df, newdf
    gc.collect()
    
    return corrs

In [37]:
correlations=target_corrs(train)

In [38]:
correlations

[('homepage_featured', 0.29530754224247674),
 ('checkout_price', -0.2831033139067199),
 ('emailer_for_promotion', 0.2778266151993247),
 ('meal_info.MAX(demand.base_price)', -0.268315366166357),
 ('meal_info.MAX(demand.discount)', -0.25946298197539136),
 ('meal_info.MAX(demand.checkout_price)', -0.25750379547358576),
 ('meal_info.SUM(demand.week)', 0.25240830717989393),
 ('meal_info.COUNT(demand)', 0.24194855519601333),
 ('meal_info.MEAN(demand.base_price)', -0.2307404424033613),
 ('meal_info.MEAN(demand.checkout_price)', -0.23032979181661073),
 ('base_price', -0.2231735189298536),
 ('meal_info.MIN(demand.base_price)', -0.21290264215850846),
 ('fullfilment_center.SUM(demand.week)', 0.18685518821322022),
 ('fullfilment_center.SUM(demand.checkout_price)', 0.18151682668359637),
 ('fullfilment_center.COUNT(demand)', 0.18140828972220485),
 ('fullfilment_center.SUM(demand.base_price)', 0.18076575893411698),
 ('fullfilment_center.op_area', 0.17757090332045494),
 ('fullfilment_center.MEAN(deman

In [39]:
train.to_csv('../data/2.fe_train2.csv', index=False)
test.to_csv('../data/2.fe_test2.csv', index=False)