<a href="https://colab.research.google.com/github/sadullahmath/Exercise/blob/master/Creating_New_Features_Using_Deep_Feature_Synthesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating New Features Using Deep Feature Synthesis

In [0]:
# Defining the path to the 
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/bank-full.csv'

In [2]:
# Loading data using pandas
import pandas as pd
bankData = pd.read_csv(file_url,sep=";")
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [0]:
# Removing the target variable
Y = bankData.pop('y')

In [0]:
# Creating the Ids for Demographic Entity
bankData['custID'] = bankData.index.values
bankData['custID'] = 'cust' + bankData['custID'].astype(str)

In [0]:
# Creating AssetId
bankData['AssetId'] = 0
bankData.loc[bankData.housing == 'yes','AssetId']= 1

In [0]:
# Creating LoanId
bankData['LoanId'] = 0
bankData.loc[bankData.loan == 'yes','LoanId']= 1

In [0]:
# Creating Financial behaviour ID
bankData['FinbehId'] = 0
bankData.loc[bankData.default == 'yes','FinbehId']= 1

In [8]:
# Displaying the new data frame after adding the ids
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,custID,AssetId,LoanId,FinbehId
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,cust0,1,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,cust1,1,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,cust2,1,1,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,cust3,1,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,cust4,0,0,0


In [0]:
# Importing necessary libraries
import featuretools as ft
import numpy as np

In [0]:
# creating the entity set 'Bankentities'
Bankentities = ft.EntitySet(id = 'Bank')

In [12]:
# Mapping a dataframe to the entityset to form the parent entity
Bankentities.entity_from_dataframe(entity_id = 'Demographic Data', dataframe = bankData, index = 'custID')

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 20]
  Relationships:
    No relationships

In [13]:
# Mapping Assets and setting the relationship
Bankentities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Assets', index = 'AssetId', additional_variables = ['housing'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 19]
    Assets [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetId -> Assets.AssetId

In [14]:
# Mapping Loans and Financial behavior entities
Bankentities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Liability', index = 'LoanId', additional_variables = ['loan'])

Bankentities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='FinBehaviour', index = 'FinbehId', additional_variables = ['default'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 17]
    Assets [Rows: 2, Columns: 2]
    Liability [Rows: 2, Columns: 2]
    FinBehaviour [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetId -> Assets.AssetId
    Demographic Data.LoanId -> Liability.LoanId
    Demographic Data.FinbehId -> FinBehaviour.FinbehId

In [15]:
# Aggregating based on housing data
agg = bankData.groupby('housing')['balance'].agg('mean')
print(agg)

housing
no     1596.501270
yes    1175.103064
Name: balance, dtype: float64


In [16]:
# Merging with the original data frame
bankNew = bankData.merge(agg,left_on = 'housing',right_index=True,how = 'left')
bankNew.head(10)

Unnamed: 0,age,job,marital,education,default,balance_x,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,custID,AssetId,LoanId,FinbehId,balance_y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,cust0,1,0,0,1175.103064
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,cust1,1,0,0,1175.103064
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,cust2,1,1,0,1175.103064
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,cust3,1,0,0,1175.103064
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,cust4,0,0,0,1596.50127
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,cust5,1,0,0,1175.103064
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,cust6,1,1,0,1175.103064
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,cust7,1,0,1,1175.103064
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,cust8,1,0,0,1175.103064
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,cust9,1,0,0,1175.103064


In [17]:
# Transformation operation
import numpy as np
bankData['Tranbalance'] = np.log(bankData['balance'])
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,custID,AssetId,LoanId,FinbehId,Tranbalance
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,cust0,1,0,0,7.669962
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,cust1,1,0,0,3.367296
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,cust2,1,1,0,0.693147
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,cust3,1,0,0,7.317212
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,cust4,0,0,0,0.0


In [18]:
# Creating feature sets using Deep Feature Synthesis
feature_set, feature_names = ft.dfs(entityset=Bankentities, 
target_entity = 'Demographic Data', 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 196 features
Elapsed: 00:12 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [0]:
# Reindexing the feature_set
feature_set = feature_set.reindex(index=bankData['custID'])
feature_set = feature_set.reset_index()

In [20]:
# Verifying the shape of the features and original bank data
print(feature_set.shape)
print(bankData.shape)

(45211, 197)
(45211, 21)


In [21]:
# Printing head of the feature set
feature_set.head()

Unnamed: 0,custID,age,job,marital,education,balance,contact,day,month,duration,campaign,pdays,previous,poutcome,AssetId,LoanId,FinbehId,Assets.housing,Liability.loan,FinBehaviour.default,Assets.SUM(Demographic Data.age),Assets.SUM(Demographic Data.balance),Assets.SUM(Demographic Data.day),Assets.SUM(Demographic Data.duration),Assets.SUM(Demographic Data.campaign),Assets.SUM(Demographic Data.pdays),Assets.SUM(Demographic Data.previous),Assets.STD(Demographic Data.age),Assets.STD(Demographic Data.balance),Assets.STD(Demographic Data.day),Assets.STD(Demographic Data.duration),Assets.STD(Demographic Data.campaign),Assets.STD(Demographic Data.pdays),Assets.STD(Demographic Data.previous),Assets.MAX(Demographic Data.age),Assets.MAX(Demographic Data.balance),Assets.MAX(Demographic Data.day),Assets.MAX(Demographic Data.duration),Assets.MAX(Demographic Data.campaign),Assets.MAX(Demographic Data.pdays),...,FinBehaviour.MAX(Demographic Data.pdays),FinBehaviour.MAX(Demographic Data.previous),FinBehaviour.SKEW(Demographic Data.age),FinBehaviour.SKEW(Demographic Data.balance),FinBehaviour.SKEW(Demographic Data.day),FinBehaviour.SKEW(Demographic Data.duration),FinBehaviour.SKEW(Demographic Data.campaign),FinBehaviour.SKEW(Demographic Data.pdays),FinBehaviour.SKEW(Demographic Data.previous),FinBehaviour.MIN(Demographic Data.age),FinBehaviour.MIN(Demographic Data.balance),FinBehaviour.MIN(Demographic Data.day),FinBehaviour.MIN(Demographic Data.duration),FinBehaviour.MIN(Demographic Data.campaign),FinBehaviour.MIN(Demographic Data.pdays),FinBehaviour.MIN(Demographic Data.previous),FinBehaviour.MEAN(Demographic Data.age),FinBehaviour.MEAN(Demographic Data.balance),FinBehaviour.MEAN(Demographic Data.day),FinBehaviour.MEAN(Demographic Data.duration),FinBehaviour.MEAN(Demographic Data.campaign),FinBehaviour.MEAN(Demographic Data.pdays),FinBehaviour.MEAN(Demographic Data.previous),FinBehaviour.COUNT(Demographic Data),FinBehaviour.NUM_UNIQUE(Demographic Data.job),FinBehaviour.NUM_UNIQUE(Demographic Data.marital),FinBehaviour.NUM_UNIQUE(Demographic Data.education),FinBehaviour.NUM_UNIQUE(Demographic Data.contact),FinBehaviour.NUM_UNIQUE(Demographic Data.month),FinBehaviour.NUM_UNIQUE(Demographic Data.poutcome),FinBehaviour.NUM_UNIQUE(Demographic Data.AssetId),FinBehaviour.NUM_UNIQUE(Demographic Data.LoanId),FinBehaviour.MODE(Demographic Data.job),FinBehaviour.MODE(Demographic Data.marital),FinBehaviour.MODE(Demographic Data.education),FinBehaviour.MODE(Demographic Data.contact),FinBehaviour.MODE(Demographic Data.month),FinBehaviour.MODE(Demographic Data.poutcome),FinBehaviour.MODE(Demographic Data.AssetId),FinBehaviour.MODE(Demographic Data.LoanId)
0,cust0,58,management,married,tertiary,2143,unknown,5,may,261,1,-1,0,unknown,1,0,0,yes,no,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
1,cust1,44,technician,single,secondary,29,unknown,5,may,151,1,-1,0,unknown,1,0,0,yes,no,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
2,cust2,33,entrepreneur,married,secondary,2,unknown,5,may,76,1,-1,0,unknown,1,1,0,yes,yes,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
3,cust3,47,blue-collar,married,unknown,1506,unknown,5,may,92,1,-1,0,unknown,1,0,0,yes,no,no,984475,29530340,391984,6517000,67813,1289483,16502,8.926807,2483.285761,8.026836,258.321907,3.140979,113.862848,2.707428,78,58544,31,4918,63,854,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0
4,cust4,33,unknown,single,unknown,1,unknown,5,may,198,1,-1,0,unknown,0,0,0,no,no,no,866292,32059342,322640,5154811,57143,527901,9735,12.058696,3613.405339,8.671437,256.529524,3.041508,77.461032,1.660227,95,102127,31,3881,41,871,...,871,275,0.687031,8.334719,0.094405,3.152469,4.926324,2.600138,41.986484,18,-4057,1,0,1,-1,0,40.961934,1389.806424,15.795792,258.512749,2.75678,40.604536,0.586044,44396,12,3,4,3,12,4,2,2,blue-collar,married,secondary,cellular,may,unknown,1,0


In [22]:
# Verifying the features for Assets.SUM(Demographic Data.balance)
bankData.groupby('AssetId')['balance'].agg('sum')

AssetId
0    32059342
1    29530340
Name: balance, dtype: int64

In [23]:
# Printing the list of all features
feature_names

[<Feature: age>,
 <Feature: job>,
 <Feature: marital>,
 <Feature: education>,
 <Feature: balance>,
 <Feature: contact>,
 <Feature: day>,
 <Feature: month>,
 <Feature: duration>,
 <Feature: campaign>,
 <Feature: pdays>,
 <Feature: previous>,
 <Feature: poutcome>,
 <Feature: AssetId>,
 <Feature: LoanId>,
 <Feature: FinbehId>,
 <Feature: Assets.housing>,
 <Feature: Liability.loan>,
 <Feature: FinBehaviour.default>,
 <Feature: Assets.SUM(Demographic Data.age)>,
 <Feature: Assets.SUM(Demographic Data.balance)>,
 <Feature: Assets.SUM(Demographic Data.day)>,
 <Feature: Assets.SUM(Demographic Data.duration)>,
 <Feature: Assets.SUM(Demographic Data.campaign)>,
 <Feature: Assets.SUM(Demographic Data.pdays)>,
 <Feature: Assets.SUM(Demographic Data.previous)>,
 <Feature: Assets.STD(Demographic Data.age)>,
 <Feature: Assets.STD(Demographic Data.balance)>,
 <Feature: Assets.STD(Demographic Data.day)>,
 <Feature: Assets.STD(Demographic Data.duration)>,
 <Feature: Assets.STD(Demographic Data.campaign)

In [0]:
# Creating aggregation and transformation primitives
aggPrimitives=[
        'std', 'min', 'max', 'mean', 
        'last', 'count'
]
tranPrimitives=[
        'percentile', 
        'subtract', 'divide']

In [25]:
# Defining the new set of features
feature_set, feature_names = ft.dfs(entityset=Bankentities, 
target_entity = 'Demographic Data',
agg_primitives=aggPrimitives,
trans_primitives=tranPrimitives, 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 3420 features
Elapsed: 01:46 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [0]:
# Displaying the feature set 
feature_set.head()