In this section we are going to deal with automated feature engineering using Featuretools. It is an opensource python library

The concept of Deep Feature Synthesis is to use basic building blocks known as feature primitives (like the transformations and aggregations done above) that can be stacked on top of each other to form new features. The depth of a "deep feature" is equal to the number of stacked primitives.

ste 1: defining entities (tables) in an EntitySet

In [None]:
#! pip install featuretools

In [1]:
import pandas as pd
import numpy as np

import featuretools as ft
import warnings 

In [2]:
clients = pd.read_csv('clients.csv', parse_dates = ['joined'])
loans = pd.read_csv('loans.csv', parse_dates = ['loan_start', 'loan_end'])
payments = pd.read_csv('payments.csv', parse_dates = ['payment_date'])

In [3]:
print(clients.columns)
print(loans.columns)
print(payments.columns)

Index(['client_id', 'joined', 'income', 'credit_score'], dtype='object')
Index(['client_id', 'loan_type', 'loan_amount', 'repaid', 'loan_id',
       'loan_start', 'loan_end', 'rate'],
      dtype='object')
Index(['loan_id', 'payment_amount', 'payment_date', 'missed'], dtype='object')


In [4]:
stats_table = loans.groupby('client_id')['loan_amount'].agg(['mean', 'max', 'min'])
stats_table.columns = ['mean_loan_amount', 'max_loan_amount', 'min_loan_amount']

In [5]:
stats_table.head()

Unnamed: 0_level_0,mean_loan_amount,max_loan_amount,min_loan_amount
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003,5010.666667,10314,2267
2004,11882.0,12651,10464
2011,12006.0,13339,9837
2031,7311.3,13975,772
2041,7054.142857,14024,1049


In [6]:
print(len(clients['client_id'].unique()))
print(len(stats_table))
print(len(loans['client_id'].unique()))

241
232
232


In [7]:
clients_manual = clients.merge(stats_table, left_on = 'client_id', right_index=True, how = 'left')
clients_manual.head(10)

Unnamed: 0,client_id,joined,income,credit_score,mean_loan_amount,max_loan_amount,min_loan_amount
0,2697,2013-02-14,45903,507,7016.6,14927.0,1262.0
1,4647,2005-06-23,47037,628,7374.7,13708.0,807.0
2,3230,2002-05-06,35246,555,7531.352941,14406.0,705.0
3,3214,2006-08-26,42311,669,6077.105263,13298.0,852.0
4,2265,2000-08-28,40387,679,7562.055556,14804.0,587.0
5,2483,2013-05-19,49188,511,5911.052632,13875.0,934.0
6,3092,2005-05-15,48432,766,7485.947368,14769.0,726.0
7,3441,2001-06-10,45021,746,8079.15,13985.0,630.0
8,2489,2002-04-17,42915,655,7740.0,14306.0,1231.0
9,2909,2011-07-03,49407,503,7828.666667,14574.0,1428.0


In [8]:
null_columns = clients_manual.columns[clients_manual.isnull().any()]
clients_manual[null_columns].isnull().sum()

mean_loan_amount    9
max_loan_amount     9
min_loan_amount     9
dtype: int64

creating an entity which is just a table. 

        1. index: Unique identifier
        2. time_index: 
        3. make_index: True/False
If the data also has a uniquely identifying time index, we can pass that in as the time_index parameter.

In [9]:
es = ft.EntitySet(id = 'clients')
es = es.entity_from_dataframe(entity_id = 'clients', dataframe = clients, 
                              index = 'client_id', time_index = 'joined')

In [10]:
payments.columns

Index(['loan_id', 'payment_amount', 'payment_date', 'missed'], dtype='object')

In [11]:
es = es.entity_from_dataframe(entity_id = 'loans', 
                              dataframe = loans, 
                              index = 'loan_id', 
                              time_index = 'loan_start')

In [12]:
es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe = payments,
                              make_index = True,
                              index = 'payment_id')

In [13]:
es

Entityset: clients
  Entities:
    clients [Rows: 241, Columns: 4]
    loans [Rows: 1811, Columns: 8]
    payments [Rows: 33708, Columns: 5]
  Relationships:
    No relationships

In [14]:
es['loans']

Entity: loans
  Variables:
    loan_id (dtype: index)
    client_id (dtype: numeric)
    loan_type (dtype: categorical)
    loan_amount (dtype: numeric)
    repaid (dtype: numeric)
    loan_start (dtype: datetime_time_index)
    loan_end (dtype: datetime)
    rate (dtype: numeric)
  Shape:
    (Rows: 1811, Columns: 8)

In [15]:
es['clients']

Entity: clients
  Variables:
    client_id (dtype: index)
    joined (dtype: datetime_time_index)
    income (dtype: numeric)
    credit_score (dtype: numeric)
  Shape:
    (Rows: 241, Columns: 4)

In [16]:
# in thsi step we are creating the relationship between client and loans
client_previous = ft.Relationship(es['clients']['client_id'],
                                    es['loans']['client_id'])

print(client_previous)

<Relationship: loans.client_id -> clients.client_id>


In [17]:
# Add the relationship to the entity set
es = es.add_relationship(client_previous)

In [18]:
es

Entityset: clients
  Entities:
    clients [Rows: 241, Columns: 4]
    loans [Rows: 1811, Columns: 8]
    payments [Rows: 33708, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id

In [19]:
payments_relation = ft.Relationship(es['loans']['loan_id'],
                             es['payments']['loan_id'])
es = es.add_relationship(payments_relation)
es


Entityset: clients
  Entities:
    clients [Rows: 241, Columns: 4]
    loans [Rows: 1811, Columns: 8]
    payments [Rows: 33708, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id

In [22]:
print(clients.columns)
print(loans.columns)
print(payments.columns)

Index(['client_id', 'joined', 'income', 'credit_score'], dtype='object')
Index(['client_id', 'loan_type', 'loan_amount', 'repaid', 'loan_id',
       'loan_start', 'loan_end', 'rate'],
      dtype='object')
Index(['payment_id', 'loan_id', 'payment_amount', 'payment_date', 'missed'], dtype='object')
