In [81]:
import featuretools as ft
import pandas as pd

# Read in the data from the .pkl
df = pd.read_csv("./data/ingest_data.csv")
#df = pd.read_pickle("./data/baseline_fe_data.pkl")

# Now list out the final numerical and categorical columns
num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
num_cols.remove('is_bad')
target = ['is_bad']

df[cat_cols] = df[cat_cols].astype(str)
df[num_cols] = df[num_cols].astype(float)



In [82]:
#!pip install --upgrade featuretools

In [83]:
df['delinq_2yrs'] = df.delinq_2yrs.fillna(0)

def make_bin(x):
    if int(x) > 0:
        return '1'
    else:
        return '0'

In [84]:
df['inq_last_6mths'].fillna(0, inplace=True)

In [85]:
df['mths_since_last_delinq'].fillna(0, inplace=True)

In [86]:
df['mths_since_last_record'].max()
df['mths_since_last_record'].fillna(df.mths_since_last_record.max(), inplace=True)
df['mths_since_last_record'] = df['mths_since_last_record'].astype(int)

In [87]:
df['pub_rec'].fillna(0, inplace=True)
df['pub_rec'] = df['pub_rec'].astype(int)

In [88]:
df.drop('collections_12_mths_ex_med', axis=1, inplace=True)

In [89]:
df['annual_inc'].fillna(df.annual_inc.median(), inplace=True)

In [90]:
df['open_acc'].fillna(df.open_acc.median(), inplace=True)

In [91]:
df['revol_util'].fillna(df.revol_util.median(), inplace=True)

In [92]:
df['total_acc'].fillna(df.total_acc.median(), inplace=True)

In [93]:
# Need to convert 'emp_length' to a numerical column
# Any values missing or 'na' will be replaced by 0 since it makes sense that they may not have been employed a long time
def convert_num(x):
    try:
        x = int(x)
        if x > 10:
            x = 10
    except:
        x = 0  
    
    return x

df['emp_length'] = df.emp_length.apply(convert_num)

In [95]:
es = ft.EntitySet(id="lending_data")
es = es.entity_from_dataframe(entity_id="data",
                               dataframe=df,
                               #make_index=True,
                               index='index',
                               time_index="earliest_cr_line",
                               variable_types={"zip_code": ft.variable_types.ZIPCode,
                                               "emp_length": ft.variable_types.Numeric,
                                                "is_bad" : ft.variable_types.Categorical})


  "integer column".format(index))


In [96]:
es['data']


Entity: data
  Variables:
    index (dtype: index)
    home_ownership (dtype: categorical)
    annual_inc (dtype: numeric)
    verification_status (dtype: categorical)
    pymnt_plan (dtype: categorical)
    purpose_cat (dtype: categorical)
    addr_state (dtype: categorical)
    debt_to_income (dtype: numeric)
    delinq_2yrs (dtype: numeric)
    earliest_cr_line (dtype: datetime_time_index)
    inq_last_6mths (dtype: numeric)
    mths_since_last_delinq (dtype: numeric)
    mths_since_last_record (dtype: numeric)
    open_acc (dtype: numeric)
    pub_rec (dtype: numeric)
    revol_bal (dtype: numeric)
    revol_util (dtype: numeric)
    total_acc (dtype: numeric)
    initial_list_status (dtype: categorical)
    mths_since_last_major_derog (dtype: numeric)
    policy_code (dtype: categorical)
    zip_code (dtype: zip_code)
    emp_length (dtype: numeric)
    is_bad (dtype: categorical)
  Shape:
    (Rows: 10000, Columns: 24)

In [None]:
es

In [None]:
!pip list | grep featuretools

In [97]:
feature_matrix, feature_defs = ft.dfs(entityset=es, 
                                      target_entity='data',
                                      agg_primitives = [],
                                      trans_primitives = ['add_numeric', 'multiply_numeric', 'month', 'year', 'percentile', 'time_since'],
                                      max_depth=2)

In [98]:
# Drop out the target values
target = feature_matrix['is_bad']
feature_matrix.drop('is_bad', axis=1, inplace=True)
#for colname in feature_matrix.columns:
#    if 'is_bad' in colname.split(' '):
#        feature_matrix.drop(colname, axis=1, inplace=True)

In [99]:
feature_matrix.columns

Index(['home_ownership', 'annual_inc', 'verification_status', 'pymnt_plan',
       'purpose_cat', 'addr_state', 'debt_to_income', 'delinq_2yrs',
       'inq_last_6mths', 'mths_since_last_delinq',
       ...
       'PERCENTILE(mths_since_last_delinq)',
       'PERCENTILE(mths_since_last_major_derog)',
       'PERCENTILE(mths_since_last_record)', 'PERCENTILE(open_acc)',
       'PERCENTILE(pub_rec)', 'PERCENTILE(revol_bal)',
       'PERCENTILE(revol_util)', 'PERCENTILE(total_acc)',
       'TIME_SINCE(earliest_cr_line)', 'YEAR(earliest_cr_line)'],
      dtype='object', length=193)

In [77]:
#list(zip(ft.list_primitives().type.tolist(), ft.list_primitives().name.tolist()))

In [78]:
#feature_matrix = pd.get_dummies(feature_matrix)
#print(feature_matrix.shape)

In [100]:
corr = feature_matrix.join(target).corr()['is_bad'].sort_values()

# Display correlations
print('Most Positive Correlations with "is_bad"')
print(20*"-")
print(corr.tail(10))
print()
print('Most Negative Correlations with "is_bad"')
print(20*"-")
print(corr.head(10))

Most Positive Correlations with "is_bad"
--------------------
open_acc + revol_util                       0.084543
debt_to_income + revol_util                 0.087016
PERCENTILE(revol_util)                      0.087940
revol_util                                  0.087998
inq_last_6mths + revol_util                 0.088010
pub_rec + revol_util                        0.088239
mths_since_last_major_derog + revol_util    0.088425
delinq_2yrs + revol_util                    0.088478
mths_since_last_record * revol_util         0.089093
is_bad                                      1.000000
Name: is_bad, dtype: float64

Most Negative Correlations with "is_bad"
--------------------
PERCENTILE(annual_inc)                    -0.072556
PERCENTILE(total_acc)                     -0.063777
emp_length * total_acc                    -0.061058
emp_length + total_acc                    -0.059651
annual_inc * total_acc                    -0.059215
total_acc                                 -0.055916
pub_

In [101]:
from sklearn.model_selection import train_test_split
seed = 20

# Try just using a train/test split at first without sorting the values by time (I don't know how truly time-based this model will be)
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, target, test_size=0.20, random_state=seed)

In [102]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

sc = StandardScaler()
pt = PowerTransformer(method='box-cox', standardize=False)

X_train_st = sc.fit_transform(X_train)
X_test_st = sc.transform(X_test)

X_train_pt = pt.fit_transform(X_train)
X_test_pt = pt.fit_transform(X_test)

ValueError: could not convert string to float: 'RENT'

In [None]:
from catboost import CatBoostClassifier

