In [1]:
import joblib
import pandas as pd
from datetime import datetime
from feast import FeatureStore
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder

In [18]:
home_dir = str(Path.home())

repo_path = home_dir
model_filename = f"{home_dir}/model.bin"
encoder_filename = f"{home_dir}/encoder.bin"

In [19]:
# Set up model building
classifier = tree.DecisionTreeClassifier()

In [20]:
# Set up feature store
fs = FeatureStore(repo_path=repo_path)

In [28]:
loans = pd.read_parquet(f"{home_dir}/loan_table.parquet") # load base for the train dataset

In [29]:
loans.head()

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp
0,10000,19530219_5179,76104,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,2021-08-25 20:34:41.361000+00:00,2021-08-25 20:34:41.361000+00:00
1,10001,19520816_8737,70380,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,2021-08-25 20:16:20.128000+00:00,2021-08-25 20:16:20.128000+00:00
2,10002,19860413_2537,97039,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,2021-08-25 19:57:58.896000+00:00,2021-08-25 19:57:58.896000+00:00
3,10003,19760701_8090,63785,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,2021-08-25 19:39:37.663000+00:00,2021-08-25 19:39:37.663000+00:00
4,10004,19830125_8297,82223,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,2021-08-25 19:21:16.430000+00:00,2021-08-25 19:21:16.430000+00:00


In [35]:
training_df = fs.get_historical_features(
    entity_df=loans, features=[
        "zipcode_features:city",
        "zipcode_features:state",
        "zipcode_features:location_type",
        "zipcode_features:tax_returns_filed",
        "zipcode_features:population",
        "zipcode_features:total_wages",
        "credit_history:credit_card_due",
        "credit_history:mortgage_due",
        "credit_history:student_loan_due",
        "credit_history:vehicle_loan_due",
        "credit_history:hard_pulls",
        "credit_history:missed_payments_2y",
        "credit_history:missed_payments_1y",
        "credit_history:missed_payments_6m",
        "credit_history:bankruptcies",
        "person_home_ownership:person_home_ownership_enc",
        "loan_intent:loan_intent_enc",
    ]
).to_df().drop(['person_home_ownership', 'loan_intent'], axis=1)

In [36]:
training_df.head()

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,event_timestamp,...,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies,loan_intent_enc,person_home_ownership_enc
0,10005,19620424_2674,49861,21,9900,2.0,2500,7.14,1,2021-08-25 19:02:55.198,...,709259,28836,17248,10,4,2,0,1,3,1
1,10011,19880808_5747,55307,21,10000,2.0,4500,8.63,1,2021-08-25 17:12:47.802,...,1389857,35196,8926,9,1,1,1,1,4,1
2,10019,19710322_7014,48827,24,10800,8.0,1750,10.99,1,2021-08-25 14:45:57.940,...,1035854,27263,29469,9,1,1,1,0,1,3
3,10023,19980613_9023,17508,24,10980,0.0,1500,7.29,0,2021-08-25 13:32:33.010,...,337244,36531,4103,6,4,0,0,0,0,1
4,10029,19771012_6788,16932,21,11389,5.0,4000,12.84,1,2021-08-25 11:42:25.614,...,1940769,24301,18906,7,3,2,0,1,1,3


In [37]:
# Transform some categorical features from the original dataset to numerical ones
# It is necessary for the correct model training (some models do not understand categorical features)
# TODO: It is better to make such transformations through Feature Store
# TODO: city, state, location_type are already in FS, just need to encode them before writing to FS

categorical_features = [
    "city",
    "state",
    "location_type",
]
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# fit encoder
encoder.fit(training_df[categorical_features])

# apply encoding
training_df[categorical_features] = encoder.transform(training_df[categorical_features])

# save encoder
joblib.dump(encoder, encoder_filename)

['/Users/sotnich/encoder.bin']

In [38]:
training_df.head()

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,event_timestamp,...,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies,loan_intent_enc,person_home_ownership_enc
0,10005,19620424_2674,49861,21,9900,2.0,2500,7.14,1,2021-08-25 19:02:55.198,...,709259,28836,17248,10,4,2,0,1,3,1
1,10011,19880808_5747,55307,21,10000,2.0,4500,8.63,1,2021-08-25 17:12:47.802,...,1389857,35196,8926,9,1,1,1,1,4,1
2,10019,19710322_7014,48827,24,10800,8.0,1750,10.99,1,2021-08-25 14:45:57.940,...,1035854,27263,29469,9,1,1,1,0,1,3
3,10023,19980613_9023,17508,24,10980,0.0,1500,7.29,0,2021-08-25 13:32:33.010,...,337244,36531,4103,6,4,0,0,0,0,1
4,10029,19771012_6788,16932,21,11389,5.0,4000,12.84,1,2021-08-25 11:42:25.614,...,1940769,24301,18906,7,3,2,0,1,1,3


In [79]:
# Train and save model

target = "loan_status"

train_x = training_df[
    training_df.columns.drop(target)
    .drop("event_timestamp")
    .drop("created_timestamp")
    .drop("loan_id")
    .drop("zipcode")
    .drop("dob_ssn")
]
train_x = train_x.reindex(sorted(train_x.columns), axis=1)
train_y = training_df.loc[:, target]

classifier.fit(train_x[sorted(train_x)], train_y)
joblib.dump(classifier, model_filename)

['/Users/sotnich/model.bin']

In [6]:
# This is not the part of the model training!
# But this step is necessary for online prediction
# This is moving features from batch to online store

fs.materialize(start_date=datetime(year=2013, month=1, day=1), end_date=datetime.now())

Materializing [1m[32m2[0m feature views from [1m[32m2013-01-01 00:00:00+00:00[0m to [1m[32m2023-01-09 16:39:35+00:00[0m into the [1m[32mdynamodb[0m online store.

[1m[32mcredit_history[0m:


100%|████████████████████████████████████████████████████████| 28633/28633 [00:49<00:00, 581.46it/s]


[1m[32mzipcode_features[0m:


100%|████████████████████████████████████████████████████████| 28844/28844 [00:48<00:00, 589.25it/s]
