In [69]:
import joblib
import pandas as pd
from feast import FeatureStore
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder

In [70]:
home_dir = str(Path.home())

repo_path = home_dir
model_filename = f"{home_dir}/model.bin"
encoder_filename = f"{home_dir}/encoder.bin"

In [71]:
# Set up model building
classifier = tree.DecisionTreeClassifier()


In [72]:
# Set up feature store
fs = FeatureStore(repo_path=repo_path)

In [73]:
loans = pd.read_parquet(f"{home_dir}/loan_table.parquet") # load base for the train dataset

In [74]:
loans.head()

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,event_timestamp,created_timestamp
0,10000,19530219_5179,76104,22,59000,RENT,123.0,PERSONAL,35000,16.02,1,2021-08-25 20:34:41.361000+00:00,2021-08-25 20:34:41.361000+00:00
1,10001,19520816_8737,70380,21,9600,OWN,5.0,EDUCATION,1000,11.14,0,2021-08-25 20:16:20.128000+00:00,2021-08-25 20:16:20.128000+00:00
2,10002,19860413_2537,97039,25,9600,MORTGAGE,1.0,MEDICAL,5500,12.87,1,2021-08-25 19:57:58.896000+00:00,2021-08-25 19:57:58.896000+00:00
3,10003,19760701_8090,63785,23,65500,RENT,4.0,MEDICAL,35000,15.23,1,2021-08-25 19:39:37.663000+00:00,2021-08-25 19:39:37.663000+00:00
4,10004,19830125_8297,82223,24,54400,RENT,8.0,MEDICAL,35000,14.27,1,2021-08-25 19:21:16.430000+00:00,2021-08-25 19:21:16.430000+00:00


In [75]:
training_df = fs.get_historical_features(
    entity_df=loans, features=[
        "zipcode_features:city",
        "zipcode_features:state",
        "zipcode_features:location_type",
        "zipcode_features:tax_returns_filed",
        "zipcode_features:population",
        "zipcode_features:total_wages",
        "credit_history:credit_card_due",
        "credit_history:mortgage_due",
        "credit_history:student_loan_due",
        "credit_history:vehicle_loan_due",
        "credit_history:hard_pulls",
        "credit_history:missed_payments_2y",
        "credit_history:missed_payments_1y",
        "credit_history:missed_payments_6m",
        "credit_history:bankruptcies",
    ]
).to_df()

In [76]:
training_df.head()

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,...,total_wages,credit_card_due,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies
0,10001,19520816_8737,70380,21,9600,OWN,5.0,EDUCATION,1000,11.14,...,295677025,3308,1181555,39015,19073,6,5,1,1,0
1,10013,19561221_5856,1867,26,108160,RENT,4.0,EDUCATION,35000,18.39,...,808066167,286,680710,3495,4760,5,6,1,0,0
2,10025,19641023_7448,26155,24,67746,RENT,8.0,HOMEIMPROVEMENT,33000,12.68,...,113574080,2738,461567,27164,14933,10,4,1,0,0
3,10035,19810418_9269,6450,21,12000,OWN,5.0,EDUCATION,2500,7.51,...,718014481,4270,197495,8370,25828,8,7,0,0,0
4,10043,19500128_8421,14878,26,300000,OWN,9.0,HOMEIMPROVEMENT,10000,10.38,...,8882208,5564,1263354,5304,25239,0,0,0,0,0


In [81]:
# Transform some categorical features from the original dataset to numerical ones
# It is necessary for the correct model training (some models do not understand categorical features)
# TODO: It is better to make such transformations through Feature Store!!!
# TODO: city, state, location_type are already in FS, just need to encode them before writing to FS
# TODO: person_home_ownership, loan_intent are NOT in FS, need to add transformation to FS?

categorical_features = [
    "person_home_ownership",
    "loan_intent",
    "city",
    "state",
    "location_type",
]
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# fit encoder
encoder.fit(training_df[categorical_features])

# apply encoding
training_df[categorical_features] = encoder.transform(training_df[categorical_features])

# save encoder
joblib.dump(encoder, encoder_filename)

['/Users/sotnich/encoder.bin']

In [78]:
training_df.head()

Unnamed: 0,loan_id,dob_ssn,zipcode,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,...,total_wages,credit_card_due,mortgage_due,student_loan_due,vehicle_loan_due,hard_pulls,missed_payments_2y,missed_payments_1y,missed_payments_6m,bankruptcies
0,10001,19520816_8737,70380,21,9600,2.0,5.0,1.0,1000,11.14,...,295677025,3308,1181555,39015,19073,6,5,1,1,0
1,10013,19561221_5856,1867,26,108160,3.0,4.0,1.0,35000,18.39,...,808066167,286,680710,3495,4760,5,6,1,0,0
2,10025,19641023_7448,26155,24,67746,3.0,8.0,2.0,33000,12.68,...,113574080,2738,461567,27164,14933,10,4,1,0,0
3,10035,19810418_9269,6450,21,12000,2.0,5.0,1.0,2500,7.51,...,718014481,4270,197495,8370,25828,8,7,0,0,0
4,10043,19500128_8421,14878,26,300000,2.0,9.0,2.0,10000,10.38,...,8882208,5564,1263354,5304,25239,0,0,0,0,0


In [79]:
# Train and save model

target = "loan_status"

train_x = training_df[
    training_df.columns.drop(target)
    .drop("event_timestamp")
    .drop("created_timestamp")
    .drop("loan_id")
    .drop("zipcode")
    .drop("dob_ssn")
]
train_x = train_x.reindex(sorted(train_x.columns), axis=1)
train_y = training_df.loc[:, target]

classifier.fit(train_x[sorted(train_x)], train_y)
joblib.dump(classifier, model_filename)

['/Users/sotnich/model.bin']