# Model training

## Dependency management

In [1]:
import numpy as np
import os

In [2]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, mean_absolute_error,
                             precision_score, recall_score, matthews_corrcoef)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## Minio Bucket setup

In [3]:
MINIO_ENDPOINT = 'http://minio.idoml.precision.uni.lux'

FEATURE_FILENAME = 'lcld/feature_data'

In [4]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/{FEATURE_FILENAME}.csv'

## Load pandas dataframe from S3

In [5]:
df = pd.read_csv(
    f's3://{filepath}',
    storage_options={
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)

In [6]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,...,pub_rec_bankruptcies,charged_off,fico_score,month_of_year,ratio_loan_amnt_annual_inc,ratio_open_acc_total_acc,month_since_earliest_cr_line,ratio_pub_rec_month_since_earliest_cr_line,ratio_pub_rec_bankruptcies_month_since_earliest_cr_line,ratio_pub_rec_bankruptcies_pub_rec
0,4000.0,36,9.76,128.62,5,10,2,90000.0,2,2012-03-01,...,0,0,752.0,2,0.044444,0.6,432,0.0,0.0,-1.0
1,28000.0,36,8.9,889.09,4,7,0,110000.0,1,2012-03-01,...,0,1,802.0,2,0.254545,0.238095,226,0.0,0.0,-1.0
2,7100.0,36,6.03,216.1,0,5,1,65000.0,0,2012-03-01,...,0,0,752.0,2,0.109231,0.3,183,0.0,0.0,-1.0
3,24000.0,36,6.62,736.89,1,10,0,85000.0,2,2012-03-01,...,0,0,772.0,2,0.282353,0.315789,215,0.0,0.0,-1.0
4,6100.0,60,15.81,147.73,13,4,1,55000.0,2,2012-03-01,...,0,0,712.0,2,0.110909,0.52,143,0.0,0.0,-1.0


## transform data frame type

In [7]:
df = df.drop('issue_d', axis=1)

In [8]:
feature = df.columns.drop('charged_off')

cat_feature = [
    "initial_list_status",
    "application_type",
    "home_ownership",
    "verification_status",
    "purpose",
]

# Convert np.arange to list
cat_range = [
    list(np.arange(int(df[f].min()), int(df[f].max()) + 1))
    for f in cat_feature
]

for f in cat_feature:
    df[f] = df[f].astype('category')

num_feature = list(set(feature) - set(cat_feature))

## Split train test

In [9]:
y = pd.factorize(df['charged_off'])[0]
X = df.drop(columns=['charged_off'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Define model parameters

In [10]:
rf_parameters = {
    "n_estimators": 125,
    "min_samples_split": 6,
    "min_samples_leaf": 2,
    "max_depth": 10,
    "bootstrap": True,
    "class_weight": "balanced",
}

## Instantiate pipeline, model and transformer

In [11]:
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_feature),
        (
            "cat",
            OneHotEncoder(
                sparse_output=False,
                handle_unknown="ignore",
                drop="if_binary",
                categories=cat_range,
            ),
            cat_feature,
        ),
    ],
    sparse_threshold=0,
    remainder="passthrough",
    n_jobs=-1,
)


model = RandomForestClassifier(
    **rf_parameters,
    random_state=42,
    n_jobs=3,
)


pipe = Pipeline(steps=[
    ('preprocessor', transformer),
    ('classifier', model)
])


## Training

In [12]:
pipe.fit(X_train, y_train)

## Evaluation

In [13]:
# make predictions
yhat = pipe.predict(X_test)

mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
accuracy = accuracy_score(y_test, yhat)
print('accuracy: %.3f' % accuracy)
precision = precision_score(y_test, yhat)
print('precision: %.3f' % precision)
recall = recall_score(y_test, yhat)
print('recall: %.3f' % recall)
matthews = matthews_corrcoef(y_test, yhat)
print('matthews: %.3f' % matthews)

MAE: 0.358
accuracy: 0.642
precision: 0.313
recall: 0.677
matthews: 0.249


In [14]:
mlflow.set_experiment('credit_scoring')
with mlflow.start_run(run_name='RandomForest'):
    for key, value in rf_parameters.items():
        mlflow.log_param(key, value)

    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('matthews', matthews)
    
    mlflow.sklearn.log_model(pipe, artifact_path='model')