# Model training

## Dependency management

In [None]:
import numpy as np
import os

In [None]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, mean_absolute_error,
                             precision_score, recall_score, matthews_corrcoef)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## Minio Bucket setup

In [None]:
MLFLOW_EXP_NAME = os.environ.get('MLFLOW_EXP_NAME', 'credit_scoring')

FEATURE_FILENAME = 'lcld/feature_data'

In [None]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/{FEATURE_FILENAME}.csv'

## Load pandas dataframe from S3

In [None]:
df = pd.read_csv(
    f's3://{filepath}',
)

In [None]:
df.head()

## transform data frame type

In [None]:
df = df.drop('issue_d', axis=1)

In [None]:
feature = df.columns.drop('charged_off')

cat_feature = [
    "initial_list_status",
    "application_type",
    "home_ownership",
    "verification_status",
    "purpose",
]

# Convert np.arange to list
cat_range = [
    list(np.arange(int(df[f].min()), int(df[f].max()) + 1))
    for f in cat_feature
]

for f in cat_feature:
    df[f] = df[f].astype('category')

num_feature = list(set(feature) - set(cat_feature))

## Split train test

In [None]:
y = pd.factorize(df['charged_off'])[0]
X = df.drop(columns=['charged_off'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Define model parameters

In [None]:
rf_parameters = {
    "n_estimators": 125,
    "min_samples_split": 6,
    "min_samples_leaf": 2,
    "max_depth": 10,
    "bootstrap": True,
    "class_weight": "balanced",
}

## Instantiate pipeline, model and transformer

In [None]:
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_feature),
        (
            "cat",
            OneHotEncoder(
                sparse_output=False,
                handle_unknown="ignore",
                drop="if_binary",
                categories=cat_range,
            ),
            cat_feature,
        ),
    ],
    sparse_threshold=0,
    remainder="passthrough",
    n_jobs=-1,
)


model = RandomForestClassifier(
    **rf_parameters,
    random_state=42,
    n_jobs=-1,
)


pipe = Pipeline(steps=[
    ('preprocessor', transformer),
    ('classifier', model)
])


## Training

In [None]:
pipe.fit(X_train, y_train)

## Evaluation

In [None]:
# make predictions
yhat = pipe.predict(X_test)

mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
accuracy = accuracy_score(y_test, yhat)
print('accuracy: %.3f' % accuracy)
precision = precision_score(y_test, yhat)
print('precision: %.3f' % precision)
recall = recall_score(y_test, yhat)
print('recall: %.3f' % recall)
matthews = matthews_corrcoef(y_test, yhat)
print('matthews: %.3f' % matthews)

In [None]:
mlflow.set_experiment(MLFLOW_EXP_NAME)
with mlflow.start_run(run_name='RandomForest'):
    for key, value in rf_parameters.items():
        mlflow.log_param(key, value)

    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('matthews', matthews)
    
    mlflow.sklearn.log_model(pipe, artifact_path='model')