# Model training

## Dependency management

In [16]:
import numpy as np

In [39]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, mean_absolute_error,
                             precision_score, recall_score)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

## Minio Bucket setup

In [None]:
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
MINIO_ENDPOINT = 'http://minio.idoml.uni.lux'

FEATURE_FILENAME = 'lcld/feature_data'

In [None]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/{CLEANED_FILENAME}.csv'

## Load pandas dataframe from S3

In [None]:
df = pd.read_csv(
    f's3://{filepath}',
    storage_options={
        'key': AWS_ACCESS_KEY_ID,
        'secret': AWS_SECRET_ACCESS_KEY,
        'token': None,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)

In [55]:
df = pd.read_csv('feature.csv', index_col=0)

In [56]:
df
df.drop('issue_d', axis=1, inplace=True)

In [65]:
cat_range, cat_feature

([array([0, 1]),
  array([0, 1]),
  array([0, 1, 2, 3]),
  array([0, 1, 2]),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])],
 ['initial_list_status',
  'application_type',
  'home_ownership',
  'verification_status',
  'purpose'])

In [74]:
df = df.copy()

In [77]:

    
feature = df.columns[:-1]

cat_feature = [
    "initial_list_status",
    "application_type",
    "home_ownership",
    "verification_status",
    "purpose",
]

# Convert np.arange to list
cat_range = [
    list(np.arange(int(df[f].min()), int(df[f].max()) + 1))
    for f in cat_feature
]

for f in cat_feature:
    df[f] = df[f].astype('category')

num_feature = list(set(feature) - set(cat_feature))

transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_feature),
        (
            "cat",
            OneHotEncoder(
                sparse_output=False,
                handle_unknown="ignore",
                drop="if_binary",
                categories=cat_range,
            ),
            cat_feature,
        ),
    ],
    sparse_threshold=0,
    remainder="passthrough",
    n_jobs=-1,
)

transformer.fit(df[feature])

In [78]:

rf_parameters = {
    "n_estimators": 125,
    "min_samples_split": 6,
    "min_samples_leaf": 2,
    "max_depth": 10,
    "bootstrap": True,
    "class_weight": "balanced",
}


In [82]:
# df.drop('issue_d', axis=1, inplace=True)
feature = df.columns[:-1]

y = pd.factorize(df['charged_off'])[0]
X = df[feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)




model = RandomForestClassifier(
    **rf_parameters,
    random_state=42,
    n_jobs=1,
)

model.fit(transformer.transform(X_train), y_train)

# make predictions
yhat = model.predict(transformer.transform(X_test))

mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
accuracy = accuracy_score(y_test, yhat)
print('accuracy: %.3f' % accuracy)
precision = precision_score(y_test, yhat)
print('precision: %.3f' % precision)
recall = recall_score(y_test, yhat)
print('recall: %.3f' % recall)



MAE: 0.000
accuracy: 1.000
precision: 1.000
recall: 1.000


In [None]:
df.drop('issue_d', axis=1, inplace=True)
feature = df.columns[:-1]

y = pd.factorize(df['charged_off'])[0]
X = df[feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

with mlflow.start_run(run_name='RandomForest'):
    for key, value in rf_parameters.items():
        mlflow.log_param(key, value)

    model = RandomForestClassifier(
        **rf_parameters,
        random_state=RANDOM_SEED,
        n_jobs=NB_CORES,
    )

    model.fit(transformer.transform(X_train), y_train)

    # make predictions
    yhat = model.predict(transformer.transform(X_test))

    mae = mean_absolute_error(y_test, yhat)
    print('MAE: %.3f' % mae)
    accuracy = accuracy_score(y_test, yhat)
    print('accuracy: %.3f' % accuracy)
    precision = precision_score(y_test, yhat)
    print('precision: %.3f' % precision)
    recall = recall_score(y_test, yhat)
    print('recall: %.3f' % recall)

    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)

In [None]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, mean_absolute_error,
                             precision_score, recall_score)
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
NB_CORES = -1

mlflow.set_tracking_uri('http://mlflow:5000')

try:
    # Creating an experiment 
    mlflow.create_experiment('credit_scoring')
except:
    pass
# Setting the environment with the created experiment
mlflow.set_experiment('credit_scoring')


def _model(data_task, transformer_task, **kwargs):
    df = kwargs['ti'].xcom_pull(task_ids=data_task)
    transformer = kwargs['ti'].xcom_pull(task_ids=transformer_task)

    rf_parameters = {
        "n_estimators": 125,
        "min_samples_split": 6,
        "min_samples_leaf": 2,
        "max_depth": 10,
        "bootstrap": True,
        "class_weight": "balanced",
    }

    

    df.drop('issue_d', axis=1, inplace=True)
    feature = df.columns[:-1]

    y = pd.factorize(df['charged_off'])[0]
    X = df[feature]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    with mlflow.start_run(run_name='RandomForest'):
        for key, value in rf_parameters.items():
            mlflow.log_param(key, value)
        
        model = RandomForestClassifier(
            **rf_parameters,
            random_state=RANDOM_SEED,
            n_jobs=NB_CORES,
        )

        model.fit(transformer.transform(X_train), y_train)

        # make predictions
        yhat = model.predict(transformer.transform(X_test))

        mae = mean_absolute_error(y_test, yhat)
        print('MAE: %.3f' % mae)
        accuracy = accuracy_score(y_test, yhat)
        print('accuracy: %.3f' % accuracy)
        precision = precision_score(y_test, yhat)
        print('precision: %.3f' % precision)
        recall = recall_score(y_test, yhat)
        print('recall: %.3f' % recall)

        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('precision', precision)
        mlflow.log_metric('recall', recall)