# MLflow Training

In [1]:
def train(in_n_estimators, in_max_depth):
    import os
    import warnings
    import sys

    import pandas as pd
    import numpy as np
    from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier

    import mlflow
    import mlflow.sklearn
    
    import logging
    
    sys.path.insert(0,'../credit-score')
    from creditscore import preprocess as preproc

    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        precision = precision_score(actual, pred, average='weighted')
        recall = recall_score(actual, pred, average='weighted')
        accuracy = accuracy_score(actual, pred)
        f1 = f1_score(actual, pred, average='weighted')
        return  precision, recall, accuracy, f1


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the csv file
    csv_path = "../data/train.csv"
    try:
        data = pd.read_csv(csv_path)
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV. Error: %s", e)
        
    data = preproc.preprocess_pipeline(data).dropna()

    data_x = data.drop(["Credit_Score"], axis=1)
    data_y = data[["Credit_Score"]]
    
    data_x = pd.get_dummies(data_x)
    
    # Split the data into training and test sets. (0.7, 0.3) split.
    train_x, test_x, train_y, test_y = train_test_split(data_x, 
                                                        data_y, 
                                                        test_size=0.3, 
                                                        stratify=data_y, 
                                                        random_state=40, 
                                                        shuffle=True)

    # Set default values if no number estimators is provided
    if float(in_n_estimators) is None:
        n_estimators = 100
    else:
        n_estimators = int(in_n_estimators)
                  
    # Set default values if no max_depth is provided
    if float(in_max_depth) is None:
        max_depth = 3
    else:
        max_depth = int(in_max_depth)
 
    with mlflow.start_run():
        # Execute ElasticNet
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        rf.fit(train_x, train_y)

        # Evaluate Metrics
        predicted = rf.predict(test_x)
        (precision, recall, accuracy, f1) = eval_metrics(test_y, predicted)

        # Print out metrics
        print("Random Forest model (n_estimators=%f, max_depth=%f):" % (n_estimators, max_depth))
        print("  Precision: %s" % precision)
        print("  Recall: %s" % recall)
        print("  Accuracy: %s" % accuracy)
        print("  F1: %s" % f1)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1", accuracy)

        mlflow.sklearn.log_model(rf, "model")

In [2]:
train(100, 2)

Random Forest model (n_estimators=100.000000, max_depth=2.000000):
  Precision: 0.2784853560178507
  Recall: 0.5277171174197883
  Accuracy: 0.5277171174197883
  F1: 0.3645771233985958


In [3]:
train(100, 3)

Random Forest model (n_estimators=100.000000, max_depth=3.000000):
  Precision: 0.2784853560178507
  Recall: 0.5277171174197883
  Accuracy: 0.5277171174197883
  F1: 0.3645771233985958


In [4]:
train(200, 5)

Random Forest model (n_estimators=200.000000, max_depth=5.000000):
  Precision: 0.2784853560178507
  Recall: 0.5277171174197883
  Accuracy: 0.5277171174197883
  F1: 0.3645771233985958


In [5]:
train(500, 10)

Random Forest model (n_estimators=500.000000, max_depth=10.000000):
  Precision: 0.49326780493541095
  Recall: 0.5717285402318159
  Accuracy: 0.5717285402318159
  F1: 0.4948155687479652


In [6]:
train(500, 20)

Random Forest model (n_estimators=500.000000, max_depth=20.000000):
  Precision: 0.6404960785021463
  Recall: 0.6175877708718294
  Accuracy: 0.6175877708718294
  F1: 0.5840993372773314


In [7]:
train(200, 25)

Random Forest model (n_estimators=200.000000, max_depth=25.000000):
  Precision: 0.6469750192748265
  Recall: 0.6391735259533009
  Accuracy: 0.6391735259533009
  F1: 0.6210750595180935


In [8]:
train(200, 30)

Random Forest model (n_estimators=200.000000, max_depth=30.000000):
  Precision: 0.658352435311378
  Recall: 0.659751385855871
  Accuracy: 0.659751385855871
  F1: 0.6579153951870572
