In [1]:
import pandas as pd
import numpy as np
import itertools
from tqdm import tqdm

# Import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (StandardScaler, 
                                   OneHotEncoder)
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (accuracy_score,
                             f1_score,
                             precision_score,
                             recall_score,
                             roc_auc_score)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Import mlflow
from mlflow import (
    log_metric, 
    log_param,
    log_figure,
    set_experiment, 
    set_tags,
    start_run,
    end_run,
    set_tracking_uri)

from mlflow.sklearn import log_model

In [None]:
#set_tracking_uri('sqlite:///mlflow.db')

# Importing the database

In [2]:
train = pd.read_csv('./train.csv')
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Creating the training and testing pipeline

In [3]:
X = train.drop(columns=['Survived'])
y = train['Survived']

cols = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
X = X[cols]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print("#"*30)
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (623, 5)
y_train shape: (623,)
##############################
X_test shape: (268, 5)
y_test shape: (268,)


In [4]:
num_vars = ['Age', 'Fare']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy='mean')), 
           ("scaler", StandardScaler())]
)

cat_vars = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy='most_frequent')),
           ("ohe", OneHotEncoder(handle_unknown="ignore", drop='first'))])
    
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_vars),
        ("cat", categorical_transformer, cat_vars)
    ]
)
preprocessor

## Here I will build an altered version of grid_search, at each iteration it will record the result in mlflow (and later it can be used for comparison)


ref.: https://mlflow.org/docs/1.8.0/index.html

In [5]:
def metrics(y_true, y_pred, y_proba):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba[:, 1])
    return acc, f1, precision, recall, roc_auc

In [6]:
def register_mlflow(exp_name,
                    pipe,
                    hyperparams,
                    y_test, 
                    y_pred, 
                    y_proba):
    
    with start_run(run_name=exp_name):    
    
        # get metrics
        acc, f1, precision, recall, roc_auc = metrics(y_test, y_pred, y_proba)

        #############################################
        # log the result in mlflow

        # params
        for p in hyperparams:
            log_param(p, hyperparams[p])

        # metrics
        log_metric("acc", acc)
        log_metric("f1", f1)
        log_metric("precision", precision)        
        log_metric("recall", recall)        
        log_metric("roc_auc", roc_auc)        
        
        # model
        log_model(pipe, "model")

In [7]:
def mlflow_grid_search(exp_name,
                     preprocessor,
                     clf, 
                     X_train, 
                     y_train, 
                     X_test, 
                     y_test, 
                     param_grid, 
                     fixed_hyperparams={}):


    set_experiment(exp_name)
    list_param = list(ParameterGrid(param_grid))
    for hyperparams in tqdm(list_param):
        c = clf(**hyperparams, **fixed_hyperparams)

        # build pipeline
        pipe = Pipeline([('preprocessor', preprocessor), 
                         ('clf', c)])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)        
        y_proba = pipe.predict_proba(X_test)
        
        # register in MLFLOW
        register_mlflow(exp_name,
                    pipe,
                    hyperparams,
                    y_test, 
                    y_pred, 
                    y_proba)

### Using KNN

In [8]:
param_grid = {'n_neighbors': np.arange(3,21,2), 
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

fixed_hyperparams = {'n_jobs':-1}

list_param = list(ParameterGrid(param_grid))
# shows 5
list_param[0:5]

[{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'},
 {'n_neighbors': 3, 'p': 1, 'weights': 'distance'},
 {'n_neighbors': 3, 'p': 2, 'weights': 'uniform'},
 {'n_neighbors': 3, 'p': 2, 'weights': 'distance'},
 {'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}]

In [9]:
mlflow_grid_search(exp_name='KNN',
                 preprocessor=preprocessor,
                 clf=KNeighborsClassifier, 
                 X_train=X_train, 
                 y_train=y_train, 
                 X_test=X_test, 
                 y_test=y_test, 
                 param_grid=param_grid, 
                 fixed_hyperparams=fixed_hyperparams)

2022/11/04 08:22:27 INFO mlflow.tracking.fluent: Experiment with name 'KNN' does not exist. Creating a new experiment.
100%|█████████████████████████████████████████████████| 36/36 [01:18<00:00,  2.17s/it]


### Using Random Forest

In [10]:
param_grid = {'n_estimators': np.arange(2, 5, 1),
              'criterion':['gini', 'entropy', 'log_loss'],
              'max_depth': np.arange(2, 5, 1),
              'class_weight': ['balanced', 'balanced_subsample', None]}

fixed_hyperparams = {'n_jobs':-1}

list_param = list(ParameterGrid(param_grid))
# shows 5
list_param[0:5]

[{'class_weight': 'balanced',
  'criterion': 'gini',
  'max_depth': 2,
  'n_estimators': 2},
 {'class_weight': 'balanced',
  'criterion': 'gini',
  'max_depth': 2,
  'n_estimators': 3},
 {'class_weight': 'balanced',
  'criterion': 'gini',
  'max_depth': 2,
  'n_estimators': 4},
 {'class_weight': 'balanced',
  'criterion': 'gini',
  'max_depth': 3,
  'n_estimators': 2},
 {'class_weight': 'balanced',
  'criterion': 'gini',
  'max_depth': 3,
  'n_estimators': 3}]

In [11]:
mlflow_grid_search(exp_name='Random Forest',
                 preprocessor=preprocessor,
                 clf=RandomForestClassifier, 
                 X_train=X_train, 
                 y_train=y_train, 
                 X_test=X_test, 
                 y_test=y_test, 
                 param_grid=param_grid, 
                 fixed_hyperparams=fixed_hyperparams)

2022/11/04 08:31:15 INFO mlflow.tracking.fluent: Experiment with name 'Random Forest' does not exist. Creating a new experiment.
100%|█████████████████████████████████████████████████| 81/81 [02:43<00:00,  2.01s/it]
