# Read results datasets

## Imports

In [30]:
import itertools
import os.path
from typing import List, Dict

import numpy as np
import pandas as pd

from src.models.config import param_layers, param_grid_mlp
from src.utils.const import MODEL_RESULTS_CSV
from typing import Tuple

### Useful path to data

In [2]:
RESULTS_FOLDER = os.path.join('..', MODEL_RESULTS_CSV)

## Read output csv

In [3]:
mlp_all = pd.read_csv(os.path.join(RESULTS_FOLDER, 'out_mlp_all.csv'))
svm_res = pd.read_csv(os.path.join(RESULTS_FOLDER, 'out_svm.csv'))
naive_res = pd.read_csv(os.path.join(RESULTS_FOLDER, 'best_out_naive_bayes.csv'))
tree_res = pd.read_csv(os.path.join(RESULTS_FOLDER, 'best_out_tree_based.csv'))

## Find best configuration of MLP

### Utility function to explore results DataFrame

Find best configuration for each fold

In [4]:
def find_max_f1_cfg(df: pd.DataFrame) -> List:
    cfg = []
    for fold in df['fold'].unique():
        idx = df[df['fold'] == fold]['f1_test'].idxmax()
        cfg.append(df.iloc[idx]['cfg'])
    cfgs = np.unique(np.array(cfg))
    return cfgs

In [5]:
best_cfg = find_max_f1_cfg(mlp_all)
print(f'Best configuration ID: {best_cfg}')

Best configuration ID: [ 3. 17. 18. 35.]


Function that calculate the mean and the confidence interval with an accuracy of 90%

In [6]:
def mu_confidence_interval(data: np.ndarray) -> {}:
    t = 1.64
    mu = np.mean(data)
    standard_deviation = np.std(data)
    M = data.shape[0]
    t_student = t * standard_deviation / np.sqrt(M)
    first_interval = mu - t_student
    second_interval = mu + t_student
    return {
        'mu': mu,
        't_student': t_student,
        'first_interval': first_interval,
        'second_interval': second_interval
    }

In order to find the best configuration between the indexes that we found previously, this function calculate the mean on each configuration between the folds, and select the one which has the higher mean.

In [27]:
def find_best_conf(lst_conf, df: pd.DataFrame) -> dict:
    conf = []
    for idx, cfg in enumerate(lst_conf):
        conf.append(
            {
                'f1': mu_confidence_interval(df[df['cfg'] == cfg]['f1_test']),
                'loss': mu_confidence_interval(df[df['cfg'] == cfg]['f1_test']),
                'acc': mu_confidence_interval(df[df['cfg'] == cfg]['f1_test'])
            }
        )
        conf[idx]['conf'] = cfg
    max = conf[0]

    for elm in conf:
        if max['f1']['mu'] < elm['f1']['mu']:
            max = elm
    return max

Find hyperparam configuration from the index.

In [33]:
def get_best_configuration_mlp(cfg:int)->Tuple:
    hyper_parameters_model_all = itertools.product(
        param_layers['input_act'],
        param_layers['hidden_act'],
        param_layers['hidden_size'],
        param_layers['num_hidden_layers'],
        param_layers['dropout'],
        param_layers['batch_norm'],
        param_layers['output_fn'],
        param_grid_mlp['num_epochs'],
        param_grid_mlp['starting_lr'],
        param_grid_mlp['batch_size'],
        param_grid_mlp['optim'],
        param_grid_mlp['momentum'],
        param_grid_mlp['weight_decay'],
    )
    return list(hyper_parameters_model_all)[cfg]

Utility function to print the calculated statistics and the relative configuration

In [39]:
def print_statistics_model(dictionary: Dict, model: str):
    print(
        f"Best configuration {model} mean metrics:\n"
        f"f1_score: {dictionary['f1']['mu']} ±{dictionary['f1']['t_student']}\n"
        f"loss: {dictionary['loss']['mu']} ±{dictionary['loss']['t_student']}\n"
        f"acc: {dictionary['acc']['mu']} ±{dictionary['acc']['t_student']}\n\n"
        f"Best hyperparams configuration:"
        )
    if model== "MLP":
        best_cfg_mlp_all=get_best_configuration_mlp(int(dictionary['conf']))
        for idx, key in enumerate(param_layers.keys()):
            print(f"{key}: {best_cfg_mlp_all[idx]}")
        for idx, key in enumerate(param_grid_mlp.keys(), 7):
            print(f"{key}: {best_cfg_mlp_all[idx]}")
    else:
        print(f"{dictionary['conf']}")

### Results best cfg mlp

In [40]:
res_mlp = find_best_conf(best_cfg, mlp_all)
print_statistics_model(res_mlp, "MLP")

Best configuration MLP mean metrics:
f1_score: 0.8610175579840659 ±0.007531035084099725
loss: 0.8610175579840659 ±0.007531035084099725
acc: 0.8610175579840659 ±0.007531035084099725

Best hyperparams configuration:
input_act: LeakyReLU(negative_slope=0.01)
hidden_act: LeakyReLU(negative_slope=0.01)
hidden_size: 512
num_hidden_layers: 3
dropout: 0.2
batch_norm: True
output_fn: None
num_epochs: 200
starting_lr: 0.001
batch_size: 128
optim: <class 'torch.optim.adam.Adam'>
momentum: 0.9
weight_decay: 1e-05


## Scikit learn best cfg

In [11]:
def calculate_statistics_sklearn(df: pd.DataFrame, model: str) -> Dict:
    res = {'f1': mu_confidence_interval(df[df['model'] == model]['f1_test']),
           'loss': mu_confidence_interval(df[df['model'] == model]['loss_test']),
           'acc': mu_confidence_interval(df[df['model'] == model]['acc_test']),
           'conf': df[df['model'] == model]['cfg'].unique()}
    return res

### Tree based

#### Random forest

In [13]:
res_random_forest = calculate_statistics_sklearn(tree_res, 'random_forest_classifier')
print_statistics_model(res_random_forest, 'random forest classifier')

Best configuration random forest classifier mean metrics:
f1_score: 0.609756989767005 ±0.005027550044208888
loss: 0.39225757744490736 ±0.005489355263284579
acc: 0.6077424225550925 ±0.00548935526328459

Best hyperparams configuration:
["{'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 700}"]


#### Decision tree

In [14]:
res_decision_tree = calculate_statistics_sklearn(tree_res, 'decision_tree_classifier')
print_statistics_model(res_decision_tree, 'decision tree classifier')

Best configuration decision tree classifier mean metrics:
f1_score: 0.6402913493313054 ±0.003246723679220664
loss: 0.36008162828469237 ±0.003119961961986912
acc: 0.6399183717153076 ±0.0031199619619869134

Best hyperparams configuration:
["{'criterion': 'entropy', 'max_depth': 15}"]


## Naive bayes

#### Gaussian naive bayes

In [15]:
res_gaussian_nb = calculate_statistics_sklearn(naive_res, 'gaussian_nb')
print_statistics_model(res_gaussian_nb, 'gaussianNB')

Best configuration gaussianNB mean metrics:
f1_score: 0.4526802764969494 ±0.008227757785451987
loss: 0.5481864029029817 ±0.008014625115326288
acc: 0.4518135970970182 ±0.008014625115326274

Best hyperparams configuration:
["{'var_smoothing': 8.111308307896872e-07}"]


#### QDA

In [16]:
res_qda = calculate_statistics_sklearn(naive_res, 'qda')
print_statistics_model(res_qda, 'QDA')

Best configuration QDA mean metrics:
f1_score: 0.5217235201683785 ±0.006131748988284651
loss: 0.4649740608914607 ±0.0063746149205477646
acc: 0.5350259391085392 ±0.00637461492054775

Best hyperparams configuration:
["{'reg_param': 0.001, 'tol': 0.0001}"]


## SVM

In [20]:
res_svm = calculate_statistics_sklearn(svm_res, 'svc')
print_statistics_model(res_svm, 'SVM')

Best configuration SVM mean metrics:
f1_score: 0.8286206857647119 ±0.003255026164399803
loss: 0.17076133850717423 ±0.0031192905594531074
acc: 0.8292386614928257 ±0.0031192905594531065

Best hyperparams configuration:
["{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}"]
