### Import

In [None]:
import pandas as pd
import numpy as np
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import pickle

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("notebooks"), '..')))

from utils.func_preprocessing import *
from utils.func_classification import *
from utils.func_training import *
from utils.utils import *

import mlflow
from mlflow.models import infer_signature

from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from config.config import (
    DATASET_NAME,
    YEARS_TRAINING,
    LIST_TYPE,
    PREPROCESSING_TRANSFORMATION,
    CLASS_UNDER_SAMPLE,
    MODEL,
    MODEL_NAME,
    PARAMS,
    SAVE_PREDICT_TO_XLSX,
    SAVE_PKL_MODEL,
    TRAINING_INFO,
    N_CLASS
)

if PREPROCESSING_TRANSFORMATION:
    from sklearn.preprocessing import MinMaxScaler
    from imblearn.under_sampling import RandomUnderSampler
    from collections import Counter
    
if CLASS_UNDER_SAMPLE:
    from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Import dataset
df = pd.read_csv(f"../dataset/{DATASET_NAME}.csv")

# Drop 'Unnamed: 0' column if it exists
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

df.intero_diagnosi_princip = df.intero_diagnosi_princip.astype(int)

In [None]:
# Colonna target con classificazione macropat
# df['y'] = df['intero_diagnosi_princip'].apply(classificazione_macropat)
# df['y'] = df['intero_diagnosi_princip'].apply(classificazione_poche_macropat)
# df['y'] = df.apply(lambda row: classificazione_tre_macropat(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
# df['y'] = df.apply(lambda row: classificazione_sei_macropat(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
# df['y'] = df.apply(lambda row: classificazione_sei_macropat_v2(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
# df['y'] = df.apply(lambda row: classificazione_sette_macropat(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
# df['y'] = df.apply(lambda row: classificazione_18_macropat(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
# df['y'] = df.apply(lambda row: class_macropat_letteremappate(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
# df['y'] = df.y.apply(reduce_6class_letteremappate)

### Preprocessing

Define target column

In [None]:
if N_CLASS == 6:
    df['y'] = df.apply(lambda row: class_macropat_letteremappate(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
    df['y'] = df.y.apply(reduce_6class_letteremappate)

In [None]:
# Elimino le altre colonne target
col_to_drop = ["COD_DIAGNOSI_PRINCIPALE", "lettera_diagnosi_princip", "intero_diagnosi_princip", "decimali_diagnosi_princip"]
df = df.drop(columns=col_to_drop)

In [None]:
# Feature selection da KDE:
### Escludere:  'AG_K', 'GLU','LAC', 'PO2', 'HCO3', 'PCO2_T', 'PCO2', 

### Feature eng

In [None]:
df['DATA'] = pd.to_datetime(df['DATA'])

# Estrae il giorno dell'anno
df['day_of_year'] = df['DATA'].apply(lambda x: x.timetuple().tm_yday)

# Calcola le nuove colonne "sin_day" e "cos_day"
df['sin_day'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['cos_day'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

# Elimino colonne anagrafica
df = df.drop(columns=['ID_ANAGRAFICA', 'DTN', 'DATA_INGRESSO', 'DATA',
                      'DATA_USCITA', 'NCAMPIONE', 'NACCESSO', 'STATO', 'REPARTO_PZ', 'day_of_year'])  # 'SESSO', 'ETA', 'TIPO',

In [None]:
print(df.shape)
df = df.dropna()
df = df.loc[df.CBASE != '.....']
print(df.shape)

In [None]:
df = df.loc[df.TIPO.isin(["Arterioso", "Venoso"])]
df.TIPO = df.TIPO.astype('category')

df.SESSO = df.SESSO.astype('category')
df.class_symptom = df.class_symptom.astype('category')

df.ETA = df.ETA.astype(int)

df.THB2 = df.THB2.astype(float)
df.MOSM = df.MOSM.astype(float)
df.CBASE = df.CBASE.astype(float)
df.METHB = df.METHB.astype(float)
df.O2HB = df.O2HB.astype(float)
df.COHB = df.COHB.astype(float)
df.RHB = df.RHB.astype(float)
df.PF = df.PF.astype(float)

In [None]:
# Feature selection: drop columns with high correlation (circa 1)
df = df.drop(columns=[ "HCT", "PCO2", "PHT"]) # "PO21",

# V2
df = df.drop(columns=["THB", "RHB", "SO2"]) # SO2 PRIMA ERA SO21

### Seleziona arterioso/venoso (optional)

In [None]:
df = df.loc[df.TIPO.isin(LIST_TYPE)]

if len(LIST_TYPE)==1:
    df = df.drop(columns=["TIPO"])

# Train-test split 80-20

In [None]:
# Separiamo le feature (X) e la target (y)
X = df.drop(columns=['y'])
y = df['y']

# Eseguiamo lo split in training e test set (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Preprocessing trasformazioni (optional)

In [None]:
if PREPROCESSING_TRANSFORMATION:
    # Copia del DataFrame di training
    X_train_transformed = X_train.copy()

    # Variabili con distribuzione sbilanciata a destra (Logaritmo normale)
    X_train_transformed['O2HB'] = np.log1p(X_train_transformed['O2HB'])  # np.log1p gestisce automaticamente i valori nulli e zero

    # Variabili con distribuzione sbilanciata a sinistra (logaritmo del valore negativo)
    for col in ['COHB', 'GLU', 'LAC', 'P50_ACT', 'PF', 'PO2_T']:
        X_train_transformed[col] = X_train_transformed[col].astype(float)
        X_train_transformed[col] = np.log1p(X_train_transformed[col])  # -df[col] per rendere positiva la distribuzione

    # Normalizzazione Min-Max per le variabili indicate
    scaler = MinMaxScaler()
    cols_to_scale = ['B', 'CBASE', 'METHB', 'CL', 'NA', 'KP', 'HCO3', 'PCO2_T', 'MOSM', 'THB2', 'TO2', 'ETA']
    X_train_transformed[cols_to_scale] = scaler.fit_transform(X_train_transformed[cols_to_scale])

    # Trasformazione categorica
    X_train_transformed['FIO2'] = X_train_transformed['FIO2'].astype('category')

    # Arrotonda 'TC' a 0.5 e converte in categorico
    X_train_transformed['TC'] = (X_train_transformed['TC'] / 0.5).round() * 0.5
    X_train_transformed['TC'] = X_train_transformed['TC'].astype('category')

    X_train = X_train_transformed.copy()


### Bilanciamento Classi

In [None]:
if CLASS_UNDER_SAMPLE:
    undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_res, y_train_res = undersample.fit_resample(X_train, y_train)

    X_train, y_train = X_train_res.copy(), y_train_res.copy()

# Train model

In [None]:
if MODEL == "Pycaret":
    # Train model
    model, feature_importances_df, pycaret_model_name, setup_config, metrics_df = train_pycaret(X_train, y_train, PARAMS, N_CLASS)

### Plot feature importance

In [None]:
if MODEL == "Pycaret":
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances_df)
    plt.title('Importanza delle Feature')
    plt.tight_layout()
    plt.show()

# Test model

In [None]:
if PREPROCESSING_TRANSFORMATION:
    # Copia del DataFrame di test
    X_test_transformed = X_test.copy()

    # Stesse trasformazioni del training set

    # Variabili con distribuzione sbilanciata a destra (Logaritmo normale)
    X_test_transformed['O2HB'] = np.log1p(X_test_transformed['O2HB'])

    # Variabili con distribuzione sbilanciata a sinistra (logaritmo del valore negativo)
    for col in ['COHB', 'GLU', 'LAC', 'P50_ACT', 'PF', 'PO2_T']:
        X_test_transformed[col] = np.log1p(X_test_transformed[col])

    # Applicare lo stesso scaler usato sul training set
    X_test_transformed[cols_to_scale] = scaler.transform(X_test_transformed[cols_to_scale])

    # Trasformazione categorica
    X_test_transformed['FIO2'] = X_test_transformed['FIO2'].astype('category')

    # Arrotonda 'TC' a 0.5 e converte in categorico
    X_test_transformed['TC'] = (X_test_transformed['TC'] / 0.5).round() * 0.5
    X_test_transformed['TC'] = X_test_transformed['TC'].astype('category')

    X_test = X_test_transformed.copy()

In [None]:
# Effettuiamo le predizioni sul test set
y_pred = model.predict(X_test)

# Valutiamo il modello
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
D_classification_rep = classification_report(y_test, y_pred, output_dict=True)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_rep)

Confusion Matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
labels = model.classes_ 
conf_matrix_df = pd.DataFrame(conf_matrix, index=labels, columns=labels)
conf_matrix_df

In [None]:
if SAVE_PREDICT_TO_XLSX:
    test_temp = X_test.copy()
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    test_temp[["Prob_0","Prob_1","Prob_2","Prob_3","Prob_4", "Prob_5"]] = y_pred_proba
    test_temp["y_pred"] = y_pred
    test_temp["y_true"] = y_test
    test_temp = test_temp.reset_index(drop=True)
    today = datetime.now().strftime('%y%m%d')
    test_temp.to_excel(f"./notes/{today}_pred_with_prob_{N_CLASS}_{TRAINING_INFO}.xlsx")

### Performance sul training set

In [None]:
# Effettuiamo le predizioni sul training set
y_pred_train = model.predict(X_train)

# Valutiamo il modello
accuracy_train = accuracy_score(y_train, y_pred_train)
classification_rep_train = classification_report(y_train, y_pred_train)
D_classification_rep_train = classification_report(y_train, y_pred_train, output_dict=True)

print(f'Accuracy: {accuracy_train:.4f}')
print('Classification Report:')
print(classification_rep_train)

Confusion matrix sul training set

In [None]:
conf_matrix_train = confusion_matrix(y_train, y_pred_train)
labels = model.classes_ 
conf_matrix_df_train = pd.DataFrame(conf_matrix_train, index=labels, columns=labels)
conf_matrix_df_train

# Save model

In [None]:
if SAVE_PKL_MODEL:
    with open(f"../models/{MODEL_NAME}.pkl", "wb") as file:
        pickle.dump(model, file)

# Log to MLFlow

In [None]:
# Codice che serve per mettere in un dizionario "leggibile da MLFlow" il "classification report" calcolato sul test set
D_metric = {}
for classe in D_classification_rep:
    if type(D_classification_rep[classe]) == dict:
        for metric in D_classification_rep[classe]:
            D_metric[f"{classe}_{metric}"] = D_classification_rep[classe][metric]
D_metric["accuracy"] = accuracy

In [None]:
# Codice che serve per mettere in un dizionario "leggibile da MLFlow" il "classification report" calcolato sul training set
D_metric_train = {}
for classe_train in D_classification_rep_train:
    if type(D_classification_rep_train[classe_train]) == dict:
        for metric_train in D_classification_rep_train[classe_train]:
            D_metric_train[f"{classe_train}_{metric_train}_train"] = D_classification_rep_train[classe_train][metric_train]
D_metric_train["accuracy_train"] = accuracy_train

In [None]:
if len(LIST_TYPE)==2:
    type_mlflow = "Tutti"
else:
    type_mlflow = LIST_TYPE[0]

In [None]:
# Dizionario che va modificato per inserire tutti i parametri della run
D_to_set = {
    "Training_info" : TRAINING_INFO,        
    "func_classification" : f"Funzione per {N_CLASS}",    
    "n_class" : len(df.y.unique()),
    "columns" : str(list(X_train.columns)),
    "func_training" : "train_pycaret",                       
    "model_name" :  MODEL_NAME,
    "dataset_name" : DATASET_NAME,
    "years_training" : YEARS_TRAINING,
    "data_ora" : datetime.now(),
    "TIPO": type_mlflow,                                          
    "len_train" : X_train.shape[0],
    "len_test" : X_test.shape[0],
    "class_under_sample": CLASS_UNDER_SAMPLE,
    "coeff_of_variation_train" : calculate_coefficient_of_variation(y_train),
    "coeff_of_variation_test" : calculate_coefficient_of_variation(y_test),
    "skewness_train" : calculate_skewness(y_train),
    "skewness_test" : calculate_skewness(y_test),
}

if MODEL == "Pycaret":
    D_to_set["pycaret_model_name"] = pycaret_model_name

In [None]:
# Set our tracking server URI for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLClinicalProb_exten")

# Start an MLflow run
with mlflow.start_run():
    
    # Log parameters to MLflow
    mlflow.log_params(PARAMS)
    
    # Log metrics to MLflow
    for key in D_metric:
        mlflow.log_metric(key, D_metric[key])
    for key_train in D_metric_train:
        mlflow.log_metric(key_train, D_metric_train[key_train])
    
    # Set tags in MLflow
    for key in D_to_set:
        mlflow.set_tag(key, D_to_set[key])
    
    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))
    
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model_mlclinicalprob",
        signature=signature,
        input_example=X_train,
        registered_model_name=MODEL_NAME,
    )


    if MODEL == "Pycaret":
        # Log feature importance as Artifacts
        feature_importance_file = "feature_importance.csv"
        feature_importances_df.to_csv(feature_importance_file, index=False)
        mlflow.log_artifact(feature_importance_file, artifact_path="pycaret")
        os.remove(feature_importance_file)

        # Log pycaret setup_config as Artifacts
        setup_config_file = "setup_config.csv"
        setup_config.to_csv(setup_config_file, index=False)
        mlflow.log_artifact(setup_config_file, artifact_path="pycaret")
        os.remove(setup_config_file)

        # Log pycaret metrics_df as Artifacts
        metrics_df_file = "metrics_df.csv"
        metrics_df.to_csv(metrics_df_file, index=False)
        mlflow.log_artifact(metrics_df_file, artifact_path="pycaret")
        os.remove(metrics_df_file)

    # Log confusion matrix test as Artifacts
    conf_matrix_file = "conf_matrix_df_test.csv"
    conf_matrix_df.to_csv(conf_matrix_file, index=False)
    mlflow.log_artifact(conf_matrix_file, artifact_path="conf_matrix")
    os.remove(conf_matrix_file)

    # Log confusion matrix train as Artifacts
    conf_matrix_train_file = "conf_matrix_df_train.csv"
    conf_matrix_df_train.to_csv(conf_matrix_train_file, index=False)
    mlflow.log_artifact(conf_matrix_train_file, artifact_path="conf_matrix")
    os.remove(conf_matrix_train_file)
