In [None]:
import pandas as pd
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("notebooks"), '..')))

from utils.func_preprocessing import *
from utils.func_classification import *
from utils.utils import plot_feature_distribution, map_diagn_out


import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from config.config import DATASET_NAME, N_CLASS

import ipywidgets as widgets
from ipywidgets import Output

from IPython.display import display, clear_output

In [None]:
df = pd.read_csv(f"../dataset/{DATASET_NAME}.csv")
# Drop 'Unnamed: 0' column if it exists
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

In [None]:
if N_CLASS == 6:
    df['y'] = df.apply(lambda row: class_macropat_letteremappate(row['lettera_diagnosi_princip'], row['intero_diagnosi_princip']), axis=1)
    df['y'] = df.y.apply(reduce_6class_letteremappate)
    df['y_desc'] = df.y.apply(map_diagn_out)

In [None]:
df.shape

### Correlation Matrix

In [None]:
# Calcolo della matrice di correlazione
df_corr = df[['PF', 'PO2_T', 'P50_ACT', 'TO2', 'HCT', 'AG_K',
       'THB', 'THB2', 'GLU', 'LAC', 'PO2', 'HCO3', 'PCO2_T', 'PCO2', 'MOSM',
       'KP', 'NA', 'CL', 'CBASE', 'METHB', 'PHT', 'PH', 'O2HB', 'COHB', 'RHB',
       'B', 'TC', 'FIO2','SESSO','DATA', 'ETA']]

df_corr['DATA'] = pd.to_datetime(df_corr['DATA'])

# Estrae il giorno dell'anno
df_corr['day_of_year'] = df_corr['DATA'].apply(lambda x: x.timetuple().tm_yday)
df_corr['sin_day'] = np.sin(2 * np.pi * df_corr['day_of_year'] / 365)
df_corr['cos_day'] = np.cos(2 * np.pi * df_corr['day_of_year'] / 365)

# df_corr = df[['SESSO', 'ETA', 'PO2_T', 'P50_ACT', 'TO2', 'PO21', 'SO21', 'HCT', 'AG_K', 'THB', 'THB2', 'GLU', 'LAC', 
#            'HCO3', 'PCO2_T', 'PCO2', 'MOSM', 'KP', 'NA', 'CL', 'CBASE', 'METHB', 'PHT', 'PH', 'O2HB', 'COHB', 
#            'RHB', 'B', 'TC', 'FIO2', 'P50_ST', "y"]]

# df_corr = df[['SESSO', 'ETA', 'PF', 'PO2_T', 'P50_ACT', 'TO2', 'PO21', 'SO21', 'HCT',
    #    'AG_K', 'THB', 'THB2', 'GLU', 'LAC', 'HCO3', 'PCO2_T', 'PCO2', 'MOSM',
    #    'KP', 'NA', 'CL', 'CBASE', 'METHB', 'PHT', 'PH', 'O2HB', 'COHB', 'RHB',
    #    'B', 'TC', 'FIO2', 'class_symptom', 'y']]

#df_corr = df_corr.drop(columns=["PO21", "HCT", "PCO2", "PHT"])

df_corr = df_corr.drop(columns=["SESSO", "DATA", "day_of_year"])

for col in list(df_corr.columns):
    df_corr = df_corr.loc[df_corr[col] !='.....']
    df_corr[col] = df_corr[col].astype(float)
    


corr_matrix = df_corr.corr()

# Creazione della heatmap colorata
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True,  fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Matrice di Correlazione')
plt.show()

## Distribuzioni rispetto alle classi

In [None]:
features = [
    'ETA', 'PF', 'PO2_T', 'P50_ACT', 'TO2', 'SO2', 'HCT', 'AG_K', 
    'THB', 'THB2', 'GLU', 'LAC', 'PO2', 'HCO3', 'PCO2_T', 'PCO2', 
    'MOSM', 'KP', 'NA', 'CL', 'CBASE', 'METHB', 'PHT', 'PH', 
    'O2HB', 'COHB', 'RHB', 'B', 'TC', 'FIO2'
]

feature_dropdown = widgets.Dropdown(
    options=features,
    value=features[0],
    description='Feature:',
    disabled=False,
)

def on_feature_change(change):
    clear_output()
    display(feature_dropdown)
    if change['type'] == 'change' and change['name'] == 'value':
        fig = plot_feature_distribution(df.sort_values(by=["y_desc"]), change['new'])
        display(fig, display_id=True)

feature_dropdown.observe(on_feature_change, names='value')

# Display initial plot
on_feature_change({'type': 'change', 'name': 'value', 'new': features[0]})

In [None]:
features = [
    'ETA',
    'PF',
    'PO2_T',
    'P50_ACT',
    'TO2',
    'SO2',
    'HCT',
    # 'AG_K', 
    'THB',
    'THB2',
    # 'GLU',
    'LAC',
    'PO2',
    # 'HCO3',
    # 'PCO2_T',
    # 'PCO2', 
    'MOSM',
    'KP',
    'NA',
    'CL',
    'CBASE',
    'METHB',
    'PHT',
    'PH', 
    'O2HB',
    'COHB',
    'RHB',
    'B',
    'TC',
    'FIO2'
]

### Trasformazioni di variabile

In [None]:
# # Copia del DataFrame
# df_transformed = df.copy()

# # Variabili con distribuzione sbilanciata a destra (Logaritmo normale)
# df_transformed['O2HB'] = np.log1p(df_transformed['O2HB'])  # np.log1p gestisce automaticamente i valori nulli e zero

# # Variabili con distribuzione sbilanciata a sinistra (logaritmo del valore negativo)
# for col in ['COHB', 'GLU', 'LAC', 'P50_ACT', 'PF', 'PO2_T']:
#     df_transformed[col] = np.log1p(-df_transformed[col])  # -df[col] per rendere positiva la distribuzione

# # Normalizzazione Min-Max per le variabili indicate
# scaler = MinMaxScaler()
# cols_to_scale = ['B', 'CBASE', 'METHB', 'PHT', 'CL', 'NA', 'KP', 'HCO3', 'PCO2_T', 'MOSM', 'THB2', 'TO2', 'ETA']
# df_transformed[cols_to_scale] = scaler.fit_transform(df_transformed[cols_to_scale])

# # Trasformazione categorica
# df_transformed['FIO2'] = df_transformed['FIO2'].astype('category')  # Converte in variabile categorica

# # Arrotonda 'TC' a 0.5 e converte in categorico
# df_transformed['TC'] = (df_transformed['TC'] / 0.5).round() * 0.5
# df_transformed['TC'] = df_transformed['TC'].astype('category')