In [33]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Big data/Fold_esperimenti')

!pwd

import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Big data/Fold_esperimenti


In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
def print_average_metrics(report_dict_0, report_dict_1, report_dict_2, report_dict_3, report_dict_4):
  macro_metrics_0 = report_dict_0.get("macro avg", {})
  macro_metrics_1 = report_dict_1.get("macro avg", {})
  macro_metrics_2 = report_dict_2.get("macro avg", {})
  macro_metrics_3 = report_dict_3.get("macro avg", {})
  macro_metrics_4 = report_dict_4.get("macro avg", {})

  # Average precision score
  average_precision_score = (macro_metrics_0.get("precision", {}) +
                              macro_metrics_1.get("precision", {}) +
                              macro_metrics_2.get("precision", {}) +
                              macro_metrics_3.get("precision", {}) +
                              macro_metrics_4.get("precision", {})
                              ) / 5

  # Average Reacall score
  average_recall_score = (macro_metrics_0.get("recall", {}) +
                        macro_metrics_1.get("recall", {}) +
                        macro_metrics_2.get("recall", {}) +
                        macro_metrics_3.get("recall", {}) +
                        macro_metrics_4.get("recall", {})
                        ) / 5
  # Average F1 Score
  average_f1_score = (macro_metrics_0.get("f1-score", {}) +
                    macro_metrics_1.get("f1-score", {}) +
                    macro_metrics_2.get("f1-score", {}) +
                    macro_metrics_3.get("f1-score", {}) +
                    macro_metrics_4.get("f1-score", {})
                   ) / 5

  print("Average Precision Score:", round(average_precision_score, 2))
  print("Average Recall Score:", round(average_recall_score, 2))
  print("Average F1 Score:", round(average_f1_score, 2))






#Preprocessing dataset


## Mirna Preprocessing

In [36]:
def process_dataframe(df: pd.DataFrame):

  # Calculate the missing rate for each column
  missing_rate = df.isnull().mean()

  # Sort in discending order
  missing_rate_sorted = missing_rate.sort_values(ascending= False)

  # Filter columns that have a missing rate <= 5 %
  threshold = 0.05
  df = df.loc[:, missing_rate <= threshold]

  # Substitute null with zero
  df.fillna(0, inplace=True)

  df = df.drop(['disease', 'country',  'Unnamed: 0', 'age', 'sex', 'apoe4'], axis=1)

  # Substitute null with zero
  df.fillna(0, inplace=True)

  return df

## Metadata Preprocessing

In [37]:
from sklearn.preprocessing import MinMaxScaler
def preprocess_metadata(df: pd.DataFrame):

  columns_to_encode = [ 'age', 'sex', 'apoe4']
  df_meta_data = df[columns_to_encode]
  scaler = MinMaxScaler(feature_range=(1, 2))
  df_meta_data['age'] = scaler.fit_transform(df[['age']])
  df_meta_data['sex'] = df_meta_data['sex'].map({'female': 0, 'male': 1}).fillna(3)
  df_meta_data.fillna(0, inplace=True)

  return df_meta_data

# Autoencoder

In [38]:
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

REG_COEFF = 0.03
DROPOUT_RATE = 0.6

def generate_intermediate_feature(X_train, X_test,
                                  hidden_dim1=256,
                                  hidden_dim2=128,
                                  epochs=100,
                                  batch_size=3):
    """
    Costruisce e addestra un autoencoder a due livelli (256 e 128) con
    regolarizzazione L2 e dropout, e restituisce le feature intermedie (128
    dimensioni) per train e test.

    Parametri:
    -----------
    X_train : np.array
        Matrice delle feature di training, shape (num_samples, input_dim)
    X_test  : np.array
        Matrice delle feature di test, shape (num_samples, input_dim)
    hidden_dim1 : int
        Dimensione del primo livello nascosto (es. 256)
    hidden_dim2 : int
        Dimensione del secondo livello nascosto (es. 128, il bottleneck)
    epochs : int
        Numero di epoche per l'addestramento dell'autoencoder
    batch_size : int
        Dimensione del batch per l'addestramento

    Restituisce:
    ------------
    features_intermediate_train : np.array
        Codifica di dimensione (num_samples_train, hidden_dim2)
    features_intermediate_test : np.array
        Codifica di dimensione (num_samples_test, hidden_dim2)
    """

    # Numero di feature in ingresso
    input_dim = X_train.shape[1]

    # 1. Definizione dei layer
    input_layer = Input(shape=(input_dim,))

    # Encoder
    encoded = Dense(hidden_dim1,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(input_layer)
    encoded = Dropout(DROPOUT_RATE)(encoded)

    encoded = Dense(hidden_dim2,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(encoded)
    encoded = Dropout(DROPOUT_RATE)(encoded)

    # Decoder
    decoded = Dense(hidden_dim1,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(encoded)
    decoded = Dropout(DROPOUT_RATE)(decoded)

    # Output layer
    decoded = Dense(input_dim,
                    activation='sigmoid',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(decoded)

    # 2. Costruzione del modello Autoencoder
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # 3. Addestramento
    autoencoder.fit(X_train, X_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test))

    # 4. Creazione del modello encoder
    # (l'output finale del nostro encoder è 'encoded' dopo il secondo Dropout)
    encoder = Model(inputs=input_layer, outputs=encoded)

    # 5. Estrazione delle feature intermedie
    features_intermediate_train = encoder.predict(X_train)
    features_intermediate_test = encoder.predict(X_test)

    # 6. Ritorno delle feature
    return features_intermediate_train, features_intermediate_test



# Random Forest Classifier

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def evaluate_model_rf( X_train, X_test, y_train, y_test ) :

  y_train = y_train.replace({'AD': 0, 'NC': 1, 'MCI': 2})
  y_test = y_test.replace({'AD': 0, 'NC': 1, 'MCI': 2})

  # Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
  rf_classifier.fit(X_train, y_train)

  y_pred = rf_classifier.predict(X_test)

  print(classification_report(y_test, y_pred))
  return classification_report(y_test, y_pred, output_dict=True)


# MLP Classifier

In [40]:
from sklearn.neural_network import MLPClassifier
def evaluate_model_mlp( X_train, X_test, y_train, y_test ) :

  y_train = y_train.replace({'AD': 0, 'NC': 1, 'MCI': 2})
  y_test = y_test.replace({'AD': 0, 'NC': 1, 'MCI': 2})

  # MLPClassifier  Classifier
  mlp_classifier = MLPClassifier(max_iter=100, random_state=42)
  mlp_classifier.fit(X_train, y_train)

  y_pred = mlp_classifier.predict(X_test)



  print(classification_report(y_test, y_pred, ))
  return classification_report(y_test, y_pred, output_dict=True)


In [41]:
def concatenate_features(features1, features2):
    return np.concatenate((features1, features2), axis=1)

# Fold 0 - Results

In [42]:
import pandas as pd

df0_train = pd.read_csv('train_fold_0.csv')
df0_test = pd.read_csv('test_fold_0.csv')

df_mirna_preprocessed_train_0 = process_dataframe(df0_train)
df_mirna_preprocessed_test_0 = process_dataframe(df0_test)


mirna_features_intermediate_train_0, mirna_features_intermediate_test_0 = generate_intermediate_feature(df_mirna_preprocessed_train_0, df_mirna_preprocessed_test_0)

Epoch 1/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - loss: 2224.1875 - val_loss: 1629.4685
Epoch 2/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - loss: 2546.8071 - val_loss: 1629.4495
Epoch 3/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - loss: 2563.7808 - val_loss: 1629.4409
Epoch 4/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - loss: 2514.3000 - val_loss: 1629.4585
Epoch 5/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - loss: 2526.1948 - val_loss: 1629.4458
Epoch 6/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - loss: 2651.0513 - val_loss: 1629.4386
Epoch 7/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 29ms/step - loss: 3816.7192 - val_loss: 1629.4384
Epoch 8/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - loss: 22

In [None]:
df_meta_preprocessed_train_0 = preprocess_metadata(df0_train)
df_meta_preprocessed_test_0 = preprocess_metadata(df0_test)


meta_features_intermediate_train_0, meta_features_intermediate_test_0 = generate_intermediate_feature(df_meta_preprocessed_train_0, df_meta_preprocessed_test_0, 15, 15)

Epoch 1/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 1.3961 - val_loss: 0.6658
Epoch 2/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.6173 - val_loss: 0.4685
Epoch 3/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4611 - val_loss: 0.4153
Epoch 4/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4309 - val_loss: 0.3935
Epoch 5/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4171 - val_loss: 0.3823
Epoch 6/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3845 - val_loss: 0.3755
Epoch 7/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3635 - val_loss: 0.3705
Epoch 8/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3867 - val_loss: 0.3664
Epoch 9/100
[1m388/388[0m [32

In [None]:
features_intermediate_train_0 = concatenate_features(mirna_features_intermediate_train_0, meta_features_intermediate_train_0)
features_intermediate_test_0 = concatenate_features(mirna_features_intermediate_test_0, meta_features_intermediate_test_0)

In [None]:
y_train_0 = df0_train['disease']
y_test_0 = df0_test['disease']

report_dict_0 = evaluate_model_rf(features_intermediate_train_0, features_intermediate_test_0, y_train_0, y_test_0)

# Fold 1 - Results

In [None]:
df1_train = pd.read_csv('train_fold_1.csv')
df1_test = pd.read_csv('test_fold_1.csv')

df_mirna_preprocessed_train_1 = process_dataframe(df1_train)
df_mirna_preprocessed_test_1 = process_dataframe(df1_test)


mirna_features_intermediate_train_1, mirna_features_intermediate_test_1 = generate_intermediate_feature(df_mirna_preprocessed_train_1, df_mirna_preprocessed_test_1)

In [None]:
df_meta_preprocessed_train_1 = preprocess_metadata(df1_train)
df_meta_preprocessed_test_1 = preprocess_metadata(df1_test)


meta_features_intermediate_train_1, meta_features_intermediate_test_1 = generate_intermediate_feature(df_meta_preprocessed_train_1, df_meta_preprocessed_test_1, 15, 15)

In [None]:
features_intermediate_train_1 = concatenate_features(mirna_features_intermediate_train_1, meta_features_intermediate_train_1)
features_intermediate_test_1 = concatenate_features(mirna_features_intermediate_test_1, meta_features_intermediate_test_1)

In [None]:
y_train_1 = df1_train['disease']
y_test_1 = df1_test['disease']

report_dict_1 = evaluate_model_rf(features_intermediate_train_1, features_intermediate_test_1, y_train_1, y_test_1)

# Fold 2 - Results

In [None]:
df2_train = pd.read_csv('train_fold_2.csv')
df2_test = pd.read_csv('test_fold_2.csv')

df_mirna_preprocessed_train_2 = process_dataframe(df2_train)
df_mirna_preprocessed_test_2 = process_dataframe(df2_test)


mirna_features_intermediate_train_2, mirna_features_intermediate_test_2 = generate_intermediate_feature(df_mirna_preprocessed_train_2, df_mirna_preprocessed_test_2)

In [None]:
df_meta_preprocessed_train_2 = preprocess_metadata(df2_train)
df_meta_preprocessed_test_2 = preprocess_metadata(df2_test)


meta_features_intermediate_train_2, meta_features_intermediate_test_2 = generate_intermediate_feature(df_meta_preprocessed_train_2, df_meta_preprocessed_test_2, 15, 15)

In [None]:
features_intermediate_train_2 = concatenate_features(mirna_features_intermediate_train_2, meta_features_intermediate_train_2)
features_intermediate_test_2 = concatenate_features(mirna_features_intermediate_test_2, meta_features_intermediate_test_2)

In [None]:
y_train_2 = df2_train['disease']
y_test_2 = df2_test['disease']

report_dict_2 = evaluate_model_rf(features_intermediate_train_2, features_intermediate_test_2, y_train_2, y_test_2)

# Fold 3 - Results

In [None]:
df3_train = pd.read_csv('train_fold_3.csv')
df3_test = pd.read_csv('test_fold_3.csv')

df_mirna_preprocessed_train_3 = process_dataframe(df3_train)
df_mirna_preprocessed_test_3 = process_dataframe(df3_test)


mirna_features_intermediate_train_3, mirna_features_intermediate_test_3 = generate_intermediate_feature(df_mirna_preprocessed_train_3, df_mirna_preprocessed_test_3)

In [None]:
df_meta_preprocessed_train_3 = preprocess_metadata(df3_train)
df_meta_preprocessed_test_3 = preprocess_metadata(df3_test)


meta_features_intermediate_train_3, meta_features_intermediate_test_3 = generate_intermediate_feature(df_meta_preprocessed_train_3, df_meta_preprocessed_test_3, 15, 15)

In [None]:
features_intermediate_train_3 = concatenate_features(mirna_features_intermediate_train_3, meta_features_intermediate_train_3)
features_intermediate_test_3 = concatenate_features(mirna_features_intermediate_test_3, meta_features_intermediate_test_3)

In [None]:
y_train_3 = df3_train['disease']
y_test_3 = df3_test['disease']

report_dict_3 = evaluate_model_rf(features_intermediate_train_3, features_intermediate_test_3, y_train_3, y_test_3)

# Fold 4 - Results

In [26]:
df4_train = pd.read_csv('train_fold_4.csv')
df4_test = pd.read_csv('test_fold_4.csv')

df_mirna_preprocessed_train_4 = process_dataframe(df4_train)
df_mirna_preprocessed_test_4 = process_dataframe(df4_test)


mirna_features_intermediate_train_4, mirna_features_intermediate_test_4 = generate_intermediate_feature(df_mirna_preprocessed_train_4, df_mirna_preprocessed_test_4)


Epoch 1/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 25ms/step - loss: 3067.2561 - val_loss: 1523.4227
Epoch 2/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - loss: 2298.2163 - val_loss: 1523.4171
Epoch 3/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - loss: 3210.8730 - val_loss: 1523.4150
Epoch 4/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 36ms/step - loss: 3004.2803 - val_loss: 1523.4221
Epoch 5/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - loss: 2194.9131 - val_loss: 1523.4218
Epoch 6/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - loss: 2314.9749 - val_loss: 1523.4104
Epoch 7/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - loss: 3714.1504 - val_loss: 1523.4033
Epoch 8/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - loss: 1962

In [27]:
df_meta_preprocessed_train_4 = preprocess_metadata(df4_train)
df_meta_preprocessed_test_4 = preprocess_metadata(df4_test)


meta_features_intermediate_train_4, meta_features_intermediate_test_4 = generate_intermediate_feature(df_meta_preprocessed_train_4, df_meta_preprocessed_test_4, 15, 15)

Epoch 1/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 1.3396 - val_loss: 0.6141
Epoch 2/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.5507 - val_loss: 0.4422
Epoch 3/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.4306 - val_loss: 0.4063
Epoch 4/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.4090 - val_loss: 0.3916
Epoch 5/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.3932 - val_loss: 0.3827
Epoch 6/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.3708 - val_loss: 0.3767
Epoch 7/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.3630 - val_loss: 0.3717
Epoch 8/100
[1m388/388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.3686 - val_loss: 0.3688
Epoch 9/100
[1m388/388[0m [32

In [28]:
features_intermediate_train_4 = concatenate_features(mirna_features_intermediate_train_4, meta_features_intermediate_train_4)
features_intermediate_test_4 = concatenate_features(mirna_features_intermediate_test_4, meta_features_intermediate_test_4)

In [29]:
y_train_4 = df4_train['disease']
y_test_4 = df4_test['disease']

report_dict_4 = evaluate_model_rf(features_intermediate_train_4, features_intermediate_test_4, y_train_4, y_test_4)

              precision    recall  f1-score   support

           0       0.78      0.97      0.87       207
           1       0.64      0.27      0.38        60
           2       1.00      0.43      0.61        23

    accuracy                           0.78       290
   macro avg       0.81      0.56      0.62       290
weighted avg       0.77      0.78      0.74       290



In [30]:
print(" Results for MLP Classifier for each fold")
report_MLP_dict_0 = evaluate_model_mlp(features_intermediate_train_0, features_intermediate_test_0, y_train_0, y_test_0)
report_MLP_dict_1 = evaluate_model_mlp(features_intermediate_train_1, features_intermediate_test_1, y_train_1, y_test_1)
report_MLP_dict_2 = evaluate_model_mlp(features_intermediate_train_2, features_intermediate_test_2, y_train_2, y_test_2)
report_MLP_dict_3 = evaluate_model_mlp(features_intermediate_train_3, features_intermediate_test_3, y_train_3, y_test_3)
report_MLP_dict_4 = evaluate_model_mlp(features_intermediate_train_4, features_intermediate_test_4, y_train_4, y_test_4)

 Results for MLP Classifier for each fold
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       208
           1       0.00      0.00      0.00        60
           2       0.00      0.00      0.00        23

    accuracy                           0.71       291
   macro avg       0.24      0.33      0.28       291
weighted avg       0.51      0.71      0.60       291

              precision    recall  f1-score   support

           0       0.71      1.00      0.83       208
           1       0.00      0.00      0.00        60
           2       0.00      0.00      0.00        23

    accuracy                           0.71       291
   macro avg       0.24      0.33      0.28       291
weighted avg       0.51      0.71      0.60       291

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       190

    accuracy                           1.00       190
   macro avg       1.00      1.0

# Average Results

In [31]:
print("\n \nResults Random Forest Classifier")
print("---------------------------------------- ")
print_average_metrics(report_dict_0, report_dict_1, report_dict_2, report_dict_3, report_dict_4)


 
Results Random Forest Classifier
---------------------------------------- 
Average Precision Score: 0.73
Average Recall Score: 0.54
Average F1 Score: 0.59


In [32]:
print("\n \n  Average Results MLP  Classifier ")
print("---------------------------------------- ")
print_average_metrics(report_MLP_dict_0, report_MLP_dict_1, report_MLP_dict_2, report_MLP_dict_3, report_MLP_dict_4)


 
  Average Results MLP  Classifier 
---------------------------------------- 
Average Precision Score: 0.39
Average Recall Score: 0.47
Average F1 Score: 0.42
