In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Big data/Fold_esperimenti')

!pwd

import warnings
warnings.filterwarnings("ignore")

Mounted at /content/drive
/content/drive/My Drive/Big data/Fold_esperimenti


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def print_average_metrics(report_dict_0, report_dict_1, report_dict_2, report_dict_3, report_dict_4):
  macro_metrics_0 = report_dict_0.get("macro avg", {})
  macro_metrics_1 = report_dict_1.get("macro avg", {})
  macro_metrics_2 = report_dict_2.get("macro avg", {})
  macro_metrics_3 = report_dict_3.get("macro avg", {})
  macro_metrics_4 = report_dict_4.get("macro avg", {})

  # Average precision score
  average_precision_score = (macro_metrics_0.get("precision", {}) +
                              macro_metrics_1.get("precision", {}) +
                              macro_metrics_2.get("precision", {}) +
                              macro_metrics_3.get("precision", {}) +
                              macro_metrics_4.get("precision", {})
                              ) / 5

  # Average Reacall score
  average_recall_score = (macro_metrics_0.get("recall", {}) +
                        macro_metrics_1.get("recall", {}) +
                        macro_metrics_2.get("recall", {}) +
                        macro_metrics_3.get("recall", {}) +
                        macro_metrics_4.get("recall", {})
                        ) / 5
  # Average F1 Score
  average_f1_score = (macro_metrics_0.get("f1-score", {}) +
                    macro_metrics_1.get("f1-score", {}) +
                    macro_metrics_2.get("f1-score", {}) +
                    macro_metrics_3.get("f1-score", {}) +
                    macro_metrics_4.get("f1-score", {})
                   ) / 5

  print("Average Precision Score:", round(average_precision_score, 2))
  print("Average Recall Score:", round(average_recall_score, 2))
  print("Average F1 Score:", round(average_f1_score, 2))






#Preprocessing dataset


## Mirna Preprocessing

In [5]:
def process_dataframe(df: pd.DataFrame):

  # Calculate the missing rate for each column
  missing_rate = df.isnull().mean()

  # Sort in discending order
  missing_rate_sorted = missing_rate.sort_values(ascending= False)

  # Filter columns that have a missing rate <= 5 %
  threshold = 0.05
  df = df.loc[:, missing_rate <= threshold]

  # Substitute null with zero
  df.fillna(0, inplace=True)

  df = df.drop(['disease', 'country',  'Unnamed: 0', 'age', 'sex', 'apoe4'], axis=1)

  # Substitute null with zero
  df.fillna(0, inplace=True)

  return df

## Metadata Preprocessing

In [6]:
from sklearn.preprocessing import MinMaxScaler
def preprocess_metadata(df: pd.DataFrame):

  columns_to_encode = [ 'age', 'sex', 'apoe4']
  df_meta_data = df[columns_to_encode]
  scaler = MinMaxScaler(feature_range=(1, 2))
  df_meta_data['age'] = scaler.fit_transform(df[['age']])
  df_meta_data['sex'] = df_meta_data['sex'].map({'female': 0, 'male': 1}).fillna(3)
  df_meta_data.fillna(0, inplace=True)

  return df_meta_data

# Autoencoder

In [7]:
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

REG_COEFF = 0.03
DROPOUT_RATE = 0.6

def generate_intermediate_feature(X_train, X_test,
                                  hidden_dim1=256,
                                  hidden_dim2=128,
                                  epochs=100,
                                  batch_size=256):
    """
    Costruisce e addestra un autoencoder a due livelli (256 e 128) con
    regolarizzazione L2 e dropout, e restituisce le feature intermedie (128
    dimensioni) per train e test.

    Parametri:
    -----------
    X_train : np.array
        Matrice delle feature di training, shape (num_samples, input_dim)
    X_test  : np.array
        Matrice delle feature di test, shape (num_samples, input_dim)
    hidden_dim1 : int
        Dimensione del primo livello nascosto (es. 256)
    hidden_dim2 : int
        Dimensione del secondo livello nascosto (es. 128, il bottleneck)
    epochs : int
        Numero di epoche per l'addestramento dell'autoencoder
    batch_size : int
        Dimensione del batch per l'addestramento

    Restituisce:
    ------------
    features_intermediate_train : np.array
        Codifica di dimensione (num_samples_train, hidden_dim2)
    features_intermediate_test : np.array
        Codifica di dimensione (num_samples_test, hidden_dim2)
    """

    # Numero di feature in ingresso
    input_dim = X_train.shape[1]

    # 1. Definizione dei layer
    input_layer = Input(shape=(input_dim,))

    # Encoder
    encoded = Dense(hidden_dim1,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(input_layer)
    encoded = Dropout(DROPOUT_RATE)(encoded)

    encoded = Dense(hidden_dim2,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(encoded)
    encoded = Dropout(DROPOUT_RATE)(encoded)

    # Decoder
    decoded = Dense(hidden_dim1,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(encoded)
    decoded = Dropout(DROPOUT_RATE)(decoded)

    # Output layer
    decoded = Dense(input_dim,
                    activation='sigmoid',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(decoded)

    # 2. Costruzione del modello Autoencoder
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # 3. Addestramento
    autoencoder.fit(X_train, X_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test))

    # 4. Creazione del modello encoder
    # (l'output finale del nostro encoder è 'encoded' dopo il secondo Dropout)
    encoder = Model(inputs=input_layer, outputs=encoded)

    # 5. Estrazione delle feature intermedie
    features_intermediate_train = encoder.predict(X_train)
    features_intermediate_test = encoder.predict(X_test)

    # 6. Ritorno delle feature
    return features_intermediate_train, features_intermediate_test



# Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def evaluate_model_rf( X_train, X_test, y_train, y_test ) :

  y_train = y_train.replace({'AD': 0, 'NC': 1, 'MCI': 2})
  y_test = y_test.replace({'AD': 0, 'NC': 1, 'MCI': 2})

  # Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
  rf_classifier.fit(X_train, y_train)

  y_pred = rf_classifier.predict(X_test)

  print(classification_report(y_test, y_pred))
  return classification_report(y_test, y_pred, output_dict=True)


# MLP Classifier

In [9]:
from sklearn.neural_network import MLPClassifier
def evaluate_model_mlp( X_train, X_test, y_train, y_test ) :

  y_train = y_train.replace({'AD': 0, 'NC': 1, 'MCI': 2})
  y_test = y_test.replace({'AD': 0, 'NC': 1, 'MCI': 2})

  # MLPClassifier  Classifier
  mlp_classifier = MLPClassifier(max_iter=100, random_state=42)
  mlp_classifier.fit(X_train, y_train)

  y_pred = mlp_classifier.predict(X_test)



  print(classification_report(y_test, y_pred, ))
  return classification_report(y_test, y_pred, output_dict=True)


In [10]:
def concatenate_features(features1, features2):
    return np.concatenate((features1, features2), axis=1)

# Fold 0 - Results

In [11]:
import pandas as pd

df0_train = pd.read_csv('train_fold_0.csv')
df0_test = pd.read_csv('test_fold_0.csv')

df_mirna_preprocessed_train_0 = process_dataframe(df0_train)
df_mirna_preprocessed_test_0 = process_dataframe(df0_test)


mirna_features_intermediate_train_0, mirna_features_intermediate_test_0 = generate_intermediate_feature(df_mirna_preprocessed_train_0, df_mirna_preprocessed_test_0)

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 128ms/step - loss: 2418.9363 - val_loss: 1660.1184
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step - loss: 2426.2441 - val_loss: 1652.0940
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step - loss: 2743.8884 - val_loss: 1646.5133
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - loss: 2592.6553 - val_loss: 1642.2794
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - loss: 2648.4802 - val_loss: 1639.0652
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - loss: 2369.8984 - val_loss: 1636.7317
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - loss: 2707.5762 - val_loss: 1635.0894
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - loss: 2300.4944 - val_loss: 1633.9380
Epoch 9

In [12]:
y_train_0 = df0_train['disease']
y_test_0 = df0_test['disease']

report_dict_0 = evaluate_model_rf(mirna_features_intermediate_train_0, mirna_features_intermediate_test_0, y_train_0, y_test_0)

              precision    recall  f1-score   support

           0       0.75      0.80      0.77       208
           1       0.29      0.25      0.27        60
           2       0.39      0.30      0.34        23

    accuracy                           0.65       291
   macro avg       0.48      0.45      0.46       291
weighted avg       0.63      0.65      0.64       291



# Fold 1 - Results

In [13]:
df1_train = pd.read_csv('train_fold_1.csv')
df1_test = pd.read_csv('test_fold_1.csv')

df_mirna_preprocessed_train_1 = process_dataframe(df1_train)
df_mirna_preprocessed_test_1 = process_dataframe(df1_test)


mirna_features_intermediate_train_1, mirna_features_intermediate_test_1 = generate_intermediate_feature(df_mirna_preprocessed_train_1, df_mirna_preprocessed_test_1)

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 127ms/step - loss: 2387.2705 - val_loss: 2471.3718
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 2650.2141 - val_loss: 2463.4463
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 2471.1724 - val_loss: 2457.8877
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - loss: 2155.1631 - val_loss: 2453.6621
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - loss: 2361.4937 - val_loss: 2450.4519
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step - loss: 2771.0742 - val_loss: 2448.1145
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step - loss: 1965.7628 - val_loss: 2446.4697
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 135ms/step - loss: 2440.0503 - val_loss: 2445.3213
Epoc

In [14]:
y_train_1 = df1_train['disease']
y_test_1 = df1_test['disease']

report_dict_1 = evaluate_model_rf(mirna_features_intermediate_train_1, mirna_features_intermediate_test_1, y_train_1, y_test_1)

              precision    recall  f1-score   support

           0       0.74      0.77      0.75       208
           1       0.19      0.17      0.18        60
           2       0.72      0.57      0.63        23

    accuracy                           0.63       291
   macro avg       0.55      0.50      0.52       291
weighted avg       0.62      0.63      0.63       291



# Fold 2 - Results

In [15]:
df2_train = pd.read_csv('train_fold_2.csv')
df2_test = pd.read_csv('test_fold_2.csv')

df_mirna_preprocessed_train_2 = process_dataframe(df2_train)
df_mirna_preprocessed_test_2 = process_dataframe(df2_test)


mirna_features_intermediate_train_2, mirna_features_intermediate_test_2 = generate_intermediate_feature(df_mirna_preprocessed_train_2, df_mirna_preprocessed_test_2)

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 125ms/step - loss: 2082.6584 - val_loss: 42.5303
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 2197.5496 - val_loss: 34.5478
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step - loss: 1903.5693 - val_loss: 28.9814
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - loss: 2027.7169 - val_loss: 24.7533
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step - loss: 2256.3389 - val_loss: 21.5405
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 2214.5366 - val_loss: 19.2057
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - loss: 1600.6093 - val_loss: 17.5651
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - loss: 1859.3234 - val_loss: 16.4180
Epoch 9/100
[1m5/5[0m

In [16]:
y_train_2 = df2_train['disease']
y_test_2 = df2_test['disease']

report_dict_2 = evaluate_model_rf(mirna_features_intermediate_train_2, mirna_features_intermediate_test_2, y_train_2, y_test_2)

              precision    recall  f1-score   support

           0       1.00      0.81      0.89       190
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.81       190
   macro avg       0.33      0.27      0.30       190
weighted avg       1.00      0.81      0.89       190



# Fold 3 - Results

In [17]:
df3_train = pd.read_csv('train_fold_3.csv')
df3_test = pd.read_csv('test_fold_3.csv')

df_mirna_preprocessed_train_3 = process_dataframe(df3_train)
df_mirna_preprocessed_test_3 = process_dataframe(df3_test)


mirna_features_intermediate_train_3, mirna_features_intermediate_test_3 = generate_intermediate_feature(df_mirna_preprocessed_train_3, df_mirna_preprocessed_test_3)

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 124ms/step - loss: 2296.5437 - val_loss: 2711.4924
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 2023.7222 - val_loss: 2703.6882
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - loss: 2049.7219 - val_loss: 2698.1648
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step - loss: 1990.4137 - val_loss: 2693.9526
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - loss: 2211.2859 - val_loss: 2690.7627
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 136ms/step - loss: 1976.2162 - val_loss: 2688.4478
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - loss: 2481.9282 - val_loss: 2686.8208
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 132ms/step - loss: 2056.8779 - val_loss: 2685.6777
Epoc

In [18]:
y_train_3 = df3_train['disease']
y_test_3 = df3_test['disease']

report_dict_3 = evaluate_model_rf(mirna_features_intermediate_train_3, mirna_features_intermediate_test_3, y_train_3, y_test_3)

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       207
           1       0.22      0.18      0.20        60
           2       0.75      0.52      0.62        23

    accuracy                           0.66       290
   macro avg       0.57      0.50      0.53       290
weighted avg       0.64      0.66      0.64       290



# Fold 4 - Results

In [19]:
df4_train = pd.read_csv('train_fold_4.csv')
df4_test = pd.read_csv('test_fold_4.csv')

df_mirna_preprocessed_train_4 = process_dataframe(df4_train)
df_mirna_preprocessed_test_4 = process_dataframe(df4_test)


mirna_features_intermediate_train_4, mirna_features_intermediate_test_4 = generate_intermediate_feature(df_mirna_preprocessed_train_4, df_mirna_preprocessed_test_4)


Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 127ms/step - loss: 2489.2410 - val_loss: 1554.4005
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 2206.0264 - val_loss: 1546.3662
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - loss: 2166.4685 - val_loss: 1540.8131
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - loss: 2641.8350 - val_loss: 1536.6217
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - loss: 2568.0898 - val_loss: 1533.4307
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - loss: 2623.6538 - val_loss: 1531.1072
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 2765.2224 - val_loss: 1529.4764
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - loss: 2477.7583 - val_loss: 1528.3384
Epoch 9

In [20]:
y_train_4 = df4_train['disease']
y_test_4 = df4_test['disease']

report_dict_4 = evaluate_model_rf(mirna_features_intermediate_train_4, mirna_features_intermediate_test_4, y_train_4, y_test_4)

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       207
           1       0.32      0.23      0.27        60
           2       0.67      0.35      0.46        23

    accuracy                           0.68       290
   macro avg       0.58      0.48      0.51       290
weighted avg       0.66      0.68      0.66       290



In [21]:
print(" Results for MLP Classifier for each fold")
report_MLP_dict_0 = evaluate_model_mlp(mirna_features_intermediate_train_0, mirna_features_intermediate_test_0, y_train_0, y_test_0)
report_MLP_dict_1 = evaluate_model_mlp(mirna_features_intermediate_train_1, mirna_features_intermediate_test_1, y_train_1, y_test_1)
report_MLP_dict_2 = evaluate_model_mlp(mirna_features_intermediate_train_2, mirna_features_intermediate_test_2, y_train_2, y_test_2)
report_MLP_dict_3 = evaluate_model_mlp(mirna_features_intermediate_train_3, mirna_features_intermediate_test_3, y_train_3, y_test_3)
report_MLP_dict_4 = evaluate_model_mlp(mirna_features_intermediate_train_4, mirna_features_intermediate_test_4, y_train_4, y_test_4)






 Results for MLP Classifier for each fold
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       208
           1       0.00      0.00      0.00        60
           2       0.00      0.00      0.00        23

    accuracy                           0.71       291
   macro avg       0.24      0.33      0.28       291
weighted avg       0.51      0.71      0.60       291

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       208
           1       0.00      0.00      0.00        60
           2       1.00      0.48      0.65        23

    accuracy                           0.75       291
   macro avg       0.58      0.49      0.50       291
weighted avg       0.61      0.75      0.66       291

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       190

    accuracy                           1.00       190
   macro avg       1.00      1.0

# Average Results

In [22]:
print("\n \nResults Random Forest Classifier")
print("---------------------------------------- ")
print_average_metrics(report_dict_0, report_dict_1, report_dict_2, report_dict_3, report_dict_4)


 
Results Random Forest Classifier
---------------------------------------- 
Average Precision Score: 0.5
Average Recall Score: 0.44
Average F1 Score: 0.46


In [23]:
print("\n \n  Average Results MLP  Classifier ")
print("---------------------------------------- ")
print_average_metrics(report_MLP_dict_0, report_MLP_dict_1, report_MLP_dict_2, report_MLP_dict_3, report_MLP_dict_4)


 
  Average Results MLP  Classifier 
---------------------------------------- 
Average Precision Score: 0.45
Average Recall Score: 0.51
Average F1 Score: 0.39
