In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Big data/Fold_esperimenti')

!pwd

import warnings
warnings.filterwarnings("ignore")

Mounted at /content/drive
/content/drive/My Drive/Big data/Fold_esperimenti


## Preprocessing dataset


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def process_dataframe(df: pd.DataFrame):

  # Calculate the missing rate for each column
  missing_rate = df.isnull().mean()

  df = df.drop([ 'Unnamed: 0', 'country', 'age', 'sex', 'apoe4'], axis=1)

  df['disease'] = df['disease'].replace({'AD': 0, 'NC': 1, 'MCI': 2})


  # Sort in discending order
  missing_rate_sorted = missing_rate.sort_values(ascending= False)

  # Filter columns that have a missing rate <= 5 %
  threshold = 0.05
  df = df.loc[:, missing_rate <= threshold]

  # Substitute null with zero
  df.fillna(0, inplace=True)


  return df

# AUTO ENCODERS

In [None]:
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

REG_COEFF = 0.03
DROPOUT_RATE = 0.6

def generate_intermediate_feature(X_train, X_test,
                                  hidden_dim1=256,
                                  hidden_dim2=128,
                                  epochs=100,
                                  batch_size=256):
    """
    Costruisce e addestra un autoencoder a due livelli (256 e 128) con
    regolarizzazione L2 e dropout, e restituisce le feature intermedie (128
    dimensioni) per train e test.

    Parametri:
    -----------
    X_train : np.array
        Matrice delle feature di training, shape (num_samples, input_dim)
    X_test  : np.array
        Matrice delle feature di test, shape (num_samples, input_dim)
    hidden_dim1 : int
        Dimensione del primo livello nascosto (es. 256)
    hidden_dim2 : int
        Dimensione del secondo livello nascosto (es. 128, il bottleneck)
    epochs : int
        Numero di epoche per l'addestramento dell'autoencoder
    batch_size : int
        Dimensione del batch per l'addestramento

    Restituisce:
    ------------
    features_intermediate_train : np.array
        Codifica di dimensione (num_samples_train, hidden_dim2)
    features_intermediate_test : np.array
        Codifica di dimensione (num_samples_test, hidden_dim2)
    """

    # Numero di feature in ingresso
    input_dim = X_train.shape[1]

    # 1. Definizione dei layer
    input_layer = Input(shape=(input_dim,))

    # Encoder
    encoded = Dense(hidden_dim1,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(input_layer)
    encoded = Dropout(DROPOUT_RATE)(encoded)

    encoded = Dense(hidden_dim2,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(encoded)
    encoded = Dropout(DROPOUT_RATE)(encoded)

    # Decoder
    decoded = Dense(hidden_dim1,
                    activation='relu',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(encoded)
    decoded = Dropout(DROPOUT_RATE)(decoded)

    # Output layer
    decoded = Dense(input_dim,
                    activation='sigmoid',
                    kernel_regularizer=regularizers.l2(REG_COEFF)
                   )(decoded)

    # 2. Costruzione del modello Autoencoder
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # 3. Addestramento
    autoencoder.fit(X_train, X_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test))

    # 4. Creazione del modello encoder
    # (l'output finale del nostro encoder è 'encoded' dopo il secondo Dropout)
    encoder = Model(inputs=input_layer, outputs=encoded)

    # 5. Estrazione delle feature intermedie
    features_intermediate_train = encoder.predict(X_train)
    features_intermediate_test = encoder.predict(X_test)

    # 6. Ritorno delle feature
    return features_intermediate_train, features_intermediate_test



# Random Forest Classifier


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def evaluate_model( X_train, X_test, y_train, y_test ) :


  # Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
  rf_classifier.fit(X_train, y_train)

  y_pred = rf_classifier.predict(X_test)

  # 'AD': 0, 'NC': 1, 'MCI': 2
  target_names = ['AD ', 'NC', 'MCI']
  print(classification_report(y_test, y_pred))

  return classification_report(y_test, y_pred, output_dict=True)




# MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
def evaluate_model_mlp( X_train, X_test, y_train, y_test ) :


  # MLPClassifier  Classifier
  mlp_classifier = MLPClassifier(max_iter=100, random_state=42)
  mlp_classifier.fit(X_train, y_train)

  y_pred = mlp_classifier.predict(X_test)

  print(classification_report(y_test, y_pred))
  return classification_report(y_test, y_pred, output_dict=True)


# Preprocessing and Autoencoders

In [None]:
def checks_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    """
    Verifica se due DataFrame hanno le stesse colonne, indipendentemente dall'ordine.

    Parametri:
        df1 (pd.DataFrame): primo DataFrame
        df2 (pd.DataFrame): secondo DataFrame

    Ritorna:
        bool: True se i DataFrame hanno le stesse colonne, False altrimenti.
    """
    return set(df1.columns) == set(df2.columns)

In [None]:
def remove_columns(df_train, df_test, label_col='disease'):

    y_train = df_train['disease']
    y_test = df_test['disease']

    df_train = df_train.drop('disease', axis=1)
    df_test = df_test.drop('disease', axis=1)

    # Trova le colonne comuni tra df_train e df_test
    colonne_comuni = df_train.columns.intersection(df_test.columns)

    # Mantieni solo le colonne comuni
    df_train_common = df_train[colonne_comuni]
    df_test_common = df_test[colonne_comuni]

    return df_train_common, df_test_common, y_train, y_test


In [None]:
import pandas as pd

def preprocess_data(df_train, df_test):


    features_mirna_train = process_dataframe(df_train)
    features_mirna_test = process_dataframe(df_test)

    print(checks_columns(features_mirna_train, features_mirna_test))

    features_final_train, features_final_test, y_train, y_test = remove_columns(features_mirna_train, features_mirna_test)

    print(checks_columns(features_mirna_train, features_mirna_test))

    features_final_train, features_final_test = generate_intermediate_feature(
        features_final_train, features_final_test)

    return features_final_train, features_final_test, y_train, y_test

def evaluate_models(features_train, features_test, y_train, y_test):
    report_dict_0 = evaluate_model(features_train, features_test, y_train, y_test)
    report_dict_1 = evaluate_model_mlp(features_train, features_test, y_train, y_test)
    return report_dict_0, report_dict_1


# Cross-dataset Result
Train df_84_93_23.csv'
Test 'df_89.csv'


In [None]:
train_path = pd.read_csv('df_84_93_23.csv')
test_path = pd.read_csv('df_89.csv')
features_train, features_test, y_train, y_test = preprocess_data(train_path, test_path)

False
False
Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 124ms/step - loss: 2596.8057 - val_loss: 34.6579
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 2505.9324 - val_loss: 28.1420
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 2346.7766 - val_loss: 22.9202
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 2305.8662 - val_loss: 18.7279
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - loss: 2526.5103 - val_loss: 15.4605
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 2542.4148 - val_loss: 13.0450
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - loss: 2462.7422 - val_loss: 11.3253
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 2419.5518 - val_loss: 10.1109
Epoch 9/100

In [None]:
report_dict_0, report_dict_1 = evaluate_models(features_train, features_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.26      1.00      0.41        21
           2       0.00      0.00      0.00        32

    accuracy                           0.26        81
   macro avg       0.09      0.33      0.14        81
weighted avg       0.07      0.26      0.11        81

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.26      1.00      0.41        21
           2       0.00      0.00      0.00        32

    accuracy                           0.26        81
   macro avg       0.09      0.33      0.14        81
weighted avg       0.07      0.26      0.11        81



# Intra-dataset Results

In [None]:
df = pd.read_csv('df_89.csv')
train_3, test_3 = train_test_split(df, test_size=0.5, random_state=42)
features_train_3, features_test_3, y_train_3, y_test_3 = preprocess_data(train_3, test_3)
report_dict_3_0, report_dict_3_1 = evaluate_models(features_train_3, features_test_3, y_train_3, y_test_3)

True
True
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 42.9588 - val_loss: 40.9786
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - loss: 40.9913 - val_loss: 39.1318
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - loss: 39.1151 - val_loss: 37.3497
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - loss: 37.3209 - val_loss: 35.6243
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step - loss: 35.5900 - val_loss: 33.9541
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step - loss: 33.9335 - val_loss: 32.3367
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 32.3195 - val_loss: 30.7700
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - loss: 30.7764 - val_loss: 29.2530
Epoch 9/100
[1m1/1[0m 