In [64]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Big data/Fold_esperimenti')

!pwd

import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Big data/Fold_esperimenti


## Preprocessing dataset


In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def process_dataframe(df: pd.DataFrame):

  df = df.drop([ 'Unnamed: 0', 'country', 'age', 'sex', 'apoe4'], axis=1)

  # Calculate the missing rate for each column
  missing_rate = df.isnull().mean()

  # Sort in discending order
  missing_rate_sorted = missing_rate.sort_values(ascending= False)

  # Filter columns that have a missing rate <= 5 %
  threshold = 0.05
  df = df.loc[:, missing_rate <= threshold]

  # Substitute null with zero
  df.fillna(0, inplace=True)

  return df

In [66]:
from sklearn.preprocessing import MinMaxScaler
def preprocess_metadata(df: pd.DataFrame):

  columns_to_encode = [ 'age', 'sex', 'apoe4']
  df_meta_data = df[columns_to_encode]
  scaler = MinMaxScaler(feature_range=(1, 2))
  df_meta_data['age'] = scaler.fit_transform(df[['age']])
  df_meta_data['sex'] = df_meta_data['sex'].map({'female': 0, 'male': 1}).fillna(3)
  df_meta_data.fillna(0, inplace=True)

  return df_meta_data

# AUTO ENCODERS

In [67]:
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

REG_COEFF = 0.03
DROPOUT_RATE = 0.6

def generate_intermediate_feature(X_train, X_test,
                                  hidden_dim1=256,
                                  hidden_dim2=128,
                                  epochs=100,
                                  batch_size=3):

  """
  Builds and trains a two-layer (256 and 128) autoencoder with
  L2 regularization and dropout, and returns intermediate features (128
  dimensions) for train and test.

  Parameters:
  -----------
  X_train : np.array
  Training feature matrix, shape (num_samples, input_dim)
  X_test : np.array
  Test feature matrix, shape (num_samples, input_dim)
  hidden_dim1 : int
  Size of the first hidden layer (256)
  hidden_dim2 : int
  Size of the second hidden layer (128)
  epochs : int
  Number of epochs to train the autoencoder
  batch_size : int
  Size of the training batch

  Returns:
  ------------
  features_intermediate_train : np.array
  Dimension encoding (num_samples_train, hidden_dim2)
  features_intermediate_test : np.array
  Dimension encoding (num_samples_test, hidden_dim2)
"""

  # Number of input features
  input_dim = X_train.shape[1]

  input_layer = Input(shape=(input_dim,))

  # Encoder
  encoded = Dense(hidden_dim1,
                  activation='relu',
                  kernel_regularizer=regularizers.l2(REG_COEFF)
                  )(input_layer)
  encoded = Dropout(DROPOUT_RATE)(encoded)

  encoded = Dense(hidden_dim2,
                  activation='relu',
                  kernel_regularizer=regularizers.l2(REG_COEFF)
                  )(encoded)
  encoded = Dropout(DROPOUT_RATE)(encoded)

  # Decoder
  decoded = Dense(hidden_dim1,
                  activation='relu',
                  kernel_regularizer=regularizers.l2(REG_COEFF)
                  )(encoded)
  decoded = Dropout(DROPOUT_RATE)(decoded)

  # Output layer
  decoded = Dense(input_dim,
                  activation='sigmoid',
                  kernel_regularizer=regularizers.l2(REG_COEFF)
                  )(decoded)

  # 2. Autoencoder Model Construction
  autoencoder = Model(inputs=input_layer, outputs=decoded)
  autoencoder.compile(optimizer='adam', loss='mse')

  # 3. Training
  autoencoder.fit(X_train, X_train,
                  epochs=epochs,
                  batch_size=batch_size,
                  shuffle=True,
                  validation_data=(X_test, X_test))

  # 4. Creating the encoder model
  encoder = Model(inputs=input_layer, outputs=encoded)

  # 5. Extraction of intermediate features
  features_intermediate_train = encoder.predict(X_train)
  features_intermediate_test = encoder.predict(X_test)

  # 6. Return of features
  return features_intermediate_train, features_intermediate_test



# Random Forest Classifier


In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def evaluate_model( X_train, X_test, y_train, y_test ) :
  y_train = y_train.replace({'AD': 0, 'NC': 1, 'MCI': 2})
  y_test = y_test.replace({'AD': 0, 'NC': 1, 'MCI': 2})

  # Random Forest Classifier
  rf_classifier = RandomForestClassifier(n_estimators=31, random_state=42)
  rf_classifier.fit(X_train, y_train)

  y_pred = rf_classifier.predict(X_test)


  print('Results with Random Forest Classifier! \n')
  print(classification_report(y_test, y_pred))

  return classification_report(y_test, y_pred, output_dict=True)




# MLP Classifier

In [69]:
from sklearn.neural_network import MLPClassifier
def evaluate_model_mlp( X_train, X_test, y_train, y_test ) :

  y_train = y_train.replace({'AD': 0, 'NC': 1, 'MCI': 2})
  y_test = y_test.replace({'AD': 0, 'NC': 1, 'MCI': 2})

  # MLPClassifier  Classifier
  mlp_classifier = MLPClassifier(max_iter=100, random_state=42)
  mlp_classifier.fit(X_train, y_train)

  y_pred = mlp_classifier.predict(X_test)

  print('Results with MLPClassifier! \n')

  print(classification_report(y_test, y_pred))
  return classification_report(y_test, y_pred, output_dict=True)


# Preprocessing and Autoencoders

In [70]:
def checks_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    """
    Verifica se due DataFrame hanno le stesse colonne, indipendentemente dall'ordine.

    Parametri:
        df1 (pd.DataFrame): primo DataFrame
        df2 (pd.DataFrame): secondo DataFrame

    Ritorna:
        bool: True se i DataFrame hanno le stesse colonne, False altrimenti.
    """
    return set(df1.columns) == set(df2.columns)

In [71]:
def remove_columns(df_train, df_test, label_col='disease'):

    y_train = df_train['disease']
    y_test = df_test['disease']

    df_train = df_train.drop('disease', axis=1)
    df_test = df_test.drop('disease', axis=1)

    # Trova le colonne comuni tra df_train e df_test
    colonne_comuni = df_train.columns.intersection(df_test.columns)

    # Mantieni solo le colonne comuni
    df_train_common = df_train[colonne_comuni]
    df_test_common = df_test[colonne_comuni]

    return df_train_common, df_test_common, y_train, y_test

In [72]:
import pandas as pd

def preprocess_data(df_train, df_test):


    print(checks_columns(df_train, df_test))

    features_mirna_train = process_dataframe(df_train)
    features_mirna_test = process_dataframe(df_test)

    print(checks_columns(features_mirna_train, features_mirna_test))

    features_mirna_train, features_mirna_test, y_train, y_test = remove_columns(features_mirna_train, features_mirna_test)

    intermediate_features_mirna_train, intermediate_features_mirna_test = generate_intermediate_feature(
        features_mirna_train, features_mirna_test)

    features_meta_train = preprocess_metadata(df_train)
    features_meta_test = preprocess_metadata(df_test)

    intermediate_features_meta_train, intermediate_features_meta_test = generate_intermediate_feature(
        features_meta_train, features_meta_test, 15, 15)

    features_final_train = concatenate_features(features_meta_train, intermediate_features_mirna_train)
    features_final_test = concatenate_features(features_meta_test, intermediate_features_mirna_test)


    return features_final_train, features_final_test, y_train, y_test

def evaluate_models(features_train, features_test, y_train, y_test):
    report_dict_0 = evaluate_model(features_train, features_test, y_train, y_test)
    report_dict_1 = evaluate_model_mlp(features_train, features_test, y_train, y_test)
    return report_dict_0, report_dict_1


In [73]:
def concatenate_features(features1, features2):
    return np.concatenate((features1, features2), axis=1)

# Cross-dataset Result
Train df_84_93_23.csv'
Test 'df_89.csv'


In [74]:
train_path = pd.read_csv('df_84_93_23.csv')
test_path = pd.read_csv('df_89.csv')
features_train, features_test, y_train, y_test = preprocess_data(train_path, test_path)

True
False
Epoch 1/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - loss: 2859.8679 - val_loss: 5.1700
Epoch 2/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - loss: 2239.0596 - val_loss: 5.2662
Epoch 3/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 25ms/step - loss: 3190.2937 - val_loss: 5.4546
Epoch 4/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - loss: 3356.4944 - val_loss: 5.5658
Epoch 5/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 26ms/step - loss: 2194.6609 - val_loss: 5.6746
Epoch 6/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - loss: 2540.4829 - val_loss: 5.9049
Epoch 7/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - loss: 2207.5889 - val_loss: 5.9963
Epoch 8/100
[1m419/419[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - loss: 2623.7241 - 

In [75]:
report_dict_0, report_dict_1 = evaluate_models(features_train, features_test, y_train, y_test)

Results with Random Forest Classifier! 

              precision    recall  f1-score   support

           0       0.35      1.00      0.51        28
           1       0.00      0.00      0.00        21
           2       0.00      0.00      0.00        32

    accuracy                           0.35        81
   macro avg       0.12      0.33      0.17        81
weighted avg       0.12      0.35      0.18        81

Results with MLPClassifier! 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.26      1.00      0.41        21
           2       0.00      0.00      0.00        32

    accuracy                           0.26        81
   macro avg       0.09      0.33      0.14        81
weighted avg       0.07      0.26      0.11        81



# Intra-dataset Results

In [76]:
df = pd.read_csv('df_89.csv')
train_3, test_3 = train_test_split(df, test_size=0.5, random_state=42)
features_train_3, features_test_3, y_train_3, y_test_3 = preprocess_data(train_3, test_3)
report_dict_3_0, report_dict_3_1 = evaluate_models(features_train_3, features_test_3, y_train_3, y_test_3)

True
True
Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - loss: 37.3326 - val_loss: 22.4408
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - loss: 19.4651 - val_loss: 12.3076
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 11.0565 - val_loss: 7.9779
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 7.4292 - val_loss: 6.1213
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 5.8368 - val_loss: 5.1830
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 5.0212 - val_loss: 4.5915
Epoch 7/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 4.5082 - val_loss: 4.2114
Epoch 8/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 4.1570 - val_loss: 3.9676
Epoch 9/100
[1m14/14[0m