In [1]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout
from imblearn.over_sampling import SMOTE

In [2]:
# 1. Obtener un set de datos.
df = pd.read_csv("alzheimers_disease_data.csv")

# 2. Buscar valores vacios
print("Missing values:")
print(df.isna().sum())

# 3. Eliminar columnas que no agrega valor al entrenamiento
df = df.drop('PatientID', axis=1)
df = df.drop('DoctorInCharge', axis=1)

print("\nDistribución original de clases:")
print(df['Diagnosis'].value_counts(normalize=True) * 100)

Missing values:
PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTas

In [3]:
# Guardar las columnas por normalizar
numerical_cols = ["Age", "BMI", "AlcoholConsumption", "PhysicalActivity", "DietQuality", "SleepQuality", "SystolicBP", "DiastolicBP", "CholesterolTotal", "CholesterolLDL", "CholesterolHDL", "CholesterolTriglycerides", "MMSE", "FunctionalAssessment", "ADL"]

# Guardar las columnas numéricas
categorical_cols = ["Gender", "Ethnicity", "EducationLevel", "Smoking", "FamilyHistoryAlzheimers", "CardiovascularDisease", "Diabetes", "Depression", "HeadInjury", "Hypertension", "MemoryComplaints", "BehavioralProblems", "Confusion", "Disorientation", "PersonalityChanges", "DifficultyCompletingTasks", "Forgetfulness"]

In [4]:
# 4. Separar features y target
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

In [5]:
# 5. Dividir en train y test con un split 80 - 20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# 6. Aplicar las técnicas de escalamiento y transformación de datos
# Transformar datos categóricos mediante One Hot Encoder
encoder = OneHotEncoder(sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_cat_encoded = encoder.transform(X_test[categorical_cols])

# Escalamiento de datos numéricos mediante StandarScaler
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_num_scaled = scaler.transform(X_test[numerical_cols])

In [7]:
# 6. Aplicar las técnicas de escalamiento y transformación de datos
# Transformar datos categóricos mediante One Hot Encoder
encoder = OneHotEncoder(sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_cat_encoded = encoder.transform(X_test[categorical_cols])

# Escalamiento de datos numéricos mediante StandarScaler
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_num_scaled = scaler.transform(X_test[numerical_cols])

In [8]:
# 7. Obtener nombres de features
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

# Crear DataFrames para datos categóricos transformados
X_train_cat_df = pd.DataFrame(X_train_cat_encoded, columns=encoded_feature_names)
X_test_cat_df = pd.DataFrame(X_test_cat_encoded, columns=encoded_feature_names)

# Crear DataFrames para datos numéricos transformados
X_train_num_df = pd.DataFrame(X_train_num_scaled, columns=numerical_cols)
X_test_num_df = pd.DataFrame(X_test_num_scaled, columns=numerical_cols)

In [9]:
# 8. Combinar datos categóricos y numéricos
X_train_processed = pd.concat([X_train_num_df, X_train_cat_df], axis=1)
X_test_processed = pd.concat([X_test_num_df, X_test_cat_df], axis=1)

In [10]:
# 9. Transformar target a valor numerico mediante LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [11]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train_encoded)

In [12]:
def prepare_data_for_gru(X):
    return X.values.reshape(X.shape[0], X.shape[1], 1)

X_train_gru = prepare_data_for_gru(X_train_resampled)
X_test_gru = prepare_data_for_gru(X_test_processed)

def l2_svm_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_true = 2.0 * y_true - 1.0
    
    return tf.reduce_mean(tf.square(tf.maximum(0., 1. - y_true * y_pred)))

In [13]:
model = Sequential()
model.add(GRU(64, input_shape=(X_train_resampled.shape[1], 1), return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear')) 

  super().__init__(**kwargs)


In [14]:
model.summary()

In [15]:
model.compile(
    optimizer="adam", 
    loss=l2_svm_loss,
    metrics=['accuracy']
)

In [16]:
history = model.fit(
    X_train_gru,
    y_train_resampled,
    epochs=200,
    batch_size=32,
    validation_split=0.15,
    verbose=1
)

Epoch 1/200
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.5722 - loss: 0.9831 - val_accuracy: 0.0000e+00 - val_loss: 1.4666
Epoch 2/200
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5867 - loss: 0.9715 - val_accuracy: 0.0000e+00 - val_loss: 1.4070
Epoch 3/200
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5805 - loss: 0.9764 - val_accuracy: 0.0000e+00 - val_loss: 1.4686
Epoch 4/200
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6016 - loss: 0.9581 - val_accuracy: 0.0000e+00 - val_loss: 1.3365
Epoch 5/200
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6035 - loss: 0.9613 - val_accuracy: 0.0000e+00 - val_loss: 1.3022
Epoch 6/200
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5761 - loss: 0.9788 - val_accuracy: 0.0000e+00 - val_loss: 1.5231
Epoc

In [17]:
loss, accuracy = model.evaluate(X_test_gru, y_test_encoded)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7912 - loss: 0.8541 
Test Loss: 0.8833
Test Accuracy: 0.7814


In [18]:
def predict_classes(model, X):
    X_reshaped = prepare_data_for_gru(X)
    raw_predictions = model.predict(X_reshaped)
    predictions = np.sign(raw_predictions)
    return (predictions + 1) / 2

y_pred = predict_classes(model, X_test_processed)

classes_x2 = y_pred.flatten().astype(int)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [19]:
TP = 0
TN = 0
FP = 0
FN = 0

for i in range(len(classes_x2)):
  if classes_x2[i] == 1:
    if y_test_encoded[i] == 1:
      TP = TP + 1
    else :
      FP = FP + 1
  else:
    if y_test_encoded[i] == 0:
      TN = TN + 1
    else :
      FN = FN + 1


print('         ', 'label neg ', ' label pos')
print('pred neg    ', TN, "        ", FN)
print('pred pos    ', FP, "        ", TP)

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2*precision*recall / (precision + recall)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

          label neg   label pos
pred neg     235          48
pred pos     42          105
Precision: 0.7142857142857143
Recall: 0.6862745098039216
F1: 0.7000000000000001
