In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import ttest_rel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model, Sequential

In [14]:
df = pd.read_csv(r"C:\Users\Amr essam\Downloads\archive (1)\cardio_train.csv", sep=";")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
num_cols = ['age','height','weight','ap_hi','ap_lo']
cat_cols = ['gender','cholesterol','gluc','smoke','alco','active']
target_col = 'cardio'

In [4]:
X_num = df[num_cols].fillna(df[num_cols].median())
scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

In [6]:
X_cat = df[cat_cols].astype(str)
encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    X_cat[c] = le.fit_transform(X_cat[c])
    encoders[c] = le

X_all = np.concatenate([X_num, X_cat.values], axis=1)
y = df[target_col].values

In [7]:
ae_input_dim = X_all.shape[1]
ae_input = Input(shape=(ae_input_dim,))
e1 = Dense(8, activation="relu")(ae_input)
b = Dense(4, activation="relu")(e1)
d1 = Dense(8, activation="relu")(b)
ae_output = Dense(ae_input_dim, activation="linear")(d1)
autoencoder = Model(ae_input, ae_output)
encoder = Model(ae_input, b)
autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.fit(X_all, X_all, epochs=20, batch_size=32, verbose=0)
X_encoded = encoder.predict(X_all)

[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 957us/step


In [8]:
X_final = np.concatenate([X_all, X_encoded], axis=1)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)


In [10]:
model = Sequential()
model.add(Dense(256, activation="relu", input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(128, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(64, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(32, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(16, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.6338 - loss: 0.6612 - val_accuracy: 0.6947 - val_loss: 0.5816
Epoch 2/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7083 - loss: 0.5816 - val_accuracy: 0.7289 - val_loss: 0.5611
Epoch 3/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7164 - loss: 0.5732 - val_accuracy: 0.7274 - val_loss: 0.5603
Epoch 4/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7184 - loss: 0.5691 - val_accuracy: 0.7293 - val_loss: 0.5548
Epoch 5/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7199 - loss: 0.5670 - val_accuracy: 0.7346 - val_loss: 0.5541
Epoch 6/50
[1m1400/1400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.7189 - loss: 0.5681 - val_accuracy: 0.7295 - val_loss: 0.5557
Epoch 7/50
[1m1

<keras.src.callbacks.history.History at 0x267a2394690>

In [11]:
pred_dnn = model.predict(X_test).flatten()
auc_dnn = roc_auc_score(y_test, pred_dnn)

[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [12]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict_proba(X_test)[:,1]
auc_rf = roc_auc_score(y_test, pred_rf)


In [13]:
t_stat, p_val = ttest_rel(pred_dnn, pred_rf)

print("AUC DNN:", auc_dnn)
print("AUC RF:", auc_rf)
print("t-test:", t_stat, p_val)

AUC DNN: 0.801511641177068
AUC RF: 0.7768229563776677
t-test: 13.595685426210943 7.828325674746107e-42
