In [17]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# 1) Read the Dataset
df = pd.read_csv("alzheimers_prediction_dataset.csv")
print("Loaded dataset with shape:", df.shape)
print(df.head())

TARGET_COL = "Alzheimer’s Diagnosis"

print("\n===== Basic Info =====")
print(df.info())

print("\n===== Value counts of Target =====")
print(df[TARGET_COL].value_counts())

print("\n===== Numeric Describe =====")
print(df.describe())

Loaded dataset with shape: (74283, 25)
        Country  Age  Gender  Education Level   BMI Physical Activity Level  \
0         Spain   90    Male                1  33.0                  Medium   
1     Argentina   72    Male                7  29.9                  Medium   
2  South Africa   86  Female               19  22.9                    High   
3         China   53    Male               17  31.2                     Low   
4        Sweden   58  Female                3  30.0                    High   

  Smoking Status Alcohol Consumption Diabetes Hypertension  ...  \
0          Never        Occasionally       No           No  ...   
1         Former               Never       No           No  ...   
2        Current        Occasionally       No          Yes  ...   
3          Never           Regularly      Yes           No  ...   
4         Former               Never      Yes           No  ...   

  Dietary Habits Air Pollution Exposure  Employment Status Marital Status  \
0     

In [19]:
if "Country" in df.columns:
    df = df.drop(columns=["Country"], errors="ignore")

X = df.drop(columns=[TARGET_COL], errors="ignore")
y = df[TARGET_COL]

print("\nShapes BEFORE train/test split:")
print("X shape:", X.shape, "y shape:", y.shape)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\nShapes AFTER train/test split:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test: ", X_test.shape,  "y_test: ", y_test.shape)

# Oversample
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

# Reset indices to avoid out-of-bounds errors
X_train_res = X_train_res.reset_index(drop=True)
y_train_res = y_train_res.reset_index(drop=True)

print("\nShapes AFTER Oversampling (train only):")
print("X_train_res:", X_train_res.shape, "y_train_res:", y_train_res.shape)


Shapes BEFORE train/test split:
X shape: (74283, 23) y shape: (74283,)

Shapes AFTER train/test split:
X_train: (59426, 23) y_train: (59426,)
X_test:  (14857, 23) y_test:  (14857,)

Shapes AFTER Oversampling (train only):
X_train_res: (69712, 23) y_train_res: (69712,)


In [20]:
# 3) Numeric & Categorical Processing
numeric_cols = ["Age", "Education Level", "BMI", "Cognitive Test Score"]
categorical_cols = list(set(X_train_res.columns) - set(numeric_cols))
for col in numeric_cols:
    if col not in X_train_res.columns:
        print(f"Warning: numeric column '{col}' not found in X_train_res.")

# Scale numeric features
scaler = StandardScaler()
X_train_res[numeric_cols] = scaler.fit_transform(X_train_res[numeric_cols])

# Transform test set
if set(numeric_cols).issubset(X_test.columns):
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# One-Hot encode categorical
print("\n===== One-Hot Encoding =====")
print("Categorical columns:", categorical_cols)
X_train_res = pd.get_dummies(X_train_res, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
X_test = X_test.reindex(columns=X_train_res.columns, fill_value=0)

# Convert target to 0/1
y_train_res_bin = (y_train_res == "Yes").astype(int)
y_test_bin = (y_test == "Yes").astype(int)

print("\nFinal training data shape:", X_train_res.shape, y_train_res_bin.shape)
print("Final testing data shape: ", X_test.shape,  y_test_bin.shape)





===== One-Hot Encoding =====
Categorical columns: ['Smoking Status', 'Hypertension', 'Depression Level', 'Diabetes', 'Air Pollution Exposure', 'Stress Levels', 'Physical Activity Level', 'Urban vs Rural Living', 'Marital Status', 'Gender', 'Genetic Risk Factor (APOE-ε4 allele)', 'Income Level', 'Sleep Quality', 'Family History of Alzheimer’s', 'Social Engagement Level', 'Employment Status', 'Cholesterol Level', 'Alcohol Consumption', 'Dietary Habits']

Final training data shape: (69712, 35) (69712,)
Final testing data shape:  (14857, 35) (14857,)


In [21]:
# 4) Convert to float32 & Build Dataset / DataLoader
print("\nConverting DataFrames to float32...")
X_train_res = X_train_res.astype("float32")
X_test = X_test.astype("float32")
print("Conversion done.")

class BinaryDataset(Dataset):
    """Custom PyTorch Dataset for binary classification."""
    def __init__(self, data_df, labels):
        # Convert DataFrame -> torch tensors
        self.data = torch.tensor(data_df.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

print("Creating train_dataset...")
train_dataset = BinaryDataset(X_train_res, y_train_res_bin)
print("Creating test_dataset...")
test_dataset  = BinaryDataset(X_test,      y_test_bin)

print("Train dataset length:", len(train_dataset))
print("Test dataset length:", len(test_dataset))

def manual_iteration_check(ds, n=5):
    print(f"\nChecking first {n} samples in dataset ...")
    for i in range(n):
        features, label = ds[i]
        print(f"Index={i}, features shape={features.shape}, label={label}")

manual_iteration_check(train_dataset, n=5)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)




Converting DataFrames to float32...
Conversion done.
Creating train_dataset...
Creating test_dataset...
Train dataset length: 69712
Test dataset length: 14857

Checking first 5 samples in dataset ...
Index=0, features shape=torch.Size([35]), label=0.0
Index=1, features shape=torch.Size([35]), label=1.0
Index=2, features shape=torch.Size([35]), label=0.0
Index=3, features shape=torch.Size([35]), label=1.0
Index=4, features shape=torch.Size([35]), label=0.0


In [22]:
# 5) Define a Large MLP with BatchNorm
class BigBinaryMLP(nn.Module):
    """
    Larger feed-forward network: 512 -> 256 -> 128 hidden layers,
    each followed by BatchNorm + ReLU,
    final Sigmoid for binary classification.
    """
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu1 = nn.ReLU()

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.relu3 = nn.ReLU()

        self.fc4 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)

        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu3(x)

        x = self.fc4(x)
        x = self.sigmoid(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BigBinaryMLP(input_size=X_train_res.shape[1]).to(device)
print("\nModel Architecture:")
print(model)


Model Architecture:
BigBinaryMLP(
  (fc1): Linear(in_features=35, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [23]:
# 6) Train Function
def train_model(model, train_loader, epochs=20, lr=0.001, threshold=0.5):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCELoss()

    for epoch in range(epochs):
        running_loss = 0.0
        total_samples = 0
        correct = 0

        for i, (xb, yb) in enumerate(train_loader):
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            out = model(xb).squeeze()
            loss = loss_fn(out, yb)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Accuracy in-batch
            preds = (out >= threshold).float()
            correct += (preds == yb).sum().item()
            total_samples += yb.size(0)

        avg_loss = running_loss / len(train_loader)
        acc = correct / total_samples
        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f} - Acc: {acc:.4f}")



In [24]:
# 7) Evaluation Function
def evaluate_model(model, X_data, y_data, threshold=0.5):
    model.eval()
    inputs = torch.tensor(X_data.values, dtype=torch.float32).to(device)
    labels = torch.tensor(y_data.values, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(inputs).squeeze()
        preds = (outputs >= threshold).float()

    # Move back to CPU for scoring
    preds_np = preds.cpu().numpy()
    labels_np = labels.cpu().numpy()

    acc  = accuracy_score(labels_np, preds_np)
    prec = precision_score(labels_np, preds_np, zero_division=0)
    rec  = recall_score(labels_np, preds_np, zero_division=0)
    f1   = f1_score(labels_np, preds_np, zero_division=0)
    auc  = roc_auc_score(labels_np, outputs.cpu().numpy())

    print("\n--- Evaluation Results ---")
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print(f"AUC:       {auc:.3f}")



In [25]:
# 8) Training & Evaluation
print("\n===== Starting Training with a Large MLP =====")
train_model(model, train_loader, epochs=100, lr=1e-3, threshold=0.5)

print("\n===== Evaluating on Test Set =====")
evaluate_model(model, X_test, y_test_bin, threshold=0.5)


===== Starting Training with a Large MLP =====
Epoch [1/100] - Loss: 0.5658 - Acc: 0.7136
Epoch [2/100] - Loss: 0.5549 - Acc: 0.7199
Epoch [3/100] - Loss: 0.5513 - Acc: 0.7224
Epoch [4/100] - Loss: 0.5478 - Acc: 0.7228
Epoch [5/100] - Loss: 0.5428 - Acc: 0.7255
Epoch [6/100] - Loss: 0.5363 - Acc: 0.7287
Epoch [7/100] - Loss: 0.5277 - Acc: 0.7344
Epoch [8/100] - Loss: 0.5180 - Acc: 0.7413
Epoch [9/100] - Loss: 0.5052 - Acc: 0.7502
Epoch [10/100] - Loss: 0.4884 - Acc: 0.7610
Epoch [11/100] - Loss: 0.4706 - Acc: 0.7722
Epoch [12/100] - Loss: 0.4511 - Acc: 0.7837
Epoch [13/100] - Loss: 0.4333 - Acc: 0.7961
Epoch [14/100] - Loss: 0.4105 - Acc: 0.8100
Epoch [15/100] - Loss: 0.3915 - Acc: 0.8211
Epoch [16/100] - Loss: 0.3730 - Acc: 0.8313
Epoch [17/100] - Loss: 0.3570 - Acc: 0.8399
Epoch [18/100] - Loss: 0.3364 - Acc: 0.8513
Epoch [19/100] - Loss: 0.3209 - Acc: 0.8582
Epoch [20/100] - Loss: 0.3030 - Acc: 0.8686
Epoch [21/100] - Loss: 0.2887 - Acc: 0.8742
Epoch [22/100] - Loss: 0.2747 - Acc: 