In [14]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv("data/alzheimers_prediction_dataset.csv")
print("Loaded partial dataset with shape:", df.shape)
print(df.head())
TARGET_COL = "Alzheimer’s Diagnosis"

Loaded partial dataset with shape: (74283, 25)
        Country  Age  Gender  Education Level   BMI Physical Activity Level  \
0         Spain   90    Male                1  33.0                  Medium   
1     Argentina   72    Male                7  29.9                  Medium   
2  South Africa   86  Female               19  22.9                    High   
3         China   53    Male               17  31.2                     Low   
4        Sweden   58  Female                3  30.0                    High   

  Smoking Status Alcohol Consumption Diabetes Hypertension  ...  \
0          Never        Occasionally       No           No  ...   
1         Former               Never       No           No  ...   
2        Current        Occasionally       No          Yes  ...   
3          Never           Regularly      Yes           No  ...   
4         Former               Never      Yes           No  ...   

  Dietary Habits Air Pollution Exposure  Employment Status Marital Status  

In [16]:
# 2) EDA Prints
print("\n===== Basic Info =====")
print(df.info())

print("\n===== Value counts of Target =====")
print(df[TARGET_COL].value_counts())

# Check numeric summary
print("\n===== Numeric Describe =====")
print(df.describe())

# 3) Train/Test Split + Oversample
# Drop 'Country'
if "Country" in df.columns:
    df = df.drop(columns=["Country"])

X = df.drop(columns=[TARGET_COL], errors="ignore")
y = df[TARGET_COL]

print("\nShapes BEFORE train/test split:")
print("X shape:", X.shape, "y shape:", y.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nShapes AFTER train/test split:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test: ", X_test.shape,  "y_test: ", y_test.shape)

# Use RandomOverSampler on the training set
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

# Reset index after oversampling
X_train_res = X_train_res.reset_index(drop=True)
y_train_res = y_train_res.reset_index(drop=True)

print("\nShapes AFTER Oversampling (train only):")
print("X_train_res:", X_train_res.shape, "y_train_res:", y_train_res.shape)


===== Basic Info =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               74283 non-null  object 
 1   Age                                   74283 non-null  int64  
 2   Gender                                74283 non-null  object 
 3   Education Level                       74283 non-null  int64  
 4   BMI                                   74283 non-null  float64
 5   Physical Activity Level               74283 non-null  object 
 6   Smoking Status                        74283 non-null  object 
 7   Alcohol Consumption                   74283 non-null  object 
 8   Diabetes                              74283 non-null  object 
 9   Hypertension                          74283 non-null  object 
 10  Cholesterol Level                     74283 non-null  obje

In [17]:
# 4) Scaling & One-Hot Encoding
numeric_cols = ["Age", "Education Level", "BMI", "Cognitive Test Score"]
categorical_cols = list(set(X_train_res.columns) - set(numeric_cols))

for col in numeric_cols:
    if col not in X_train_res.columns:
        print(f"Warning: numeric column '{col}' not found in X_train_res columns")

# Scale numeric columns
scaler = StandardScaler()

# Fit on train and transform
X_train_res[numeric_cols] = scaler.fit_transform(X_train_res[numeric_cols])
# Transform test
if set(numeric_cols).issubset(X_test.columns):
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# One-Hot encode categorical columns
print("\n===== One-Hot Encoding =====")
print("Categorical columns:", categorical_cols)
X_train_res = pd.get_dummies(X_train_res, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure X_test has same columns as X_train_res
X_test = X_test.reindex(columns=X_train_res.columns, fill_value=0)

# Convert target to binary 0/1
y_train_res_bin = (y_train_res == "Yes").astype(int)
y_test_bin = (y_test == "Yes").astype(int)

print("\nFinal training data shape:", X_train_res.shape, y_train_res_bin.shape)
print("Final testing data shape: ", X_test.shape,  y_test_bin.shape)

# Print columns for debug
print("\nX_train_res columns:\n", X_train_res.columns)
print("Number of columns = ", len(X_train_res.columns))


===== One-Hot Encoding =====
Categorical columns: ['Cholesterol Level', 'Smoking Status', 'Dietary Habits', 'Employment Status', 'Alcohol Consumption', 'Stress Levels', 'Physical Activity Level', 'Urban vs Rural Living', 'Marital Status', 'Depression Level', 'Diabetes', 'Sleep Quality', 'Income Level', 'Family History of Alzheimer’s', 'Social Engagement Level', 'Hypertension', 'Air Pollution Exposure', 'Gender', 'Genetic Risk Factor (APOE-ε4 allele)']

Final training data shape: (69712, 35) (69712,)
Final testing data shape:  (14857, 35) (14857,)

X_train_res columns:
 Index(['Age', 'Education Level', 'BMI', 'Cognitive Test Score',
       'Cholesterol Level_Normal', 'Smoking Status_Former',
       'Smoking Status_Never', 'Dietary Habits_Healthy',
       'Dietary Habits_Unhealthy', 'Employment Status_Retired',
       'Employment Status_Unemployed', 'Alcohol Consumption_Occasionally',
       'Alcohol Consumption_Regularly', 'Stress Levels_Low',
       'Stress Levels_Medium', 'Physical A

In [18]:
# 5) PyTorch Dataset & DataLoader
# Convert all columns to float32
print("\nConverting DataFrames to float32 ...")
X_train_res = X_train_res.astype("float32")
X_test = X_test.astype("float32")
print("Conversion done.")

# Custom Dataset
class BinaryDataset(Dataset):
    """Custom PyTorch Dataset for binary classification."""
    def __init__(self, data_df, labels):
        # Convert DataFrame -> torch tensors
        self.data = torch.tensor(data_df.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create dataset
print("Creating train_dataset ...")
train_dataset = BinaryDataset(X_train_res, y_train_res_bin)
print("Creating test_dataset ...")
test_dataset  = BinaryDataset(X_test,      y_test_bin)

print("Train dataset length:", len(train_dataset))
print("Test dataset length:", len(test_dataset))

# Iterate a few samples
def manual_iteration_check(ds, n=5):
    print(f"\nChecking first {n} samples in dataset ...")
    for i in range(n):
        features, label = ds[i]
        print(f"Index={i}, features shape={features.shape}, label={label}")

manual_iteration_check(train_dataset, n=5)

# DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)


Converting DataFrames to float32 ...
Conversion done.
Creating train_dataset ...
Creating test_dataset ...
Train dataset length: 69712
Test dataset length: 14857

Checking first 5 samples in dataset ...
Index=0, features shape=torch.Size([35]), label=0.0
Index=1, features shape=torch.Size([35]), label=1.0
Index=2, features shape=torch.Size([35]), label=0.0
Index=3, features shape=torch.Size([35]), label=1.0
Index=4, features shape=torch.Size([35]), label=0.0


In [19]:
# 6) Define a Simple MLP
class SimpleMLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 16)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(16, 8)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(8, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleMLP(input_size=X_train_res.shape[1]).to(device)
print("\nModel Architecture:")
print(model)


Model Architecture:
SimpleMLP(
  (fc1): Linear(in_features=35, out_features=16, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=16, out_features=8, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=8, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [20]:
# 7) Training Loop
def train_model(model, train_loader, epochs=5, lr=0.001):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCELoss()

    for epoch in range(epochs):
        running_loss = 0.0
        for i, (xb, yb) in enumerate(train_loader):
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            out = model(xb).squeeze()
            loss = loss_fn(out, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}")

In [21]:
# 8) Evaluation
def evaluate_model(model, X_data, y_data, threshold=0.5):
    model.eval()
    inputs = torch.tensor(X_data.values, dtype=torch.float32).to(device)
    labels = torch.tensor(y_data.values, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(inputs).squeeze()
        preds = (outputs >= threshold).float()

    # Move back to CPU for scoring
    preds_np = preds.cpu().numpy()
    labels_np = labels.cpu().numpy()
    acc = accuracy_score(labels_np, preds_np)
    prec = precision_score(labels_np, preds_np, zero_division=0)
    rec = recall_score(labels_np, preds_np, zero_division=0)
    f1 = f1_score(labels_np, preds_np, zero_division=0)
    auc = roc_auc_score(labels_np, outputs.cpu().numpy())

    print("\n--- Evaluation Results ---")
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")
    print(f"AUC:       {auc:.3f}")


In [22]:
# Train & Evaluate
print("\n===== Starting Training =====")
train_model(model, train_loader, epochs=100, lr=1e-3)


===== Starting Training =====
Epoch [1/100] - Loss: 0.5767
Epoch [2/100] - Loss: 0.5602
Epoch [3/100] - Loss: 0.5583
Epoch [4/100] - Loss: 0.5579
Epoch [5/100] - Loss: 0.5568
Epoch [6/100] - Loss: 0.5560
Epoch [7/100] - Loss: 0.5557
Epoch [8/100] - Loss: 0.5554
Epoch [9/100] - Loss: 0.5545
Epoch [10/100] - Loss: 0.5537
Epoch [11/100] - Loss: 0.5530
Epoch [12/100] - Loss: 0.5525
Epoch [13/100] - Loss: 0.5516
Epoch [14/100] - Loss: 0.5510
Epoch [15/100] - Loss: 0.5506
Epoch [16/100] - Loss: 0.5501
Epoch [17/100] - Loss: 0.5498
Epoch [18/100] - Loss: 0.5496
Epoch [19/100] - Loss: 0.5496
Epoch [20/100] - Loss: 0.5492
Epoch [21/100] - Loss: 0.5491
Epoch [22/100] - Loss: 0.5488
Epoch [23/100] - Loss: 0.5487
Epoch [24/100] - Loss: 0.5485
Epoch [25/100] - Loss: 0.5485
Epoch [26/100] - Loss: 0.5483
Epoch [27/100] - Loss: 0.5479
Epoch [28/100] - Loss: 0.5482
Epoch [29/100] - Loss: 0.5476
Epoch [30/100] - Loss: 0.5478
Epoch [31/100] - Loss: 0.5475
Epoch [32/100] - Loss: 0.5474
Epoch [33/100] - L

In [23]:
print("\n===== Evaluating on Test Set =====")
evaluate_model(model, X_test, y_test_bin, threshold=0.5)


===== Evaluating on Test Set =====

--- Evaluation Results ---
Accuracy:  0.718
Precision: 0.627
Recall:    0.783
F1 Score:  0.697
AUC:       0.789
