In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("~/dip_project/ADNI1_Final_With_Biomarkers.csv")

In [4]:
df.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,...,VISCODE_y.1,HMSCORE,VISCODE_x.2,NPISCORE,VISCODE_y.2,GDTOTAL,VISCODE2,ABETA42,TAU,PTAU
0,I97327,941_S_1311,MCI,M,69,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/02/2007,...,sc,1.0,,,sc,1.0,,,,
1,I112538,941_S_1311,MCI,M,70,m12,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,6/01/2008,...,,,m12,4.0,m12,3.0,,,,
2,I97341,941_S_1311,MCI,M,70,m06,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,9/27/2007,...,,,m06,3.0,,,,,,
3,I63874,941_S_1202,CN,M,78,sc,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,1/30/2007,...,sc,0.0,,,sc,0.0,,,,
4,I75150,941_S_1202,CN,M,78,m06,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,8/24/2007,...,,,m06,2.0,,,,,,


In [5]:
df.columns

Index(['Image Data ID', 'Subject', 'Group', 'Sex', 'Age', 'Visit', 'Modality',
       'Description', 'Type', 'Acq Date', 'Format', 'Downloaded', 'GENOTYPE',
       'VISCODE_x', 'CDGLOBAL', 'CDRSB', 'VISCODE_y', 'MMSCORE', 'VISCODE_x.1',
       'TOTAL11', 'TOTALMOD', 'VISCODE_y.1', 'HMSCORE', 'VISCODE_x.2',
       'NPISCORE', 'VISCODE_y.2', 'GDTOTAL', 'VISCODE2', 'ABETA42', 'TAU',
       'PTAU'],
      dtype='object')

In [10]:
from sklearn.model_selection import train_test_split

train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_data, temp_data = train_test_split(df, test_size=(1 - train_ratio), random_state=42, stratify=df["Group"])
val_data, test_data = train_test_split(temp_data, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42, stratify=temp_data["Group"])

In [11]:
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print(f"Data split completed:")
print(f"Training set: {len(train_data)} samples")
print(f"Validation set: {len(val_data)} samples")
print(f"Test set: {len(test_data)} samples")

Data split completed:
Training set: 1605 samples
Validation set: 344 samples
Test set: 345 samples


In [12]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

In [13]:
features = ["Age", "GENOTYPE", "CDGLOBAL", "CDRSB", "MMSCORE", "HMSCORE", "NPISCORE", "GDTOTAL"]
label = "Group"

In [14]:
train_data = train_data[features + [label]]
val_data   = val_data[features + [label]]
test_data  = test_data[features + [label]]

In [15]:
cols_with_missing = ["CDRSB", "MMSCORE", "HMSCORE", "NPISCORE", "GDTOTAL"]
for col in cols_with_missing:
    for df in [train_data, val_data, test_data]:
        # Optionally create a missingness indicator (if needed)
        df[col + "_is_missing"] = df[col].isnull().astype(int)
        # Replace NaNs with a sentinel value (-999)
        df[col] = df[col].fillna(-999)

In [16]:
##############################
# 4. Encode GENOTYPE using one-hot encoding and align columns
##############################
# Process training data first as reference.
train_data["GENOTYPE"] = train_data["GENOTYPE"].astype(str)
train_data = pd.get_dummies(train_data, columns=["GENOTYPE"], prefix="geno")
# Save the training DataFrame columns (this includes feature and label columns)
train_cols = train_data.columns

# Process validation data and reindex to match training columns
val_data["GENOTYPE"] = val_data["GENOTYPE"].astype(str)
val_data = pd.get_dummies(val_data, columns=["GENOTYPE"], prefix="geno")
val_data = val_data.reindex(columns=train_cols, fill_value=0)

# Process test data similarly
test_data["GENOTYPE"] = test_data["GENOTYPE"].astype(str)
test_data = pd.get_dummies(test_data, columns=["GENOTYPE"], prefix="geno")
test_data = test_data.reindex(columns=train_cols, fill_value=0)

In [19]:
##############################
# 5. Separate features (X) and label (y)
##############################
# Now all splits have the same set of columns.
X_train = train_data.drop(columns=[label])
y_train = train_data[label].values

X_val = val_data.drop(columns=[label])
y_val = val_data[label].values

X_test = test_data.drop(columns=[label])
y_test = test_data[label].values

# Encode target labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val   = label_encoder.transform(y_val)
y_test  = label_encoder.transform(y_test)

In [20]:
##############################
# 6. (Optional) Print dataset sizes
##############################
print("Train size:", X_train.shape)
print("Val size:", X_val.shape)
print("Test size:", X_test.shape)

Train size: (1605, 18)
Val size: (344, 18)
Test size: (345, 18)


In [21]:
##############################
# 7. Define the TabNet Classifier
##############################
model = TabNetClassifier(
    n_d=8,
    n_a=8,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-3),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    scheduler_params={"mode": "min", "patience": 3, "factor": 0.5, "verbose": True},
    mask_type="sparsemax"
)



In [22]:
##############################
# 8. Convert data to NumPy arrays and force numeric type
##############################
# Convert DataFrames to NumPy arrays (only feature columns) as float32.
X_train_np = X_train.astype(np.float32).values
X_val_np   = X_val.astype(np.float32).values
X_test_np  = X_test.astype(np.float32).values

In [23]:
##############################
# 9. Train with Early Stopping
##############################
model.fit(
    X_train=X_train_np, y_train=y_train,
    eval_set=[(X_val_np, y_val)],
    eval_name=["val"],
    eval_metric=["logloss"],
    max_epochs=100,
    patience=5,
    batch_size=32,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)



epoch 0  | loss: 1.19343 | val_logloss: 1.15953 |  0:00:01s
epoch 1  | loss: 0.98    | val_logloss: 1.03577 |  0:00:02s
epoch 2  | loss: 0.87469 | val_logloss: 0.97525 |  0:00:03s
epoch 3  | loss: 0.74449 | val_logloss: 0.92245 |  0:00:04s
epoch 4  | loss: 0.65337 | val_logloss: 0.83739 |  0:00:05s
epoch 5  | loss: 0.58243 | val_logloss: 0.74631 |  0:00:06s
epoch 6  | loss: 0.52134 | val_logloss: 0.74027 |  0:00:07s
epoch 7  | loss: 0.47344 | val_logloss: 0.74848 |  0:00:08s
epoch 8  | loss: 0.46673 | val_logloss: 0.68043 |  0:00:09s
epoch 9  | loss: 0.4512  | val_logloss: 0.6004  |  0:00:10s
epoch 10 | loss: 0.42788 | val_logloss: 0.5807  |  0:00:11s
epoch 11 | loss: 0.46408 | val_logloss: 0.55405 |  0:00:12s
epoch 12 | loss: 0.37881 | val_logloss: 0.52739 |  0:00:14s
epoch 13 | loss: 0.40812 | val_logloss: 0.52026 |  0:00:15s
epoch 14 | loss: 0.40162 | val_logloss: 0.48909 |  0:00:16s
epoch 15 | loss: 0.39168 | val_logloss: 0.50959 |  0:00:17s
epoch 16 | loss: 0.38038 | val_logloss: 



In [24]:
##############################
# 10. Save the best model
##############################
save_model_path = "best_tabnet_model_1"
model.save_model(save_model_path)

Successfully saved model at best_tabnet_model_1.zip


'best_tabnet_model_1.zip'

In [27]:
##############################
# 11. Load the saved model (optional)
##############################
model.load_model(save_model_path + ".zip")

In [25]:
##############################
# 12. Evaluate on Test Set
##############################
y_pred = model.predict(X_test_np)
test_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_acc)
print("Classification Report (Test):")
print(classification_report(y_test, y_pred, target_names=[str(c) for c in label_encoder.classes_]))

Test Accuracy: 0.8
Classification Report (Test):
              precision    recall  f1-score   support

          AD       1.00      0.14      0.24        72
          CN       0.98      0.95      0.97       106
         MCI       0.71      0.99      0.83       167

    accuracy                           0.80       345
   macro avg       0.90      0.69      0.68       345
weighted avg       0.85      0.80      0.75       345



In [28]:
def extract_embeddings(model, X_np):
    """
    Extract intermediate embeddings from a trained TabNetClassifier.
    
    This function registers a forward hook on the final mapping of the underlying 
    TabNet (stored in model.network.tabnet.final_mapping) to capture its input, which
    is considered the learned embedding, during a forward pass.
    
    Parameters
    ----------
    model : TabNetClassifier
        A trained TabNet model.
    X_np : np.array
        Input data as a NumPy array.
    
    Returns
    -------
    embeddings_array : np.array
        Extracted embeddings for all samples.
    """
    # Set the underlying network to evaluation mode.
    model.network.eval()
    
    embeddings_list = []
    
    # Define a hook function that captures the input to the final mapping.
    def hook_fn(module, input, output):
        # input[0] is the tensor entering the final mapping layer.
        embeddings_list.append(input[0].detach().cpu().numpy())
    
    # Register the hook on the final mapping layer.
    hook_handle = model.network.tabnet.final_mapping.register_forward_hook(hook_fn)
    
    # Run a forward pass over X_np.
    _ = model.predict(X_np)
    
    # Remove the hook.
    hook_handle.remove()
    
    # Concatenate the embeddings from all batches.
    embeddings_array = np.concatenate(embeddings_list, axis=0)
    return embeddings_array

In [38]:
train_embeddings = extract_embeddings(model, X_train_np)
val_embeddings   = extract_embeddings(model, X_val_np)
test_embeddings  = extract_embeddings(model, X_test_np)

print("Train embeddings shape:", train_embeddings_3.shape)
print("Validation embeddings shape:", val_embeddings_3.shape)
print("Test embeddings shape:", test_embeddings_3.shape)

Train embeddings shape: (1605, 8)
Validation embeddings shape: (344, 8)
Test embeddings shape: (345, 8)


In [39]:
np.save("train_embeddings.npy", train_embeddings)
np.save("val_embeddings.npy", val_embeddings)
np.save("test_embeddings.npy", test_embeddings)

In [40]:
from torch.utils.data import DataLoader, TensorDataset

In [41]:
train_emb_tensor = torch.tensor(train_embeddings_3, dtype=torch.float32)
val_emb_tensor   = torch.tensor(val_embeddings_3, dtype=torch.float32)
test_emb_tensor  = torch.tensor(test_embeddings_3, dtype=torch.float32)

In [42]:
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor   = torch.tensor(y_val, dtype=torch.long)
y_test_tensor  = torch.tensor(y_test, dtype=torch.long)

In [43]:
# Create TensorDatasets and DataLoaders.
batch_size = 64
train_dataset = TensorDataset(train_emb_tensor, y_train_tensor)
val_dataset   = TensorDataset(val_emb_tensor, y_val_tensor)
test_dataset  = TensorDataset(test_emb_tensor, y_test_tensor)

train_loader_mlp = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader_mlp   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader_mlp  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
import torch.nn as nn
import torch.optim as optim

# Define a simple MLP model.
class EmbeddingMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(EmbeddingMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )
        
    def forward(self, x):
        return self.net(x)

In [45]:
input_dim = train_embeddings.shape[1]  # For example, 32
hidden_dim = 64
num_classes = len(np.unique(y_train))
mlp_model = EmbeddingMLP(input_dim, hidden_dim, num_classes)

In [46]:
# Move model to device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mlp_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=1e-3)

In [47]:
# Training loop with early stopping.
num_epochs = 50
best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(num_epochs):
    mlp_model.train()
    running_loss = 0.0
    for emb, labels in train_loader_mlp:
        emb, labels = emb.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = mlp_model(emb)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * emb.size(0)
    train_loss = running_loss / len(train_loader_mlp.dataset)
    
    mlp_model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for emb, labels in val_loader_mlp:
            emb, labels = emb.to(device), labels.to(device)
            outputs = mlp_model(emb)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * emb.size(0)
            _, preds = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()
    val_loss = val_loss / len(val_loader_mlp.dataset)
    val_acc = correct / total
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(mlp_model.state_dict(), "best_mlp_from_emb.pth")
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        print("Early stopping")
        break


Epoch 1/50 - Train Loss: 0.9406, Val Loss: 0.8163, Val Acc: 0.7762
Epoch 2/50 - Train Loss: 0.7663, Val Loss: 0.6759, Val Acc: 0.7762
Epoch 3/50 - Train Loss: 0.6526, Val Loss: 0.5752, Val Acc: 0.7762
Epoch 4/50 - Train Loss: 0.5771, Val Loss: 0.5105, Val Acc: 0.8052
Epoch 5/50 - Train Loss: 0.5277, Val Loss: 0.4741, Val Acc: 0.8169
Epoch 6/50 - Train Loss: 0.4985, Val Loss: 0.4493, Val Acc: 0.8459
Epoch 7/50 - Train Loss: 0.4772, Val Loss: 0.4360, Val Acc: 0.8459
Epoch 8/50 - Train Loss: 0.4605, Val Loss: 0.4259, Val Acc: 0.8488
Epoch 9/50 - Train Loss: 0.4489, Val Loss: 0.4206, Val Acc: 0.8459
Epoch 10/50 - Train Loss: 0.4385, Val Loss: 0.4154, Val Acc: 0.8488
Epoch 11/50 - Train Loss: 0.4300, Val Loss: 0.4151, Val Acc: 0.8459
Epoch 12/50 - Train Loss: 0.4210, Val Loss: 0.4118, Val Acc: 0.8430
Epoch 13/50 - Train Loss: 0.4166, Val Loss: 0.4097, Val Acc: 0.8488
Epoch 14/50 - Train Loss: 0.4109, Val Loss: 0.4081, Val Acc: 0.8459
Epoch 15/50 - Train Loss: 0.4096, Val Loss: 0.4077, Val A

In [48]:
# Load the best MLP model and evaluate on the test set.
mlp_model.load_state_dict(torch.load("best_mlp_from_emb.pth", map_location=device))
mlp_model.eval()
all_preds = []
with torch.no_grad():
    for emb, labels in test_loader_mlp:
        emb = emb.to(device)
        outputs = mlp_model(emb)
        _, preds = torch.max(outputs, 1)
        all_preds.append(preds.cpu().numpy())
test_preds_mlp = np.concatenate(all_preds, axis=0)

print("MLP Test Accuracy:", accuracy_score(y_test, test_preds_mlp))
print("MLP Classification Report (Test):")
print(classification_report(y_test, test_preds_mlp, target_names=[str(c) for c in label_encoder.classes_]))

MLP Test Accuracy: 0.8927536231884058
MLP Classification Report (Test):
              precision    recall  f1-score   support

          AD       0.89      0.67      0.76        72
          CN       0.98      0.95      0.97       106
         MCI       0.85      0.95      0.90       167

    accuracy                           0.89       345
   macro avg       0.91      0.86      0.87       345
weighted avg       0.90      0.89      0.89       345

