In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset, DataLoader

# ############################### Configuration ###################################
TRAIN_CSV = r"C:\Shivangi\college\Sem 5\Deep Learning\DL project\zips\train_merged.csv"
TEST_CSV = r"C:\Shivangi\college\Sem 5\Deep Learning\DL project\zips\test_merged.csv"

TRAIN_FEATURES_NPY = r"C:\Shivangi\college\Sem 5\Deep Learning\DL project\train_features_all.npy"
TEST_FEATURES_NPY = r"C:\Shivangi\college\Sem 5\Deep Learning\DL project\test_features_all.npy"

TARGET_COL = 'DIAGNOSIS'
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001

# ############################### Loading Data ####################################

# Load the CSV files
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print(f"Train DF shape: {train_df.shape}")
print(f"Test DF shape: {test_df.shape}")

# Load features
train_features = np.load(TRAIN_FEATURES_NPY)
test_features = np.load(TEST_FEATURES_NPY)
print("Train features shape:", train_features.shape)
print("Test features shape:", test_features.shape)

# Ensure `row_id` exists
if 'row_id' not in train_df.columns or 'row_id' not in test_df.columns:
    raise ValueError("row_id column not found in train or test CSVs.")

# Extract targets
# Assuming DIAGNOSIS is categorical and needs encoding
if train_df[TARGET_COL].dtype == 'object' or str(train_df[TARGET_COL].dtype) == 'category':
    label_enc = LabelEncoder()
    # Fit on combined classes from train and test to have consistent encoding
    combined_classes = pd.concat([train_df[TARGET_COL], test_df[TARGET_COL]], axis=0)
    label_enc.fit(combined_classes.dropna().astype(str))  # dropna in case of missing values
    
    train_df[TARGET_COL] = label_enc.transform(train_df[TARGET_COL].astype(str))
    test_df[TARGET_COL] = label_enc.transform(test_df[TARGET_COL].astype(str))

# Extract features from npy using row_id
X_train = train_features[train_df['row_id'].values]
X_test = test_features[test_df['row_id'].values]

y_train = train_df[TARGET_COL].values
y_test = test_df[TARGET_COL].values

# Optional: Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# ############################### Model Definition ################################

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(np.unique(np.concatenate([y_train, y_test])))  # number of classes

model = SimpleMLP(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# ############################### Training Loop ####################################
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")

# ############################### Evaluation ######################################
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        outputs = model(batch_x)
        _, preds = torch.max(outputs, dim=1)
        total += batch_y.size(0)
        correct += (preds == batch_y).sum().item()

acc = correct / total * 100
print(f"Test Accuracy: {acc:.2f}%")

# ############################### Saliency Maps ###################################
# Example: Compute saliency maps for one test sample

def compute_saliency(model, input_tensor, target_class):
    # input_tensor: shape (1, input_dim)
    input_tensor = input_tensor.clone().requires_grad_(True)
    model.zero_grad()
    output = model(input_tensor)
    loss = output[0, target_class]
    loss.backward()
    saliency = input_tensor.grad.data.cpu().numpy()
    return saliency

# Pick the first test sample
sample_x = X_test_tensor[0].unsqueeze(0)  # shape (1, input_dim)
with torch.no_grad():
    pred_output = model(sample_x)
    predicted_class = pred_output.argmax(dim=1).item()

saliency_map = compute_saliency(model, sample_x, predicted_class)
print("Saliency map for first test sample:", saliency_map)

# Grad-CAM isn't directly applicable to MLP, but we have saliency maps for interpretability.

Train DF shape: (783419, 18)
Test DF shape: (77779, 18)
Train features shape: (52365, 2048)
Test features shape: (10661, 2048)


IndexError: Target -9223372036854775808 is out of bounds.