In [52]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Callback
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
import os
import sys

# Add the src directory to the path to import from common
sys.path.append('../..')
from common.datasets import CubeDataset, AFAContextDataset, MNISTDataset, DiabetesDataset
from common.registry import AFA_DATASET_REGISTRY

# Set random seed for reproducibility
SEED = 42
pl.seed_everything(SEED)

# Define a callback to track metrics
class MetricsCallback(Callback):
    def __init__(self):
        super().__init__()
        self.train_losses = []
        self.val_losses = []
        self.val_accuracies = []
        
    def on_train_epoch_end(self, trainer, pl_module):
        # Get the last logged train loss
        if 'train_loss' in trainer.callback_metrics:
            self.train_losses.append(trainer.callback_metrics['train_loss'].item())
    
    def on_validation_epoch_end(self, trainer, pl_module):
        # Get the last logged validation loss and accuracy
        if 'val_loss' in trainer.callback_metrics:
            self.val_losses.append(trainer.callback_metrics['val_loss'].item())
        if 'val_acc' in trainer.callback_metrics:
            self.val_accuracies.append(trainer.callback_metrics['val_acc'].item())

# Define the neural network model
class FCNModel(pl.LightningModule):
    def __init__(self, input_dim, hidden_dims, output_dim, class_weights=torch.tensor([0.5,0.5]), learning_rate=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.class_weights = class_weights
        
        # Build the layers
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.Dropout(0.2))
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y.long(), weight=self.class_weights)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        val_loss = F.cross_entropy(y_hat, y.long(), weight=self.class_weights)
        self.log('val_loss', val_loss)
        
        # Calculate accuracy
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        self.log('val_acc', acc)
        return val_loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        test_loss = F.cross_entropy(y_hat, y.long(), weight=self.class_weights)
        self.log('test_loss', test_loss)
        
        # Calculate accuracy
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        self.log('test_acc', acc)
        return test_loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)

# Function to load a dataset from the data folder
def load_dataset_from_file(dataset_name, split_idx=0, data_dir="data"):
    if dataset_name not in AFA_DATASET_REGISTRY:
        raise ValueError(f"Dataset {dataset_name} not found in registry. Available datasets: {list(AFA_DATASET_REGISTRY.keys())}")
    
    dataset_class = AFA_DATASET_REGISTRY[dataset_name]
    
    # Construct paths to the saved dataset splits
    train_path = os.path.join(data_dir, dataset_name, f"train_split_{split_idx+1}.pt")
    val_path = os.path.join(data_dir, dataset_name, f"val_split_{split_idx+1}.pt")
    test_path = os.path.join(data_dir, dataset_name, f"test_split_{split_idx+1}.pt")
    
    # Check if files exist
    if not all(os.path.exists(path) for path in [train_path, val_path, test_path]):
        raise FileNotFoundError(f"Dataset splits not found. Please run generate_datasets.py first.")
    
    # Load the datasets
    train_dataset = dataset_class.load(train_path)
    val_dataset = dataset_class.load(val_path)
    test_dataset = dataset_class.load(test_path)
    
    # Get input and output dimensions from the training dataset
    features, labels = train_dataset.get_all_data()
    input_dim = features.shape[1]
    output_dim = len(torch.unique(labels))
    
    print(f"Dataset: {dataset_name} (split {split_idx+1})")
    print(f"Input dimension: {input_dim}")
    print(f"Output dimension: {output_dim}")
    print(f"Train size: {len(train_dataset)}")
    print(f"Val size: {len(val_dataset)}")
    print(f"Test size: {len(test_dataset)}")
    print(f"Class distribution (train): {torch.bincount(labels.long(), minlength=output_dim)}")
    
    return train_dataset, val_dataset, test_dataset, input_dim, output_dim

# Function to create data loaders
def create_data_loaders(train_dataset, val_dataset, test_dataset, batch_size=64):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_loader, val_loader, test_loader

# Function to train the model
def train_model(model, train_loader, val_loader, max_epochs=50):
    # Define callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        mode='min'
    )
    
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath='checkpoints',
        filename='{epoch:02d}-{val_loss:.2f}',
        save_top_k=3,
        mode='min'
    )
    
    # Create metrics callback
    metrics_callback = MetricsCallback()
    
    # Create trainer
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        callbacks=[early_stopping, checkpoint_callback, metrics_callback],
        accelerator='auto',
        devices=1,
        log_every_n_steps=10
    )
    
    # Train the model
    trainer.fit(model, train_loader, val_loader)
    
    return trainer, metrics_callback

# Function to evaluate the model
def evaluate_model(model, test_loader):
    trainer = pl.Trainer(accelerator='auto', devices=1)
    results = trainer.test(model, test_loader)
    return results

# Function to make predictions
def make_predictions(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            x, y = batch
            y_hat = model(x)
            preds = torch.argmax(y_hat, dim=1)
            all_preds.append(preds)
            all_labels.append(y)
    
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    
    return all_preds, all_labels

# Function to plot training history
def plot_training_history(metrics_callback):
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(metrics_callback.train_losses, label='Train Loss')
    plt.plot(metrics_callback.val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss Curves')
    
    plt.subplot(1, 2, 2)
    plt.plot(metrics_callback.val_accuracies, label='Val Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Validation Accuracy')
    
    plt.tight_layout()
    plt.show()


Seed set to 42


In [55]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

def run_dataset(dataset_name, split_idx, data_dir):
    print(f"Running {dataset_name} split {split_idx}")
    # Load dataset splits
    train_dataset, val_dataset, test_dataset, input_dim, output_dim = load_dataset_from_file(
        dataset_name, split_idx, data_dir
    )

    ## Normalize the data
    ## Convert to numpy for sklearn preprocessing
    #train_features = train_dataset.features.numpy()
    #val_features = val_dataset.features.numpy()
    #test_features = test_dataset.features.numpy()
    
    ## Create and fit the scaler on training data only
    #scaler = StandardScaler()
    ##scaler = MinMaxScaler()
    #train_features_normalized = scaler.fit_transform(train_features)
    
    ## Transform validation and test data using the same scaler
    #val_features_normalized = scaler.transform(val_features)
    #test_features_normalized = scaler.transform(test_features)
    
    # Convert back to tensors
   # train_dataset.features = torch.tensor(train_features_normalized, dtype=torch.float32)
   # val_dataset.features = torch.tensor(val_features_normalized, dtype=torch.float32)
   # test_dataset.features = torch.tensor(test_features_normalized, dtype=torch.float32)
   
    
    print(f"Data normalized. Feature statistics after normalization:")
    print(f"  Train - Mean: {train_dataset.features.mean():.4f}, Std: {train_dataset.features.std():.4f}")
    print(f"  Val - Mean: {val_dataset.features.mean():.4f}, Std: {val_dataset.features.std():.4f}")
    print(f"  Test - Mean: {test_dataset.features.mean():.4f}, Std: {test_dataset.features.std():.4f}")

    # Create data loaders
    train_loader, val_loader, test_loader = create_data_loaders(
        train_dataset, val_dataset, test_dataset, batch_size=64
    )

    # Define model architecture
    hidden_dims = [128, 64, 32]
    learning_rate = 0.001

    # Calculate class weights based on class distribution
    class_weights = torch.bincount(train_dataset.labels.long(), minlength=output_dim)/len(train_dataset)
    # Invert the weights to give more importance to minority classes
    class_weights = 1.0 / class_weights
    # Normalize the weights
    class_weights = class_weights / class_weights.sum()
    print(f"Class weights: {class_weights}")

    # Create model
    model = FCNModel(
        input_dim=input_dim,
        hidden_dims=hidden_dims,
        output_dim=output_dim,
        learning_rate=learning_rate,
        class_weights=class_weights
    )

    # Train model
    trainer, metrics_callback = train_model(model, train_loader, val_loader, max_epochs=100)

    # Make predictions
    predictions, labels = make_predictions(model, test_loader)

    # Calculate accuracy
    accuracy = (predictions == labels).float().mean()
    print(f"Test accuracy: {accuracy:.4f}")

    if output_dim == 2:
        f1score = f1_score(labels, predictions)
        print(f"Test F1 score: {f1score:.4f}")

In [56]:
# Choose dataset
dataset_name = "physionet"  # Options: "cube", "AFAContext", "MNIST", "diabetes", "physionet"
split_idx = 0  # Which split to use (0-based index)
data_dir = "../../data"  # Directory where the dataset splits are saved

run_dataset(dataset_name, split_idx, data_dir)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Github_Projects\AFA-Benchmark\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory C:\Github_Projects\AFA-Benchmark\notebooks\datasets\checkpoints exists and is not empty.

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 16.2 K | train
---------------------------------------------
16.2 K    Trainable params
0         Non-trainable params
16.2 K    Total params
0.065     Total estimated model params size (MB)
14        Modules in train mode
0         Modules in eval mode


Running physionet split 0
Dataset: physionet (split 1)
Input dimension: 41
Output dimension: 2
Train size: 8400
Val size: 1800
Test size: 1800
Class distribution (train): tensor([7207, 1193])
Data normalized. Feature statistics after normalization:
  Train - Mean: 65.4028, Std: 134.6499
  Val - Mean: 65.1504, Std: 128.9722
  Test - Mean: 65.1915, Std: 127.6785
Class weights: tensor([0.1420, 0.8580])
                                                                            

c:\Github_Projects\AFA-Benchmark\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
c:\Github_Projects\AFA-Benchmark\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Epoch 18: 100%|██████████| 132/132 [00:00<00:00, 150.40it/s, v_num=51]
Test accuracy: 0.7711
Test F1 score: 0.4901


In [None]:
# run on all datasets
#dataset_name = "MNIST"  # Options: "cube", "AFAContext", "MNIST", "diabetes"
split_idx = 0  # Which split to use (0-based index)
data_dir = "../../data"  # Directory where the dataset splits are saved

for dataset_name in AFA_DATASET_REGISTRY.keys():
    run_dataset(dataset_name, split_idx, data_dir)