# CHOWDER

## Imports

In [1]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/805.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/805.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/805.2 kB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.9.0 torchmetrics-1.2.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, ConcatDataset
import torch.nn as nn
import torch.optim as optim
import torchmetrics

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [4]:
processed_data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Processed_Data/'

X_dev = np.load(processed_data_path + 'X_dev.npy')
y_dev = np.load(processed_data_path + 'y_dev.npy')

X_test = np.load(processed_data_path + 'X_test.npy')

## Split Data

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.15, stratify=y_dev, random_state=42)

## Standardize Data

In [6]:
def X_standardize(X_train, X_val, X_test):

    feature_mean = np.mean(X_train)
    feature_std = np.std(X_train)

    X_train_scaled = (X_train - feature_mean) / feature_std
    X_val_scaled = (X_val - feature_mean) / feature_std
    X_test_scaled = (X_test - feature_mean) / feature_std

    return X_train_scaled, X_val_scaled, X_test_scaled

In [7]:
X_train_scaled, X_val_scaled, X_test_scaled = X_standardize(X_train, X_val, X_test)

## Convert to Tensor

In [8]:
X_train_tensor = torch.Tensor(X_train_scaled)
X_val_tensor = torch.Tensor(X_val_scaled)
X_test_tensor = torch.Tensor(X_test_scaled)

In [9]:
# Delete redundant variables to free up memory
del X_train_scaled
del X_val_scaled
del X_test_scaled
del X_train
del X_val
del X_test
del X_dev

## CHOWDER Model

In [10]:
class CHOWDER(nn.Module):

    def __init__(self):
        super(CHOWDER, self).__init__()

        # Convolutional layer
        self.conv = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2048, stride=2048)

        self.fc1 = nn.Linear(4, 1)

        self.init_weights()

    def init_weights(self):

        # Set std to the square root of the number of edges
        std_conv = 2048**(-0.5)
        # Initialize weights with random normal values for the convolutional layer
        nn.init.normal_(self.conv.weight, mean=0.0, std=std_conv)
        nn.init.constant_(self.conv.bias, 0)


    def forward(self, x):

        ## CONVOLUTION LAYER
        conv_output = self.conv(x)

        # Calculate L2-norm on weights in the convolutional layer
        l2_reg = 0.0
        for param in self.conv.parameters():
            l2_reg += torch.sum(param ** 2)

        ## MINMAX LAYER

        # Sort each row of the conv layer (each row is a sample)
        sorted_output, _ = torch.sort(conv_output, dim=2)

        # Number of top instances and negative evidence
        R = 2

        # Select the first two and last two sorted outputs
        selected_output = sorted_output[:, :, :R]
        a0 = torch.cat((selected_output, sorted_output[:, :, -R:]), dim=2)

        a0 = a0.squeeze()

        z1 = self.fc1(a0)

        output = torch.sigmoid(z1)

        return output, l2_reg

In [11]:
demo_model = CHOWDER()

demo_model

CHOWDER(
  (conv): Conv1d(1, 1, kernel_size=(2048,), stride=(2048,))
  (fc1): Linear(in_features=4, out_features=1, bias=True)
)

## Setup Hyperparameters and Data Loader

In [12]:
# Define hyperparameters
BATCH_SIZE = 10
LR = 0.001
EPOCHS = 20
NUM_ENSEMBLE_MODELS = 10

In [13]:
class CustomDataset(Dataset):

    def __init__(self, data_tensor, target_tensor):
        self.data = data_tensor
        self.target = target_tensor

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x, y

    def __len__(self):
        return len(self.data)

# Create custom datasets
train_dataset = CustomDataset(X_train_tensor, torch.Tensor(y_train))
val_dataset = CustomDataset(X_val_tensor, torch.Tensor(y_val))

# Create DataLoader
train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dl = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [14]:
# Free Up Memory
del X_train_tensor
del X_val_tensor
del y_train
del y_val

## Train Model and Validate

In [15]:
# Define loss function and optimizer
loss_function = nn.BCELoss()

# Create an ensemble of models
ensemble_models = []

for _ in range(NUM_ENSEMBLE_MODELS):

    # Initialize the model
    model = CHOWDER()

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=LR)
    ensemble_models.append((model, optimizer))

# Setup model counter
model_counter = 1

# Training and validation loops for each model in the ensemble
for model, optimizer in ensemble_models:

    print(f'\n ----- Model {model_counter} -----')
    model_counter += 1

    # Loop thorugh each epoch
    for epoch in range(EPOCHS):

        ## Training loop

        # Put model in train mode
        model.train()

        # Initialize loss, AUC and count
        total_loss = 0.0
        auroc_hist_train = 0.0
        total_count = 0.0

        # Train in batches
        for batch_x, batch_y in train_dl:

            batch_x_transformed = batch_x.unsqueeze(1)

            # Zero gradients
            optimizer.zero_grad()

            # Make predictions and get L2-norm from conv layer
            pred, weight_decay = model(batch_x_transformed)

            # Calculate loss
            batch_y = batch_y.view(-1, 1) # Reshape batch_y from (10) to (10,1)
            loss = loss_function(pred, batch_y) + (0.1*weight_decay)

            # Calculate gradients
            loss.backward()

            # Make step in gradient descent
            optimizer.step()

            # Add to loss counter for this epoch
            total_loss += loss.item() * len(batch_y)
            total_count += len(batch_y)

            # Calculate AUC for batch
            auroc_hist_train += torchmetrics.AUROC(task="binary")(pred, batch_y).item() * len(batch_y)

        # Calculate loss and AUC per sample
        train_average_loss = total_loss / total_count
        train_average_auc = auroc_hist_train/ total_count


        ## Validation loop

        # Put model in evaluation mode
        model.eval()

        # Fix gradients (only using model to predict)
        with torch.no_grad():

            # Initialize loss, AUC and count
            total_loss = 0.0
            auroc_hist_val = 0.0
            total_count = 0.0

            # Validate in batches
            for batch_x, batch_y in val_dl:

                batch_x_transformed = batch_x.unsqueeze(1)

                # Make predictions
                val_pred, _ = model(batch_x_transformed)

                # Calculate loss
                batch_y = batch_y.view(-1, 1) # Reshape batch_y from (10) to (10,1)
                loss = loss_function(val_pred, batch_y)

                # Add to loss for this epoch
                total_loss += loss.item() * len(batch_y)
                total_count += len(batch_y)

                # Calculate AUC for batch
                auroc_hist_val += torchmetrics.AUROC(task="binary")(val_pred, batch_y).item() * len(batch_y)

        # Calculate loss and AUC per sample
        val_average_loss = total_loss / total_count
        val_average_auc =  auroc_hist_val / total_count

        # Print results from each epoch
        print(f"Epoch [{epoch+1}/{EPOCHS}] - Train Loss: {train_average_loss:.4f} Train AUC Score: {train_average_auc:.4f} \
        Val Loss: {val_average_loss:.4f} Val AUC Score: {val_average_auc:.4f}")


 ----- Model 1 -----
Epoch [1/20] - Train Loss: 0.7720 Train AUC Score: 0.5092         Val Loss: 0.6780 Val AUC Score: 0.5018
Epoch [2/20] - Train Loss: 0.7286 Train AUC Score: 0.6464         Val Loss: 0.6797 Val AUC Score: 0.5982
Epoch [3/20] - Train Loss: 0.7018 Train AUC Score: 0.6774         Val Loss: 0.7030 Val AUC Score: 0.5179
Epoch [4/20] - Train Loss: 0.6791 Train AUC Score: 0.7325         Val Loss: 0.6510 Val AUC Score: 0.6025




Epoch [5/20] - Train Loss: 0.6701 Train AUC Score: 0.7145         Val Loss: 0.6691 Val AUC Score: 0.4653
Epoch [6/20] - Train Loss: 0.6432 Train AUC Score: 0.8026         Val Loss: 0.6573 Val AUC Score: 0.5115
Epoch [7/20] - Train Loss: 0.6385 Train AUC Score: 0.8190         Val Loss: 0.6492 Val AUC Score: 0.6042
Epoch [8/20] - Train Loss: 0.6282 Train AUC Score: 0.7891         Val Loss: 0.7100 Val AUC Score: 0.5536
Epoch [9/20] - Train Loss: 0.6080 Train AUC Score: 0.8286         Val Loss: 0.6558 Val AUC Score: 0.4757
Epoch [10/20] - Train Loss: 0.5971 Train AUC Score: 0.8558         Val Loss: 0.7126 Val AUC Score: 0.5233
Epoch [11/20] - Train Loss: 0.5801 Train AUC Score: 0.8420         Val Loss: 0.7018 Val AUC Score: 0.4339
Epoch [12/20] - Train Loss: 0.5776 Train AUC Score: 0.8680         Val Loss: 0.6700 Val AUC Score: 0.6169
Epoch [13/20] - Train Loss: 0.5608 Train AUC Score: 0.9216         Val Loss: 0.7174 Val AUC Score: 0.5037
Epoch [14/20] - Train Loss: 0.5519 Train AUC Score:

In [16]:
# After training all models, calculate the ensemble AUC

# Store predictions and labels
ensemble_predictions = []
val_labels = []
flag = 1

# Do not change gradients
with torch.no_grad():

    # Loop through each model
    for model, _ in ensemble_models:

        # Put model in evaluation mode
        model.eval()

        # Get predictions on the validation dataset
        predictions = []
        batch_labels = []  # Create a list to store labels for each batch

        for batch_x, batch_y in val_dl:

            batch_x_transformed = batch_x.unsqueeze(1)

            # Make predictions
            pred, _ = model(batch_x_transformed)
            # Save predictions
            predictions.append(pred)

            # Store val labels for this batch
            batch_labels.append(batch_y)

        # Concatenate the labels for this model
        if flag == 1:
            batch_labels = torch.cat(batch_labels, dim=0)
            val_labels.append(batch_labels)
            flag = 2

        predictions = torch.cat(predictions)
        ensemble_predictions.append(predictions)

# Concatenate all the validation labels
val_labels = torch.cat(val_labels, dim=0)

# Average the predictions from all models
ensemble_predictions = torch.stack(ensemble_predictions)
average_predictions = torch.mean(ensemble_predictions, dim=0)

# Calculate the AUC score based on the averaged predictions
average_auc = torchmetrics.AUROC(task="binary")(average_predictions, val_labels).item()
print(f"Ensemble AUC Score: {average_auc:.4f}")


Ensemble AUC Score: 0.5893


In [18]:
modified_chowder_decisive = pd.DataFrame({'predictions': average_predictions.squeeze(), 'true': val_labels})

decisive_path = '/content/drive/My Drive/Breast_Cancer_Detection/Decisive/'
modified_chowder_decisive.to_csv(decisive_path + 'modified_CHOWDER.csv', index=False)

In [None]:
sys.exit()

## Test Model

In [None]:
## Setup DataLoaders

# Create a dummy y for test set
y_test_dummy = torch.zeros(len(X_test_tensor),)

# Create custom datasets
test_dataset = CustomDataset(X_test_tensor, y_test_dummy)

# Create DataLoader
test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Merge them using ConcatDataset
dev_dataset = ConcatDataset([train_dataset, val_dataset])

# Create a DataLoader for the merged dataset
dev_dl = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [None]:
# Free Up Memory
del X_test_tensor
del train_dataset
del val_dataset

In [None]:
# Define loss function and optimizer
loss_function = nn.BCELoss()

# Create an ensemble of models
ensemble_models = []

for _ in range(NUM_ENSEMBLE_MODELS):

    # Initialize the model
    model = CHOWDER()

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=LR)
    ensemble_models.append((model, optimizer))

# Setup model counter
model_counter = 1

# Training loops for each model in the ensemble
for model, optimizer in ensemble_models:

    print(f'\n ----- Model {model_counter} -----')
    model_counter += 1

    # Loop thorugh each epoch
    for epoch in range(EPOCHS):

        ## Training loop

        # Put model in train mode
        model.train()

        # Initialize loss, AUC and count
        total_loss = 0.0
        auroc_hist_train = 0.0
        total_count = 0.0

        # Train in batches
        for batch_x, batch_y in dev_dl:

            # Add a dimension for channel numbers
            batch_x_transformed = batch_x.unsqueeze(1)

            # Zero gradients
            optimizer.zero_grad()

            # Make predictions and get L2-norm from conv layer
            pred, L2_term = model(batch_x_transformed)

            # Calculate loss
            batch_y = batch_y.view(-1, 1) # Reshape batch_y from (10) to (10,1)
            loss = loss_function(pred, batch_y) + (0.5*L2_term)

            # Calculate gradients
            loss.backward()

            # Make step in gradient descent
            optimizer.step()

            # Add to loss counter for this epoch
            total_loss += loss.item() * len(batch_y)
            total_count += len(batch_y)

            # Calculate AUC for batch
            auroc_hist_train += torchmetrics.AUROC(task="binary")(pred, batch_y).item() * len(batch_y)

        # Calculate loss and AUC per sample
        train_average_loss = total_loss / total_count
        train_average_auc = auroc_hist_train/ total_count

        # Print results from each epoch
        print(f"Epoch [{epoch+1}/{EPOCHS}] - Train Loss: {train_average_loss:.4f} Train AUC Score: {train_average_auc:.4f}")

In [None]:
# After training all models, calculate the ensemble AUC

# Store predictions and labels
ensemble_predictions = np.zeros((len(test_dataset), len(ensemble_models)))

# Do not change gradients
with torch.no_grad():

    model_counter = 0

    # Loop through each model
    for model, _ in ensemble_models:

        # Put model in evaluation mode
        model.eval()

        # Get predictions on the validation dataset
        predictions = np.empty((0, 1))

        for batch_x, _ in test_dl:

            extra = 10 - len(batch_x)

            # If batch size is smaller than 10, pad rows in batch_x with 0s
            if extra > 0:
                pad_tensor = torch.zeros((extra,) + batch_x.shape[1:], dtype=batch_x.dtype)
                batch_x = torch.cat((batch_x, pad_tensor), dim=0)

            # Add a dimension for channel numbers
            batch_x_transformed = batch_x.unsqueeze(1)

            # Make predictions
            pred, _ = model(batch_x_transformed)

            # Save predictions
            pred_numpy = pred.numpy()
            predictions = np.concatenate((predictions, pred_numpy), axis=0)

            if extra > 0:
              predictions = predictions[:-extra]

        ensemble_predictions[:, model_counter] = predictions.squeeze()
        model_counter += 1

# Average the predictions from all models
average_prediction = np.mean(ensemble_predictions, axis=1)

In [None]:
# Load metadata about each sample
data_path = '/content/drive/My Drive/Breast_Cancer_Detection/Data/'
df_test = pd.read_csv(data_path + "test_metadata.csv")

# Join sample ID metadata with probability prediction
CHOWDER_submission = pd.DataFrame( {"Sample ID": df_test["Sample ID"].values, "Target": average_prediction}).sort_values("Sample ID")

In [None]:
def sanity_checks(submission):
    assert all(submission["Target"].between(0, 1)), "`Target` values must be in [0, 1]"
    assert submission.shape == (149, 2), "Your submission file must be of shape (149, 2)"
    assert list(submission.columns) == ["Sample ID", "Target",], "Your submission file must have columns `Sample ID` and `Target`"

sanity_checks(CHOWDER_submission)

In [None]:
submission_path = '/content/drive/My Drive/Breast_Cancer_Detection/Predictions/'

CHOWDER_submission.to_csv(submission_path + "CHOWDER_submission_weights.csv", index=None)