In [1]:
# load the data
import numpy as np
import torch
from torch import nn
from glob import glob
import pandas as pd
from pathlib import Path
from tensorboardX import SummaryWriter
import sklearn
import sklearn.metrics
import sklearn.model_selection
import copy


data_dir = Path("data/")
train_tile_annotations = pd.read_csv(data_dir / "train_input/train_tile_annotations.csv")

def get_features(path, ntiles=1000):
    x = np.load(path)[:,3:]

    y = np.tile(x,(ntiles//x.shape[0],1))
    if y.shape[0] < ntiles:
        ncat = ntiles%x.shape[0]
        y = np.concatenate([y, x[:ncat]], axis=0)
    resnet_features = y # of size 1000 x 2048
    return resnet_features

class Dataset():
    def __init__(self):
        self.training_output = pd.read_csv(data_dir / "training_output.csv")
        self.ntiles = 1000

    def __getitem__(self, i):
        x = self.training_output.iloc[i]
        ID, target = x['ID'], x['Target']
        
        # load the pre-computed resnet features
        feat_path = glob(f"data/train_input/resnet_features/ID_{ID:03d}*.npy")[0]
        x = get_features(feat_path, ntiles=self.ntiles)
        resnet_features = torch.from_numpy(x).float()
        
        return resnet_features, target

    def __len__(self):
        return len(self.training_output)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        fSz = 2048 # dimension of resnet features
        self.conv1x1 = nn.Conv1d(fSz, 1, kernel_size=1, bias=False)

        self.R = 5
        self.mlp = nn.Sequential(
            nn.Linear(2*self.R, 200),
            nn.Sigmoid(),
            nn.Dropout(p=0.5),
            nn.Linear(200, 100),
            nn.Sigmoid(),
            nn.Dropout(p=0.5),
            nn.Linear(100, 1),
        )
        # TODO: Replace Sigmoid with ReLU

        self.BCELoss = nn.BCEWithLogitsLoss()

    def forward(self, feats, targets=None):
        """
            feats: torch.FloatTensor features of the tiles of size bSz,ntiles, fSz 
        """
        
        # bSz, ntiles, fSz
        feats = feats.transpose(1,2)      # Adapt input for conv
        # bSz, fSz, ntiles
        feats = self.conv1x1(feats)[:,0]  # Feature embedding
        # bSz, ntiles

        # min-max selection
        vals, inds = feats.sort(dim=1)
        minmax_inds = torch.cat([inds[:,:self.R] , inds[:,-self.R:]], dim=1)
        minmax_feats = torch.gather(feats, dim=1, index=minmax_inds)

        logits = self.mlp(minmax_feats)
        # bSz, 1
        probs = torch.sigmoid(logits)

        out = {
            'logits':logits, 'probs':probs,
        }
        if targets is not None:
            # compute loss
            loss = self.BCELoss(logits, targets[:,None].float())
            loss += 0.5 * (self.conv1x1.weight**2).sum() # Add weight decay
            out['loss'] = loss
            
        return out



In [2]:
dset = Dataset()
model_ = Model()


# # TODO: data augmentation ?
run_i = 0

def fit_and_score(train_dset, val_dset):
    train_loader = torch.utils.data.DataLoader(train_dset, batch_size=10, shuffle=True, drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_dset, batch_size=10, shuffle=False, drop_last=False)

    model = copy.deepcopy(model_).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Train loop
    nepochs = 30

    def validate():
        model.eval()

        all_preds = []
        all_targets = []
        for feats, targets in val_loader:
            all_targets.append(targets.numpy())

            with torch.no_grad():
                out = model(feats.cuda())
            all_preds.append(out['probs'].cpu().numpy())
        all_targets = np.concatenate(all_targets)
        all_preds = np.concatenate(all_preds)
        auc = sklearn.metrics.roc_auc_score(all_targets, all_preds)

        return auc

    global run_i
    writer = SummaryWriter(f'runs/run_{run_i}')
    run_i += 1

    iteration = 0
    best_auc = 0
    for epoch in range(nepochs):
        model.train()
        for feats, targets in train_loader:
            feats = feats.cuda()
            targets = targets.cuda()
            out = model(feats, targets=targets)

            optimizer.zero_grad()
            out['loss'].backward()
            optimizer.step()

            writer.add_scalar("Train/loss", out['loss'], iteration)
            iteration += 1

        # validate and keep the best at each epoch
        auc = validate()
        writer.add_scalar("Val/AUC", auc, iteration)
        
        if auc > best_auc:
            best_auc = auc
        print(f"Epoch {epoch}: AUC {auc:0.2f}, best AUC {best_auc:0.2f}")
    

    return best_auc

In [3]:

num_runs = 1
num_splits = 5

aucs = []
for seed in range(num_runs):
    # create new model
    cv = sklearn.model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True,random_state=seed)

    targets = dset.training_output['Target'].tolist()

    cv_aucs = []
    for i, (train_inds, val_inds) in enumerate(cv.split(dset, y=targets)):
        train_dset = torch.utils.data.Subset(dset, train_inds)
        val_dset = torch.utils.data.Subset(dset, val_inds)

        # train model on train_dset; evaluate on val_dset

        auc = fit_and_score(train_dset, val_dset)
        print(f"CV [{i}/{num_splits}] -  AUC: {auc:0.2f}")

        cv_aucs.append(auc)

    cv_auc = np.array(cv_aucs).mean()
    aucs.append(cv_auc)

aucs = np.array(aucs)

print("Predicting weak labels using Chowder")
print("AUC: mean {}, std {}".format(aucs.mean(), aucs.std()))

Epoch 0: AUC 0.5240641711229946, best AUC 0.5240641711229946
Epoch 1: AUC 0.5655080213903744, best AUC 0.5655080213903744
Epoch 2: AUC 0.5053475935828877, best AUC 0.5655080213903744
Epoch 3: AUC 0.5574866310160427, best AUC 0.5655080213903744
Epoch 4: AUC 0.5762032085561497, best AUC 0.5762032085561497
Epoch 5: AUC 0.4358288770053476, best AUC 0.5762032085561497
Epoch 6: AUC 0.5106951871657754, best AUC 0.5762032085561497
Epoch 7: AUC 0.5280748663101604, best AUC 0.5762032085561497
Epoch 8: AUC 0.6323529411764706, best AUC 0.6323529411764706
Epoch 9: AUC 0.5120320855614973, best AUC 0.6323529411764706
Epoch 10: AUC 0.5521390374331551, best AUC 0.6323529411764706
Epoch 11: AUC 0.7954545454545454, best AUC 0.7954545454545454
Epoch 12: AUC 0.7593582887700534, best AUC 0.7954545454545454
Epoch 13: AUC 0.9064171122994652, best AUC 0.9064171122994652
Epoch 14: AUC 0.9037433155080213, best AUC 0.9064171122994652
Epoch 15: AUC 0.8061497326203209, best AUC 0.9064171122994652
Epoch 16: AUC 0.83

KeyboardInterrupt: 