In [1]:
import umap
import random, os
import numpy as np
import pandas as pd
import scipy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from copy import deepcopy
from random import sample

import sys
sys.path.insert(0, "../")


In [None]:
from data_iq.dataiq_class import *
from src.utils.utils import *
from src.models.neuralnets import *
from src.utils.data_loader import *
from src.utils.group_dro_helpers import *

In [60]:
dataset = 'covid'
train_loader, train_data, X_train, y_train, X_test, y_test, X_train_pd, y_train_pd, X_test_pd, y_test_pd, nlabels, corr_vals, column_ids, df = load_dataset(dataset)

try:
  X_test = X_test.to_numpy()
except:
  pass

try:
  y_test = y_test.values
except:
  pass

# TRAIN BASELINE MODEL

In [61]:
LEARNING_RATE = 0.001
EPOCHS=10
BATCH_SIZE=128

latent_test=True
nlabels = len(np.unique(y_train))

if dataset=='fetal':
  EPOCHS_FETAL=20
  EPOCHS=EPOCHS_FETAL

n_feats = X_train.shape[1]
train_data = TrainData(torch.FloatTensor(X_train), 
                      torch.FloatTensor(y_train))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

loss_list=[]
loss_list_test = []
checkpoint_list = []
dataiq_list=[]

for i in range(1):
  from aum import DatasetWithIndex
  train_loader = DataLoader(dataset=DatasetWithIndex(train_data), batch_size=BATCH_SIZE, shuffle=True)
  ckpt_nets = []
  net = Net1(input_size=n_feats,nlabels=nlabels)
  net.to(device)
  criterion = torch.nn.NLLLoss()

  optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
  dataiq = DataIQ_Torch(X=X_train , y=y_train, sparse_labels=True)

  for e in range(1, EPOCHS+1):
      net.train()
      epoch_loss = 0
      epoch_acc = 0
      for X_batch, y_batch, sample_ids in train_loader:
          X_batch, y_batch = X_batch.to(device), y_batch.to(device)
          optimizer.zero_grad()
          sf = nn.LogSoftmax()
          y_pred = net(X_batch)

          _, predicted = torch.max(y_pred.data, 1)

          y_batch=y_batch.to(torch.int64)
          
          loss = criterion(sf(y_pred), y_batch)

          loss.backward()
          optimizer.step()
          
          epoch_loss += loss.item()
          epoch_acc += (predicted == y_batch).sum().item()/len(y_batch)

      
      dataiq.on_epoch_end(net, device=device)
      loss_list.append(epoch_loss/len(train_loader))
      print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
      ckpt_nets.append(deepcopy(net))

  checkpoint_list.append(ckpt_nets)
  dataiq_list.append(dataiq)

  X = F.softmax(self.output(X))


Epoch 001: | Loss: 0.68592 | Acc: 0.553
Epoch 002: | Loss: 0.67793 | Acc: 0.544
Epoch 003: | Loss: 0.65535 | Acc: 0.650
Epoch 004: | Loss: 0.59964 | Acc: 0.698
Epoch 005: | Loss: 0.57595 | Acc: 0.726
Epoch 006: | Loss: 0.57389 | Acc: 0.723
Epoch 007: | Loss: 0.56123 | Acc: 0.738
Epoch 008: | Loss: 0.56353 | Acc: 0.735
Epoch 009: | Loss: 0.56534 | Acc: 0.733
Epoch 010: | Loss: 0.56427 | Acc: 0.739


# GET INTERMEDIATE (LATENT) REPRESENTATIONS WHICH WE WILL PROJECT WITH UMAP

In [62]:
mymodel=checkpoint_list[0][9]

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook
mymodel.dense3.register_forward_hook(get_activation('dense3'))
output = mymodel(torch.tensor(X_train,device=device))
intermediate_train = activation['dense3'].cpu().numpy()

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook
mymodel.dense3.register_forward_hook(get_activation('dense3'))
output = mymodel(torch.tensor(X_test,device=device).float())
intermediate_test = activation['dense3'].cpu().numpy()

# UMAP TRANSFORMATION - LOWER DIMENSIONAL REPRESENTATION

In [63]:
mapper = umap.UMAP().fit(intermediate_train, y=y_train)
embedding_train = mapper.transform(intermediate_train)
embedding_test = mapper.transform(intermediate_test)

# GET DATA-IQ subgroups

In [64]:
d_idx=0
aleatoric_train = dataiq_list[d_idx].aleatoric
confidence_train = dataiq_list[d_idx].confidence

percentile_thresh = 50
conf_thresh = 0.5
conf_thresh_low = 0.25
conf_thresh_high = 0.75

hard_train = np.where((confidence_train <= conf_thresh_low) & (aleatoric_train <= np.percentile(aleatoric_train,   percentile_thresh)))[0]
easy_train = np.where((confidence_train >= conf_thresh_high) & (aleatoric_train <= np.percentile(aleatoric_train,   percentile_thresh)))[0]

hard_easy = np.concatenate((hard_train,easy_train))
ambig_train = []
for id in range(len(confidence_train)):
  if id not in hard_easy:
    ambig_train.append(id)
ambig_train= np.array(ambig_train)

print('Train :', len(hard_train),  len(ambig_train),  len(easy_train))

Train : 341 2064 1724


# Now use neighbors in the UMAP to get potential test group labels

In [65]:
from sklearn.neighbors import KNeighborsClassifier

y_train_groups = []
for i in range(len(y_train)):
  if i in easy_train:
    y_train_groups.append(0)
  elif i in ambig_train:
    y_train_groups.append(1)
  elif i in hard_train:
    y_train_groups.append(2)
y_train_groups = np.array(y_train_groups)

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X= embedding_train, y=y_train_groups)
test_groups = neigh.predict(embedding_test)

if latent_test:
  easy_test = np.where(test_groups==0)[0]
  ambig_test = np.where(test_groups==1)[0]
  hard_test = np.where(test_groups==2)[0]
print('Test :', len(hard_test),  len(ambig_test),  len(easy_test))

Test : 92 1424 1237


# GROUP-DRO: subgroups are clusters within the Data-IQ subgroups

### Get clusters within the Data-IQ subgroups

In [66]:
from sklearn.mixture import GaussianMixture
from sklearn import metrics

# Superclass: Data-IQ easy group
partition0 = easy_train
sil_score = []
for i in range(10):
  gm = GaussianMixture(n_components=i+2, random_state=0).fit(embedding_train[partition0,:])
  clusters = gm.predict(embedding_train[partition0,:])
  sil_score.append(metrics.silhouette_score(embedding_train[partition0,:], clusters, metric='euclidean'))
best = np.argmax(sil_score)+2
gm = GaussianMixture(n_components=best, random_state=0).fit(embedding_train[partition0,:])
clusters0 = gm.predict(embedding_train[partition0,:])

# Superclass: Data-IQ ambig group
partition1 = ambig_train
sil_score = []
for i in range(10):
  gm = GaussianMixture(n_components=i+2, random_state=0).fit(embedding_train[partition1,:])
  clusters = gm.predict(embedding_train[partition1,:])
  sil_score.append(metrics.silhouette_score(embedding_train[partition1,:], clusters, metric='euclidean'))
best = np.argmax(sil_score)+2
gm = GaussianMixture(n_components=best, random_state=0).fit(embedding_train[partition1,:])
clusters1 = gm.predict(embedding_train[partition1,:])

# Superclass: Data-IQ hard group
partition2 = hard_train
sil_score = []
for i in range(10):
  gm = GaussianMixture(n_components=i+2, random_state=0).fit(embedding_train[partition2,:])
  clusters = gm.predict(embedding_train[partition2,:])
  sil_score.append(metrics.silhouette_score(embedding_train[partition2,:], clusters, metric='euclidean'))
best = np.argmax(sil_score)+2
gm = GaussianMixture(n_components=best, random_state=0).fit(embedding_train[partition2,:])
clusters2 = gm.predict(embedding_train[partition2,:])

### Assign the subclass labels

In [68]:
X_trainG = np.concatenate((X_train[partition0,:],X_train[partition1,:],X_train[partition2,:]))
y_trainG = np.concatenate((y_train[partition0],y_train[partition1],y_train[partition2]))
superclass_labels = y_trainG

max0 = np.max(np.unique(clusters0))
max1 = np.max(np.unique(clusters1))
clusters1_xp = clusters1+(max0+1)
clusters2_xp = clusters2+(max0+max1+1+1)
subclass_labels = np.concatenate((clusters0, clusters1_xp, clusters2_xp))

class_map = {}
superclass_set = sorted(set(np.array(superclass_labels)))
for superclass in superclass_set:
    class_map[superclass] = sorted(
        np.unique(np.array(subclass_labels[superclass_labels == superclass])))
sup_sub_map = class_map

class_map = torch.tensor(subclass_labels) == torch.arange(len(np.unique(subclass_labels))).unsqueeze(1).long()
subclass_counts = class_map.sum(1).float()

### Train w/ Group-DRO

In [69]:
LEARNING_RATE = 0.001
EPOCHS=10
BATCH_SIZE=16
num_subclasses = len(subclass_counts)

if dataset=='fetal':
  EPOCHS=EPOCHS_FETAL

dro_netC = Net1(input_size=X_trainG.shape[1],nlabels=nlabels)
dro_netC.to(device)

train_data = TrainDataDRO(torch.FloatTensor(X_trainG), 
                       torch.FloatTensor(y_trainG), 
                       torch.tensor(subclass_labels))
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

optimizer = optim.Adam(dro_netC.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss(reduction='none')
robust=True

if robust:
    size_adjustment=1
    size_adjustments = [size_adjustment] * num_subclasses
else:
    size_adjustments = None
robust_lr=0.001
criterion = LossComputer(criterion, robust, num_subclasses, subclass_counts,
                          robust_lr, stable=True,
                          size_adjustments=size_adjustments,
                          auroc_version=False,
                          class_map=sup_sub_map, use_cuda=True)

dro_netC = train_loop(net=dro_netC, criterion=criterion, EPOCHS=EPOCHS, train_loader=train_loader,optimizer=optimizer, device=device)

  X = F.softmax(self.output(X))
  co = criterion(sf(y_pred), y_batch, y_subclass)


Epoch 001: | Loss: 0.60756 | Acc: 0.623
Epoch 002: | Loss: 0.58245 | Acc: 0.671
Epoch 003: | Loss: 0.57454 | Acc: 0.705
Epoch 004: | Loss: 0.56736 | Acc: 0.718
Epoch 005: | Loss: 0.57973 | Acc: 0.718
Epoch 006: | Loss: 0.57834 | Acc: 0.725
Epoch 007: | Loss: 0.58031 | Acc: 0.727
Epoch 008: | Loss: 0.58269 | Acc: 0.719
Epoch 009: | Loss: 0.58639 | Acc: 0.725
Epoch 010: | Loss: 0.57613 | Acc: 0.722


# GROUP-DRO: subgroups are clusters on the superclass (e.g George)

### Get clusters within the superclass labels

In [70]:
# Superclass: label=0
partition0 = np.where(y_train==0)[0]
sil_score = []
for i in range(10):
  gm = GaussianMixture(n_components=i+2, random_state=0).fit(embedding_train[partition0,:])
  clusters = gm.predict(embedding_train[partition0,:])
  sil_score.append(metrics.silhouette_score(embedding_train[partition0,:], clusters, metric='euclidean'))
best = np.argmax(sil_score)+2
gm = GaussianMixture(n_components=best, random_state=0).fit(embedding_train[partition0,:])
clusters0 = gm.predict(embedding_train[partition0,:])

# Superclass: label=1
partition1 = np.where(y_train==1)[0]
sil_score = []
for i in range(10):
  gm = GaussianMixture(n_components=i+2, random_state=0).fit(embedding_train[partition1,:])
  clusters = gm.predict(embedding_train[partition1,:])
  sil_score.append(metrics.silhouette_score(embedding_train[partition1,:], clusters, metric='euclidean'))
best = np.argmax(sil_score)+2
gm = GaussianMixture(n_components=best, random_state=0).fit(embedding_train[partition1,:])
clusters1 = gm.predict(embedding_train[partition1,:])

if dataset=='fetal':
  # Superclass: label=2
  partition2 = np.where(y_train==2)[0]
  sil_score = []
  for i in range(10):
    gm = GaussianMixture(n_components=i+2, random_state=0).fit(embedding_train[partition2,:])
    clusters = gm.predict(embedding_train[partition2,:])
    sil_score.append(metrics.silhouette_score(embedding_train[partition2,:], clusters, metric='euclidean'))
  best = np.argmax(sil_score)+2
  gm = GaussianMixture(n_components=best, random_state=0).fit(embedding_train[partition2,:])
  clusters2 = gm.predict(embedding_train[partition2,:])

### Assign the subclass labels

In [72]:
X_trainG = np.concatenate((X_train[partition0,:],X_train[partition1,:],X_train[partition2,:]))
y_trainG = np.concatenate((y_train[partition0],y_train[partition1],y_train[partition2]))
superclass_labels = y_trainG
max0 = np.max(np.unique(clusters0))
max1 = np.max(np.unique(clusters1))
clusters1_xp = clusters1+(max0+1)
clusters2_xp = clusters2+(max0+max1+1+1)
subclass_labels = np.concatenate((clusters0, clusters1_xp, clusters2_xp))

class_map = {}
superclass_set = sorted(set(np.array(superclass_labels)))
for superclass in superclass_set:
    class_map[superclass] = sorted(
        np.unique(np.array(subclass_labels[superclass_labels == superclass])))
sup_sub_map = class_map

class_map = torch.tensor(subclass_labels) == torch.arange(len(np.unique(subclass_labels))).unsqueeze(1).long()
subclass_counts = class_map.sum(1).float()

### Train w/ Group-DRO

In [73]:
LEARNING_RATE = 0.001
EPOCHS=10
BATCH_SIZE=16
num_subclasses = len(subclass_counts)
if dataset=='fetal':
  EPOCHS=EPOCHS_FETAL

dro_netG = Net1(input_size=X_trainG.shape[1],nlabels=nlabels)
dro_netG.to(device)

train_data = TrainDataDRO(torch.FloatTensor(X_trainG), 
                       torch.FloatTensor(y_trainG), 
                       torch.tensor(subclass_labels))
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

optimizer = optim.Adam(dro_netG.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss(reduction='none')
robust=True

if robust:
    size_adjustment=1
    size_adjustments = [size_adjustment] * num_subclasses
else:
    size_adjustments = None
robust_lr=0.001
criterion = LossComputer(criterion, robust, num_subclasses, subclass_counts,
                          robust_lr, stable=True,
                          size_adjustments=size_adjustments,
                          auroc_version=False,
                          class_map=sup_sub_map, use_cuda=True)

dro_netG = train_loop(net=dro_netG, criterion=criterion, EPOCHS=EPOCHS, train_loader=train_loader,optimizer=optimizer, device=device)


  X = F.softmax(self.output(X))
  co = criterion(sf(y_pred), y_batch, y_subclass)


Epoch 001: | Loss: 0.60536 | Acc: 0.504
Epoch 002: | Loss: 0.59503 | Acc: 0.617
Epoch 003: | Loss: 0.59392 | Acc: 0.632
Epoch 004: | Loss: 0.59942 | Acc: 0.604
Epoch 005: | Loss: 0.58561 | Acc: 0.643
Epoch 006: | Loss: 0.59201 | Acc: 0.642
Epoch 007: | Loss: 0.59323 | Acc: 0.652
Epoch 008: | Loss: 0.60020 | Acc: 0.654
Epoch 009: | Loss: 0.60596 | Acc: 0.652
Epoch 010: | Loss: 0.60269 | Acc: 0.654


# Group-DRO: on Data-IQ subgroups

### Assign subclasses as Data-IQ subgroups

In [74]:
superclass_labels = y_train
subclass_labels = []
for i in range(len(y_train)):
  if i in easy_train:
    subclass_labels.append(0)
  elif i in ambig_train:
    subclass_labels.append(1)
  else:
    subclass_labels.append(2)
subclass_labels = np.array(subclass_labels)

class_map = {}
superclass_set = sorted(set(np.array(superclass_labels)))
for superclass in superclass_set:
    class_map[superclass] = sorted(
        np.unique(np.array(subclass_labels[superclass_labels == superclass])))
sup_sub_map = class_map

class_map = torch.tensor(subclass_labels) == torch.arange(3).unsqueeze(1).long()
subclass_counts = class_map.sum(1).float()

### Train w/ Group-DRO

In [75]:
LEARNING_RATE = 0.001
EPOCHS=10
BATCH_SIZE=16
num_subclasses = 3
if dataset=='fetal':
  EPOCHS=EPOCHS_FETAL

ids = np.concatenate((easy_train, ambig_train))

train_data = TrainDataDRO(torch.FloatTensor(X_train[ids,:]), 
                       torch.FloatTensor(y_train[ids]), 
                       torch.tensor(subclass_labels[ids]))
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

dro_net = Net1(X_train[ids,:].shape[1],nlabels=nlabels)
dro_net.to(device)

optimizer = optim.Adam(dro_net.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss(reduction='none')
robust=True

if robust:
    size_adjustment=0
    size_adjustments = [size_adjustment] * num_subclasses
else:
    size_adjustments = None
robust_lr=0.001
criterion = LossComputer(criterion=criterion, is_robust=robust, n_groups = num_subclasses, group_counts=subclass_counts,
                          robust_step_size= robust_lr, stable=True,
                          size_adjustments=size_adjustments,
                          auroc_version=False,
                          class_map=sup_sub_map, use_cuda=True)

dro_net = train_loop(net=dro_net, criterion=criterion, EPOCHS=EPOCHS, train_loader=train_loader,optimizer=optimizer, device=device)


Epoch 001: | Loss: 0.46626 | Acc: 0.548
Epoch 002: | Loss: 0.37773 | Acc: 0.762
Epoch 003: | Loss: 0.37450 | Acc: 0.787
Epoch 004: | Loss: 0.38506 | Acc: 0.795
Epoch 005: | Loss: 0.40209 | Acc: 0.793
Epoch 006: | Loss: 0.41648 | Acc: 0.796
Epoch 007: | Loss: 0.43462 | Acc: 0.793
Epoch 008: | Loss: 0.44221 | Acc: 0.795
Epoch 009: | Loss: 0.45634 | Acc: 0.798
Epoch 010: | Loss: 0.46809 | Acc: 0.798


# JTT based on errors

In [76]:
n_reps=5

# Get errors on base model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net_test = checkpoint_list[0][9] 
net_test.eval()
with torch.no_grad():
    X_batch = torch.tensor(X_train)
    X_batch = X_batch.to(device)
    y_test_pred = net_test(X_batch)
threshold=0.5
preds = y_test_pred.data[:,1].cpu().numpy()
y_pred = preds>threshold

# Augment with the errors
aug_ids = np.where(np.not_equal(y_pred, y_train))[0]
aug_feats = np.repeat(X_train[aug_ids,:], n_reps, axis=0)
aug_labels = np.repeat(y_train[aug_ids], n_reps, axis=0)

# Update the training set
X_train_jtt = np.vstack((X_train, aug_feats))
y_train_jtt = np.hstack((y_train, aug_labels))


# RE-TRAIN
LEARNING_RATE = 0.001
EPOCHS=10
BATCH_SIZE=128

if dataset=='fetal':
  EPOCHS=EPOCHS_FETAL

jtt_net = Net1(X_train.shape[1],nlabels=nlabels)
jtt_net.to(device)

train_data = TrainData(torch.FloatTensor(X_train_jtt), 
                       torch.FloatTensor(y_train_jtt))

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

criterion = torch.nn.NLLLoss()
optimizer = optim.Adam(jtt_net.parameters(), lr=LEARNING_RATE)
jtt_net = train_loop(net=jtt_net, criterion=criterion, EPOCHS=EPOCHS, train_loader=train_loader,optimizer=optimizer, device=device, subclass=False)


  loss = criterion(sf(y_pred), y_batch)


Epoch 001: | Loss: 0.68446 | Acc: 0.574
Epoch 002: | Loss: 0.68309 | Acc: 0.575
Epoch 003: | Loss: 0.68064 | Acc: 0.574
Epoch 004: | Loss: 0.66707 | Acc: 0.602
Epoch 005: | Loss: 0.64115 | Acc: 0.644
Epoch 006: | Loss: 0.62301 | Acc: 0.671
Epoch 007: | Loss: 0.61618 | Acc: 0.677
Epoch 008: | Loss: 0.60513 | Acc: 0.701
Epoch 009: | Loss: 0.60129 | Acc: 0.701
Epoch 010: | Loss: 0.59852 | Acc: 0.708


In [77]:
results={}
results['jtt'] = evaluate_model(net_test=jtt_net, X_test=X_test, y_test=y_test, easy_test=easy_test, incons_test=ambig_test, hard_test=hard_test)

results['dro_iq'] = evaluate_model(net_test=dro_net, X_test=X_test, y_test=y_test, easy_test=easy_test, incons_test=ambig_test, hard_test=hard_test)
                                   
results['dro-george'] = evaluate_model(net_test=dro_netG, X_test=X_test, y_test=y_test, easy_test=easy_test, incons_test=ambig_test, hard_test=hard_test)

results['baseline'] = evaluate_model(net_test=checkpoint_list[0][9], X_test=X_test, y_test=y_test, easy_test=easy_test, incons_test=ambig_test, hard_test=hard_test)

# ADDITIONAL METHOD - CAN UNCOMMENT --> this is group-dro on the clustered space of Data-IQ subgroups
#results['dro-cluster'] = evaluate_model(net_test= dro_netC, X_test=X_test, y_test=y_test, easy_test=easy_test, incons_test=ambig_test, hard_test=hard_test)

  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))
  X = F.softmax(self.output(X))


In [78]:
results

{'jtt': {'overall': 0.40864511442063206,
  'rest': 0.44543249797898143,
  'ambig': 0.3806179775280899},
 'dro_iq': {'overall': 0.7232110424990918,
  'rest': 0.8302344381568311,
  'ambig': 0.6306179775280899},
 'dro-george': {'overall': 0.6785325099891028,
  'rest': 0.7865804365400162,
  'ambig': 0.5884831460674157},
 'baseline': {'overall': 0.7221213221939702,
  'rest': 0.8302344381568311,
  'ambig': 0.6285112359550562}}