# Notebook to replicate the results of the experiment conducted on the Offensive Language and Hate Speech dataset by Davidson et al. (2017)

# Preparations

In [1]:
import random
import math
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from tabulate import tabulate

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

download and unzip glove.6B.100d.txt from Stanford University

In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-01-14 15:42:24--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-01-14 15:42:24--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-01-14 15:42:24--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

download labelled_data.npy from Github Repo of Towards Unbiased and Accurate Deferral to Multiple Experts by Keswani et al. (2021)

In [3]:
#download from https://github.com/vijaykeswani/Deferral-To-Multiple-Experts/blob/3aa1f621991cb9d7823757d68c178870b686da62/output/labelled_data.npy

#Hyperparameter Definition

In [4]:
NUM_CLASSES = 2
DROPOUT = 0.00
NUM_HIDDEN_UNITS = 50

PATH_OLHS_DATA = 'labelled_data.npy'
PATH_GLOVE_MODEL = 'glove.6B.100d.txt'

NUM_EXPERTS = 20
USE_LR_SCHEDULER = True

TRAIN_BATCH_SIZE = 512
TEST_BATCH_SIZE = 512
LR = 5e-3
EPOCHS = 20

#Definition of Classes and Functions

Classes for Dataset and Dataloader

In [5]:
print("Loading Glove Model")
f = open(PATH_GLOVE_MODEL, 'r')
GLOVE_MODEL = {}
for line in f:
    split_lines = line.split()
    word = split_lines[0]
    word_embedding = np.array([float(value) for value in split_lines[1:]])
    GLOVE_MODEL[word] = word_embedding


class OLHS_Dataset(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        self.post_ids = data["PostId"].values
        self.targets = data["Label"].values
        self.groups = data["Group"].values

        self.posts_features = []
        for features in data["Feature"].values:
            features = torch.from_numpy(features).float()
            features = features.to(device)
            self.posts_features.append(features)

    def __getitem__(self, index: int):
        post_id, target, group = self.post_ids[index], self.targets[index], self.groups[index]
        post_features = self.posts_features[index]
        return post_features, target, group, post_id

    def __len__(self) -> int:
        return len(self.posts_features)

class OLHS_3_Split_Dataloader:
    def __init__(self, train_batch_size=128, test_batch_size=128, seed=42):
        self.train_batch_size = train_batch_size
        self.test_batch_size = test_batch_size
        self.seed = seed

        vocab = GLOVE_MODEL.keys()

        data = np.load(PATH_OLHS_DATA, allow_pickle=True)[()]
        posts = data['posts']
        labels = data['labels']
        labels = [1 if int(l) < 2 else 0 for l in labels]
        groups = data['groups']
        groups = [int(g) for g in groups]

        postsToFeatures = {}
        for i, p in enumerate(posts):
            feat = []
            for w in p:
                if w in vocab:
                    feat.append(GLOVE_MODEL[w])
            if len(feat) == 0:
                continue

            feat = np.mean(feat, axis=0)
            postsToFeatures[i] = feat

        all_indices = list(postsToFeatures.keys())
        train_len = int(len(posts) * 0.8)

        np.random.seed(self.seed)
        train_indices = np.random.choice(all_indices, train_len, replace=False)
        val_test_indices = np.setdiff1d(all_indices, train_indices)

        val_len = int(len(posts) * 0.1)
        val_indices = np.random.choice(val_test_indices, val_len, replace=False)

        test_indices = np.setdiff1d(val_test_indices, val_indices)

        train_features = [postsToFeatures[i] for i in train_indices]
        train_labels = [labels[i] for i in train_indices]
        train_groups = [groups[i] for i in train_indices]

        val_features = [postsToFeatures[i] for i in val_indices]
        val_labels = [labels[i] for i in val_indices]
        val_groups = [groups[i] for i in val_indices]

        test_features = [postsToFeatures[i] for i in test_indices]
        test_labels = [labels[i] for i in test_indices]
        test_groups = [groups[i] for i in test_indices]

        train_df = pd.DataFrame({"PostId": train_indices, "Label": train_labels, "Group": train_groups, "Feature": train_features})
        val_df = pd.DataFrame({"PostId": val_indices, "Label": val_labels, "Group": val_groups, "Feature": val_features})
        test_df = pd.DataFrame({"PostId": test_indices, "Label": test_labels, "Group": test_groups, "Feature": test_features})

        self.trainset = OLHS_Dataset(train_df)
        self.valset = OLHS_Dataset(val_df)
        self.testset = OLHS_Dataset(test_df)

    def get_data_loader(self):
        train_loader = self._get_data_loader(dataset=self.trainset, batch_size=self.train_batch_size, drop_last=True)
        val_loader = self._get_data_loader(dataset=self.valset, batch_size=self.test_batch_size, drop_last=False)
        test_loader = self._get_data_loader(dataset=self.testset, batch_size=self.test_batch_size, drop_last=False)
        return train_loader, val_loader, test_loader

    def _get_data_loader(self, dataset, batch_size, drop_last, shuffle=True):
        return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0, drop_last=drop_last)

Loading Glove Model


Functions for our loss and JSF loss

In [6]:
def joint_sparse_framework_loss(epoch, classifier_output, allocation_system_output, expert_preds, targets):
    # Input:
    #   epoch: int = current epoch (used for epoch-dependent weighting of allocation system loss)
    #   classifier_output: softmax probabilities as class probabilities,  nxm matrix with n=batch size, m=number of classes
    #   allocation_system_output: sigmoid outputs as expert weights,  nx(m+1) matrix with n=batch size, m=number of experts + 1 for machine
    #   expert_preds: nxm matrix with expert predictions with n=number of experts, m=number of classes
    #   targets: targets as 1-dim vector with n length with n=batch_size

    # loss for allocation system 

    # set up zero-initialized tensor to store weighted team predictions
    batch_size = len(targets)
    weighted_team_preds = torch.zeros((batch_size, NUM_CLASSES)).to(classifier_output.device)

    # for each team member add the weighted prediction to the team prediction
    # start with machine
    weighted_team_preds = weighted_team_preds + allocation_system_output[:, 0].reshape(-1, 1) * classifier_output
    # continue with human experts
    for idx in range(NUM_EXPERTS):
        one_hot_expert_preds = torch.tensor(np.eye(NUM_CLASSES)[expert_preds[idx].astype(int)]).to(classifier_output.device)
        weighted_team_preds = weighted_team_preds + allocation_system_output[:, idx + 1].reshape(-1, 1) * one_hot_expert_preds

    # calculate team probabilities using softmax
    team_probs = nn.Softmax(dim=1)(weighted_team_preds)

    # alpha2 is 1-epoch^0.5 (0.5 taken from code of preprint paper) <--- used for experiments
    alpha2 = 1 - (epoch ** -0.5)
    alpha2 = torch.tensor(alpha2).to(classifier_output.device)

    # weight the negative log likelihood loss with alpha2 to get team loss
    log_team_probs = torch.log(team_probs + 1e-7)
    allocation_system_loss = nn.NLLLoss(reduction="none")(log_team_probs, targets.long())
    allocation_system_loss = torch.mean(alpha2 * allocation_system_loss)

    # loss for classifier

    alpha1 = 1
    log_classifier_output = torch.log(classifier_output + 1e-7)
    classifier_loss = nn.NLLLoss(reduction="none")(log_classifier_output, targets.long())
    classifier_loss = alpha1 * torch.mean(classifier_loss)

    # combine both losses
    system_loss = classifier_loss + allocation_system_loss

    return system_loss

def our_loss(epoch, classifier_output, allocation_system_output, expert_preds, targets):
    # Input:
    #   epoch: int = current epoch (not used)
    #   classifier_output: softmax probabilities as class probabilities,  nxm matrix with n=batch size, m=number of classes
    #   allocation_system_output: softmax outputs as weights,  nx(m+1) matrix with n=batch size, m=number of experts + 1 for machine
    #   expert_preds: nxm matrix with expert predictions with n=number of experts, m=number of classes
    #   targets: targets as 1-dim vector with n length with n=batch_size

    batch_size = len(targets)
    team_probs = torch.zeros((batch_size, NUM_CLASSES)).to(classifier_output.device) # set up zero-initialized tensor to store team predictions
    team_probs = team_probs + allocation_system_output[:, 0].reshape(-1, 1) * classifier_output # add the weighted classifier prediction to the team prediction
    for idx in range(NUM_EXPERTS): # continue with human experts
        one_hot_expert_preds = torch.tensor(np.eye(NUM_CLASSES)[expert_preds[idx].astype(int)]).to(classifier_output.device)
        team_probs = team_probs + allocation_system_output[:, idx + 1].reshape(-1, 1) * one_hot_expert_preds

    log_output = torch.log(team_probs + 1e-7)
    system_loss = nn.NLLLoss()(log_output, targets)

    return system_loss

def mixture_of_ai_experts_loss(allocation_system_output, classifiers_outputs, targets):
    batch_size = len(targets)
    team_probs = torch.zeros((batch_size, NUM_CLASSES)).to(allocation_system_output.device)
    classifiers_outputs = classifiers_outputs.to(allocation_system_output.device)

    for idx in range(NUM_EXPERTS+1):
        team_probs = team_probs + allocation_system_output[:, idx].reshape(-1, 1) * classifiers_outputs[idx]

    log_output = torch.log(team_probs + 1e-7)
    moae_loss = nn.NLLLoss()(log_output, targets)

    return moae_loss

def mixture_of_human_experts_loss(allocation_system_output, human_expert_preds, targets):
    batch_size = len(targets)
    team_probs = torch.zeros((batch_size, NUM_CLASSES)).to(allocation_system_output.device)

    # human experts
    for idx in range(NUM_EXPERTS):
        one_hot_expert_preds = torch.tensor(np.eye(NUM_CLASSES)[human_expert_preds[idx].astype(int)]).to(allocation_system_output.device)
        team_probs = team_probs + allocation_system_output[:, idx].reshape(-1, 1) * one_hot_expert_preds

    log_output = torch.log(team_probs + 1e-7)
    mohe_loss = nn.NLLLoss()(log_output, targets)

    return mohe_loss

Class for classifier and allocation system network

In [7]:
class Network(nn.Module):
    def __init__(self, output_size, softmax_sigmoid="softmax"):
        super().__init__()
        self.softmax_sigmoid = softmax_sigmoid

        self.classifier = nn.Sequential(
            nn.Dropout(DROPOUT),
            nn.Linear(100, NUM_HIDDEN_UNITS),
            nn.ReLU(),
            nn.Linear(NUM_HIDDEN_UNITS, output_size)
        )

    def forward(self, features):
        output = self.classifier(features)
        if self.softmax_sigmoid == "softmax":
            output = nn.Softmax(dim=1)(output)
        elif self.softmax_sigmoid == "sigmoid":
            output = nn.Sigmoid()(output)
        return output

Classes and Functions for Experts

In [8]:
def flip(p):
    return 1 if random.random() < p else 0

class Expert:
    def __init__(self, pq, post_indices, labels, groups):
        self.pq = pq

        self.preds = {}
        for group, label, post_idx in zip(groups, labels, post_indices):
            toss = flip(self.pq[int(group)])
            pred = int(label) if toss else 1 - int(label)
            self.preds[post_idx] = pred

    def predict(self, post_indices):
        predictions = [self.preds[post_idx] for post_idx in post_indices]
        return predictions

    def __str__(self):
        return "Expert accuracies:" + str(round(self.p[0], 2)) + ", " + str(round(self.p[0], 2))

class AverageExpert:
    def __init__(self, expert_fns=[]):
        self.expert_fns = expert_fns
        self.num_experts = len(self.expert_fns)

    def predict(self, post_indices):
        random.shuffle(self.expert_fns)
        all_experts_predictions = [expert_fn(post_indices) for expert_fn in self.expert_fns]
        predictions = [None] * len(post_indices)

        for idx, expert_predictions in enumerate(all_experts_predictions):
            predictions[idx::self.num_experts] = expert_predictions[idx::self.num_experts]

        return predictions

def get_uniform_experts(num, min_prob=0.5, toggle=0):
    data = np.load(PATH_OLHS_DATA, allow_pickle=True)[()]
    labels = data['labels']
    labels = [1 if int(l) < 2 else 0 for l in labels]
    groups = data['groups']
    groups = [int(g) for g in groups]
    post_indices = list(range(0,len(labels)))

    experts = []
    for i in range(num):
        p = np.random.uniform(min_prob, 1)
        q = np.random.uniform(min_prob, p)
        if toggle:
            p, q = q, p

        experts.append(Expert((p, q), post_indices, labels, groups))
    return experts

Functions for Metric Calculation

In [9]:
def get_accuracy(preds, targets):
    if len(targets) > 0:
        acc = accuracy_score(targets, preds)
    else:
        acc = 0

    return acc

def get_coverage(task_subset_targets, targets):
    num_images = len(targets)
    num_images_in_task_subset = len(task_subset_targets)
    coverage = num_images_in_task_subset / num_images

    return coverage

def get_classifier_metrics(classifier_preds, allocation_system_decisions, targets):
    # classifier performance on all tasks
    classifier_accuracy = get_accuracy(classifier_preds, targets)

    # filter for subset of tasks that are allocated to the classifier
    task_subset = (allocation_system_decisions == 0)

    # classifier performance on those tasks
    task_subset_classifier_preds = classifier_preds[task_subset]
    task_subset_targets = targets[task_subset]
    classifier_task_subset_accuracy = get_accuracy(task_subset_classifier_preds, task_subset_targets)

    # coverage
    classifier_coverage = get_coverage(task_subset_targets, targets)

    return classifier_accuracy, classifier_task_subset_accuracy, classifier_coverage

def get_experts_metrics(expert_preds, allocation_system_decisions, targets):
    expert_accuracies = []
    expert_task_subset_accuracies = []
    expert_coverages = []

    # calculate metrics for each expert
    for expert_idx in range(NUM_EXPERTS):

        # expert performance on all tasks
        preds = expert_preds[expert_idx]
        expert_accuracy = get_accuracy(preds, targets)

        # filter for subset of tasks that are allocated to the expert with number "idx"
        task_subset = (allocation_system_decisions == expert_idx+1)

        # expert performance on tasks assigned by allocation system
        task_subset_expert_preds = preds[task_subset]
        task_subset_targets = targets[task_subset]
        expert_task_subset_accuracy = get_accuracy(task_subset_expert_preds, task_subset_targets)

        # coverage
        expert_coverage = get_coverage(task_subset_targets, targets)

        expert_accuracies.append(expert_accuracy)
        expert_task_subset_accuracies.append(expert_task_subset_accuracy)
        expert_coverages.append(expert_coverage)

    return expert_accuracies, expert_task_subset_accuracies, expert_coverages

def get_metrics(epoch, allocation_system_outputs, classifier_outputs, expert_preds, targets, loss_fn):
    metrics = {}

    # Metrics for system
    allocation_system_decisions = np.argmax(allocation_system_outputs, 1)
    classifier_preds = np.argmax(classifier_outputs, 1)
    preds = np.vstack((classifier_preds, expert_preds)).T
    system_preds = preds[range(len(preds)), allocation_system_decisions.astype(int)]
    system_accuracy = get_accuracy(system_preds, targets)

    system_loss = loss_fn(epoch=epoch,
                          classifier_output=torch.tensor(classifier_outputs).float(),
                          allocation_system_output=torch.tensor(allocation_system_outputs).float(),
                          expert_preds=expert_preds,
                          targets=torch.tensor(targets).long())

    metrics["System Accuracy"] = system_accuracy
    metrics["System Loss"] = system_loss

    # Metrics for classifier
    classifier_accuracy, classifier_task_subset_accuracy, classifier_coverage = get_classifier_metrics(classifier_preds, allocation_system_decisions, targets)
    metrics["Classifier Accuracy"] = classifier_accuracy
    metrics["Classifier Task Subset Accuracy"] = classifier_task_subset_accuracy
    metrics["Classifier Coverage"] = classifier_coverage

    # Metrics for experts 
    """expert_accuracies, experts_task_subset_accuracies, experts_coverages = get_experts_metrics(expert_preds, allocation_system_decisions, targets)

    for expert_idx, (expert_accuracy, expert_task_subset_accuracy, expert_coverage) in enumerate(zip(expert_accuracies, experts_task_subset_accuracies, experts_coverages)):
        metrics[f'Expert {expert_idx+1} Accuracy'] = expert_accuracy
        metrics[f'Expert {expert_idx+1} Task Subset Accuracy'] = expert_task_subset_accuracy
        metrics[f'Expert {expert_idx+1} Coverage'] = expert_coverage"""

    return system_accuracy, system_loss, metrics

Functions for Training and Evaluation of Our Approach and JSF

In [10]:
def train_one_epoch(epoch, classifier, allocation_system, train_loader, optimizer, scheduler, expert_fns, loss_fn):
    classifier.train()
    allocation_system.train()

    for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(train_loader):
        batch_targets = batch_targets.to(device)

        expert_batch_preds = np.empty((NUM_EXPERTS, len(batch_targets)))
        for idx, expert_fn in enumerate(expert_fns):
            expert_batch_preds[idx] = np.array(expert_fn(batch_post_ids.numpy()))

        batch_outputs_classifier = classifier(batch_input)
        batch_outputs_allocation_system = allocation_system(batch_input)

        batch_loss = loss_fn(epoch=epoch, classifier_output=batch_outputs_classifier, allocation_system_output=batch_outputs_allocation_system,
                                expert_preds=expert_batch_preds, targets=batch_targets)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if USE_LR_SCHEDULER:
            scheduler.step()

def evaluate_one_epoch(epoch, classifier, allocation_system, data_loader, expert_fns, loss_fn):
    classifier.eval()
    allocation_system.eval()

    classifier_outputs = torch.tensor([]).to(device)
    allocation_system_outputs = torch.tensor([]).to(device)
    targets = torch.tensor([]).long().to(device)
    post_ids = []

    with torch.no_grad():
        for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(data_loader):
            batch_targets = batch_targets.to(device)
            targets = torch.cat((targets, batch_targets))

            batch_classifier_outputs = classifier(batch_input)
            classifier_outputs = torch.cat((classifier_outputs, batch_classifier_outputs))

            batch_allocation_system_outputs = allocation_system(batch_input)
            allocation_system_outputs = torch.cat((allocation_system_outputs, batch_allocation_system_outputs))

            post_ids.extend(batch_post_ids.numpy())

    expert_preds = np.empty((NUM_EXPERTS, len(targets)))
    for idx, expert_fn in enumerate(expert_fns):
        expert_preds[idx] = np.array(expert_fn(post_ids))

    classifier_outputs = classifier_outputs.cpu().numpy()
    allocation_system_outputs = allocation_system_outputs.cpu().numpy()
    targets = targets.cpu().numpy()

    system_accuracy, system_loss, metrics = get_metrics(epoch, allocation_system_outputs, classifier_outputs, expert_preds, targets, loss_fn)

    return system_accuracy, system_loss, metrics

def run_team_performance_optimization(method, seed, expert_fns):
    print(f'Training multi expert deferral with {method}')

    if method == "Joint Sparse Framework":
        loss_fn = joint_sparse_framework_loss
        allocation_system_activation_function = "sigmoid"


    elif method == "Our Approach":
        loss_fn = our_loss
        allocation_system_activation_function = "softmax"

    classifier = Network(output_size=NUM_CLASSES,
                            softmax_sigmoid="softmax").to(device)

    allocation_system = Network(output_size=NUM_EXPERTS + 1,
                                 softmax_sigmoid=allocation_system_activation_function).to(device)

    ohs_dl = OLHS_3_Split_Dataloader(train_batch_size=TRAIN_BATCH_SIZE, test_batch_size=TEST_BATCH_SIZE, seed=seed)
    train_loader, val_loader, test_loader = ohs_dl.get_data_loader()

    parameters = list(classifier.parameters()) + list(allocation_system.parameters())
    optimizer = torch.optim.Adam(parameters, lr=LR, betas=(0.9, 0.999), weight_decay=0)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS * len(train_loader))

    best_val_system_accuracy = 0
    best_val_system_loss = 100
    best_metrics = None

    for epoch in tqdm(range(1, EPOCHS + 1)):
        train_one_epoch(epoch, classifier, allocation_system, train_loader, optimizer, scheduler, expert_fns, loss_fn)

        val_system_accuracy, val_system_loss, _ = evaluate_one_epoch(epoch, classifier, allocation_system, val_loader, expert_fns, loss_fn)
        _, _, test_metrics = evaluate_one_epoch(epoch, classifier, allocation_system, test_loader, expert_fns, loss_fn)

        if method == "Joint Sparse Framework":
            if val_system_accuracy > best_val_system_accuracy:
                best_val_system_accuracy = val_system_accuracy
                best_metrics = test_metrics

        elif method == "Our Approach":
            if val_system_loss < best_val_system_loss:
                best_val_system_loss = val_system_loss
                best_metrics = test_metrics

    print(f'\n Earlystopping Results for {method}:')
    system_metrics_keys = [key for key in best_metrics.keys() if "System" in key]
    for k in system_metrics_keys:
        print(f'\t {k}: {best_metrics[k]}')
    print()

    classifier_metrics_keys = [key for key in best_metrics.keys() if "Classifier" in key]
    for k in classifier_metrics_keys:
        print(f'\t {k}: {best_metrics[k]}')
    print()

    """for exp_idx in range(NUM_EXPERTS):
      expert_metrics_keys = [key for key in best_metrics.keys() if f'Expert {exp_idx+1} ' in key]
      for k in expert_metrics_keys:
          print(f'\t {k}: {best_metrics[k]}')
    print()"""

    return best_metrics["System Accuracy"], best_metrics["Classifier Coverage"]

Functions for Evaluation of Baselines

In [11]:
def get_accuracy_of_best_expert(seed, expert_fns):
    ohs_dl = OLHS_3_Split_Dataloader(train_batch_size=TRAIN_BATCH_SIZE, test_batch_size=TEST_BATCH_SIZE, seed=seed)
    _, _, test_loader = ohs_dl.get_data_loader()

    targets = torch.tensor([]).long()
    post_ids = []

    with torch.no_grad():
        for i, (_, batch_targets, batch_groups, batch_post_ids) in enumerate(test_loader):
            batch_targets = batch_targets
            targets = torch.cat((targets, batch_targets))

            post_ids.extend(batch_post_ids.numpy())

    expert_preds = np.empty((NUM_EXPERTS, len(targets)))
    for idx, expert_fn in enumerate(expert_fns):
        expert_preds[idx] = np.array(expert_fn(post_ids))

    expert_accuracies = []
    for idx in range(NUM_EXPERTS):
        preds = expert_preds[idx]
        acc = accuracy_score(targets, preds)
        expert_accuracies.append(acc)

    print(f'Best Expert Accuracy: {max(expert_accuracies)}\n')

    return max(expert_accuracies)

def get_accuracy_of_average_expert(seed, expert_fns):
    ohs_dl = OLHS_3_Split_Dataloader(train_batch_size=TRAIN_BATCH_SIZE, test_batch_size=TEST_BATCH_SIZE, seed=seed)
    _, _, test_loader = ohs_dl.get_data_loader()

    targets = torch.tensor([]).long()
    post_ids = []

    with torch.no_grad():
        for i, (_, batch_targets, batch_groups, batch_post_ids) in enumerate(test_loader):
            batch_targets = batch_targets
            targets = torch.cat((targets, batch_targets))

            post_ids.extend(batch_post_ids.numpy())


    avg_expert = AverageExpert(expert_fns)
    avg_expert_preds = avg_expert.predict(post_ids)
    avg_expert_acc = accuracy_score(targets, avg_expert_preds)
    print(f'Average Expert Accuracy: {avg_expert_acc}\n')

    return avg_expert_acc

Functions for Training and Evaluation of Full Automation Baseline

In [12]:
def train_full_automation_one_epoch(classifier, train_loader, optimizer, scheduler):
    # switch to train mode
    classifier.train()

    for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(train_loader):
        batch_targets = batch_targets.to(device)
        batch_outputs_classifier = classifier(batch_input)
        log_output = torch.log(batch_outputs_classifier + 1e-7)
        batch_loss = nn.NLLLoss()(log_output, batch_targets)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if USE_LR_SCHEDULER:
            scheduler.step()


def evaluate_full_automation_one_epoch(classifier, data_loader):
    classifier.eval()

    classifier_outputs = torch.tensor([]).to(device)
    targets = torch.tensor([]).long().to(device)

    with torch.no_grad():
        for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(data_loader):
            batch_targets = batch_targets.to(device)
            targets = torch.cat((targets, batch_targets))

            batch_classifier_outputs = classifier(batch_input)
            classifier_outputs = torch.cat((classifier_outputs, batch_classifier_outputs))

    log_output = torch.log(classifier_outputs + 1e-7)
    full_automation_loss = nn.NLLLoss()(log_output, targets)

    classifier_outputs = classifier_outputs.cpu().numpy()
    targets = targets.cpu().numpy()

    classifier_preds = np.argmax(classifier_outputs, 1)
    full_automation_accuracy = get_accuracy(classifier_preds, targets)

    return full_automation_accuracy, full_automation_loss

def run_full_automation(seed):
    print(f'Training full automation baseline')

    classifier = Network(output_size=NUM_CLASSES,
                            softmax_sigmoid="softmax").to(device)

    ohs_dl = OLHS_3_Split_Dataloader(train_batch_size=TRAIN_BATCH_SIZE, test_batch_size=TEST_BATCH_SIZE, seed=seed)
    train_loader, val_loader, test_loader = ohs_dl.get_data_loader()

    optimizer = torch.optim.Adam(classifier.parameters(), lr=LR, betas=(0.9, 0.999), weight_decay=0)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS * len(train_loader))

    best_val_system_loss = 100
    best_test_system_accuracy = None

    for epoch in tqdm(range(1, EPOCHS + 1)):
        train_full_automation_one_epoch(classifier, train_loader, optimizer, scheduler)

        val_system_accuracy, val_system_loss = evaluate_full_automation_one_epoch(classifier, val_loader)
        test_system_accuracy, test_system_loss, = evaluate_full_automation_one_epoch(classifier, test_loader)

        if val_system_loss < best_val_system_loss:
            best_val_system_loss = val_system_loss
            best_test_system_accuracy = test_system_accuracy

    print(f'Full Automation Accuracy: {best_test_system_accuracy}\n')
    return best_test_system_accuracy


Functions for Training and Evaluation of Mixture of Artificial Experts Baseline

In [18]:
def train_moae_one_epoch(classifiers, allocation_system, train_loader, optimizer, scheduler):
    # switch to train mode
    allocation_system.train()
    for classifier in classifiers:
        classifier.train()

    for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(train_loader):
        batch_targets = batch_targets.to(device)

        batch_outputs_allocation_system = allocation_system(batch_input)
        batch_outputs_classifiers = torch.empty((NUM_EXPERTS+1, len(batch_targets), NUM_CLASSES))
        for idx, classifier in enumerate(classifiers):
            batch_outputs_classifiers[idx] = classifier(batch_input)

        # compute and record loss
        batch_loss = mixture_of_ai_experts_loss(allocation_system_output=batch_outputs_allocation_system,
                                                   classifiers_outputs=batch_outputs_classifiers, targets=batch_targets)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if USE_LR_SCHEDULER:
            scheduler.step()

def evaluate_moae_one_epoch(classifiers, allocation_system, data_loader):
    allocation_system.eval()
    for classifier in classifiers:
        classifier.eval()

    classifiers_outputs = torch.tensor([]).to(device)
    allocation_system_outputs = torch.tensor([]).to(device)
    targets = torch.tensor([]).long().to(device)

    with torch.no_grad():
        for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(data_loader):
            batch_targets = batch_targets.to(device)

            batch_allocation_system_outputs = allocation_system(batch_input)
            batch_outputs_classifiers = torch.empty((NUM_EXPERTS+1, len(batch_targets), NUM_CLASSES)).to(device)
            for idx, classifier in enumerate(classifiers):
                batch_outputs_classifiers[idx] = classifier(batch_input)

            classifiers_outputs = torch.cat((classifiers_outputs, batch_outputs_classifiers), dim=1)
            allocation_system_outputs = torch.cat((allocation_system_outputs, batch_allocation_system_outputs))
            targets = torch.cat((targets, batch_targets.float()))

    moae_loss = mixture_of_ai_experts_loss(allocation_system_output=allocation_system_outputs,
                                                   classifiers_outputs=classifiers_outputs, targets=targets.long())

    classifiers_outputs = classifiers_outputs.cpu().numpy()
    allocation_system_outputs = allocation_system_outputs.cpu().numpy()
    targets = targets.cpu().numpy()

    allocation_system_decisions = np.argmax(allocation_system_outputs, 1)
    classifiers_preds = np.argmax(classifiers_outputs, 2).T
    team_preds = classifiers_preds[range(len(classifiers_preds)), allocation_system_decisions.astype(int)]
    moae_accuracy = get_accuracy(team_preds, targets)

    return moae_accuracy, moae_loss

def run_moae(seed):
    print(f'Training Mixture of artificial experts baseline')

    allocation_system = Network(output_size=NUM_EXPERTS + 1,
                                 softmax_sigmoid="softmax").to(device)

    classifiers = []
    for _ in range(NUM_EXPERTS+1):
        classifier = Network(output_size=NUM_CLASSES,
                            softmax_sigmoid="softmax").to(device)
        classifiers.append(classifier)

    ohs_dl = OLHS_3_Split_Dataloader(train_batch_size=TRAIN_BATCH_SIZE, test_batch_size=TEST_BATCH_SIZE, seed=seed)
    train_loader, val_loader, test_loader = ohs_dl.get_data_loader()

    parameters = list(allocation_system.parameters())
    for classifier in classifiers:
        parameters += list(classifier.parameters())

    optimizer = torch.optim.Adam(parameters, lr=LR, betas=(0.9, 0.999), weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS * len(train_loader))

    best_val_system_loss = 100
    best_test_system_accuracy = None

    for epoch in tqdm(range(1, EPOCHS + 1)):
        train_moae_one_epoch(classifiers, allocation_system, train_loader, optimizer, scheduler)
        val_moae_accuracy, val_moae_loss = evaluate_moae_one_epoch(classifiers, allocation_system, val_loader)
        test_moae_accuracy, test_moae_loss = evaluate_moae_one_epoch(classifiers, allocation_system, test_loader)

        if val_moae_loss < best_val_system_loss:
            best_val_system_loss = val_moae_loss
            best_test_system_accuracy = test_moae_accuracy

    print(f'Mixture of Artificial Experts Accuracy: {best_test_system_accuracy}\n')
    return best_test_system_accuracy


Functions for Training and Evaluation of Mixture of Human Experts Baseline

In [19]:
def train_mohe_one_epoch(allocation_system, train_loader, optimizer, scheduler, expert_fns):
    # switch to train mode
    allocation_system.train()

    for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(train_loader):
        batch_targets = batch_targets.to(device)

        expert_batch_preds = np.empty((NUM_EXPERTS, len(batch_targets)))
        for idx, expert_fn in enumerate(expert_fns):
            expert_batch_preds[idx] = np.array(expert_fn(batch_post_ids.numpy()))

        batch_outputs_allocation_system = allocation_system(batch_input)

        # compute and record loss
        batch_loss = mixture_of_human_experts_loss(allocation_system_output=batch_outputs_allocation_system,
                                                   human_expert_preds=expert_batch_preds, targets=batch_targets)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if USE_LR_SCHEDULER:
            scheduler.step()


def evaluate_mohe_one_epoch(allocation_system, data_loader, expert_fns):
    allocation_system.eval()

    allocation_system_outputs = torch.tensor([]).to(device)
    targets = torch.tensor([]).to(device)
    post_ids = []

    with torch.no_grad():
        for i, (batch_input, batch_targets, batch_groups, batch_post_ids) in enumerate(data_loader):
            batch_targets = batch_targets.to(device)
            batch_allocation_system_outputs = allocation_system(batch_input)

            allocation_system_outputs = torch.cat((allocation_system_outputs, batch_allocation_system_outputs))
            targets = torch.cat((targets, batch_targets))
            post_ids.extend(batch_post_ids.numpy())


    expert_preds = np.empty((NUM_EXPERTS, len(targets)))
    for idx, expert_fn in enumerate(expert_fns):
        expert_preds[idx] = np.array(expert_fn(post_ids))

    # compute and record loss
    mohe_loss = mixture_of_human_experts_loss(allocation_system_output=allocation_system_outputs,
                                                   human_expert_preds=expert_preds, targets=targets.long())

    allocation_system_outputs = allocation_system_outputs.cpu().numpy()
    targets = targets.cpu().numpy()

    expert_preds = expert_preds.T
    allocation_system_decisions = np.argmax(allocation_system_outputs, 1)
    team_preds = expert_preds[range(len(expert_preds)), allocation_system_decisions.astype(int)]
    mohe_accuracy = get_accuracy(team_preds, targets)

    return mohe_accuracy, mohe_loss


def run_mohe(seed, expert_fns):
    print(f'Training Mixture of human experts baseline')

    allocation_system = Network(output_size=NUM_EXPERTS,
                                 softmax_sigmoid="softmax").to(device)

    ohs_dl = OLHS_3_Split_Dataloader(train_batch_size=TRAIN_BATCH_SIZE, test_batch_size=TEST_BATCH_SIZE, seed=seed)
    train_loader, val_loader, test_loader = ohs_dl.get_data_loader()

    parameters = allocation_system.parameters()
    optimizer = torch.optim.Adam(parameters, lr=LR, betas=(0.9, 0.999), weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS * len(train_loader))

    best_val_system_loss = 100
    best_test_system_accuracy = None

    for epoch in tqdm(range(1, EPOCHS + 1)):
        train_mohe_one_epoch(allocation_system, train_loader, optimizer, scheduler, expert_fns)
        val_mohe_accuracy, val_mohe_loss = evaluate_mohe_one_epoch(allocation_system, val_loader, expert_fns)
        test_mohe_accuracy, test_mohe_loss = evaluate_mohe_one_epoch(allocation_system, test_loader, expert_fns)

        if val_mohe_loss < best_val_system_loss:
            best_val_system_loss = val_mohe_loss
            best_test_system_accuracy = test_mohe_accuracy

    print(f'Mixture of Human Experts Accuracy: {best_test_system_accuracy}\n')
    return best_test_system_accuracy



#Run Experiment

In [20]:
best_expert_accuracies = []
avg_expert_accuracies = []
our_approach_accuracies = []
our_approach_coverages = []
jsf_accuracies = []
jsf_coverages = []
full_automation_accuracies = []
mohe_accuracies = []
moae_accuracies = []

for seed in range(20):
    print(f'Seed: {seed}')
    print("-"*40)
    np.random.seed(seed)
    random.seed(seed)

    min_prob = 0.6
    num_experts_aae = math.floor(NUM_EXPERTS * 3 / 4)
    num_experts_non_aae = NUM_EXPERTS - num_experts_aae
    aae_experts = get_uniform_experts(num_experts_aae, min_prob)
    non_aae_experts = get_uniform_experts(num_experts_non_aae, min_prob, toggle=1)

    experts = []
    experts.extend(aae_experts)
    experts.extend(non_aae_experts)
    expert_fns = [expert.predict for expert in experts]
    print(f'Num AAE Experts: {num_experts_aae}, Num Non-AAE Experts: {num_experts_non_aae}\n')

    best_expert_accuracy = get_accuracy_of_best_expert(seed, expert_fns)
    best_expert_accuracies.append(best_expert_accuracy)
    
    avg_expert_accuracy = get_accuracy_of_average_expert(seed, expert_fns)
    avg_expert_accuracies.append(avg_expert_accuracy)

    our_approach_accuracy, our_approach_coverage = run_team_performance_optimization("Our Approach", seed, expert_fns)
    our_approach_accuracies.append(our_approach_accuracy)
    our_approach_coverages.append(our_approach_coverage)
    
    jsf_accuracy, jsf_coverage = run_team_performance_optimization("Joint Sparse Framework", seed, expert_fns)
    jsf_accuracies.append(jsf_accuracy)
    jsf_coverages.append(jsf_coverage)
    
    mohe_accuracy = run_mohe(seed, expert_fns)
    mohe_accuracies.append(mohe_accuracy)

    full_automation_accuracy = run_full_automation(seed)
    full_automation_accuracies.append(full_automation_accuracy)
    
    moae_accuracy = run_moae(seed)
    moae_accuracies.append(moae_accuracy)
    print("-"*40)

Seed: 0
----------------------------------------
Num AAE Experts: 15, Num Non-AAE Experts: 5

Best Expert Accuracy: 0.9646053702196908

Average Expert Accuracy: 0.7628152969894223

Training multi expert deferral with Our Approach


100%|██████████| 20/20 [00:11<00:00,  1.72it/s]



 Earlystopping Results for Our Approach:
	 System Accuracy: 0.96826688364524
	 System Loss: 0.061494549465643686

	 Classifier Accuracy: 0.838893409275834
	 Classifier Task Subset Accuracy: 0
	 Classifier Coverage: 0.0

Training multi expert deferral with Joint Sparse Framework


100%|██████████| 20/20 [00:12<00:00,  1.66it/s]



 Earlystopping Results for Joint Sparse Framework:
	 System Accuracy: 0.9174125305126118
	 System Loss: 0.25251489671342237

	 Classifier Accuracy: 0.8909682668836453
	 Classifier Task Subset Accuracy: 1.0
	 Classifier Coverage: 0.0028478437754271765

Training Mixture of human experts baseline


100%|██████████| 20/20 [00:10<00:00,  1.83it/s]


Mixture of Human Experts Accuracy: 0.9707078925956062

Training full automation baseline


100%|██████████| 20/20 [00:02<00:00,  7.13it/s]


Full Automation Accuracy: 0.8966639544344996

Training Mixture of artificial experts baseline


100%|██████████| 20/20 [00:17<00:00,  1.13it/s]


Mixture of Artificial Experts Accuracy: 0.8938161106590724

----------------------------------------
Seed: 1
----------------------------------------
Num AAE Experts: 15, Num Non-AAE Experts: 5

Best Expert Accuracy: 0.9320585842148088

Average Expert Accuracy: 0.7310821806346623

Training multi expert deferral with Our Approach


100%|██████████| 20/20 [00:11<00:00,  1.71it/s]



 Earlystopping Results for Our Approach:
	 System Accuracy: 0.9694873881204231
	 System Loss: 0.0584044598287273

	 Classifier Accuracy: 0.8287225386493083
	 Classifier Task Subset Accuracy: 0.9894736842105263
	 Classifier Coverage: 0.5410903173311635

Training multi expert deferral with Joint Sparse Framework


100%|██████████| 20/20 [00:11<00:00,  1.67it/s]



 Earlystopping Results for Joint Sparse Framework:
	 System Accuracy: 0.9328722538649309
	 System Loss: 0.287274813367474

	 Classifier Accuracy: 0.8816110659072417
	 Classifier Task Subset Accuracy: 1.0
	 Classifier Coverage: 0.0020341741253051262

Training Mixture of human experts baseline


100%|██████████| 20/20 [00:11<00:00,  1.81it/s]


Mixture of Human Experts Accuracy: 0.934092758340114

Training full automation baseline


100%|██████████| 20/20 [00:02<00:00,  7.57it/s]


Full Automation Accuracy: 0.8954434499593165

Training Mixture of artificial experts baseline


100%|██████████| 20/20 [00:17<00:00,  1.15it/s]

Mixture of Artificial Experts Accuracy: 0.8995117982099268

----------------------------------------





In [21]:
mean_best_expert_accuracy = np.mean(best_expert_accuracies)
mean_best_expert_coverage = 0.00

mean_avg_expert_accuracy = np.mean(avg_expert_accuracies)
mean_avg_expert_coverage = 0.00

mean_our_approach_accuracy = np.mean(our_approach_accuracies)
mean_our_approach_coverage = np.mean(our_approach_coverages)

mean_jsf_accuracy = np.mean(jsf_accuracies)
mean_jsf_coverage = np.mean(jsf_coverages)

mean_full_automation_accuracy = np.mean(full_automation_accuracies)
mean_full_automation_coverage = 100.00

mean_moae_accuracy = np.mean(moae_accuracies)
mean_moae_coverage = 100.00

mean_mohe_accuracy = np.mean(mohe_accuracies)
mean_mohe_coverage = 0.00



In [22]:
print(tabulate([['Our Approach', mean_our_approach_accuracy, mean_our_approach_coverage],
                ['JSF', mean_jsf_accuracy, mean_jsf_coverage],
                ['--------', '--------', '--------'],
                ['Full Automation', mean_full_automation_accuracy, mean_full_automation_coverage],
                ['Random Expert', mean_avg_expert_accuracy, mean_avg_expert_coverage],
                ['Best Expert', mean_best_expert_accuracy, mean_best_expert_coverage], 
                ['MOHE', mean_mohe_accuracy, mean_mohe_coverage],
                ['MOAE', mean_moae_accuracy, mean_moae_coverage]],
               headers=['Method', 'Accuracy', 'Coverage']))

Method           Accuracy            Coverage
---------------  ------------------  ---------------------
Our Approach     0.9688771358828316  0.27054515866558176
JSF              0.9251423921887714  0.0024410089503661514
--------         --------            --------
Full Automation  0.896053702196908   100.0
Random Expert    0.7469487388120424  0.0
Best Expert      0.9483319772172498  0.0
MOHE             0.9524003254678601  0.0
MOAE             0.8966639544344996  100.0
