# Set up / imports

In [1]:
#@title Setup (imports and downloads)

!pip install transformers
!pip install gdown

import torch
import gdown
import os
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data.sampler import SubsetRandomSampler

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, AdamW

from tqdm import tqdm

import pickle
import matplotlib.pyplot as plt

path_original_dataset_model = '6_new_models/DC_original_dataset.pt'
path_rerelease_dataset_model = '6_new_models/DC_rerelease_dataset.pt'

# Set model
model_name = 'roberta-large' #roberta-large, roberta-base, bert-base-uncased
tokenizer = AutoTokenizer.from_pretrained(model_name)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 13.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 48.9MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |██████

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




# Setting

**Set path to root directory in the following cell.**

In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

# change to directory containing relevant files
%cd '/content/drive/MyDrive/Uni/NLP_CW2/github/'

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1g3JCgH-awK6a0hTXlBoIT3NgyxvgLv_M/NLP_CW2/github


In [3]:
#@title Run to check we're in the correct directory

try:
    f = open("check_directory.txt")
    print('Success :)')
except IOError:
    print("Wrong directory, please try again")

Success :)


**Select which actions to run in the notebook**

In [4]:
# Training settings
run_training = False # If you want to run training, set to True
training_dataset = 'original' # either 'original' or 'rerelease'
training_save_path = 'PATH.pt'

# Evaluation setting
run_evaluation = True
evaluation_dataset = 'rerelease'  # either 'original' or 'rerelease'

# Uncertainty estimates setting
run_uncertainty_estimates = False
uncertainty_dataset = 'original' # either 'original' or 'rerelease'

# Data loading

## Data loading functions

In [5]:
def load_util_sentences_aspairs(path="util_train"):
    df = pd.read_csv("{}.csv".format(path), header=None)
    first_sentences = []
    second_sentences = []
    labels = []
    for i in range(df.shape[0]):

        swap = np.random.binomial(n=1, p=0.5)

        if swap:
            second_sentences.append(df.iloc[i, 0])
            first_sentences.append(df.iloc[i, 1])
            labels.append(1)
        else:
            first_sentences.append(df.iloc[i, 0])
            second_sentences.append(df.iloc[i, 1])
            labels.append(0)

    first_sentences = ["[CLS] " + s for s in first_sentences]
    second_sentences = ["[SEP] " + s for s in second_sentences]

    return first_sentences, second_sentences, labels

In [6]:
class UTILdata(torch.utils.data.Dataset):
    '''
    Utilitarianims dataset
    '''
    def __init__(self, first_sentences, second_sentences, labels, max_length=50):
        super(UTILdata, self).__init__()

        self.first = tokenizer(first_sentences)['input_ids']
        self.second = tokenizer(second_sentences)['input_ids']
        self.labels = labels

        self.instances = []
        
        for idx, _ in enumerate(self.first):

            if len(self.first[idx]) <= max_length and len(self.second[idx]) <= max_length:

                self.instances += [{'sentence_A': self.first[idx],
                                    'sentence_B': self.second[idx],
                                    'label': self.labels[idx],
                                    'original_A': first_sentences[idx][6:],
                                    'original_B': second_sentences[idx][6:]}]
        

    def __getitem__(self, index):
        return self.instances[index]

    def __len__(self):
        return len(self.instances)

In [7]:
def collate(batch):
    batch_size = len(batch)

    max_length = 50

    sentences_A, sentences_B = [], []
    original_A, original_B = [], []
    masks_A, masks_B = [], []

    # Pad all instances to match longest
    for i, instance in enumerate(batch):

        length_A, length_B = len(instance['sentence_A']), len(instance['sentence_B'])

        if length_A > max_length or length_B > max_length:  # If either of the sentences exceeds the max, skip
            continue
        original_A.append(instance['original_A'])
        original_B.append(instance['original_B'])

        mask_A, mask_B = torch.ones(max_length), torch.ones(max_length)
        mask_A[length_A:], mask_B[length_B:] = 0, 0

        # Pad and add sentence A
        if length_A < max_length:
            pad_length = max_length - length_A
            padding = torch.zeros((pad_length), dtype=torch.int)
            #print(instance['sentence_A'])
            a = torch.IntTensor(instance['sentence_A'])
            #print(f'\n {a}')
            padded_A = torch.cat((torch.IntTensor(instance['sentence_A']), padding), dim=0)
            
            sentences_A.append(padded_A)
            masks_A.append(mask_A)
        else:
            sentences_A.append(torch.IntTensor(instance['sentence_A']))
            masks_A.append(mask_A)
        # Pad and add sentence B
        if length_B < max_length:
            pad_length = max_length - length_B
            padding = torch.zeros((pad_length), dtype=torch.int)
            padded_B = torch.cat((torch.IntTensor(instance['sentence_B']), padding), dim=0)
            
            sentences_B.append(padded_B)
            masks_B.append(mask_B)
        else:
            sentences_B.append(torch.IntTensor(instance['sentence_B']))
            masks_B.append(mask_B)

    labels = [instance['label'] for instance in batch]
    batch_labels = torch.LongTensor(labels)

    batch_sentences_A, batch_sentences_B = torch.stack(sentences_A, dim=0), torch.stack(sentences_B, dim=0)

    batch_sentences = torch.cat([batch_sentences_A, batch_sentences_B], dim=1)
    #print(a[0])
    batch_masks_A, batch_masks_B = torch.stack(masks_A, dim=0), torch.stack(masks_B, dim=0)
    batch_masks = torch.cat([batch_masks_A, batch_masks_B], dim=1)


    batch = {
        'sentences': batch_sentences,
        'masks': batch_masks,
        'label': batch_labels,
        'original_A': original_A,
        'original_B': original_B
    }
    

    return batch

## Load data

Choose whether to load the original or new dataset

### Load original dataset
Creates dataloaders.

**For training:**
- `original_train_loader`
- `original_val_loader`

**For testing:**
- `original_easy_test_loader`
- `original_hard_test_loader`

In [8]:
#############
# Loads the training set, and splits it into training and validation
##########
path_data_orig_train = '1_original_study_datasets/util_train'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_orig_train)
train_set = UTILdata(first_sent, sec_sent, labels)

# Creating data indices for training and validation splits:
validation_split = 0.2
dataset_size = len(train_set)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
np.random.seed(0)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)


# Final training and validation loaders
original_train_loader = DataLoader(dataset = train_set,
                    batch_size = 20,
                    collate_fn=collate,
                    sampler = train_sampler)

original_val_loader = DataLoader(dataset = train_set,
                    batch_size = 20,
                    collate_fn=collate,
                    sampler = valid_sampler)

In [9]:
#############
# Loads the easy and hard test sets into dataloaders
##########
path_data_orig_test_easy = '1_original_study_datasets/util_test'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_orig_test_easy)
test_set = UTILdata(first_sent, sec_sent, labels)

original_easy_test_loader = DataLoader(dataset = test_set,
                    batch_size = 20,
                    shuffle = False,
                    collate_fn=collate)

path_data_orig_test_hard = '1_original_study_datasets/util_test_hard'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_orig_test_hard)
hard_test_set = UTILdata(first_sent, sec_sent, labels)

original_hard_test_loader = DataLoader(dataset = hard_test_set,
                    batch_size = 20,
                    shuffle = False,
                    collate_fn=collate)

### Load New Datasets
Creates dataloaders.

**For training:**
- `new_train_loader`
- `new_val_loader`

**For testing:**
- `new_easy_test_loader`
- `new_hard_test_loader`
- `new_unmatched_test_loader`

In [10]:
#############
# Loads the training set, and splits it into training and validation
##########
path_data_new_train = '4_reformulated_datasets/util_train_no_test_overlap'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_new_train)
train_set = UTILdata(first_sent, sec_sent, labels)

# Creating data indices for training and validation splits:
validation_split = 0.2
dataset_size = len(train_set)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
np.random.seed(0)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

new_train_loader = DataLoader(dataset = train_set,
                    batch_size = 20,
                    collate_fn=collate,
                    sampler = train_sampler)
new_val_loader = DataLoader(dataset = train_set,
                    batch_size = 20,
                    collate_fn=collate,
                    sampler = valid_sampler)

In [11]:
# Re-release test sets

# Matched easy
path_data_new_test_easy = '4_reformulated_datasets/util_test_easy_matched'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_new_test_easy)
test_easy_matched = UTILdata(first_sent, sec_sent, labels)

new_easy_test_loader = DataLoader(dataset = test_easy_matched,
                    batch_size = 20,
                    shuffle = False,
                    collate_fn=collate)

# Matched hard
path_data_new_test_hard = '4_reformulated_datasets/util_test_hard_matched'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_new_test_hard)
test_hard_matched = UTILdata(first_sent, sec_sent, labels)

new_hard_test_loader = DataLoader(dataset = test_hard_matched,
                    batch_size = 20,
                    shuffle = False,
                    collate_fn=collate)
# Unmatched
path_data_new_test_unmatched = '4_reformulated_datasets/test_combined_unmatched'
first_sent, sec_sent, labels = load_util_sentences_aspairs(path=path_data_new_test_unmatched)
test_unmatched = UTILdata(first_sent, sec_sent, labels)

new_unmatched_test_loader = DataLoader(dataset = test_unmatched,
                    batch_size = 20,
                    shuffle = False,
                    collate_fn=collate)

# Model training

## Training/evaluation functions

In [12]:
def train(model, optimizer, train_dataloader, validation_loader, epochs, save_path):
    # Set model to training mode
    device = 'cuda'
    model.train()
    model.to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    sig = nn.Sigmoid()

    loss_fun = nn.CrossEntropyLoss()
    #loss_fun = torch.nn.BCEWithLogitsLoss()

    best_loss = 10000

    for epoch in range(epochs):

        tot_correct = 0
        total = 0
        tot_loss = 0

        with tqdm(train_dataloader, unit="batch") as tepoch:
            # Loop over each batch from the training set
            for batch in tepoch:

                # Unpack the inputs from our dataloader
                inputs = batch['sentences']
                targets = batch['label']
                masks = batch['masks']

                #print("\n")
                #print(inputs.shape)

                # Send to device
                inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)

                # Zero gradient buffers
                optimizer.zero_grad()

                ######### ASSUMING 2 OUTPUTS ######## -> cross entropy # Need to set targets to be a LongTensor
                # Forward pass
                output = model(inputs, attention_mask = masks)
                softmax_output = F.softmax(output.logits, dim=1)

                #print(len(output.logits.squeeze()))
                #print(len(targets))
                # Loss
                loss = loss_fun(output.logits.squeeze(), targets)
                loss.backward()
                optimizer.step()

                tot_loss += loss.item()

                # Calculate accuracy
                y = torch.argmax(softmax_output, dim=1) # Prediction (max probability)
                correct = (y == targets).sum() # Correct predictions
                # print(f'Softmax output: {softmax_output}')
                # print(f'y: {y}')
                # print(f'Confidence: {torch.gather(softmax_output, 1, y.view(-1,1))}')
                

                # del loss, output, softmax_output, inputs, targets, masks
                # return

                tot_correct += correct.item() # Accumulate correct predictions
                total += len(targets) # Accumulate number of examples seen

                del loss, output, softmax_output, inputs, targets, masks

                tepoch.set_postfix(loss=tot_loss/total, accuracy=100*tot_correct/total)

        val_loss, val_correct = train_eval(model, validation_loader)

        if val_loss < best_loss:
            best_loss = val_loss

            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': val_loss,
                }, save_path)

        print(f'\r \nValidation loss: {val_loss}, validation accuracy: {val_correct}')

    return model

In [13]:
def train_eval(model, test_dataloader):
    device = 'cuda'
    model.eval()
    #model.to(device)
    tot_correct = 0
    total = 0
    tot_loss = 0

    loss_fun = nn.CrossEntropyLoss()

    for i, batch in enumerate(test_dataloader):
        # Unpack the inputs from our dataloader
        inputs = batch['sentences']
        targets = batch['label']
        masks = batch['masks']

        # Send to device
        inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)

        # Forward pass
        output = model(inputs, attention_mask = masks)
        softmax_output = F.softmax(output.logits, dim=1)

        # Loss
        loss = loss_fun(output.logits.squeeze(), targets)
        tot_loss += loss.item()

        # Calculate accuracy
        y = torch.argmax(softmax_output, dim=1) # Prediction (max probability)
        correct = (y == targets).sum() # Correct predictions

        tot_correct += correct.item() # Accumulate correct predictions
        total += len(targets)

        del loss, output, softmax_output, inputs, targets, masks

    return tot_loss/total, tot_correct/total

In [14]:
def eval(model, test_dataloader):
    device = 'cuda'
    model.eval()
    model.to(device)
    tot_correct = 0
    total = 0
    tot_loss = 0

    input_sentences_A = []
    input_sentences_B = []
    predictions = []
    prediction_results = [] # True = correct, False = incorrect

    loss_fun = nn.CrossEntropyLoss()

    with tqdm(test_dataloader, unit="batch") as tepoch:
        # Loop over each batch from the training set
        for batch in tepoch:

            # Unpack the inputs from our dataloader
            inputs = batch['sentences']
            targets = batch['label']
            masks = batch['masks']

            # For comparison between model results
            sentences_A = batch['original_A']
            sentences_B = batch['original_B']
            input_sentences_A.extend(sentences_A)
            input_sentences_B.extend(sentences_B)

            # Send to device
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)

            # Forward pass
            output = model(inputs, attention_mask = masks)
            softmax_output = F.softmax(output.logits, dim=1)

            # Loss
            loss = loss_fun(output.logits.squeeze(), targets)
            tot_loss += loss.item()

            # Calculate accuracy
            y = torch.argmax(softmax_output, dim=1) # Prediction (max probability)
            correct = (y == targets).sum() # Correct predictions

            # For model comparisons
            pred_correct = (y==targets).tolist()
            prediction_results.extend(pred_correct)
            predictions.extend(y.tolist())

            tot_correct += correct.item() # Accumulate correct predictions
            total += len(targets)

    return tot_loss/total, tot_correct/total, input_sentences_A, input_sentences_B, predictions, prediction_results

## Training

In [15]:
if training_dataset == 'original':
    train_loader = original_train_loader
    val_loader = original_val_loader
elif training_dataset == 'rerelease':
    train_loader = new_train_loader
    val_loader = new_val_loader    

if run_training:    
    # Create new model to train
    config = AutoConfig.from_pretrained(model_name, num_labels=2)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # Select optimizer
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [16]:
if run_training:
    # Trains model, and saves the best weights to PATH (according to validation accuracy)
    model = train(model, optimizer, train_loader, val_loader, epochs=4, save_path=training_save_path)

In [17]:
a = torch.Tensor([[1,2], [3,2], [-2, 3], [7, 4], [4, 5]])
b = torch.tensor([0, 0, 1, 0, 1], dtype=torch.long)
print(a)
print(b)
print(torch.gather(a, 1, b.view(-1,1)))

tensor([[ 1.,  2.],
        [ 3.,  2.],
        [-2.,  3.],
        [ 7.,  4.],
        [ 4.,  5.]])
tensor([0, 0, 1, 0, 1])
tensor([[1.],
        [3.],
        [3.],
        [7.],
        [5.]])


# Evaluation

## Evaluation function

In [18]:
def eval(model, test_dataloader):
    device = 'cuda'
    model.eval()
    model.to(device)
    tot_correct = 0
    total = 0
    tot_loss = 0

    input_sentences_A = []
    input_sentences_B = []
    predictions = []
    prediction_results = [] # True = correct, False = incorrect
    prediction_confidence = []

    loss_fun = nn.CrossEntropyLoss()

    with tqdm(test_dataloader, unit="batch") as tepoch:
        # Loop over each batch from the training set
        for batch in tepoch:

            # Unpack the inputs from our dataloader
            inputs = batch['sentences']
            targets = batch['label']
            masks = batch['masks']

            # For comparison between model results
            sentences_A = batch['original_A']
            sentences_B = batch['original_B']
            input_sentences_A.extend(sentences_A)
            input_sentences_B.extend(sentences_B)

            # Send to device
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)

            # Forward pass
            output = model(inputs, attention_mask = masks)
            softmax_output = F.softmax(output.logits, dim=1)

            # Loss
            loss = loss_fun(output.logits.squeeze(), targets)
            tot_loss += loss.item()

            # Calculate accuracy
            y = torch.argmax(softmax_output, dim=1) # Prediction (max probability)
            correct = (y == targets).sum() # Correct predictions

            # print(f'Softmax output: {softmax_output}')
            # print(f'y: {y}')
            confidence = torch.gather(softmax_output, 1, y.view(-1,1))
            confidence = confidence.squeeze()
            # print(f'Confidence: {confidence}')
            # print(f'Correct pred: {y==targets}')
            
            # return

            # For model comparisons
            pred_correct = (y==targets).tolist()
            prediction_results.extend(pred_correct)
            predictions.extend(y.tolist())
            prediction_confidence.extend(confidence.tolist())

            tot_correct += correct.item() # Accumulate correct predictions
            total += len(targets)
        
    prediction_confidence = torch.FloatTensor(prediction_confidence)

    prediction_confidence = np.array(prediction_confidence)
    prediction_results = np.array(prediction_results)

    return tot_loss/total, tot_correct/total, input_sentences_A, input_sentences_B, predictions, prediction_results, prediction_confidence

## Load and evaluate trained model

In [19]:
if run_evaluation:
    if evaluation_dataset == 'original':
        trained_model = torch.load(path_original_dataset_model)
        easy_test_loader = original_easy_test_loader
        hard_test_loader = original_hard_test_loader
    elif evaluation_dataset == 'rerelease':
        trained_model = torch.load(path_rerelease_dataset_model)
        easy_test_loader = new_easy_test_loader
        hard_test_loader = new_hard_test_loader
        # new_unmatched_test_loader also available to evaluate

    config = AutoConfig.from_pretrained('roberta-large', num_labels=2)
    DC_model = AutoModelForSequenceClassification.from_pretrained('roberta-large', config=config)
    DC_model.load_state_dict(trained_model)
    del trained_model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

In [20]:
if run_evaluation:
    loss, acc, _, _, _, pred_easy, conf_easy = eval(DC_model, easy_test_loader)
    print("\n")
    print('Easy test set results')
    print(f'Loss: {loss}')
    print(f'Accuracy: {acc}')

100%|██████████| 151/151 [00:17<00:00,  8.48batch/s]



Easy test set results
Loss: 0.00520090780721264
Accuracy: 0.9611037234042553





In [21]:
if run_evaluation:
    loss, acc, _, _, _, pred_hard, conf_hard  = eval(DC_model, hard_test_loader)
    print("\n")
    print('Hard test set results')
    print(f'Loss: {loss}')
    print(f'Accuracy: {acc}')

100%|██████████| 142/142 [00:16<00:00,  8.49batch/s]



Hard test set results
Loss: 0.041839520815199455
Accuracy: 0.648141592920354





In [22]:
if run_evaluation and evaluation_dataset == 'rerelease':
    loss, acc, _, _, _, pred_unm, conf_unm  = eval(DC_model, new_unmatched_test_loader)
    print("\n")
    print('Unmatched test set results')
    print(f'Loss: {loss}')
    print(f'Accuracy: {acc}')

100%|██████████| 159/159 [00:18<00:00,  8.46batch/s]



Unmatched test set results
Loss: 0.05313535392265596
Accuracy: 0.5531378114159571





# Uncertainty estimates (MC)

In [23]:
if run_uncertainty_estimates:
    if uncertainty_dataset == 'original':
        trained_model = torch.load(path_original_dataset_model)
        easy_test_loader = original_easy_test_loader
        hard_test_loader = original_hard_test_loader
    elif uncertainty_dataset == 'rerelease':
        trained_model = torch.load(path_rerelease_dataset_model)
        easy_test_loader = new_easy_test_loader
        hard_test_loader = new_hard_test_loader
        # new_unmatched_test_loader also available to evaluate

    config = AutoConfig.from_pretrained('roberta-large', num_labels=2)
    DC_model = AutoModelForSequenceClassification.from_pretrained('roberta-large', config=config)
    DC_model.load_state_dict(trained_model)
    del trained_model

## Functions

In [24]:
def enable_dropout(model):
    """ Function to enable the dropout layers during test-time """
    for m in model.modules():
        if m.__class__.__name__.startswith('Dropout'):
            m.train()

In [25]:
def MC_dropout(model, dataloader, forward_passes):
    '''
    Inputs:
    - model
    - dataloader
    - forward_passes: int

    Ouputs:
    - predictions: numpy.ndarray, (num_samples,)
    - confidence: numpy.ndarray, (num_samples,)
    - mean, variance: numpy.ndarray
    - avg_correct_variance, avg_wrong_variance: numpy.float64
    '''

    device = 'cuda'
    model.eval()
    enable_dropout(model)
    model.to(device)
    tot_correct = 0
    total = 0
    tot_loss = 0

    input_sentences_A = []
    input_sentences_B = []
    predictions = []
    prediction_results = [] # True = correct, False = incorrect

    drop_pred = []

    loss_fun = nn.CrossEntropyLoss()

    with tqdm(dataloader, unit="batch") as tepoch:
        # Loop over each batch from the training set
        for batch in tepoch:
            #print(f'A batch!')

            # Unpack the inputs from our dataloader
            inputs = batch['sentences']
            targets = batch['label']
            masks = batch['masks']

            # For comparison between model results
            sentences_A = batch['original_A']
            sentences_B = batch['original_B']
            input_sentences_A.extend(sentences_A)
            input_sentences_B.extend(sentences_B)

            # Send to device
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)

            num_samples = len(targets)

            # Forward pass
            dropout_predictions = np.zeros((forward_passes, num_samples))

            for i in range(forward_passes):
                #print(f'Pass: {i}')
                output = model(inputs, attention_mask = masks)
                softmax_output = F.softmax(output.logits, dim=1)

                # Calculate accuracy
                y = torch.argmax(softmax_output, dim=1) # Prediction (max probability)
                #print(f'Predictions: {y}')
                #print(f'Target: {targets}')
                correct = y == targets # Correct predictions
                #print(f'Correct: {correct}')
                correct = correct.to('cpu')
                #print(f'Correct: {correct}')

                dropout_predictions[i,:] = correct

                del output, softmax_output, y, correct
            drop_pred.append(dropout_predictions)

            del inputs, targets, masks
    drop_pred = np.concatenate(drop_pred, axis=1)
    mean = np.mean(drop_pred, axis=0)
    
    # Get binary predictions
    predictions = np.copy(mean)
    predictions[mean>=0.5] = 1
    predictions[mean<0.5] = 0


    # Calculate the confidence in each prediction  
    confidence = np.copy(mean)
    confidence[mean<0.5] = 1 - confidence[mean<0.5] # For the instances where we're classifying as 0, the confidence is the opposite

    return predictions, confidence

In [26]:
def split_in_bins(predictions, confidence):
    num_bins = 5
    l = np.linspace(0.5,1,num_bins+1)
    bins = np.linspace(0.5,.9,num_bins)+.05

    conf = []
    acc = []
    num_in_bins = []

    for ind, (lower,upper) in enumerate(zip(l[:-1], l[1:])):
        indxs = np.where((confidence<=upper) & (confidence>lower)) # B_m

        this_bin_pred = predictions[indxs]
        this_bin_conf = confidence[indxs]

        #print(this_bin_conf.size)
        #print(type(this_bin_conf))
        # Get average confidence
        if this_bin_conf.size != 0:
            avg_conf = np.mean(this_bin_conf)
        else:
            avg_conf = None

        # Get average accuracy
        if this_bin_pred.size != 0:
            avg_acc = np.mean(this_bin_pred)
        else:
            avg_acc = None
        conf.append(avg_conf)
        acc.append(avg_acc)
        num_in_bins.append(len(this_bin_pred))
    
    return conf, acc, bins, num_in_bins

In [27]:
def get_ECE(confidence, accuracy, num_in_bins):
  '''
  condifence: list of conf(B_m)
  accuracy: list of acc(B_m)

  num_in_bins: number of samples in each bin
  '''
  assert len(confidence) == len(accuracy)

  num_in_bins = np.asarray(num_in_bins)
  n = num_in_bins.sum() # Tot number of samples
  ECE = 0
  for i in range(len(confidence)):
    if accuracy[i] and confidence[i]:
        ECE += (num_in_bins[i]/(n)) * np.abs(accuracy[i] - confidence[i])

  return ECE

In [28]:
def plot_reliability_diagram(accuracy, bins):
    for i in range(len(accuracy)):
        if accuracy[i] == None:
            accuracy[i] = 0

    width=0.1
    fig, ax = plt.subplots(figsize=(5,5))
    ax.bar(bins, accuracy, width=width, edgecolor='black', color='black', label="Model uncert.")
    ax.plot(np.linspace(0.5,1,6),np.linspace(0.5,1,6),linestyle='--',label='Perfect uncert.', color='red')
    ax.set_ylabel("Accuracy")
    ax.set_xlabel("Model certainty")
    plt.legend(loc='lower left')
    plt.savefig(f"dc_newdataset_unmatched_v2", dpi=250)
    plt.show()

## Running

## Original

In [29]:
# Original easy
if run_uncertainty_estimates and uncertainty_dataset == 'original':
    pred, conf = MC_dropout(DC_model, easy_test_loader, forward_passes=20)
    print(f'\nMC dropout accuracy: {pred.mean()}')
    easy_conf, easy_acc, bins, num_in_bins = split_in_bins(pred,conf)

    print(f'ECE: {get_ECE(easy_conf, easy_acc, num_in_bins)}')
    plot_reliability_diagram(easy_acc, bins)

In [30]:
# Original hard
if run_uncertainty_estimates and uncertainty_dataset == 'original':
    pred, conf = MC_dropout(DC_model, hard_test_loader, forward_passes=20)
    print(f'\nMC dropout accuracy: {pred.mean()}')
    hard_conf, hard_acc, bins, num_in_bins = split_in_bins(pred,conf)

    print(f'ECE: {get_ECE(hard_conf, hard_acc, num_in_bins)}')
    plot_reliability_diagram(hard_acc, bins)

## Rerelease

In [31]:
# Easy matched
if run_uncertainty_estimates and uncertainty_dataset == 'rerelease':
    pred, conf = MC_dropout(DC_model, easy_test_loader, forward_passes=30)
    print(f'\nMC dropout accuracy: {pred.mean()}')
    conf, acc, bins, num_in_bins = split_in_bins(pred,conf)

    print(f'ECE: {get_ECE(conf, acc, num_in_bins)}')
    plot_reliability_diagram(acc, bins)

In [32]:
# Hard matched
if run_uncertainty_estimates and uncertainty_dataset == 'rerelease':
    pred, conf = MC_dropout(DC_model, hard_test_loader, forward_passes=20)
    print(f'\nMC dropout accuracy: {pred.mean()}')
    conf, acc, bins, num_in_bins = split_in_bins(pred,conf)

    print(f'ECE: {get_ECE(conf, acc, num_in_bins)}')
    plot_reliability_diagram(acc, bins)

In [33]:
# Hard matched
if run_uncertainty_estimates and uncertainty_dataset == 'rerelease':
    pred, conf = MC_dropout(DC_model, new_unmatched_test_loader, forward_passes=20)
    print(f'\nMC dropout accuracy: {pred.mean()}')
    conf, acc, bins, num_in_bins = split_in_bins(pred,conf)

    print(f'ECE: {get_ECE(conf, acc, num_in_bins)}')
    plot_reliability_diagram(acc, bins)