In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer
from torch.utils.data import WeightedRandomSampler  ## TODO come back here and implement with WeightedRandomSampler
import torch.nn as nn
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import torch.nn.functional as F
import time

import calibration_fns as cal

from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score, f1_score

In [5]:
cwd = os.getcwd()
data_folder = cwd + '/Raw_labeled_data/'

### Step 1. Set up dataset for pytorch

In [6]:
#train, val, test= du.getData(dataDir=dataDir, holdoutDir=holdoutDir,ValCutoff=ValCutoff)
data = pd.read_excel(data_folder + 'sources.xlsx')
data = data.sample(frac=1).reset_index(drop=True)

data["Response"] = data["Response"].astype(str)
data["code"] = data["code"].astype(str)

train_size = int(0.7*len(data))
val_size = int(0.15*len(data))
train_data = data[:train_size]
base_data = data[train_size:train_size + val_size]
target_data = data[train_size + val_size:]

In [7]:
len(train_data)

2029

In [8]:
def get_max_len(tokenizer, train):
    all_sent = np.array(train.Response.values)
    # Encode data
    encoded_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sent]
    # Find the maximum length
    max_len = max([len(sent) for sent in encoded_sentences])
    return max_len

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LEN = get_max_len(tokenizer, train_data) 

In [10]:
print(MAX_LEN)

101


In [11]:
def text_preprocessing_simple(text):
    try:
        text = re.sub(r'\s+', ' ', text).strip()
    except:
        pass
    return text

In [12]:
def preprocessing_for_bert(data, tokenizer, max_len):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing_simple(str(sent)),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [13]:
class SourcesDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.data, self.masks = preprocessing_for_bert(df["Response"], tokenizer, MAX_LEN)
        self.text = df["Response"].reset_index(drop=True)
        labels_enc, unique_labels = pd.factorize(df["code"], sort = True)
        self.labels = torch.tensor(labels_enc, dtype = torch.long)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        sample = self.data[idx]
        mask = self.masks[idx]
        label = self.labels[idx]
        return sample, mask, label

In [14]:
labels_enc, unique_labels = pd.factorize(train_data["code"], sort = True)
target_to_class = {i:unique_labels[i] for i in range(len(unique_labels))}

In [15]:
target_to_class

{0: 'L', 1: 'O', 2: 'P', 3: 'S'}

In [16]:
train_dataset = SourcesDataset(train_data, tokenizer, MAX_LEN)
base_dataset = SourcesDataset(base_data, tokenizer, MAX_LEN)
target_dataset = SourcesDataset(target_data, tokenizer, MAX_LEN)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
len(train_dataset)

2029

In [18]:
train_dataset[2]

(tensor([  101,  2493,  9854, 19721,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]),
 tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 

In [19]:
for data, mask, label in train_dataset:
    break

In [20]:
class_counts = train_data.code.value_counts()

In [21]:
#sample_weights = [1.0 / class_counts[i] for i in train_data.code.values]
class_weights = 1.0 / class_counts
class_weights[1] = class_weights[1] / 4 # underweight "other"
sample_weights = [class_weights[i] for i in train_data.code.values]
class_weights = torch.tensor(class_weights, dtype=torch.float)

train_sampler=WeightedRandomSampler(sample_weights,len(train_dataset), replacement=True)

train_dataloader = DataLoader(train_dataset, batch_size = 16, sampler = train_sampler)
base_dataloader = DataLoader(base_dataset, batch_size = 16) 
target_dataloader = DataLoader(target_dataset, batch_size = 16) 

  class_weights[1] = class_weights[1] / 4 # underweight "other"
  class_weights[1] = class_weights[1] / 4 # underweight "other"
  class_weights = torch.tensor(class_weights, dtype=torch.float)


In [22]:
class_counts

code
L    1564
O     292
P     109
S      64
Name: count, dtype: int64

In [23]:
for sents, masks, labels in train_dataloader:
    break

In [24]:
sents.shape
masks.shape
labels

tensor([2, 0, 2, 3, 0, 2, 2, 2, 0, 0, 3, 3, 3, 0, 3, 2])

### Step 2. Set up Pytorch Model

In [25]:
class SourcesClassifier(nn.Module):
    def __init__(self, num_classes=4, freeze_bert = False):
        #Where we define all the parts of the model
        super(SourcesClassifier, self).__init__()  # initialize object with everything from the parent class
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 200, num_classes
        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    def forward(self, input_ids, attention_mask):
        # Connect these parts and return the output
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]
        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)
        return logits

In [26]:
model = SourcesClassifier(num_classes = 4)
print(model)

SourcesClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [27]:
example_out = model(sents, masks)

In [28]:
example_out.shape #batch size, num_classes

torch.Size([16, 4])

In [29]:
example_out

tensor([[-0.0012,  0.0348,  0.1257,  0.0572],
        [-0.1234,  0.0328,  0.1113,  0.0786],
        [-0.0341, -0.0613,  0.2232,  0.1021],
        [ 0.0527, -0.0694,  0.1471,  0.0087],
        [-0.0214,  0.0036,  0.1372,  0.0850],
        [-0.0883,  0.0122,  0.0492,  0.1432],
        [-0.0512, -0.0277,  0.0803,  0.1230],
        [ 0.0236, -0.0752,  0.1647,  0.1728],
        [-0.0175, -0.0021,  0.0675,  0.0321],
        [-0.0996,  0.0564,  0.1316,  0.0434],
        [-0.1605, -0.0508,  0.0461,  0.2130],
        [-0.0141, -0.0624,  0.0920,  0.0654],
        [-0.0199,  0.0047,  0.0725,  0.1541],
        [-0.0512, -0.0008,  0.1165,  0.1073],
        [-0.0891, -0.0759,  0.0570,  0.1501],
        [-0.0675,  0.0302,  0.1174, -0.0403]], grad_fn=<AddmmBackward0>)

### Step 3. The training loop

In [30]:
device = torch.device("cuda:0" if torch.cuda.is_available()  else "cpu")
print(device)

cpu


In [31]:
# Loss function
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [32]:
criterion(example_out, labels) # check that the loss function calculates. It won't be any good yet because we haven't done training

tensor(1.3056, grad_fn=<NllLossBackward0>)

In [33]:
labels

tensor([2, 0, 2, 3, 0, 2, 2, 2, 0, 0, 3, 3, 3, 0, 3, 2])

In [34]:
for param in model.bert.pooler.parameters():
    param.requires_grad = True

In [35]:
for module in model.modules():
    if isinstance(module, torch.nn.LayerNorm):
        module.eval()  # Set LayerNorm to eval mode

In [36]:
num_epochs = 7
train_losses, val_losses = [], []


model = SourcesClassifier(num_classes=4)
model.to(device)

total_steps = len(train_dataloader) * num_epochs # Total number of training steps

optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay = 1e-5)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(device))


for epoch in range(num_epochs):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*70)
    running_loss = 0.0
    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0 # Reset tracking variables at the beginning of each epoch
    model.train() # Set the model to train
    for batch_idx, (sents, masks, labels) in enumerate(train_dataloader): 
        batch_counts +=1
        sents, masks, labels = sents.to(device), masks.to(device), labels.to(device)
        optimizer.zero_grad() # set our optimizer to zero_grad
        outputs = model(sents, masks) # get our outputs by calling the forward method on the images
        loss = loss_fn(outputs, labels)
        loss.backward() # this is where we do backpropagation on the model to update the model weights
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
        optimizer.step()
        scheduler.step()
        #print(f"Batch {batch_idx}: Loss = {loss.item()}")
        batch_loss += loss.item()
        running_loss += loss.item() * sents.size(0)
        # Print the loss values and time elapsed for every 20 batches
        if (batch_idx % 20 == 0 and batch_idx != 0) or (batch_idx == len(train_dataloader) - 1):
            # Calculate time elapsed for 20 batches
            time_elapsed = time.time() - t0_batch
            # Print training results
            print(f"{epoch + 1:^7} | {batch_idx:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
    batch_loss, batch_counts = 0, 0
    t0_batch = time.time()
    train_loss = running_loss / len(train_dataloader.dataset)
    train_losses.append(train_loss)
    print("-"*70)
    
    # Validation phase
    model.eval()
    running_loss = 0.0
    val_accuracy = []
    with torch.no_grad(): #to make sure the model weights are not touched
        for sents, masks, labels in base_dataloader:
            sents, masks, labels = sents.to(device), masks.to(device), labels.to(device)
            outputs = model(sents, masks)
            loss = loss_fn(outputs, labels)
            running_loss += loss.item() * sents.size(0)
            # Get the predictions
            preds = torch.argmax(outputs, dim=1).flatten()
            # Calculate the accuracy rate
            accuracy = (preds == labels).cpu().numpy().mean() * 100
            val_accuracy.append(accuracy)
            time_elapsed = time.time() - t0_epoch
    val_loss = running_loss / len(base_dataloader.dataset)
    val_losses.append(val_loss)
    val_accuracy = np.mean(val_accuracy)

    #Print epoch stats
    print(f"{epoch+1:^7} | {'-':^7} | {train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')
plt.legend()
plt.title("Loss over epochs")
plt.show()

In [None]:
model.eval()
model=model.to(device)
all_logits = []
all_labels = []
# For each batch in our test set...
for sents, masks, labels in base_dataloader:
    sents, masks = sents.to(device), masks.to(device)
    # Compute logits
    with torch.no_grad():
        logits = model(sents, masks)
        all_logits.append(logits)
        all_labels.append(labels)
# Concatenate logits from each batch
all_logits = torch.cat(all_logits, dim=0)
all_labels = torch.cat(all_labels, dim = 0).cpu().numpy()
# Apply softmax to calculate probabilities
probs = F.softmax(all_logits, dim=1).cpu().numpy()
preds = torch.argmax(all_logits, dim=1).flatten().cpu().numpy()
all_logits = all_logits.cpu().numpy()

In [None]:
preds_LOPS = list(pd.Series(preds).map(target_to_class))
print("L: " + str(preds_LOPS.count("L")))
print("P: " + str(preds_LOPS.count("P")))
print("S: " + str(preds_LOPS.count("S")))
print("O: " + str(preds_LOPS.count("O")))

In [None]:
# calculate metrics

acc = accuracy_score(all_labels, preds)
p = precision_score(all_labels, preds, average = None)
r = recall_score(all_labels, preds, average = None)
bal = balanced_accuracy_score(all_labels,preds)
f1 = f1_score(all_labels,preds, average = None)

print(acc)
print(p)
print(r)
print(bal)
print(f1)

In [None]:
#calibration phase: calculate a calibration curve. along with a base classifier density, we can get the base joint density and estimate the base prevalence

print("Base dataset: number of samples in each category")
base_labels =list(pd.Series(base_dataset.labels.numpy()).map(target_to_class))
print("L: " + str(base_labels.count("L")))
print("P: " + str(base_labels.count("P")))
print("S: " + str(base_labels.count("S")))
print("O: " + str(base_labels.count("O")))

In [None]:
all_logits
probs = F.softmax(all_logits, dim=1).cpu().numpy()

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(base_dataset.text.values)})

In [None]:
df

In [None]:
df["probs_L"]

In [None]:
fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
axs.hist(df["probs_L"], bins = 20)

In [None]:
binned_cal_curve = cal.generate_calibration_curve_binned(df, num_bin = 10, code = "L", other_codes = ["O","P","S"])
binned_cal_curve.plot(show_diagonal=True)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(base_dataset.text.values)})

In [None]:
platt_cal_curve = cal.generate_calibration_curve_platt(df, code = "L", other_codes = ["O","P","S"])
platt_cal_curve.plot(show_diagonal=True, error_score = "raise")

In [None]:
# extrapolation phase: run probabilistic estimator

In [None]:
model.eval()
model=model.to(device)
all_logits = []
all_labels = []
# For each batch in our test set...
for sents, masks, labels in target_dataloader:
    sents, masks = sents.to(device), masks.to(device)
    # Compute logits
    with torch.no_grad():
        logits = model(sents, masks)
        all_logits.append(logits)
        all_labels.append(labels)
# Concatenate logits from each batch
all_logits = torch.cat(all_logits, dim=0)
all_labels = torch.cat(all_labels, dim = 0)
# Apply softmax to calculate probabilities
probs = F.softmax(all_logits, dim=1).cpu().numpy()
preds = torch.argmax(all_logits, dim=1).flatten()


In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})

## Calibration curves for L

In [None]:
fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
axs.hist(df["probs_L"], bins = 20)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})
binned_cal_curve = cal.generate_calibration_curve_binned(df, num_bin = 10, code = "L", other_codes = ["O","P","S"])
binned_cal_curve.plot(show_diagonal=True)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})
platt_cal_curve = cal.generate_calibration_curve_platt(df, code = "L", other_codes = ["O","P","S"])
platt_cal_curve.plot(show_diagonal=True)

In [None]:
est_prevalence = cal.extrinsic_estimate(df, platt_cal_curve, code = "L")
print(f'Assuming stable calibration curve -- estimated prevalence: {est_prevalence:.3f}')

In [None]:
print("Target dataset: number of samples in each category")
target_labels =list(pd.Series(target_dataset.labels.numpy()).map(target_to_class))
print("L: " + str(base_labels.count("L")))
print("P: " + str(base_labels.count("P")))
print("S: " + str(base_labels.count("S")))
print("O: " + str(base_labels.count("O")))

In [None]:
328/(328+27+11+68)

In [None]:
list(preds.cpu().numpy()).count(0)/len(preds)

In [None]:
display_data = pd.DataFrame(data = {"preds":list(pd.Series(preds.cpu().numpy()).map(target_to_class)), "labels":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})

In [None]:
pd.set_option('display.max_colwidth', None)
display_data[20:30]

## Calibration curves for P

In [None]:
fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
axs.hist(df["probs_P"], bins = 20)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})
binned_cal_curve = cal.generate_calibration_curve_binned(df, num_bin = 10, code = "P", other_codes = ["O","L","S"])
binned_cal_curve.plot(show_diagonal=True)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})
platt_cal_curve = cal.generate_calibration_curve_platt(df, code = "P", other_codes = ["O","L","S"])
platt_cal_curve.plot(show_diagonal=True)

In [None]:
est_prevalence = cal.extrinsic_estimate(df, platt_cal_curve, code = "P")
print(f'Assuming stable calibration curve -- estimated prevalence: {est_prevalence:.3f}')

## Calibration curves for S

In [None]:
fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
axs.hist(df["probs_S"], bins = 20)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})
binned_cal_curve = cal.generate_calibration_curve_binned(df, num_bin = 10, code = "S", other_codes = ["O","P","L"])
binned_cal_curve.plot(show_diagonal=True)

In [None]:
df = pd.DataFrame(data = {"probs_L": probs[:,0], "probs_O": probs[:,1], "probs_P": probs[:,2], "probs_S": probs[:,3], "gt_label":list(pd.Series(all_labels.cpu().numpy()).map(target_to_class)), "text":list(target_dataset.text.values)})
platt_cal_curve = cal.generate_calibration_curve_platt(df, code = "S", other_codes = ["O","P","L"])
platt_cal_curve.plot(show_diagonal=True)

In [None]:
est_prevalence = cal.extrinsic_estimate(df, platt_cal_curve, code = "S")
print(f'Assuming stable calibration curve -- estimated prevalence: {est_prevalence:.3f}')