# Script Setup

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install necessary packages
!pip install -r drive/MyDrive/nlp_sp/env/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import block
import torch
import torch.nn as nn
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List
import random
from tqdm import tqdm
import numpy as np
from numpy import logical_and, sum as t_sum
import pandas as pd
from typing import Dict, List
from sklearn.model_selection import train_test_split


In [None]:
# Device setup for CUDA

'''

Important: Every tensor, layer, and model needs to be sent to the same device using to()
Ex: 
  ten = torch.ones(4,5).to(device)

'''

# Get the best device to run on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


cuda


# Hyperparameters

## Define hyparameters near top to make changing them easier

In [None]:
# Number of training loops
epochs = 20

# Learning rate - should be very small when using Adam
LR = .0001

# Dropout probability
dropout_prob = 0.2

# Batch size
batch_size = 128

# Data Preprocessing

## Data Format

**Data:** `arguments-training/validation/testing.tsv`
(5220 arguments)
- Argument ID
- Conclusion 
- Stance (e.g., in favor, against)
- Premise (justification for conclusion)

**Labels:** `labels-training/validation/testing.tsv` 
(20 binary value labels per argument)
- Argument ID
- Self-direction: thought
- Self-direction: action
- Stimulation
- Hedonism
- Achievement
- Power: dominance
- Power: resources
- Face
- Security: personal
- Security: societal
- Tradition
- Conformity: rules
- Conformity: interpersonal
- Humility
- Benevolence: caring
- Benevolence: dependability
- Universalism: concern
- Universalism: nature
- Universalism: tolerance
- Universalism: objectivity

**Access:** https://doi.org/10.5281/zenodo.6814563

## Load Data

In [None]:
# training arguments
train_args_df = pd.read_csv('/content/drive/MyDrive/nlp_sp/data/arguments-training.tsv', sep='\t')
# view structure
train_args_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5220 entries, 0 to 5219
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Argument ID  5220 non-null   object
 1   Conclusion   5220 non-null   object
 2   Stance       5220 non-null   object
 3   Premise      5220 non-null   object
dtypes: object(4)
memory usage: 163.2+ KB


In [None]:
# training labels
train_labs_df = pd.read_csv('/content/drive/MyDrive/nlp_sp/data/labels-training.tsv', sep='\t')
# view structure
train_labs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5220 entries, 0 to 5219
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Argument ID                 5220 non-null   object
 1   Self-direction: thought     5220 non-null   int64 
 2   Self-direction: action      5220 non-null   int64 
 3   Stimulation                 5220 non-null   int64 
 4   Hedonism                    5220 non-null   int64 
 5   Achievement                 5220 non-null   int64 
 6   Power: dominance            5220 non-null   int64 
 7   Power: resources            5220 non-null   int64 
 8   Face                        5220 non-null   int64 
 9   Security: personal          5220 non-null   int64 
 10  Security: societal          5220 non-null   int64 
 11  Tradition                   5220 non-null   int64 
 12  Conformity: rules           5220 non-null   int64 
 13  Conformity: interpersonal   5220 non-null   int6

## Data Prep

In [None]:
# convert multiple label columns to one label list column
train_labs_df['labels'] = train_labs_df.loc[:, 'Self-direction: thought':'Universalism: objectivity'].values.tolist()

In [None]:
# label distribution for full training data
print('Self-direction: thought =', sum(train_labs_df['Self-direction: thought']))
print('Self-direction: action =', sum(train_labs_df['Self-direction: action']))
print('Stimulation =', sum(train_labs_df['Stimulation']))
print('Hedonism =', sum(train_labs_df['Hedonism']))
print('Achievement = ', sum(train_labs_df['Achievement']))
print('Power: dominance =', sum(train_labs_df['Power: dominance']))
print('Power: resources =', sum(train_labs_df['Power: resources']))
print('Face =', sum(train_labs_df['Face']))
print('Security: personal =', sum(train_labs_df['Security: personal']))
print('Security: societal =', sum(train_labs_df['Security: societal']))
print('Tradition =', sum(train_labs_df['Tradition']))
print('Conformity: rules =', sum(train_labs_df['Conformity: rules']))
print('Conformity: interpersonal =', sum(train_labs_df['Conformity: interpersonal']))
print('Humility =', sum(train_labs_df['Humility']))
print('Benevolence: caring =', sum(train_labs_df['Benevolence: caring']))
print('Benevolence: dependability =', sum(train_labs_df['Benevolence: dependability']))
print('Universalism: concern =', sum(train_labs_df['Universalism: concern']))
print('Universalism: nature =', sum(train_labs_df['Universalism: nature']))
print('Universalism: tolerance =', sum(train_labs_df['Universalism: tolerance']))
print('Universalism: objectivity =', sum(train_labs_df['Universalism: objectivity']))

print('\nTotal number of samples = ', len(train_labs_df))

In [None]:
# combine dfs to add label list to data dictionary
train_merged_df = pd.merge(train_args_df, train_labs_df, on='Argument ID')
train_merged_df = train_merged_df.drop(columns=['Self-direction: thought',
                                                'Self-direction: action',
                                                'Stimulation',
                                                'Hedonism',
                                                'Achievement',
                                                'Power: dominance',
                                                'Power: resources',
                                                'Face',
                                                'Security: personal',
                                                'Security: societal',
                                                'Tradition',
                                                'Conformity: rules',
                                                'Conformity: interpersonal',
                                                'Humility',
                                                'Benevolence: caring',
                                                'Benevolence: dependability',
                                                'Universalism: concern',
                                                'Universalism: nature',
                                                'Universalism: tolerance',
                                                'Universalism: objectivity'])

In [None]:
# view structure
train_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5220 entries, 0 to 5219
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Argument ID  5220 non-null   object
 1   Conclusion   5220 non-null   object
 2   Stance       5220 non-null   object
 3   Premise      5220 non-null   object
 4   labels       5220 non-null   object
dtypes: object(5)
memory usage: 244.7+ KB


## Train/Val Split

In [None]:
# split train data into 80/20 train/val
train_data, val_data = train_test_split(train_merged_df, test_size=0.2, random_state=4)

In [None]:
# convert each row to a dictionary -> List[Dict]
train_data = train_data.to_dict(orient='records')
val_data = val_data.to_dict(orient='records')
# print examples
print('training example:\n', train_data[0])
print('validation example:\n', val_data[0])

## Tokenization

In [None]:
# function to load samples from HuggingFace dataset to be batched and encoded

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""
    """HuggingFace docs: https://huggingface.co/transformers/v3.0.2/preprocessing.html"""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
    
    # HuggingFace tokenizer will join data with sentence separator token
    # and match batches of tokenized and encoded sentences
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token

    # call method can only take a pair of inputs, but we have three
    # conclusion batch, stance batch, and premise batch
    # so we create a hack
    #def __call__(self, con_batch: List[str], stan_batch: List[str], prem_batch: List[str]) -> List[List[str]]:

    def __call__(self, con_stan_batch: List[str], prem_batch: List[str]) -> List[List[str]]:  
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # the two sentences deimited by the [SEP] token.
        enc = self.hf_tokenizer(
            con_stan_batch,
            prem_batch,
            #stan_batch,
            #prem_batch,
            padding=True,
            return_token_type_ids=False, # ignore with hack
            return_tensors='pt'
        )

        return enc

In [None]:
# define tokenizer
tokenizer = BatchTokenizer()

Downloading:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
# example of use case for batch tokenizer without triplet hack (only two input types acceptable)
token_ex = tokenizer(*[['this is the conclusion with more words', 'this is also a conclusion'], ['this is the premise', 'this is the second premise']])
print(f"{token_ex}\n")
tokenizer.hf_tokenizer.batch_decode(token_ex['input_ids'])

{'input_ids': tensor([[  101,  2023,  2003,  1996,  7091,  2007,  2062,  2616,   102,  2023,
          2003,  1996, 18458,   102],
        [  101,  2023,  2003,  2036,  1037,  7091,   102,  2023,  2003,  1996,
          2117, 18458,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}



['[CLS] this is the conclusion with more words [SEP] this is the premise [SEP]',
 '[CLS] this is also a conclusion [SEP] this is the second premise [SEP] [PAD]']

In [None]:
# example of use case for batch tokenizer with triplet hack
token_ex2 = tokenizer(*[['this is the conclusion with more words [SEP] and a stance against', 'this is also a conclusion [SEP] with another stance that is in favor of'], ['this is the premise', 'this is the second premise']])
print(f"{token_ex2}\n")
tokenizer.hf_tokenizer.batch_decode(token_ex2['input_ids'])

{'input_ids': tensor([[  101,  2023,  2003,  1996,  7091,  2007,  2062,  2616,   102,  1998,
          1037, 11032,  2114,   102,  2023,  2003,  1996, 18458,   102,     0,
             0,     0],
        [  101,  2023,  2003,  2036,  1037,  7091,   102,  2007,  2178, 11032,
          2008,  2003,  1999,  5684,  1997,   102,  2023,  2003,  1996,  2117,
         18458,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}



['[CLS] this is the conclusion with more words [SEP] and a stance against [SEP] this is the premise [SEP] [PAD] [PAD] [PAD]',
 '[CLS] this is also a conclusion [SEP] with another stance that is in favor of [SEP] this is the second premise [SEP]']

## Batch

In [None]:
# function to generate triple-wise inputs

def generate_triplewise_input(dataset: List[Dict]) -> (List[str], List[str], List[str], List[List[int]]):
    """
    TODO: group all premises and corresponding hypotheses and labels of the datapoints
    a datapoint as seen earlier is a dict of premis, hypothesis and label
    """

    # extract each observation from dictionary; save to list
    d_vals = []
    for i in range(len(dataset)):
        d_vals.append(list(dataset[i].values()))

    # store data items in lists by three categories    
    conclusion_lst = []
    stance_lst = []
    premise_lst = []

    # store labels in list of lists of 20 labels
    label_lst = []

    # generate separate lists from each observation
    for i in range(len(d_vals)):
        conclusion_lst.append(d_vals[i][1])
        stance_lst.append(d_vals[i][2])
        premise_lst.append(d_vals[i][3])
        label_lst.append(d_vals[i][4])

    # add [SEP] token before every stance in list
    stance_lst = [' [SEP] ' + s for s in stance_lst]

    return conclusion_lst, stance_lst, premise_lst, label_lst

In [None]:
# apply function to generate triple-wise inputs and labels for batching

# training data
train_conclusions, train_stances, train_premises, train_labels = generate_triplewise_input(train_data)

# validation data
val_conclusions, val_stances, val_premises, val_labels = generate_triplewise_input(val_data)

In [None]:
# temporarily combine conclusions and stances separate with [SEP]
# use hack to merge tokenized conclusion batch, stance batch, and premise batch

# training data
train_conclusions_stances = []
for i in range(len(train_conclusions)):
  train_conclusions_stances.append(train_conclusions[i] + train_stances[i])

# validation data
val_conclusions_stances = []
for i in range(len(val_conclusions)):
  val_conclusions_stances.append(val_conclusions[i] + val_stances[i])  

In [None]:
# define functions to chunk data for batches

# for train labels
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i: i+n]

# for train features
def chunk_multi(lst1, lst2, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i+n], lst2[i: i+n]

In [None]:
# apply function to batch input data 
# tokenize and encode simultaneously since we are using HuggingFace

# batch
train_input_batches = [b for b in chunk_multi(train_conclusions_stances, train_premises, batch_size)]
val_input_batches = [b for b in chunk_multi(val_conclusions_stances, val_premises, val_size)]

# tokenize + encode
train_input_batches = [tokenizer(*batch).to(device) for batch in train_input_batches]
val_input_batches = [tokenizer(*batch).to(device) for batch in val_input_batches]

In [None]:
# check training data example
print(train_input_batches[0])
encoded_tst = tokenizer.hf_tokenizer.batch_decode(train_input_batches[0]['input_ids'])
encoded_tst[0]

{'input_ids': tensor([[ 101, 4372, 6494,  ...,    0,    0,    0],
        [ 101, 2057, 2323,  ...,    0,    0,    0],
        [ 101, 2057, 2323,  ...,    0,    0,    0],
        ...,
        [ 101, 2057, 2323,  ...,    0,    0,    0],
        [ 101, 2057, 2323,  ...,    0,    0,    0],
        [ 101, 2057, 2323,  ...,    0,    0,    0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}


"[CLS] entrapment should be legalized [SEP] in favor of [SEP] if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [None]:
# define function to batch class labels
# a single observation's label is a list of 20 labels

def encode_labels(labels: List[List[int]]) -> torch.FloatTensor:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[List[int]]): List of all lists of labels in batch

    Returns:
        torch.FloatTensor: Tensor of all lists of labels in batch
    """
    
    return torch.LongTensor(labels)


In [None]:
# apply function to batch labels in same order as inputs
# batch
train_label_batches = [b for b in chunk(train_labels, batch_size)]
val_label_batches = [b for b in chunk(val_labels, batch_size)]
# tokenize + encode
train_label_batches = [encode_labels(batch).to(device) for batch in train_label_batches]
val_label_batches = [encode_labels(batch).to(device) for batch in val_label_batches]

# Model

Below is the code to define our model as well as the training loop.

## Functions to Make Predictions

In [None]:
def make_prediction(logits: torch.Tensor) -> torch.Tensor:
  # This is equivalent to a threshold of 0.5
  return torch.round(logits)

def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    logits = model(sents)
    return make_prediction(logits)

## Model Definition

In [None]:

# Function to initialize weights for the chain classifiers
def init_weights(layer):
    if isinstance(layer, nn.Linear):
        torch.nn.init.xavier_normal_(layer.weight)

class NLIClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int, dropout_prob: float):
      
      # Basic initialization
      super().__init__()
      self.output_size = output_size
      self.hidden_size = hidden_size

      # Additional args
      self.dropout_prob = dropout_prob

      # Initialize BERT, which we use instead of a single embedding layer.
      self.bert = BertModel.from_pretrained("prajjwal1/bert-small").to(device)
      
      # Comment out these lines to unfreeze BERT params
      for param in self.bert.parameters():
          param.requires_grad = False
          
      # Get BERT's hiddem dim
      self.bert_hidden_dimension = self.bert.config.hidden_size
      
      
      # Single linear layer to project to hidden size
      self.hidden_layer = torch.nn.Linear(self.bert_hidden_dimension, self.hidden_size).to(device)
      
      # Use RELU regularization
      # TODO: Could try others
      self.relu = torch.nn.ReLU()

      '''

      We are doing multi-label classification using a chain classifier.
      For details, see: https://en.wikipedia.org/wiki/Multi-label_classification

      Setup a classifier chain for the 20 labels.
      To simplify code, just store them in a list and run through them sequentially.
      They will be interpreted in the same order as the training data:

      Self-direction: thought
      Self-direction: action
      Stimulation
      Hedonism
      Achievement
      Power: dominance
      Power: resources
      Face
      Security: personal
      Security: societal
      Tradition
      Conformity: rules
      Conformity: interpersonal
      Humility
      Benevolence: caring
      Benevolence: dependability
      Universalism: concern
      Universalism: nature
      Universalism: tolerance
      Universalism: objectivity

      '''

      self.chain = []
      for i in range(self.output_size):

        # To make it a chain, the prediction from the previous classifier is 
        # appended to the input and used as the input for the next classifier

        # Initialize each chain classifier
        t = nn.Sequential(
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(in_features=self.hidden_size + i, out_features = 1),
            #nn.LogSoftmax(dim=2)
            nn.Sigmoid()
        )
        self.chain.append(t.to(device))
        # Initialize the weights
        for c in self.chain:
          c.apply(init_weights)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Use BERT to create contextulized embeddings and get the output 
            from the pooling layer (i.e. embedding for CLR)

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: Encoding of CLR for the given input
        """

        # Run through BERT for contextualized embeddings
        encoded_sequence = self.bert(**symbols)
        # TODO: Get the [CLS] token using the `pooler_output` from 
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        
        # Pooler output is initially (batch_size, bert_hidden_dimension)
        pool_out = torch.unsqueeze(encoded_sequence['pooler_output'], dim=1)
        return pool_out

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        output = self.hidden_layer(encoded_sents)
        output = self.relu(output)
        
        # output is of size (batch_size, hidden_layer)

        # Run through the classifier chain

        cur_input = output
        logits = []

        for classifier in self.chain:

          # Get output of next in chain
          o = classifier(cur_input)

          # Save the logits for training
          logits.append(o)

          # Make a prediction so we can append it to the next input
          # TODO: Could also append raw logits, potentially
          pred = make_prediction(o)

          # Append the previous prediction to the input for the next classifier
          cur_input = torch.cat([cur_input, pred], dim=2)

        # Preds contains 20 tensors, each batch_size x 1 x 1
        # We need to return one tensor that is 128 x 20
        stack = logits[0].squeeze(dim=1)
        for logit in logits[1:]:
          stack = torch.cat([stack, logit.squeeze(dim=1)], dim=-1)
        
        return stack

## Evaluation

### Metric Functions

In [None]:
def precision(predicted_labels, true_labels, which_label=1):
    """
    Precision is True Positives / All Positives Predictions
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(pred_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
        
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(true_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def f1_score(
    predicted_labels: List[int],
    true_labels: List[int],
    which_label: int
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels, which_label=which_label)
    R = recall(predicted_labels, true_labels, which_label=which_label)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


def macro_f1(
    predicted_labels: List[int],
    true_labels: List[int],
    possible_labels: List[int]
):
    scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
    # Macro, so we take the uniform avg.
    return sum(scores) / len(scores)

## Training Loop

In [None]:
def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    possible_labels
):
    print("Training...")
    dev_f1_scores = []
    #loss_func = torch.nn.BCEWithLogitsLoss()
    loss_func = torch.nn.BCELoss()

    # Send the data to the device first
    #train_features = train_features.to(device)
    #train_labels = train_labels.to(device)
    #dev_features = dev_features.to(device)
    #dev_labels = dev_labels.to(device)

    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):
            # Empty the dynamic computation graph
            optimizer.zero_grad()
            preds = model(features)
            loss = loss_func(preds, labels.float())
            
            # Backpropogate the loss through our model
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_features, dev_labels), total=len(dev_features)):
            pred = predict(model, sents)
            all_preds.extend(pred)
            all_labels.extend(list(labels.numpy()))

        dev_f1 = macro_f1(all_preds, all_labels, possible_labels)
        print(f"Dev F1 {dev_f1}")
        dev_f1_scores.append(dev_f1)
        #print(all_preds)
        
    # Print the best dev_f1 score for result reporting
    print(f"Best dev F1 score: {np.max(dev_f1_scores)}")
    # Return the trained model
    return model

# Training Phase

## Setup

In [None]:
# Number of labels (should be 20)
possible_labels = len(train_labels[0])
if possible_labels != 20:
  raise RuntimeError(f"Instead of 20 possible labels, we found {possible_labels}.")

# Intialize model
model = NLIClassifier(output_size=possible_labels, hidden_size = 512, dropout_prob=dropout_prob)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), LR)

# Setup the validation set
'''
validation_input_batches = [b for b in chunk_multi(validation_premises, validation_hypotheses, batch_size)]
# Tokenize + encode
validation_input_batches = [tokenizer(*batch) for batch in validation_input_batches]
validation_batch_labels = [b for b in chunk(validation_labels, batch_size)]
validation_batch_labels = [encode_labels(batch) for batch in validation_batch_labels]
'''

# TODO: temp solution for testing
validation_input_batches = train_input_batches
validation_label_batches = train_label_batches


Downloading:   0%|          | 0.00/116M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Train the Model

In [None]:
# Start the training
trained_model = training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    validation_input_batches,
    validation_label_batches,
    optimizer,
    model,
    list(range(possible_labels))
)


Training...


100%|██████████| 41/41 [00:05<00:00,  7.74it/s]


epoch 0, loss: 0.49792732407407064
Evaluating dev...


  0%|          | 0/41 [00:00<?, ?it/s]


TypeError: ignored