<a href="https://colab.research.google.com/github/srpauliscu/nlp-shared-task/blob/main/semeval_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script Setup

In [2]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install necessary packages
!pip install -r drive/MyDrive/nlp_sp/env/requirements.txt

In [6]:
# Import block
import torch
import torch.nn as nn
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List
import random
from tqdm import tqdm
import numpy as np
from numpy import logical_and, sum as t_sum
import pandas as pd
from typing import Dict, List
from sklearn.model_selection import train_test_split


In [7]:
# Device setup for CUDA

'''

Important: Every tensor, layer, and model needs to be sent to the same device using to()
Ex: 
  ten = torch.ones(4,5).to(device)

'''

# Get the best device to run on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


cuda


# Hyperparameters

## Define hyparameters near top to make changing them easier

In [8]:
# Number of training loops
epochs = 100

# Learning rate - should be very small when using Adam
LR = .001

# Dropout probability
dropout_prob = 0.0

# Batch size
batch_size = 32

# Size to project to after BERT
hidden_size = 1024

# Data Preprocessing

## Data Format

**Data:** `arguments-training/validation/testing.tsv`
(5220 arguments)
- Argument ID
- Conclusion 
- Stance (e.g., in favor, against)
- Premise (justification for conclusion)

**Labels:** `labels-training/validation/testing.tsv` 
(20 binary value labels per argument)
- Argument ID
- Self-direction: thought
- Self-direction: action
- Stimulation
- Hedonism
- Achievement
- Power: dominance
- Power: resources
- Face
- Security: personal
- Security: societal
- Tradition
- Conformity: rules
- Conformity: interpersonal
- Humility
- Benevolence: caring
- Benevolence: dependability
- Universalism: concern
- Universalism: nature
- Universalism: tolerance
- Universalism: objectivity

**Access:** https://doi.org/10.5281/zenodo.6814563

## Load Data

In [9]:
# training arguments
#train_args_df = pd.read_csv('/content/drive/MyDrive/nlp_sp/data/arguments-training.tsv', sep='\t')         # Spencer
train_args_df = pd.read_csv('/content/drive/MyDrive/csci5832_project/data/arguments-training.tsv', sep='\t') # Caroline
# view structure
train_args_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5220 entries, 0 to 5219
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Argument ID  5220 non-null   object
 1   Conclusion   5220 non-null   object
 2   Stance       5220 non-null   object
 3   Premise      5220 non-null   object
dtypes: object(4)
memory usage: 163.2+ KB


In [10]:
# training labels
#train_labs_df = pd.read_csv('/content/drive/MyDrive/nlp_sp/data/labels-training.tsv', sep='\t')         # Spencer
train_labs_df = pd.read_csv('/content/drive/MyDrive/csci5832_project/data/labels-training.tsv', sep='\t') # Caroline
# view structure
train_labs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5220 entries, 0 to 5219
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Argument ID                 5220 non-null   object
 1   Self-direction: thought     5220 non-null   int64 
 2   Self-direction: action      5220 non-null   int64 
 3   Stimulation                 5220 non-null   int64 
 4   Hedonism                    5220 non-null   int64 
 5   Achievement                 5220 non-null   int64 
 6   Power: dominance            5220 non-null   int64 
 7   Power: resources            5220 non-null   int64 
 8   Face                        5220 non-null   int64 
 9   Security: personal          5220 non-null   int64 
 10  Security: societal          5220 non-null   int64 
 11  Tradition                   5220 non-null   int64 
 12  Conformity: rules           5220 non-null   int64 
 13  Conformity: interpersonal   5220 non-null   int6

## Data Prep

In [11]:
# convert multiple label columns to one label list column
train_labs_df['labels'] = train_labs_df.loc[:, 'Self-direction: thought':'Universalism: objectivity'].values.tolist()

In [12]:
# label distribution for full training data
print('Self-direction: thought =', sum(train_labs_df['Self-direction: thought']))
print('Self-direction: action =', sum(train_labs_df['Self-direction: action']))
print('Stimulation =', sum(train_labs_df['Stimulation']))
print('Hedonism =', sum(train_labs_df['Hedonism']))
print('Achievement = ', sum(train_labs_df['Achievement']))
print('Power: dominance =', sum(train_labs_df['Power: dominance']))
print('Power: resources =', sum(train_labs_df['Power: resources']))
print('Face =', sum(train_labs_df['Face']))
print('Security: personal =', sum(train_labs_df['Security: personal']))
print('Security: societal =', sum(train_labs_df['Security: societal']))
print('Tradition =', sum(train_labs_df['Tradition']))
print('Conformity: rules =', sum(train_labs_df['Conformity: rules']))
print('Conformity: interpersonal =', sum(train_labs_df['Conformity: interpersonal']))
print('Humility =', sum(train_labs_df['Humility']))
print('Benevolence: caring =', sum(train_labs_df['Benevolence: caring']))
print('Benevolence: dependability =', sum(train_labs_df['Benevolence: dependability']))
print('Universalism: concern =', sum(train_labs_df['Universalism: concern']))
print('Universalism: nature =', sum(train_labs_df['Universalism: nature']))
print('Universalism: tolerance =', sum(train_labs_df['Universalism: tolerance']))
print('Universalism: objectivity =', sum(train_labs_df['Universalism: objectivity']))

print('\nTotal number of samples = ', len(train_labs_df))

Self-direction: thought = 913
Self-direction: action = 1332
Stimulation = 312
Hedonism = 202
Achievement =  1400
Power: dominance = 461
Power: resources = 566
Face = 374
Security: personal = 1961
Security: societal = 1627
Tradition = 598
Conformity: rules = 1222
Conformity: interpersonal = 217
Humility = 438
Benevolence: caring = 1500
Benevolence: dependability = 766
Universalism: concern = 1992
Universalism: nature = 358
Universalism: tolerance = 709
Universalism: objectivity = 937

Total number of samples =  5220


In [13]:
# combine dfs to add label list to data dictionary
train_merged_df = pd.merge(train_args_df, train_labs_df, on='Argument ID')
train_merged_df = train_merged_df.drop(columns=['Self-direction: thought',
                                                'Self-direction: action',
                                                'Stimulation',
                                                'Hedonism',
                                                'Achievement',
                                                'Power: dominance',
                                                'Power: resources',
                                                'Face',
                                                'Security: personal',
                                                'Security: societal',
                                                'Tradition',
                                                'Conformity: rules',
                                                'Conformity: interpersonal',
                                                'Humility',
                                                'Benevolence: caring',
                                                'Benevolence: dependability',
                                                'Universalism: concern',
                                                'Universalism: nature',
                                                'Universalism: tolerance',
                                                'Universalism: objectivity'])

In [14]:
# view structure
train_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5220 entries, 0 to 5219
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Argument ID  5220 non-null   object
 1   Conclusion   5220 non-null   object
 2   Stance       5220 non-null   object
 3   Premise      5220 non-null   object
 4   labels       5220 non-null   object
dtypes: object(5)
memory usage: 244.7+ KB


## Train/Val Split

In [15]:
# split train data into 80/20 train/val
train_data, val_data = train_test_split(train_merged_df, test_size=0.2, random_state=4)

In [16]:
# convert each row to a dictionary -> List[Dict]
train_data = train_data.to_dict(orient='records')
val_data = val_data.to_dict(orient='records')
full_data = train_merged_df.to_dict(orient='records')
# print examples
print('training example:\n', train_data[0])
print('validation example:\n', val_data[0])
print('full example:\n', full_data[0])

training example:
 {'Argument ID': 'A07017', 'Conclusion': 'Homeopathy brings more harm than good', 'Stance': 'against', 'Premise': 'homeopathy uses natural remedies that have little to no side affects on the body.', 'labels': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]}
validation example:
 {'Argument ID': 'A18174', 'Conclusion': 'The vow of celibacy should be abandoned', 'Stance': 'against', 'Premise': "the vow of celibacy should be promoted as it brings the sense of self control and purity to a person's soul.", 'labels': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0]}
full example:
 {'Argument ID': 'A01001', 'Conclusion': 'Entrapment should be legalized', 'Stance': 'in favor of', 'Premise': "if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?", 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [17]:
# Calculate the label density
numerator = 0
for sample in full_data:
  numerator += sum(sample['labels'])

density = numerator / (20 * len(full_data))

# Set the threshold
threshold = 2*density
#threshold = 0.5

print(threshold)

0.34262452107279695


## Tokenization

In [18]:
# function to load samples from HuggingFace dataset to be batched and encoded

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""
    """HuggingFace docs: https://huggingface.co/transformers/v3.0.2/preprocessing.html"""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
    
    # HuggingFace tokenizer will join data with sentence separator token
    # and match batches of tokenized and encoded sentences
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token

    # call method can only take a pair of inputs, but we have three
    # conclusion batch, stance batch, and premise batch
    # so we create a hack
    #def __call__(self, con_batch: List[str], stan_batch: List[str], prem_batch: List[str]) -> List[List[str]]:

    def __call__(self, con_stan_batch: List[str], prem_batch: List[str]) -> List[List[str]]:  
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # the two sentences deimited by the [SEP] token.
        enc = self.hf_tokenizer(
            con_stan_batch,
            prem_batch,
            #stan_batch,
            #prem_batch,
            padding=True,
            return_token_type_ids=False, # ignore with hack
            return_tensors='pt'
        )

        return enc

In [19]:
# define tokenizer
tokenizer = BatchTokenizer()

Downloading:   0%|          | 0.00/286 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [20]:
# example of use case for batch tokenizer without triplet hack (only two input types acceptable)
token_ex = tokenizer(*[['this is the conclusion with more words', 'this is also a conclusion'], ['this is the premise', 'this is the second premise']])
print(f"{token_ex}\n")
tokenizer.hf_tokenizer.batch_decode(token_ex['input_ids'])

{'input_ids': tensor([[  101,  2023,  2003,  1996,  7091,  2007,  2062,  2616,   102,  2023,
          2003,  1996, 18458,   102],
        [  101,  2023,  2003,  2036,  1037,  7091,   102,  2023,  2003,  1996,
          2117, 18458,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}



['[CLS] this is the conclusion with more words [SEP] this is the premise [SEP]',
 '[CLS] this is also a conclusion [SEP] this is the second premise [SEP] [PAD]']

In [21]:
# example of use case for batch tokenizer with triplet hack
token_ex2 = tokenizer(*[['this is the conclusion with more words [SEP] and a stance against', 'this is also a conclusion [SEP] with another stance that is in favor of'], ['this is the premise', 'this is the second premise']])
print(f"{token_ex2}\n")
tokenizer.hf_tokenizer.batch_decode(token_ex2['input_ids'])

{'input_ids': tensor([[  101,  2023,  2003,  1996,  7091,  2007,  2062,  2616,   102,  1998,
          1037, 11032,  2114,   102,  2023,  2003,  1996, 18458,   102,     0,
             0,     0],
        [  101,  2023,  2003,  2036,  1037,  7091,   102,  2007,  2178, 11032,
          2008,  2003,  1999,  5684,  1997,   102,  2023,  2003,  1996,  2117,
         18458,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}



['[CLS] this is the conclusion with more words [SEP] and a stance against [SEP] this is the premise [SEP] [PAD] [PAD] [PAD]',
 '[CLS] this is also a conclusion [SEP] with another stance that is in favor of [SEP] this is the second premise [SEP]']

## Batch

In [22]:
# function to generate triple-wise inputs

def generate_triplewise_input(dataset: List[Dict]) -> (List[str], List[str], List[str], List[str], List[List[int]]):
    """
    group all argument components and corresponding labels of the datapoints
    a datapoint is now a dictionary of 
    argument id, conclusion, stance, premise, and label list
    """

    # extract each observation from dictionary; save to list
    d_vals = []
    for i in range(len(dataset)):
        d_vals.append(list(dataset[i].values()))

    # store data items in lists by three categories by id
    id_lst = []    
    conclusion_lst = []
    stance_lst = []
    premise_lst = []

    # store labels in list of lists of 20 labels
    label_lst = []

    # generate separate lists from each observation
    for i in range(len(d_vals)):
        id_lst.append(d_vals[i][0])
        conclusion_lst.append(d_vals[i][1])
        stance_lst.append(d_vals[i][2])
        premise_lst.append(d_vals[i][3])
        label_lst.append(d_vals[i][4])

    # add [SEP] token before every stance in list
    stance_lst = [' [SEP] ' + s for s in stance_lst]

    return id_lst, conclusion_lst, stance_lst, premise_lst, label_lst

In [23]:
# apply function to generate triple-wise inputs and labels for batching

# training data
train_ids, train_conclusions, train_stances, train_premises, train_labels = generate_triplewise_input(train_data)

# validation data
val_ids, val_conclusions, val_stances, val_premises, val_labels = generate_triplewise_input(val_data)

# full data
full_ids, full_conclusions, full_stances, full_premises, full_labels = generate_triplewise_input(full_data)

In [24]:
# temporarily combine conclusions and stances separate with [SEP]
# use hack to merge tokenized conclusion batch, stance batch, and premise batch

# training data
train_conclusions_stances = []
for i in range(len(train_conclusions)):
  train_conclusions_stances.append(train_conclusions[i] + train_stances[i])

# validation data
val_conclusions_stances = []
for i in range(len(val_conclusions)):
  val_conclusions_stances.append(val_conclusions[i] + val_stances[i])

# full data
full_conclusions_stances = []
for i in range(len(full_conclusions)):
  full_conclusions_stances.append(full_conclusions[i] + full_stances[i])

In [25]:
# define functions to chunk data for batches

# for train labels
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i: i+n]

# for train features
def chunk_multi(lst1, lst2, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i+n], lst2[i: i+n]

In [26]:
# apply function to batch input data 
# tokenize and encode simultaneously since we are using HuggingFace

# batch
train_input_batches = [b for b in chunk_multi(train_conclusions_stances, train_premises, batch_size)]
val_size = 1
full_size = 1
val_input_batches = [b for b in chunk_multi(val_conclusions_stances, val_premises, val_size)]
full_input_batches = [b for b in chunk_multi(full_conclusions_stances, full_premises, full_size)]

# tokenize + encode
train_input_batches = [tokenizer(*batch).to(device) for batch in train_input_batches]
val_input_batches = [tokenizer(*batch).to(device) for batch in val_input_batches]
full_input_batches = [tokenizer(*batch).to(device) for batch in full_input_batches]

In [27]:
# check training data example
print(train_input_batches[0])
encoded_tst = tokenizer.hf_tokenizer.batch_decode(train_input_batches[0]['input_ids'])
encoded_tst[0]

{'input_ids': tensor([[  101,  2188, 29477,  ...,     0,     0,     0],
        [  101,  2057,  2323,  ...,     0,     0,     0],
        [  101,  2057,  2323,  ...,     0,     0,     0],
        ...,
        [  101,  2057,  2323,  ...,     0,     0,     0],
        [  101,  2057,  2323,  ...,     0,     0,     0],
        [  101,  2057,  2323,  ...,     0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}


'[CLS] homeopathy brings more harm than good [SEP] against [SEP] homeopathy uses natural remedies that have little to no side affects on the body. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [28]:
# define function to batch class labels
# a single observation's label is a list of 20 labels

def encode_labels(labels: List[List[int]]) -> torch.FloatTensor:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[List[int]]): List of all lists of labels in batch

    Returns:
        torch.FloatTensor: Tensor of all lists of labels in batch
    """
    
    return torch.LongTensor(labels)

In [29]:
# apply function to batch labels in same order as inputs
# batch
train_label_batches = [b for b in chunk(train_labels, batch_size)]
val_label_batches = [b for b in chunk(val_labels, val_size)]
full_label_batches = [b for b in chunk(full_labels, full_size)]
# tokenize + encode
train_label_batches = [encode_labels(batch).to(device) for batch in train_label_batches]
val_label_batches = [encode_labels(batch).to(device) for batch in val_label_batches]
full_label_batches = [encode_labels(batch).to(device) for batch in full_label_batches]

# Model

Below is the code to define our model as well as the training loop.

## Functions to Make Predictions

In [30]:
def make_prediction(logits: torch.Tensor) -> torch.Tensor:
  # Use boolean logic to handle the predictions
  return (logits>threshold).float()

def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    logits = model(sents)
    return make_prediction(logits.cpu())

## Model Definition

In [64]:

# Function to initialize weights for the chain classifiers
def init_weights(layer):
    if isinstance(layer, nn.Linear):
        torch.nn.init.xavier_normal_(layer.weight)

class NLIClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int, dropout_prob: float):
      
      # Basic initialization
      super().__init__()
      self.output_size = output_size
      self.hidden_size = hidden_size

      # Additional args
      self.dropout_prob = dropout_prob

      # Initialize BERT, which we use instead of a single embedding layer.
      self.bert = BertModel.from_pretrained("prajjwal1/bert-small").to(device)
      
      # Comment out these lines to unfreeze BERT params
      for param in self.bert.parameters():
          param.requires_grad = False
          
      # Get BERT's hiddem dim
      self.bert_hidden_dimension = self.bert.config.hidden_size
      
      
      # Single linear layer to project to hidden size
      self.hidden_layer = torch.nn.Linear(self.bert_hidden_dimension, self.hidden_size * 2).to(device)
      self.hidden_layer_2 = torch.nn.Linear(self.hidden_size * 2, self.hidden_size).to(device)
      #self.hidden_layer_3 = torch.nn.Linear(self.hidden_size * 2, self.hidden_size).to(device)   # remove third layer to re-run best model on test set
      
      # Use RELU regularization
      # TODO: Could try others
      self.relu = torch.nn.ReLU()

      '''

      We are doing multi-label classification using a chain classifier.
      For details, see: https://en.wikipedia.org/wiki/Multi-label_classification

      Setup a classifier chain for the 20 labels.
      To simplify code, just store them in a list and run through them sequentially.
      They will be interpreted in the same order as the training data:

      Self-direction: thought
      Self-direction: action
      Stimulation
      Hedonism
      Achievement
      Power: dominance
      Power: resources
      Face
      Security: personal
      Security: societal
      Tradition
      Conformity: rules
      Conformity: interpersonal
      Humility
      Benevolence: caring
      Benevolence: dependability
      Universalism: concern
      Universalism: nature
      Universalism: tolerance
      Universalism: objectivity

      '''

      self.chain = []
      for i in range(self.output_size):

        # To make it a chain, the prediction from the previous classifier is 
        # appended to the input and used as the input for the next classifier

        # Initialize each chain classifier
        # TODO: Try more layers per classifier
        # TODO: Could try bigger BERT model, but that would require more changes
        # TODO: Could unfreeze BERT weights
        
        # TODO: Hyperparameter tunings

        # TODO: Could also play with the threshold for prediction
        # and base it on label cardinality (i.e. average number of labels per sample)

        t = nn.Sequential(
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(in_features=self.hidden_size + i, out_features = 1),
            nn.Sigmoid()
        )
        self.chain.append(t.to(device))
        # Initialize the weights
        for c in self.chain:
          c.apply(init_weights)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Use BERT to create contextulized embeddings and get the output 
            from the pooling layer (i.e. embedding for CLR)

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: Encoding of CLR for the given input
        """

        # Run through BERT for contextualized embeddings
        encoded_sequence = self.bert(**symbols)
        # TODO: Get the [CLS] token using the `pooler_output` from 
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        
        # Pooler output is initially (batch_size, bert_hidden_dimension)
        pool_out = torch.unsqueeze(encoded_sequence['pooler_output'], dim=1)
        return pool_out

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        output = self.hidden_layer(encoded_sents)
        output = self.relu(output)
        output = self.hidden_layer_2(output)
        output = self.relu(output)
        #output = self.hidden_layer_3(output)   # remove third layer to re-run best model on test set
        #output = self.relu(output)
        
        # output is of size (batch_size, hidden_layer)

        # Run through the classifier chain

        cur_input = output
        logits = []

        for classifier in self.chain:

          # Get output of next in chain
          o = classifier(cur_input)

          # Save the logits for training
          logits.append(o)

          # Make a prediction so we can append it to the next input
          # TODO: Could also append raw logits, potentially
          pred = make_prediction(o)

          # Append the previous prediction to the input for the next classifier
          cur_input = torch.cat([cur_input, pred], dim=2)

        # Preds contains 20 tensors, each batch_size x 1 x 1
        # We need to return one tensor that is 128 x 20
        stack = logits[0].squeeze(dim=1)
        for logit in logits[1:]:
          stack = torch.cat([stack, logit.squeeze(dim=1)], dim=-1)
        
        return stack

## Evaluation

### Metric Functions

In [60]:
def precision(predicted_labels, true_labels):
    """
    Precision is True Positives / All Positives Predictions
    """

    # Each pred/true pair is a list of 20 values, so need to go one level deeper

    all_pos = 0
    true_pos = 0
    for i in range(len(predicted_labels)):
      cur_pred = predicted_labels[i]
      cur_true = true_labels[i]

      # Count both true_pos and false_pos
      all_pos += sum(cur_pred)

      # Get true_pos only
      for j in range(len(cur_pred)):
        if (cur_pred[j] == 1 and cur_pred[j] == cur_true[j]):
          true_pos += 1

    if all_pos:
        return true_pos/all_pos   
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """

    false_neg = 0
    true_pos = 0
    for i in range(len(predicted_labels)):
      cur_pred = predicted_labels[i]
      cur_true = true_labels[i]
    
      for j in range(len(cur_pred)):
        # Get true_pos
        if (cur_pred[j] == 1 and cur_pred[j] == cur_true[j]):
          true_pos += 1

        # Get false_neg
        if (cur_pred[j] == 0 and cur_true[j] == 1):
          false_neg += 1
      
    denom = false_neg + true_pos
    if denom:
        return true_pos/denom
    else:
        return 0.

def f1_score(
    predicted_labels: List[int],
    true_labels: List[int]
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels)
    R = recall(predicted_labels, true_labels)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


## Training Loop

In [65]:
def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    possible_labels
):
    print("Training...")
    dev_f1_scores = []
    #loss_func = torch.nn.BCEWithLogitsLoss()
    loss_func = torch.nn.BCELoss()

    # Send the data to the device first
    #train_features = train_features.to(device)
    #train_labels = train_labels.to(device)
    #dev_features = dev_features.to(device)
    #dev_labels = dev_labels.to(device)

    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):
            # Empty the dynamic computation graph
            optimizer.zero_grad()
            preds = model(features)
            loss = loss_func(preds, labels.float())
            
            # Backpropogate the loss through our model
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_features, dev_labels), total=len(dev_features)):
            pred = predict(model, sents)
            all_preds.extend(pred.cpu().detach().numpy())
            all_labels.extend(list(labels.cpu().numpy()))
        dev_f1 = f1_score(all_preds, all_labels)
        print(f"Dev F1 {dev_f1}")
        dev_f1_scores.append(dev_f1)

    # Print the best dev_f1 score for result reporting
    print(f"Best dev F1 score: {np.max(dev_f1_scores)}")
    print(f"Best iteration: {np.argmax(dev_f1_scores)}")
    
    # Return the trained model
    return model

# Training Phase

## Setup

In [66]:
# Number of labels (should be 20)
possible_labels = len(train_labels[0])
if possible_labels != 20:
  raise RuntimeError(f"Instead of 20 possible labels, we found {possible_labels}.")

# Intialize model
model = NLIClassifier(output_size=possible_labels, hidden_size=hidden_size, dropout_prob=dropout_prob)
model.train()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Train the Model

In [67]:
# Start the training
trained_model = training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    val_input_batches,
    val_label_batches,
    optimizer,
    model,
    list(range(possible_labels))
)


Training...


100%|██████████| 131/131 [00:02<00:00, 49.16it/s]


epoch 0, loss: 0.40560710452895127
Evaluating dev...


100%|██████████| 1044/1044 [00:09<00:00, 105.55it/s]


Dev F1 0.4495399858457183


100%|██████████| 131/131 [00:02<00:00, 49.00it/s]


epoch 1, loss: 0.37753565757329227
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.01it/s]


Dev F1 0.4765911249830371


100%|██████████| 131/131 [00:02<00:00, 49.29it/s]


epoch 2, loss: 0.368096781368474
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.26it/s]


Dev F1 0.4800114629603095


100%|██████████| 131/131 [00:02<00:00, 48.56it/s]


epoch 3, loss: 0.3624258926351562
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.92it/s]


Dev F1 0.48458023986293547


100%|██████████| 131/131 [00:02<00:00, 48.58it/s]


epoch 4, loss: 0.35787565812809774
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.44it/s]


Dev F1 0.4801161103047895


100%|██████████| 131/131 [00:02<00:00, 49.63it/s]


epoch 5, loss: 0.35143334492472295
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.08it/s]


Dev F1 0.4883124717237219


100%|██████████| 131/131 [00:02<00:00, 49.12it/s]


epoch 6, loss: 0.3474357933488511
Evaluating dev...


100%|██████████| 1044/1044 [00:09<00:00, 108.08it/s]


Dev F1 0.5044545056229005


100%|██████████| 131/131 [00:02<00:00, 49.29it/s]


epoch 7, loss: 0.34419249760285586
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 140.31it/s]


Dev F1 0.5019897669130189


100%|██████████| 131/131 [00:02<00:00, 49.65it/s]


epoch 8, loss: 0.3403978855100297
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.63it/s]


Dev F1 0.50787230969233


100%|██████████| 131/131 [00:02<00:00, 49.61it/s]


epoch 9, loss: 0.33803339109165975
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 139.73it/s]


Dev F1 0.5110437418796017


100%|██████████| 131/131 [00:02<00:00, 49.09it/s]


epoch 10, loss: 0.33496619203618466
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.82it/s]


Dev F1 0.5015553251370167


100%|██████████| 131/131 [00:02<00:00, 48.96it/s]


epoch 11, loss: 0.33268041624367695
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.01it/s]


Dev F1 0.5204383886255924


100%|██████████| 131/131 [00:02<00:00, 49.09it/s]


epoch 12, loss: 0.33031144738197327
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.52it/s]


Dev F1 0.5156845499853414


100%|██████████| 131/131 [00:02<00:00, 49.50it/s]


epoch 13, loss: 0.32800990001845903
Evaluating dev...


100%|██████████| 1044/1044 [00:14<00:00, 73.74it/s]


Dev F1 0.5172870383992938


100%|██████████| 131/131 [00:02<00:00, 44.74it/s]


epoch 14, loss: 0.32540931683460267
Evaluating dev...


100%|██████████| 1044/1044 [00:12<00:00, 83.26it/s]


Dev F1 0.5280480824270177


100%|██████████| 131/131 [00:02<00:00, 47.77it/s]


epoch 15, loss: 0.32349497557596396
Evaluating dev...


100%|██████████| 1044/1044 [00:08<00:00, 128.97it/s]


Dev F1 0.5275966970882224


100%|██████████| 131/131 [00:02<00:00, 46.19it/s]


epoch 16, loss: 0.32075449687834007
Evaluating dev...


100%|██████████| 1044/1044 [00:12<00:00, 85.48it/s]


Dev F1 0.5264394829612221


100%|██████████| 131/131 [00:02<00:00, 47.08it/s]


epoch 17, loss: 0.31929632452608064
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 139.23it/s]


Dev F1 0.5159512761020881


100%|██████████| 131/131 [00:02<00:00, 44.58it/s]


epoch 18, loss: 0.31806326117224365
Evaluating dev...


100%|██████████| 1044/1044 [00:08<00:00, 121.70it/s]


Dev F1 0.5231560891938252


100%|██████████| 131/131 [00:02<00:00, 49.54it/s]


epoch 19, loss: 0.3154679716543387
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.77it/s]


Dev F1 0.525392029923752


100%|██████████| 131/131 [00:02<00:00, 49.39it/s]


epoch 20, loss: 0.31268262226162974
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.99it/s]


Dev F1 0.5279837185637447


100%|██████████| 131/131 [00:02<00:00, 49.14it/s]


epoch 21, loss: 0.31102725544958626
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.58it/s]


Dev F1 0.5282310469314079


100%|██████████| 131/131 [00:02<00:00, 49.69it/s]


epoch 22, loss: 0.3093476504769944
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.94it/s]


Dev F1 0.5290790427846266


100%|██████████| 131/131 [00:02<00:00, 49.02it/s]


epoch 23, loss: 0.3080750533187662
Evaluating dev...


100%|██████████| 1044/1044 [00:08<00:00, 128.49it/s]


Dev F1 0.5295051219160294


100%|██████████| 131/131 [00:02<00:00, 49.15it/s]


epoch 24, loss: 0.30607493619882425
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.35it/s]


Dev F1 0.5331212033555106


100%|██████████| 131/131 [00:02<00:00, 49.28it/s]


epoch 25, loss: 0.30439757083663505
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.81it/s]


Dev F1 0.5257657134562346


100%|██████████| 131/131 [00:02<00:00, 49.43it/s]


epoch 26, loss: 0.3042689475394387
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.47it/s]


Dev F1 0.5340876124165943


100%|██████████| 131/131 [00:02<00:00, 49.28it/s]


epoch 27, loss: 0.3025742640704599
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.33it/s]


Dev F1 0.527721335268505


100%|██████████| 131/131 [00:02<00:00, 49.13it/s]


epoch 28, loss: 0.3019847388713414
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.55it/s]


Dev F1 0.5384833050367854


100%|██████████| 131/131 [00:02<00:00, 48.83it/s]


epoch 29, loss: 0.2989337486392669
Evaluating dev...


100%|██████████| 1044/1044 [00:09<00:00, 104.61it/s]


Dev F1 0.5287950596007468


100%|██████████| 131/131 [00:02<00:00, 49.52it/s]


epoch 30, loss: 0.29796922423002375
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.72it/s]


Dev F1 0.5225881667152433


100%|██████████| 131/131 [00:02<00:00, 49.36it/s]


epoch 31, loss: 0.2973361198456233
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.13it/s]


Dev F1 0.5378421900161031


100%|██████████| 131/131 [00:02<00:00, 49.10it/s]


epoch 32, loss: 0.29502782225608826
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.17it/s]


Dev F1 0.5361907465228498


100%|██████████| 131/131 [00:02<00:00, 49.57it/s]


epoch 33, loss: 0.2936614874665064
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.07it/s]


Dev F1 0.5269972046491098


100%|██████████| 131/131 [00:02<00:00, 49.48it/s]


epoch 34, loss: 0.2946456729910756
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.94it/s]


Dev F1 0.5367421475529585


100%|██████████| 131/131 [00:02<00:00, 49.64it/s]


epoch 35, loss: 0.29255409265747506
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.35it/s]


Dev F1 0.5138316656856975


100%|██████████| 131/131 [00:02<00:00, 49.22it/s]


epoch 36, loss: 0.29058370471910666
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.85it/s]


Dev F1 0.5234741784037559


100%|██████████| 131/131 [00:02<00:00, 49.40it/s]


epoch 37, loss: 0.29061904981846115
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.98it/s]


Dev F1 0.535791444620481


100%|██████████| 131/131 [00:02<00:00, 48.93it/s]


epoch 38, loss: 0.28791099638884304
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.27it/s]


Dev F1 0.5369798559035436


100%|██████████| 131/131 [00:02<00:00, 49.19it/s]


epoch 39, loss: 0.2864843521636861
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.57it/s]


Dev F1 0.5341023792357605


100%|██████████| 131/131 [00:02<00:00, 49.37it/s]


epoch 40, loss: 0.2864495485111047
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.89it/s]


Dev F1 0.5283342873497424


100%|██████████| 131/131 [00:02<00:00, 49.48it/s]


epoch 41, loss: 0.2842479317470361
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.60it/s]


Dev F1 0.5270496374790854


100%|██████████| 131/131 [00:02<00:00, 49.10it/s]


epoch 42, loss: 0.2831202111853898
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.09it/s]


Dev F1 0.5260942760942761


100%|██████████| 131/131 [00:02<00:00, 49.50it/s]


epoch 43, loss: 0.2829751547511297
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.68it/s]


Dev F1 0.5328218243819267


100%|██████████| 131/131 [00:02<00:00, 49.49it/s]


epoch 44, loss: 0.2796725266762362
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.30it/s]


Dev F1 0.5462937062937063


100%|██████████| 131/131 [00:02<00:00, 49.37it/s]


epoch 45, loss: 0.28010323529935066
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.10it/s]


Dev F1 0.5440890125173853


100%|██████████| 131/131 [00:02<00:00, 48.98it/s]


epoch 46, loss: 0.27737996334793
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.32it/s]


Dev F1 0.5411344595522594


100%|██████████| 131/131 [00:02<00:00, 48.90it/s]


epoch 47, loss: 0.2784158362232092
Evaluating dev...


100%|██████████| 1044/1044 [00:08<00:00, 117.55it/s]


Dev F1 0.5280835450183461


100%|██████████| 131/131 [00:02<00:00, 49.68it/s]


epoch 48, loss: 0.2746510966361024
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.68it/s]


Dev F1 0.5344779839379673


100%|██████████| 131/131 [00:02<00:00, 49.16it/s]


epoch 49, loss: 0.2752235860542487
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.82it/s]


Dev F1 0.5328729281767955


100%|██████████| 131/131 [00:02<00:00, 49.06it/s]


epoch 50, loss: 0.2738554916764034
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.00it/s]


Dev F1 0.5303974518764715


100%|██████████| 131/131 [00:02<00:00, 49.73it/s]


epoch 51, loss: 0.2726365141286195
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.89it/s]


Dev F1 0.5278206026629292


100%|██████████| 131/131 [00:02<00:00, 49.34it/s]


epoch 52, loss: 0.2726887456560863
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.19it/s]


Dev F1 0.5306974801614924


100%|██████████| 131/131 [00:02<00:00, 49.42it/s]


epoch 53, loss: 0.2719779138574163
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.26it/s]


Dev F1 0.5289814293753516


100%|██████████| 131/131 [00:02<00:00, 49.44it/s]


epoch 54, loss: 0.27010609471160946
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.38it/s]


Dev F1 0.5405405405405407


100%|██████████| 131/131 [00:02<00:00, 49.53it/s]


epoch 55, loss: 0.26864902766151283
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.19it/s]


Dev F1 0.5331521739130435


100%|██████████| 131/131 [00:02<00:00, 49.94it/s]


epoch 56, loss: 0.26915672011957825
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.47it/s]


Dev F1 0.5404442007129148


100%|██████████| 131/131 [00:02<00:00, 50.61it/s]


epoch 57, loss: 0.2676643316299861
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.36it/s]


Dev F1 0.5455547898001378


100%|██████████| 131/131 [00:02<00:00, 49.80it/s]


epoch 58, loss: 0.26754349686262263
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.38it/s]


Dev F1 0.5340183282421549


100%|██████████| 131/131 [00:02<00:00, 49.76it/s]


epoch 59, loss: 0.26475247388122647
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.59it/s]


Dev F1 0.5321792545023034


100%|██████████| 131/131 [00:02<00:00, 50.67it/s]


epoch 60, loss: 0.2640502269713933
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.13it/s]


Dev F1 0.5464359861591697


100%|██████████| 131/131 [00:02<00:00, 50.98it/s]


epoch 61, loss: 0.2646400406387926
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.27it/s]


Dev F1 0.532372105919867


100%|██████████| 131/131 [00:02<00:00, 51.85it/s]


epoch 62, loss: 0.26316665385970633
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.00it/s]


Dev F1 0.5368796270907595


100%|██████████| 131/131 [00:02<00:00, 51.26it/s]


epoch 63, loss: 0.2620168335110177
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.28it/s]


Dev F1 0.5363738108561835


100%|██████████| 131/131 [00:02<00:00, 50.97it/s]


epoch 64, loss: 0.2604828050345865
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 133.68it/s]


Dev F1 0.5362863908622371


100%|██████████| 131/131 [00:02<00:00, 48.76it/s]


epoch 65, loss: 0.2603610217116261
Evaluating dev...


100%|██████████| 1044/1044 [00:08<00:00, 128.91it/s]


Dev F1 0.5345911949685535


100%|██████████| 131/131 [00:02<00:00, 50.91it/s]


epoch 66, loss: 0.2585371191265019
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.15it/s]


Dev F1 0.5344052323204797


100%|██████████| 131/131 [00:02<00:00, 50.66it/s]


epoch 67, loss: 0.2585865525343946
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.25it/s]


Dev F1 0.5280244173140956


100%|██████████| 131/131 [00:02<00:00, 49.71it/s]


epoch 68, loss: 0.25665913908991195
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.19it/s]


Dev F1 0.5360998177484929


100%|██████████| 131/131 [00:02<00:00, 49.70it/s]


epoch 69, loss: 0.25637710265075886
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.27it/s]


Dev F1 0.5359477124183005


100%|██████████| 131/131 [00:02<00:00, 48.93it/s]


epoch 70, loss: 0.2547734011220568
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.74it/s]


Dev F1 0.5391186440677965


100%|██████████| 131/131 [00:02<00:00, 49.56it/s]


epoch 71, loss: 0.2536613512357683
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.36it/s]


Dev F1 0.5398585886593651


100%|██████████| 131/131 [00:02<00:00, 49.65it/s]


epoch 72, loss: 0.2524921887248527
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 139.12it/s]


Dev F1 0.538288920056101


100%|██████████| 131/131 [00:02<00:00, 49.89it/s]


epoch 73, loss: 0.25216862775442256
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.38it/s]


Dev F1 0.5286222952615722


100%|██████████| 131/131 [00:02<00:00, 49.36it/s]


epoch 74, loss: 0.25111634176196035
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 139.13it/s]


Dev F1 0.5353424657534246


100%|██████████| 131/131 [00:02<00:00, 49.06it/s]


epoch 75, loss: 0.25010286840318724
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.25it/s]


Dev F1 0.5276193126478363


100%|██████████| 131/131 [00:02<00:00, 49.40it/s]


epoch 76, loss: 0.2504102051940583
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.12it/s]


Dev F1 0.5378966329499792


100%|██████████| 131/131 [00:02<00:00, 49.20it/s]


epoch 77, loss: 0.24967444511770293
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.20it/s]


Dev F1 0.5267943242870918


100%|██████████| 131/131 [00:02<00:00, 48.67it/s]


epoch 78, loss: 0.24657730030194494
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.54it/s]


Dev F1 0.5327631397434129


100%|██████████| 131/131 [00:02<00:00, 48.68it/s]


epoch 79, loss: 0.24646254759708433
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.88it/s]


Dev F1 0.5390991986736667


100%|██████████| 131/131 [00:02<00:00, 50.25it/s]


epoch 80, loss: 0.24684777389500887
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.48it/s]


Dev F1 0.5361690219672589


100%|██████████| 131/131 [00:02<00:00, 51.08it/s]


epoch 81, loss: 0.24400115832117678
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.83it/s]


Dev F1 0.5320495185694635


100%|██████████| 131/131 [00:02<00:00, 51.09it/s]


epoch 82, loss: 0.2449614379469675
Evaluating dev...


100%|██████████| 1044/1044 [00:08<00:00, 127.86it/s]


Dev F1 0.5370629370629371


100%|██████████| 131/131 [00:02<00:00, 48.53it/s]


epoch 83, loss: 0.24503195058298474
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.70it/s]


Dev F1 0.5312327252625759


100%|██████████| 131/131 [00:02<00:00, 50.69it/s]


epoch 84, loss: 0.24366267446343226
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.16it/s]


Dev F1 0.5249366018596787


100%|██████████| 131/131 [00:02<00:00, 50.13it/s]


epoch 85, loss: 0.24044403342800286
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 133.71it/s]


Dev F1 0.5250484362026017


100%|██████████| 131/131 [00:02<00:00, 50.23it/s]


epoch 86, loss: 0.24333297319084635
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.01it/s]


Dev F1 0.5291970802919708


100%|██████████| 131/131 [00:02<00:00, 49.88it/s]


epoch 87, loss: 0.24105954340851035
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.90it/s]


Dev F1 0.5255813953488372


100%|██████████| 131/131 [00:02<00:00, 49.96it/s]


epoch 88, loss: 0.2405805630993297
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.25it/s]


Dev F1 0.5293220573963676


100%|██████████| 131/131 [00:02<00:00, 50.16it/s]


epoch 89, loss: 0.23888123024055977
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.14it/s]


Dev F1 0.5313714602998334


100%|██████████| 131/131 [00:02<00:00, 49.81it/s]


epoch 90, loss: 0.23713482302108793
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.73it/s]


Dev F1 0.5285441136301557


100%|██████████| 131/131 [00:02<00:00, 49.60it/s]


epoch 91, loss: 0.23840881645224477
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 138.16it/s]


Dev F1 0.5277324632952692


100%|██████████| 131/131 [00:02<00:00, 48.98it/s]


epoch 92, loss: 0.23730165528432104
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 136.80it/s]


Dev F1 0.5251916757940854


100%|██████████| 131/131 [00:02<00:00, 48.78it/s]


epoch 93, loss: 0.2354684129698586
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.30it/s]


Dev F1 0.5229615745079662


100%|██████████| 131/131 [00:02<00:00, 48.94it/s]


epoch 94, loss: 0.2356778213768515
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 135.19it/s]


Dev F1 0.5236373748609566


100%|██████████| 131/131 [00:02<00:00, 48.93it/s]


epoch 95, loss: 0.2349689028417791
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.21it/s]


Dev F1 0.5245407183986838


100%|██████████| 131/131 [00:02<00:00, 49.89it/s]


epoch 96, loss: 0.23544567846159897
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.51it/s]


Dev F1 0.5220628792057362


100%|██████████| 131/131 [00:02<00:00, 50.32it/s]


epoch 97, loss: 0.23479333155937776
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 137.03it/s]


Dev F1 0.5301401551231459


100%|██████████| 131/131 [00:02<00:00, 49.89it/s]


epoch 98, loss: 0.23353693662708952
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.57it/s]


Dev F1 0.5195132186319765


100%|██████████| 131/131 [00:02<00:00, 49.66it/s]


epoch 99, loss: 0.23118160721909908
Evaluating dev...


100%|██████████| 1044/1044 [00:07<00:00, 134.88it/s]


Dev F1 0.5362673186634068
Best dev F1 score: 0.5464359861591697
Best iteration: 60


## Run Model on Entire Dataset for Evaluation

In [68]:
# Set up our output DataFrame
cols = [c for c in train_labs_df.columns if c != 'labels']
out_df = pd.DataFrame(columns=cols)

# Set the Arg ID as the index for easy access
out_df['Argument ID'] = full_ids
out_df.set_index('Argument ID', inplace = True)

# Put the model in evaluation mode
model.eval()

for sents, id in tqdm(zip(full_input_batches, full_ids), total=len(full_input_batches)):
  # Get our prediction
  pred = predict(model, sents).cpu().detach().numpy()[0]

  # Add it to the output DataFrame
  out_df.loc[id] = pred

# Print out for error checking
out_df.info()

100%|██████████| 5220/5220 [01:13<00:00, 71.09it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 5220 entries, A01001 to D27100
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Self-direction: thought     5220 non-null   object
 1   Self-direction: action      5220 non-null   object
 2   Stimulation                 5220 non-null   object
 3   Hedonism                    5220 non-null   object
 4   Achievement                 5220 non-null   object
 5   Power: dominance            5220 non-null   object
 6   Power: resources            5220 non-null   object
 7   Face                        5220 non-null   object
 8   Security: personal          5220 non-null   object
 9   Security: societal          5220 non-null   object
 10  Tradition                   5220 non-null   object
 11  Conformity: rules           5220 non-null   object
 12  Conformity: interpersonal   5220 non-null   object
 13  Humility                    5220 non-null   ob




In [69]:
# Write the output to a TSV
out_df = out_df.astype(int)
#out_df.to_csv('/content/drive/MyDrive/nlp_sp/data/model-preds.tsv', sep="\t")               # Spencer
out_df.to_csv('/content/drive/MyDrive/csci5832_project/results/model-preds-best-train-v2.tsv', sep="\t")   # Caroline
print("Finished run!")


Finished run!


# Test phase

## Test set data preprocessing

### Load data

In [70]:
# test arguments
#train_args_df = pd.read_csv('/content/drive/MyDrive/nlp_sp/data/arguments-test.tsv', sep='\t')         # Spencer
test_args_df = pd.read_csv('/content/drive/MyDrive/csci5832_project/data/arguments-test.tsv', sep='\t') # Caroline
# view structure
test_args_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576 entries, 0 to 1575
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Argument ID  1576 non-null   object
 1   Conclusion   1576 non-null   object
 2   Stance       1576 non-null   object
 3   Premise      1576 non-null   object
dtypes: object(4)
memory usage: 49.4+ KB


### Data prep

In [71]:
# convert each row to a dictionary -> List[Dict]
test_data = test_args_df.to_dict(orient='records')
# print examples
print('test example:\n', test_data[0])

test example:
 {'Argument ID': 'A26004', 'Conclusion': 'We should end affirmative action', 'Stance': 'against', 'Premise': 'affirmative action helps with employment equity.'}


### Tokenization

In [72]:
# function to load samples from HuggingFace dataset to be batched and encoded
# identically defined for train/dev section but here again for ease of use

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""
    """HuggingFace docs: https://huggingface.co/transformers/v3.0.2/preprocessing.html"""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
    
    # HuggingFace tokenizer will join data with sentence separator token
    # and match batches of tokenized and encoded sentences
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token

    # call method can only take a pair of inputs, but we have three
    # conclusion batch, stance batch, and premise batch
    # so we create a hack
    #def __call__(self, con_batch: List[str], stan_batch: List[str], prem_batch: List[str]) -> List[List[str]]:

    def __call__(self, con_stan_batch: List[str], prem_batch: List[str]) -> List[List[str]]:  
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # the two sentences deimited by the [SEP] token.
        enc = self.hf_tokenizer(
            con_stan_batch,
            prem_batch,
            #stan_batch,
            #prem_batch,
            padding=True,
            return_token_type_ids=False, # ignore with hack
            return_tensors='pt'
        )

        return enc

In [73]:
# define tokenizer
tokenizer = BatchTokenizer()

### "Batch"

In [74]:
# redefine another function to generate triple-wise inputs (test data w/o labels)

def generate_triplewise_input_test(dataset: List[Dict]) -> (List[str], List[str], List[str], List[str]):
    """
    group all argument components
    a datapoint is now a dictionary of 
    argument id, conclusion, stance, premise
    """

    # extract each observation from dictionary; save to list
    d_vals = []
    for i in range(len(dataset)):
        d_vals.append(list(dataset[i].values()))

    # store data items in lists by three categories by id
    id_lst = []    
    conclusion_lst = []
    stance_lst = []
    premise_lst = []

    # generate separate lists from each observation
    for i in range(len(d_vals)):
        id_lst.append(d_vals[i][0])
        conclusion_lst.append(d_vals[i][1])
        stance_lst.append(d_vals[i][2])
        premise_lst.append(d_vals[i][3])

    # add [SEP] token before every stance in list
    stance_lst = [' [SEP] ' + s for s in stance_lst]

    return id_lst, conclusion_lst, stance_lst, premise_lst

In [75]:
# apply function to generate triple-wise inputs and labels for batching

# test data
test_ids, test_conclusions, test_stances, test_premises = generate_triplewise_input_test(test_data)

In [76]:
# temporarily combine conclusions and stances separate with [SEP]
# use hack to merge tokenized conclusion batch, stance batch, and premise batch

# test data
test_conclusions_stances = []
for i in range(len(test_conclusions)):
  test_conclusions_stances.append(test_conclusions[i] + test_stances[i])

In [77]:
# define functions to chunk data for batches 
# identically defined for train/dev section but here again for ease of use

# for train labels
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i: i+n]

# for train features
def chunk_multi(lst1, lst2, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i+n], lst2[i: i+n]

In [78]:
# apply function to batch input data 
# tokenize and encode simultaneously since we are using HuggingFace

# single "batch"
test_size = 1
test_input_batches = [b for b in chunk_multi(test_conclusions_stances, test_premises, test_size)]

# tokenize + encode
test_input_batches = [tokenizer(*batch).to(device) for batch in test_input_batches]

In [79]:
# check test data example
print(test_input_batches[0])
encoded_test_tst = tokenizer.hf_tokenizer.batch_decode(test_input_batches[0]['input_ids'])
encoded_test_tst[0]

{'input_ids': tensor([[  101,  2057,  2323,  2203, 27352,  2895,   102,  2114,   102, 27352,
          2895,  7126,  2007,  6107, 10067,  1012,   102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


'[CLS] we should end affirmative action [SEP] against [SEP] affirmative action helps with employment equity. [SEP]'

## Test set predictions 
Obtain the predictions on the test set using the trained model 

In [80]:
# Set up our output DataFrame
cols = [c for c in train_labs_df.columns if c != 'labels']
out_df = pd.DataFrame(columns=cols)

# Set the Arg ID as the index for easy access
out_df['Argument ID'] = test_ids
out_df.set_index('Argument ID', inplace = True)

# Put the model in evaluation mode
model.eval()

for sents, id in tqdm(zip(test_input_batches, test_ids), total=len(test_input_batches)):
  # Get our prediction
  pred = predict(model, sents).cpu().detach().numpy()[0]

  # Add it to the output DataFrame
  out_df.loc[id] = pred

# Print out for error checking
out_df.info()

100%|██████████| 1576/1576 [00:16<00:00, 97.56it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 1576 entries, A26004 to E08023
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Self-direction: thought     1576 non-null   object
 1   Self-direction: action      1576 non-null   object
 2   Stimulation                 1576 non-null   object
 3   Hedonism                    1576 non-null   object
 4   Achievement                 1576 non-null   object
 5   Power: dominance            1576 non-null   object
 6   Power: resources            1576 non-null   object
 7   Face                        1576 non-null   object
 8   Security: personal          1576 non-null   object
 9   Security: societal          1576 non-null   object
 10  Tradition                   1576 non-null   object
 11  Conformity: rules           1576 non-null   object
 12  Conformity: interpersonal   1576 non-null   object
 13  Humility                    1576 non-null   ob




In [81]:
# Write the output to a TSV
out_df = out_df.astype(int)
#out_df.to_csv('/content/drive/MyDrive/nlp_sp/data/model-preds-test.tsv', sep="\t")               # Spencer
out_df.to_csv('/content/drive/MyDrive/csci5832_project/results/model-preds-test.tsv', sep="\t")   # Caroline
print("Finished run!")

Finished run!
