---
# **Dataset Preprocessing**
---

In [1]:
# Preprocessing Imports

import pandas as pd
import os

In [2]:
!nvidia-smi

Thu May  4 19:09:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
BASE_DIR = '/content/drive/MyDrive/'
PROJECT_DIR = os.path.join(BASE_DIR, 'project')


In [4]:
# Load Datasets and rename columns

dataset_df = pd.read_csv(os.path.join(PROJECT_DIR, 'vuamc_corpus_train.csv'))
verb_pos_df = pd.read_csv(os.path.join(PROJECT_DIR, 'verb_tokens.csv'), header=None)
verb_pos_df = verb_pos_df.rename({0:'TextID_SentID_Index', 1:'Label'}, axis = 1)
dataset_df.head()

Unnamed: 0,txt_id,sentence_id,sentence_txt
0,a1e-fragment01,1,Latest corporate unbundler M_reveals laid-back...
1,a1e-fragment01,2,By FRANK KANE
2,a1e-fragment01,3,"IT SEEMS that Roland Franklin , the latest unb..."
3,a1e-fragment01,4,He has not properly investigated the M_target ...
4,a1e-fragment01,5,The 63-year-old M_head of Pembridge Investment...


In [5]:
# Process verb dataframe

verb_pos_df['split_data'] = verb_pos_df.apply(lambda row: row['TextID_SentID_Index'].split('_'), axis = 1)
verb_pos_df[['TextID','SentID','Index']] = pd.DataFrame(verb_pos_df['split_data'].tolist(), index= verb_pos_df.index)
verb_pos_df.head()

Unnamed: 0,TextID_SentID_Index,Label,split_data,TextID,SentID,Index
0,a1h-fragment06_117_32,1,"[a1h-fragment06, 117, 32]",a1h-fragment06,117,32
1,a1h-fragment06_118_4,0,"[a1h-fragment06, 118, 4]",a1h-fragment06,118,4
2,a1h-fragment06_118_10,1,"[a1h-fragment06, 118, 10]",a1h-fragment06,118,10
3,a1h-fragment06_118_18,1,"[a1h-fragment06, 118, 18]",a1h-fragment06,118,18
4,a1h-fragment06_118_22,1,"[a1h-fragment06, 118, 22]",a1h-fragment06,118,22


In [6]:
# Function that tokenizes the sentences and create labels for the sequence labelling task

def tokenize_and_label(row):

    txt_id, sent_id, sent = row['txt_id'], row['sentence_id'], row['sentence_txt']

    try:
        tokenized_txt = str(sent).lower().split(' ')
    except:
        print(txt_id, sent_id, sent)


    m_idx_label = verb_pos_df[(verb_pos_df['TextID'] == txt_id) & (verb_pos_df['SentID'] == sent_id)].reset_index(drop = True)

    labels = [0] * len(tokenized_txt)

    for idx, r in m_idx_label.iterrows():
        labels[int(r['Index'])-1] = int(r['Label'])
    
    row['tokenized_txt'] = tokenized_txt
    row['labels'] = labels
    return row

In [7]:
# Apply dataset preprocessing function to Dataset

tokenize_df = dataset_df.apply(tokenize_and_label, axis = 1)
tokenize_df.head()

Unnamed: 0,txt_id,sentence_id,sentence_txt,tokenized_txt,labels
0,a1e-fragment01,1,Latest corporate unbundler M_reveals laid-back...,"[latest, corporate, unbundler, m_reveals, laid...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,a1e-fragment01,2,By FRANK KANE,"[by, frank, kane]","[0, 0, 0]"
2,a1e-fragment01,3,"IT SEEMS that Roland Franklin , the latest unb...","[it, seems, that, roland, franklin, ,, the, la...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,a1e-fragment01,4,He has not properly investigated the M_target ...,"[he, has, not, properly, investigated, the, m_...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,a1e-fragment01,5,The 63-year-old M_head of Pembridge Investment...,"[the, 63-year-old, m_head, of, pembridge, inve...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [8]:
# Drop Nulls

tokenize_df = tokenize_df.dropna(how = 'any')

In [9]:
# Convert to CSV for future use

tokenize_df.to_csv('vua_corpus_sequence.csv')

In [10]:
# Extract necessary columns

final_dataset = tokenize_df[['tokenized_txt','labels']]
final_dataset.head()

Unnamed: 0,tokenized_txt,labels
0,"[latest, corporate, unbundler, m_reveals, laid...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,"[by, frank, kane]","[0, 0, 0]"
2,"[it, seems, that, roland, franklin, ,, the, la...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[he, has, not, properly, investigated, the, m_...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[the, 63-year-old, m_head, of, pembridge, inve...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [11]:
# Generate dataset statistics

all_tokens = []
def all_tokens_fn(row):
    tokens = list(row['tokenized_txt'])
    all_tokens.extend(tokens)
    return row

temp1 = tokenize_df.apply(all_tokens_fn, axis = 1)

temp = tokenize_df.apply(lambda row: len(row['tokenized_txt']), axis = 1)
print(f"Total tokens => {temp.sum()}")
print(f"Total sentences => {len(tokenize_df)}")
print(f"Unique tokens => {len(set(all_tokens))}")

Total tokens => 181488
Total sentences => 12109
Unique tokens => 17874


---
# **Sequence Labelling Setup**
---

In [12]:
# Install required packages to colab instance

!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-h

In [13]:
# Sequence labelling task imports

from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
import evaluate
import numpy as np
import torch
from torch import nn

In [14]:
# Split dataset into train and test splits
train_df, test_df = train_test_split(final_dataset, test_size = 0.15, shuffle = True)

In [15]:
# Convert dataframes to huggingface datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Further split train dataset into validation dataset
train_dataset = train_dataset.train_test_split(test_size=0.15, shuffle = True)

In [16]:
# Initialize tokenizers

distilroberta_tknzer = AutoTokenizer.from_pretrained("distilroberta-base", add_prefix_space=True)
distilbert_tknzr = AutoTokenizer.from_pretrained("distilbert-base-uncased")
electra_tknzr = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
# Tokenize into subwords using BERT tokenizer and perform alignment
# token-label realignment is achieved by:
# 1. Mapping all tokens to their corresponding word with the word_ids method.
# 2. Assigning the label -100 to the special tokens [CLS] and [SEP] so they’re ignored by the PyTorch loss function.
# 3. Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.

def tokenize_and_align_labels(sample, tknzer):
    bert_tokens = tknzer(sample['tokenized_txt'], truncation=True, is_split_into_words = True)

    labels = []
    for i, label in enumerate(sample['labels']):
        word_ids = bert_tokens.word_ids(batch_index = i)
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx == None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        labels.append(label_ids)

    bert_tokens['new_labels'] = labels
    return bert_tokens

In [18]:
# Apply the tokenize_and_align_labels function using datasets map method

tokenized_train_dataset_bert = train_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilbert_tknzr})
tokenized_test_dataset_bert = test_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilbert_tknzr})

tokenized_train_dataset_roberta = train_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilroberta_tknzer})
tokenized_test_dataset_roberta = test_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilroberta_tknzer})

tokenized_train_dataset_electra = train_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": electra_tknzr})
tokenized_test_dataset_electra = test_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": electra_tknzr})

Map:   0%|          | 0/8748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/8748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/8748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

In [19]:
# Dynamically pad sequences and batch samples

data_collator_bert = DataCollatorForTokenClassification(tokenizer=distilbert_tknzr)
data_collator_roberta = DataCollatorForTokenClassification(tokenizer=distilroberta_tknzer)
data_collator_electra = DataCollatorForTokenClassification(tokenizer=electra_tknzr)

In [20]:
# Setup evaluation metrics

poseval = evaluate.load('poseval')

Downloading builder script:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

In [21]:
# Function to package predictions and labels for the evaluation function and Initialize metrics

label_list = ["Literal", "Metaphor"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = poseval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["macro avg"]["precision"],
        "recall": results["macro avg"]["recall"],
        "f1": results["macro avg"]["f1-score"],
        "accuracy": results["accuracy"],
        "Literal precision": results["Literal"]["precision"],
        "Literal recall": results["Literal"]["recall"],
        "Literal f1": results["Literal"]["f1-score"],
        "Metaphor precision": results["Metaphor"]["precision"],
        "Metaphor recall": results["Metaphor"]["recall"],
        "Metaphor f1": results["Metaphor"]["f1-score"],
    }


In [22]:
# Dictionary of labels and theirs corresponding indexes
id2label = {
    0:'Literal',
    1:'Metaphor'
}

label2id = {
    'Literal':0,
    'Metaphor':1
}

In [23]:
# Load distilBERT model
model_distilbert = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Load distilRoBERTa model
model_distilroberta= AutoModelForTokenClassification.from_pretrained(
    "distilroberta-base", num_labels=2, id2label=id2label, label2id=label2id
)

# Load ELECTRA-Base model
model_electra= AutoModelForTokenClassification.from_pretrained(
    "google/electra-base-discriminator", num_labels=2, id2label=id2label, label2id=label2id
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Downloading pytorch_model.bin:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream tas

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForTokenClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.weight', 'cla

---
# DistilBERT Training and Evaluation
---

In [24]:
# Define custom trainer to allow for weighted loss since our classes are unbalanced

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.1,0.9], device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [25]:
# Prepare training hyper parameters

training_args = TrainingArguments(
    output_dir="distilbert_metaphor_sequence",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = CustomTrainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_train_dataset_bert["train"],
    eval_dataset=tokenized_train_dataset_bert["test"],
    tokenizer=distilbert_tknzr,
    data_collator=data_collator_bert,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Literal precision,Literal recall,Literal f1,Metaphor precision,Metaphor recall,Metaphor f1
1,No log,0.212147,0.601947,0.843256,0.643735,0.919725,0.993203,0.92391,0.957304,0.210692,0.762602,0.330165
2,0.251800,0.186207,0.609072,0.895064,0.654769,0.917363,0.996289,0.918583,0.955859,0.221854,0.871545,0.353679
3,0.251800,0.165981,0.628676,0.896213,0.682696,0.933477,0.995943,0.935516,0.964784,0.261409,0.856911,0.400608
4,0.149600,0.16009,0.638063,0.912645,0.695937,0.937737,0.996782,0.93911,0.967087,0.279344,0.886179,0.424786
5,0.149600,0.167037,0.663251,0.890044,0.723871,0.952291,0.99513,0.955697,0.975015,0.331373,0.82439,0.472727
6,0.117200,0.178053,0.662265,0.882965,0.721888,0.952375,0.994729,0.956173,0.97507,0.329801,0.809756,0.468706
7,0.117200,0.180329,0.66926,0.887364,0.729988,0.954779,0.99492,0.958469,0.976354,0.3436,0.81626,0.483622
8,0.096500,0.186058,0.684622,0.877583,0.743999,0.96039,0.994244,0.964921,0.979363,0.375,0.790244,0.508634
9,0.096500,0.187495,0.687828,0.878829,0.747316,0.961276,0.994293,0.965788,0.979833,0.381363,0.79187,0.514799
10,0.083600,0.191559,0.688506,0.880476,0.748269,0.961402,0.994382,0.965831,0.979899,0.382629,0.795122,0.51664


TrainOutput(global_step=2740, training_loss=0.13422215663603623, metrics={'train_runtime': 627.2651, 'train_samples_per_second': 139.463, 'train_steps_per_second': 4.368, 'total_flos': 1670658127927152.0, 'train_loss': 0.13422215663603623, 'epoch': 10.0})

In [26]:
# Save model

trainer.save_model()

In [27]:
# Evaluating to display training accuracy

trainer.evaluate(tokenized_train_dataset_bert["train"])

{'eval_loss': 0.10829337686300278,
 'eval_precision': 0.6617841114814921,
 'eval_recall': 0.9625199033170029,
 'eval_f1': 0.7289791884323197,
 'eval_accuracy': 0.9444736301135196,
 'eval_Literal precision': 0.9994612470886621,
 'eval_Literal recall': 0.9434481895283698,
 'eval_Literal f1': 0.9706473048084004,
 'eval_Metaphor precision': 0.324106975874322,
 'eval_Metaphor recall': 0.9815916171056358,
 'eval_Metaphor f1': 0.48731107205623897,
 'eval_runtime': 23.0053,
 'eval_samples_per_second': 380.26,
 'eval_steps_per_second': 11.91,
 'epoch': 10.0}

In [28]:
# Running on test
trainer.predict(tokenized_test_dataset_bert)

PredictionOutput(predictions=array([[[   4.4517484,   -4.4128814],
        [   4.5658603,   -4.6013403],
        [   4.5854034,   -4.5563574],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   4.387273 ,   -4.2355804],
        [   4.47385  ,   -4.2324853],
        [   4.624173 ,   -4.568493 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   4.3951893,   -4.2378416],
        [   4.359786 ,   -4.292859 ],
        [   4.183528 ,   -4.070912 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       ...,

       [[   4.454409 ,   -4.466613 ],
        [   4.229215 ,   -4.1377883],
        [   4.177176 ,   -4.4818344],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -10

# DistilRoBERTa Training and Evaluation

In [29]:
# Define custom trainer to allow for weighted loss since our classes are unbalanced

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.7,0.3], device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [30]:
# Prepare training hyper parameters

training_args = TrainingArguments(
    output_dir="distilroberta_metaphor_sequence",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = CustomTrainer(
    model=model_distilroberta,
    args=training_args,
    train_dataset=tokenized_train_dataset_roberta["train"],
    eval_dataset=tokenized_train_dataset_roberta["test"],
    tokenizer=distilroberta_tknzer,
    data_collator=data_collator_roberta,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Literal precision,Literal recall,Literal f1,Metaphor precision,Metaphor recall,Metaphor f1
1,No log,0.044556,0.972535,0.525995,0.543139,0.975365,0.975373,0.999957,0.987512,0.969697,0.052033,0.098765
2,0.057100,0.037323,0.882143,0.588781,0.639962,0.977432,0.978571,0.998701,0.988533,0.785714,0.178862,0.291391
3,0.057100,0.032634,0.928626,0.615827,0.67853,0.979246,0.979952,0.999134,0.98945,0.877301,0.23252,0.367609
4,0.034300,0.03152,0.93377,0.627209,0.693106,0.979836,0.980535,0.999134,0.989747,0.887006,0.255285,0.396465
5,0.034300,0.030273,0.922557,0.667578,0.737369,0.981397,0.982613,0.998571,0.990528,0.8625,0.336585,0.484211
6,0.029200,0.028357,0.932574,0.675795,0.747446,0.981988,0.983035,0.998744,0.990827,0.882114,0.352846,0.504065
7,0.029200,0.027547,0.935805,0.686364,0.75871,0.982536,0.98358,0.998744,0.991104,0.888031,0.373984,0.526316
8,0.025500,0.027858,0.928839,0.716229,0.78574,0.983675,0.985128,0.998311,0.991676,0.872549,0.434146,0.579805
9,0.025500,0.028436,0.906253,0.731221,0.793419,0.983591,0.985916,0.997402,0.991626,0.82659,0.465041,0.595213
10,0.023100,0.028333,0.910743,0.726451,0.79054,0.983548,0.985666,0.997618,0.991606,0.835821,0.455285,0.589474


TrainOutput(global_step=2740, training_loss=0.03277052057920581, metrics={'train_runtime': 677.1702, 'train_samples_per_second': 129.185, 'train_steps_per_second': 4.046, 'total_flos': 1752189745256496.0, 'train_loss': 0.03277052057920581, 'epoch': 10.0})

In [31]:
# Save model

trainer.save_model()

In [32]:
# Evaluating to display training accuracy

trainer.evaluate(tokenized_train_dataset_roberta["train"])

{'eval_loss': 0.020668944343924522,
 'eval_precision': 0.9774914446304592,
 'eval_recall': 0.7208544218820118,
 'eval_f1': 0.7997787513218522,
 'eval_accuracy': 0.9846356486451505,
 'eval_Literal precision': 0.984815083170179,
 'eval_Literal recall': 0.9996244484086001,
 'eval_Literal f1': 0.992164506534755,
 'eval_Metaphor precision': 0.9701678060907396,
 'eval_Metaphor recall': 0.4420843953554234,
 'eval_Metaphor f1': 0.6073929961089495,
 'eval_runtime': 22.3147,
 'eval_samples_per_second': 392.028,
 'eval_steps_per_second': 12.279,
 'epoch': 10.0}

In [33]:
# Running on test

trainer.predict(tokenized_test_dataset_roberta)

PredictionOutput(predictions=array([[[   6.4218106,   -5.770071 ],
        [   6.2305474,   -5.4626546],
        [   6.1753907,   -5.442373 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   6.453245 ,   -5.867971 ],
        [   6.3890204,   -5.6180315],
        [   6.115483 ,   -5.430262 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   6.3117642,   -5.6527495],
        [   6.1794357,   -5.429175 ],
        [   6.059191 ,   -5.2416053],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       ...,

       [[   6.03209  ,   -5.7759166],
        [   5.8681817,   -5.605896 ],
        [   5.8122935,   -5.4955354],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -10

# ELECTRA Training and Evaluation

In [34]:
# Prepare training hyper parameters

training_args = TrainingArguments(
    output_dir="electra_metaphor_sequence",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = CustomTrainer(
    model=model_electra,
    args=training_args,
    train_dataset=tokenized_train_dataset_electra["train"],
    eval_dataset=tokenized_train_dataset_electra["test"],
    tokenizer=electra_tknzr,
    data_collator=data_collator_electra,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Literal precision,Literal recall,Literal f1,Metaphor precision,Metaphor recall,Metaphor f1
1,No log,0.04319,0.943751,0.56568,0.609255,0.977137,0.977389,0.999654,0.988396,0.910112,0.131707,0.230114
2,0.053800,0.033964,0.925537,0.614992,0.677223,0.979161,0.979909,0.999091,0.989407,0.871166,0.230894,0.365039
3,0.053800,0.029896,0.952997,0.630613,0.699048,0.9803,0.980707,0.999437,0.989983,0.925287,0.261789,0.408112
4,0.031100,0.027397,0.938347,0.683155,0.7559,0.982452,0.983414,0.998831,0.991062,0.893281,0.36748,0.520737
5,0.031100,0.025876,0.934405,0.715502,0.786264,0.983802,0.985089,0.998484,0.991741,0.883721,0.43252,0.580786
6,0.024200,0.026567,0.9089,0.739351,0.80076,0.984012,0.986338,0.997402,0.991839,0.831461,0.481301,0.609681
7,0.024200,0.024846,0.919909,0.757346,0.818071,0.985151,0.987271,0.997618,0.992418,0.852547,0.517073,0.643725
8,0.019900,0.024861,0.901581,0.757791,0.813132,0.984477,0.987304,0.996882,0.99207,0.815857,0.518699,0.634195
9,0.019900,0.0256,0.906719,0.774864,0.827321,0.985362,0.988194,0.996882,0.992519,0.825243,0.552846,0.662123
10,0.016800,0.026402,0.898428,0.779525,0.827951,0.985194,0.988444,0.996449,0.99243,0.808411,0.562602,0.663471


TrainOutput(global_step=2740, training_loss=0.027902469930857637, metrics={'train_runtime': 1234.4903, 'train_samples_per_second': 70.863, 'train_steps_per_second': 2.22, 'total_flos': 3341195504240496.0, 'train_loss': 0.027902469930857637, 'epoch': 10.0})

In [35]:
# Save model

trainer.save_model()

In [36]:
# Evaluating to display training accuracy

trainer.evaluate(tokenized_train_dataset_electra["train"])

{'eval_loss': 0.016245858743786812,
 'eval_precision': 0.9730526461605102,
 'eval_recall': 0.7948783254323649,
 'eval_f1': 0.8622183904519622,
 'eval_accuracy': 0.9882825883374067,
 'eval_Literal precision': 0.988805016838927,
 'eval_Literal recall': 0.9992723687916627,
 'eval_Literal f1': 0.9940111371834832,
 'eval_Metaphor precision': 0.9573002754820936,
 'eval_Metaphor recall': 0.5904842820730671,
 'eval_Metaphor f1': 0.7304256437204414,
 'eval_runtime': 42.4865,
 'eval_samples_per_second': 205.901,
 'eval_steps_per_second': 6.449,
 'epoch': 10.0}

In [37]:
# Running on test

trainer.predict(tokenized_test_dataset_electra)

PredictionOutput(predictions=array([[[   4.633411 ,   -5.130016 ],
        [   4.6997666,   -5.6300635],
        [   4.728642 ,   -5.645139 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   4.6478815,   -5.187212 ],
        [   4.654143 ,   -5.749933 ],
        [   4.6812997,   -5.8832097],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       [[   4.693275 ,   -5.282495 ],
        [   4.74501  ,   -5.5910916],
        [   4.5623755,   -5.5580277],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -100.       ]],

       ...,

       [[   4.758428 ,   -5.654511 ],
        [   4.830103 ,   -5.609847 ],
        [   4.7960663,   -5.597907 ],
        ...,
        [-100.       , -100.       ],
        [-100.       , -100.       ],
        [-100.       , -10

# **Model Building (Classification)**




In [38]:
# Load dataset 

class_df = pd.read_csv(os.path.join(PROJECT_DIR, 'classification_dataset.csv'))

def one_hot_encode_indexes(row):
    tokenized_txt = str(row['sentence']).lower().split(' ')
    pos_encoding = [0] * len(tokenized_txt)
    pos_encoding[row['verb index']-1] = 1
    row['position encoding'] = pos_encoding
    row['tokenized txt'] = tokenized_txt
    return row

class_df_encoded = class_df.apply(one_hot_encode_indexes, axis = 1).drop(columns=['verb index'])
class_df_encoded.head()

Unnamed: 0,sentence,label,position encoding,tokenized txt
0,Most athletes first encountered him as a voice...,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[most, athletes, first, encountered, him, as, ..."
1,Most athletes first encountered him as a voice...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[most, athletes, first, encountered, him, as, ..."
2,Most athletes first encountered him as a voice...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[most, athletes, first, encountered, him, as, ..."
3,Most athletes first encountered him as a voice...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[most, athletes, first, encountered, him, as, ..."
4,Most athletes first encountered him as a voice...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[most, athletes, first, encountered, him, as, ..."


In [39]:
train_df, test_df = train_test_split(class_df_encoded, test_size = 0.15, shuffle = True)

# Convert dataframes to huggingface datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Further split train dataset into validation dataset
train_dataset = train_dataset.train_test_split(test_size=0.15, shuffle = True)

In [40]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base", add_prefix_space=True)


def tokenize_align_pos_ids(row):
    bert_tokens = tokenizer(row['tokenized txt'], truncation=True, is_split_into_words = True)

    labels = []
    for i, label in enumerate(row['position encoding']):
        word_ids = bert_tokens.word_ids(batch_index = i)
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx == None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[prev_word_idx])
            prev_word_idx = word_idx
        labels.append(label_ids)

    bert_tokens['pos_encodings'] = labels
    return bert_tokens

train_dataset = train_dataset.map(tokenize_align_pos_ids, batched = True)


Map:   0%|          | 0/12455 [00:00<?, ? examples/s]

Map:   0%|          | 0/2198 [00:00<?, ? examples/s]

In [41]:
train_dataset['train'][0]['pos_encodings']

[-100,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -100]

In [42]:
from transformers.models.bert.modeling_bert import BertEncoder, BertPooler, BertEmbeddings, BaseModelOutputWithPoolingAndCrossAttentions
from transformers import BertConfig, BertModel
import torch
from torch import nn


class BertEmbeddingsV2(BertEmbeddings):
    def __init__(self, config):
        super().__init__(config)
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        max_number_of_pos_tags = 2
        self.pos_tag_embeddings = nn.Embedding(max_number_of_pos_tags, config.hidden_size)
        
        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

    def forward(self, input_ids=None, pos_tag_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        pos_tag_embeddings = self.pos_tag_embeddings(pos_tag_ids)

        embeddings = inputs_embeds + token_type_embeddings + pos_tag_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

class BertModelV2(BertModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
    input to the forward pass.
    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = BertEmbeddingsV2(config)
        self.encoder = BertEncoder(config)

        self.pooler = BertPooler(config) if add_pooling_layer else None

        self.init_weights()
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        pos_tag_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if self.config.is_decoder:
            use_cache = use_cache if use_cache is not None else self.config.use_cache
        else:
            use_cache = False

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
            batch_size, seq_length = input_shape
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
            batch_size, seq_length = input_shape
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        # past_key_values_length
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            pos_tag_ids=pos_tag_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length,
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_extended_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,
        )


# if __name__ == "__main__":
#     from transformers import BertTokenizer

#     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#     text = "She sells"
#     # if we tokenize it, this becomes:
#     encoding = tokenizer(text, return_tensors="pt") # this creates a dictionary with keys 'input_ids' etc.
#     print(encoding)
#     # we add the pos_tag_ids to the dictionary
#     # pos_tags = [NNP, VNP]
#     encoding['pos_tag_ids'] = torch.tensor([[0, 1, 1, 0]])

#     # next, we can provide this to our modified BertModel:
#     config = BertConfig()
#     model = BertModelV2.from_pretrained("bert-base-uncased", config=config)
    

#     outputs = model(**encoding)