In [6]:
!nvidia-smi

Sat May  6 20:56:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

---
# **Dataset Preprocessing**
---

In [7]:
# Preprocessing Imports

import pandas as pd
from collections import Counter
import os

In [8]:
BASE_DIR = '/content/drive/MyDrive/'
PROJECT_DIR = os.path.join(BASE_DIR, 'project')


In [None]:
# Load Datasets and rename columns

dataset_df = pd.read_csv(os.path.join(PROJECT_DIR, 'vuamc_corpus_train.csv'))
verb_pos_df = pd.read_csv(os.path.join(PROJECT_DIR, 'all_pos_tokens.csv'), header=None)
verb_pos_df = verb_pos_df.rename({0:'TextID_SentID_Index', 1:'Label'}, axis = 1)
dataset_df.head()

In [None]:
# Process verb dataframe

verb_pos_df['split_data'] = verb_pos_df.apply(lambda row: row['TextID_SentID_Index'].split('_'), axis = 1)
verb_pos_df[['TextID','SentID','Index']] = pd.DataFrame(verb_pos_df['split_data'].tolist(), index= verb_pos_df.index)
verb_pos_df.head()

In [11]:
# Function that tokenizes the sentences and create labels for the sequence labelling task

def tokenize_and_label(row):

    txt_id, sent_id, sent = row['txt_id'], row['sentence_id'], row['sentence_txt']

    try:
        tokenized_txt = str(sent).lower().split(' ')
    except:
        print(txt_id, sent_id, sent)


    m_idx_label = verb_pos_df[(verb_pos_df['TextID'] == txt_id) & (verb_pos_df['SentID'] == sent_id)].reset_index(drop = True)

    labels = [2] * len(tokenized_txt)

    for idx, r in m_idx_label.iterrows():
        labels[int(r['Index'])-1] = int(r['Label'])
    


    row['tokenized_txt'] = tokenized_txt
    row['labels'] = labels
    return row

In [None]:
# Apply dataset preprocessing function to Dataset

tokenize_df = dataset_df.apply(tokenize_and_label, axis = 1)
tokenize_df.head()

In [13]:
# Drop Nulls

tokenize_df = tokenize_df.dropna(how = 'any')

In [14]:
# Convert to CSV for future use

tokenize_df.to_csv('vua_corpus_sequence.csv')

In [15]:
# Extract necessary columns

final_dataset = tokenize_df[['tokenized_txt','labels']]
final_dataset.head()

Unnamed: 0,tokenized_txt,labels
0,"[latest, corporate, unbundler, m_reveals, laid...","[0, 0, 0, 1, 0, 1, 2, 0, 0, 2, 2, 2, 1, 2, 2, ..."
1,"[by, frank, kane]","[2, 0, 0]"
2,"[it, seems, that, roland, franklin, ,, the, la...","[2, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 2, 2, 0, 2, ..."
3,"[he, has, not, properly, investigated, the, m_...","[2, 2, 2, 0, 0, 2, 1, 2, 0, 0, 2]"
4,"[the, 63-year-old, m_head, of, pembridge, inve...","[2, 2, 1, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1, 0, ..."


In [16]:
# Generate dataset statistics

token_cnts = Counter()
all_tokens = []

def all_tokens_fn(row):
    tokens = list(row['tokenized_txt'])
    token_cnts.update(row['labels'])
    all_tokens.extend(tokens)
    return row

temp1 = tokenize_df.apply(all_tokens_fn, axis = 1)

temp = tokenize_df.apply(lambda row: len(row['tokenized_txt']), axis = 1)
print(f"Total tokens: {temp.sum()}")
print(f"Total sentences: {len(tokenize_df)}")
print(f"Unique tokens: {len(set(all_tokens))}")

sum_cnt = token_cnts[0] + token_cnts[1] + token_cnts[2]
print(f"Literal tokens count : {token_cnts[0]} ({(token_cnts[0]/sum_cnt)*100:.2f}%)")
print(f"Metaphor tokens count: {token_cnts[1]} ({(token_cnts[1]/sum_cnt)*100:.2f}%)")
print(f"Other tokens count: {token_cnts[2]} ({(token_cnts[2]/sum_cnt)*100:.2f}%)")

Total tokens: 181488
Total sentences: 12109
Unique tokens: 17874
Literal tokens count : 61567 (33.92%)
Metaphor tokens count: 11044 (6.09%)
Other tokens count: 108877 (59.99%)


---
# **Sequence Labelling Setup**
---

In [None]:
# Install required packages to colab instance

!pip install transformers datasets evaluate

In [18]:
# Sequence labelling task imports

from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
import evaluate
import numpy as np
import torch
from torch import nn

In [19]:
# Split dataset into train and test splits
train_df, test_df = train_test_split(final_dataset, test_size = 0.15, shuffle = True)

In [20]:
# Convert dataframes to huggingface datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Further split train dataset into validation dataset
train_dataset = train_dataset.train_test_split(test_size=0.15, shuffle = True)

In [None]:
# Initialize tokenizers

distilroberta_tknzer = AutoTokenizer.from_pretrained("distilroberta-base", add_prefix_space=True)
distilbert_tknzr = AutoTokenizer.from_pretrained("distilbert-base-uncased")
electra_tknzr = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

In [22]:
# Tokenize into subwords using BERT tokenizer and perform alignment
# token-label realignment is achieved by:
# 1. Mapping all tokens to their corresponding word with the word_ids method.
# 2. Assigning the label -100 to the special tokens [CLS] and [SEP] so they’re ignored by the PyTorch loss function.
# 3. Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.

def tokenize_and_align_labels(sample, tknzer):
    bert_tokens = tknzer(sample['tokenized_txt'], truncation=True, is_split_into_words = True)

    labels = []
    for i, label in enumerate(sample['labels']):
        word_ids = bert_tokens.word_ids(batch_index = i)
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx == None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        labels.append(label_ids)

    bert_tokens['new_labels'] = labels
    return bert_tokens

In [23]:
# Apply the tokenize_and_align_labels function using datasets map method

tokenized_train_dataset_bert = train_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilbert_tknzr})
tokenized_test_dataset_bert = test_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilbert_tknzr})

tokenized_train_dataset_roberta = train_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilroberta_tknzer})
tokenized_test_dataset_roberta = test_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": distilroberta_tknzer})

tokenized_train_dataset_electra = train_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": electra_tknzr})
tokenized_test_dataset_electra = test_dataset.map(tokenize_and_align_labels, batched = True, fn_kwargs = {"tknzer": electra_tknzr})

Map:   0%|          | 0/8748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/8748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/8748 [00:00<?, ? examples/s]

Map:   0%|          | 0/1544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

In [24]:
# Dynamically pad sequences and batch samples

data_collator_bert = DataCollatorForTokenClassification(tokenizer=distilbert_tknzr)
data_collator_roberta = DataCollatorForTokenClassification(tokenizer=distilroberta_tknzer)
data_collator_electra = DataCollatorForTokenClassification(tokenizer=electra_tknzr)

In [None]:
# Setup evaluation metrics

poseval = evaluate.load('poseval')

In [26]:
# Function to package predictions and labels for the evaluation function and Initialize metrics

label_list = ["Literal", "Metaphor","Other"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = poseval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["macro avg"]["precision"],
        "recall": results["macro avg"]["recall"],
        "f1": results["macro avg"]["f1-score"],
        "accuracy": results["accuracy"],
        "Literal precision": results["Literal"]["precision"],
        "Literal recall": results["Literal"]["recall"],
        "Literal f1": results["Literal"]["f1-score"],
        "Metaphor precision": results["Metaphor"]["precision"],
        "Metaphor recall": results["Metaphor"]["recall"],
        "Metaphor f1": results["Metaphor"]["f1-score"],
    }


In [27]:
# Dictionary of labels and theirs corresponding indexes
id2label = {
    0:'Literal',
    1:'Metaphor',
    2:'Other'
}

label2id = {
    'Literal':0,
    'Metaphor':1,
    'Other':2
}

In [None]:
# Load distilBERT model
model_distilbert = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

# Load distilRoBERTa model
model_distilroberta= AutoModelForTokenClassification.from_pretrained(
    "distilroberta-base", num_labels=3, id2label=id2label, label2id=label2id
)

# Load ELECTRA-Base model
model_electra= AutoModelForTokenClassification.from_pretrained(
    "google/electra-base-discriminator", num_labels=3, id2label=id2label, label2id=label2id
)

---
# DistilBERT Training and Evaluation
---

In [49]:
# Define custom trainer to allow for weighted loss since our classes are unbalanced

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.1,0.9], device=device))
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0,1.0], device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
# Prepare training hyper parameters

training_args = TrainingArguments(
    output_dir="distilbert_metaphor_sequence",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = CustomTrainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_train_dataset_bert["train"],
    eval_dataset=tokenized_train_dataset_bert["test"],
    tokenizer=distilbert_tknzr,
    data_collator=data_collator_bert,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

In [None]:
# Save model

trainer.save_model()

In [None]:
# Evaluating to display training accuracy

trainer.evaluate(tokenized_train_dataset_bert["train"])

In [None]:
# Running on test
trainer.predict(tokenized_test_dataset_bert)

# DistilRoBERTa Training and Evaluation

In [None]:
# # Define custom trainer to allow for weighted loss since our classes are unbalanced

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0,2.0,3.0], device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
# Prepare training hyper parameters

training_args = TrainingArguments(
    output_dir="distilroberta_metaphor_sequence",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = CustomTrainer(
    model=model_distilroberta,
    args=training_args,
    train_dataset=tokenized_train_dataset_roberta["train"],
    eval_dataset=tokenized_train_dataset_roberta["test"],
    tokenizer=distilroberta_tknzer,
    data_collator=data_collator_roberta,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

In [None]:
# Save model

trainer.save_model()

In [None]:
# Evaluating to display training accuracy

trainer.evaluate(tokenized_train_dataset_roberta["train"])

In [None]:
# Running on test

trainer.predict(tokenized_test_dataset_roberta)

# ELECTRA Training and Evaluation

In [44]:
# # Define custom trainer to allow for weighted loss since our classes are unbalanced

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.1,1.3,0.5], device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
# Prepare training hyper parameters

training_args = TrainingArguments(
    output_dir="electra_metaphor_sequence",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = CustomTrainer(
    model=model_electra,
    args=training_args,
    train_dataset=tokenized_train_dataset_electra["train"],
    eval_dataset=tokenized_train_dataset_electra["test"],
    tokenizer=electra_tknzr,
    data_collator=data_collator_electra,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

In [46]:
# Save model

trainer.save_model()

In [None]:
# Evaluating to display training accuracy

trainer.evaluate(tokenized_train_dataset_electra["train"])

In [None]:
# Running on test

trainer.predict(tokenized_test_dataset_electra)