Code adapted from: https://github.com/shahafp/TRC-Hebrew

## Intalls


In [None]:
#!pip install datasets

In [None]:
#!pip install 'transformers[torch]' -U

In [None]:
import accelerate
print(accelerate.__version__)

## Imports

In [None]:
import os
import sys

In [None]:
import datasets
from datasets import load_dataset
from sklearn.metrics import classification_report
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer
from collections import Counter

In [None]:
import torch

In [None]:
import pandas as pd

## Import Data

In [None]:
train = pd.read_csv('data/processed/train.csv')
test = pd.read_csv('data/processed/test.csv')

## Process Data

In [None]:
label_mapping = {
    'BEFORE': 0,
    'AFTER': 1,
    'EQUAL': 2,
    'VAGUE': 3
}

In [None]:
train['label'] = train['label_temp'].map(label_mapping)
test['label'] = test['label_temp'].map(label_mapping)

In [None]:
import re
def annotate_text(row):
    context = row['context']
    eventA = re.escape(row['eventA'])
    eventB = re.escape(row['eventB'])

    if len(eventA) > len(eventB):
        context = re.sub(eventA, f"[a1]{row['eventA']}[/a1]", context)
        context = re.sub(eventB, f"[a2]{row['eventB']}[/a2]", context)
    else:
        context = re.sub(eventB, f"[a2]{row['eventB']}[/a2]", context)
        context = re.sub(eventA, f"[a1]{row['eventA']}[/a1]", context)

    return context

train['annotated_context'] = train.apply(annotate_text, axis=1)
test['annotated_context'] = test.apply(annotate_text, axis=1)

In [None]:
tokens = ["[a1]", "[/a1]", "[a2]", "[/a2]"]

def count_token_occurrences(text, token):
    return text.count(token)
results = {token: train['annotated_context'].apply(lambda x: count_token_occurrences(x, token)) for token in tokens}


results_df = pd.DataFrame(results)
more_than_once = results_df.applymap(lambda x: x > 1)
all_multiple = more_than_once.all()

print("Tokens appearing more than once for each token:")
print(all_multiple)

In [None]:
tokens = ["[a1]", "[/a1]", "[a2]", "[/a2]"]
token_presence = train['annotated_context'].apply(lambda x: all(token in x for token in tokens))
all_have_tokens = token_presence.all()

print("Do all instances contain the tokens [a1], [/a1], [a2], [/a2]? ", all_have_tokens)

In [None]:
train = train[['id', 'annotated_context', 'eventA', 'eventB', 'label', 'label_temp']]
test = test[['id', 'annotated_context', 'eventA', 'eventB', 'label', 'label_temp']]

## Initialising Data

In [None]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = DatasetDict({
    "test" : Dataset.from_pandas(test),
    "train" : Dataset.from_pandas(train)
})


In [None]:
def calculate_class_weights(dataset):
    labels = dataset['train']['label']
    labels_count = Counter(labels)
    class_weights = [0] * len(labels_count)
    for l, count in labels_count.items():
        cls_w = 1 - (count / len(labels))
        class_weights[l] = cls_w
    return class_weights

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
eval_mode = False


def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)

    if eval_mode:
        report = classification_report(y_true=labels, y_pred=predictions,
                                       target_names=['BEFORE', 'AFTER', 'EQUAL', 'VAGUE'])
        for i in range(labels.shape[0]):
            if labels[i] == 3 and predictions[i] != 3:
                labels[i] = predictions[i]
        report_no_vague = classification_report(y_true=labels, y_pred=predictions,
                                                target_names=['BEFORE', 'AFTER', 'EQUAL', 'VAGUE'])

        with open('results/evaluation_report-bert-base-uncased', 'w') as f:
            f.write(report)
            f.write('\n')
            f.write(report_no_vague)
        print(report)
        print(report_no_vague)

    results = \
        classification_report(y_true=labels, y_pred=predictions, target_names=['BEFORE', 'AFTER', 'EQUAL','VAGUE'],
                              output_dict=True)
    final_results = results['weighted avg']
    final_results.pop('support')
    final_results['BEFORE-f1'] = results['BEFORE']['f1-score']
    final_results['AFTER-f1'] = results['AFTER']['f1-score']
    final_results['EQUAL-f1'] = results['EQUAL']['f1-score']
    final_results['VAGUE-f1'] = results['VAGUE']['f1-score']
    return final_results

In [None]:
label2id = {}
id2label = {}
for label, named_label in zip(dataset['train']['label'], dataset['train']['label_temp']):
    label2id[named_label] = label
    id2label[label] = named_label

class_weights = calculate_class_weights(dataset)
class_weights

## Initialise Model

In [None]:
import sys
sys.path.append('model/code-bert')
#sys.path.append('model/code-roberta')
from temporal_relation_classification import TemporalRelationClassification
from temporal_relation_classification_config import TemporalRelationClassificationConfig

In [None]:
lm_checkpoints = ['bert-base-uncased', 'bert-large-uncased', 'roberta-large', 'roberta-base']
architectures = ['ESS', 'SEQ-CLS', 'EMP']
arc = 'ESS'
checkpoint = 'bert-base-uncased'

In [None]:
model_base_name = 'bert-base-uncased'
model_base_architecture = 'ESS'
model_final_name = f'{model_base_architecture}-{model_base_name}'

## Initialise Tokeniser

In [None]:
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenizer.add_special_tokens({'additional_special_tokens': ['[a1]', '[/a1]', '[a2]', '[/a2]']})
print(len(tokenizer))

In [None]:
E1_start = tokenizer.convert_tokens_to_ids('[a1]')
E1_end = tokenizer.convert_tokens_to_ids('[/a1]')
E2_start = tokenizer.convert_tokens_to_ids('[a2]')
E2_end = tokenizer.convert_tokens_to_ids('[/a2]')

## Process Tokeniser

In [None]:
def preprocess_function(examples):
  max_length_value = 508
  return tokenizer(examples["annotated_context"], truncation=True, max_length=max_length_value)

tokenized_datasets = dataset.map(preprocess_function, remove_columns=['label_temp'], batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenizer_class = str(type(tokenizer)).strip("><'").split('.')[-1]

## Fine-Tuning

In [None]:
config = TemporalRelationClassificationConfig(EMS1=E1_start,
                                              EMS2=E2_start,
                                              EME1=E1_end,
                                              EME2=E2_end,
                                              class_weights=class_weights,
                                              architecture=arc,
                                              num_labels=len(label2id),
                                              id2label=id2label,
                                              label2id=label2id,
                                              name_or_path=checkpoint,
                                              tokenizer_class=tokenizer_class,
                                              vocab_size=len(tokenizer),
                                              hidden_size = 768,
                                              num_attention_heads =4)


In [None]:
model = TemporalRelationClassification(config=config)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
training_args = TrainingArguments(
            output_dir= 'saved_models/bert-base-uncased',
            learning_rate=3e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            weight_decay=0.01,
            num_train_epochs=10,
            evaluation_strategy="epoch",
            save_strategy="no",
            optim='adamw_torch',
            report_to=[],
            use_mps_device=False
        )

In [None]:
trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

In [None]:
trainer.train()

## Saving model

In [None]:
# Save the model and tokenizer
model_path = "saved_models/bert-base-uncased"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

## Evaluating Testset

In [None]:
model_final_name = 'saved_models/bert-base-uncased'
eval_mode = True
print('Evaluate:', model_final_name)
trainer.evaluate(tokenized_datasets['test'])

In [None]:
eval_mode = False