In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), ".")))

In [2]:
from datasets import load_dataset
from sklearn.metrics import classification_report
import numpy as np
from trc_model.temporal_relation_classification import TemporalRelationClassification
from trc_model.temporal_relation_classification_config import TemporalRelationClassificationConfig
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, \
    AutoModelForSequenceClassification

In [3]:
raw_datasets = load_dataset("guyyanko/trc-hebrew")

Found cached dataset csv (/Users/guy.yanko/.cache/huggingface/datasets/guyyanko___csv/guyyanko--trc-hebrew-9dd0114a08465c4a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
label2id = {}
id2label = {}
for label, named_label in zip(raw_datasets['train']['label'], raw_datasets['train']['named_label']):
    label2id[named_label] = label
    id2label[label] = named_label

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
eval_mode = False


def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    if eval_mode:
        report = classification_report(y_true=labels, y_pred=predictions,
                                       target_names=['BEFORE', 'AFTER', 'EQUAL', 'VAGUE'])
        with open(f'{model_final_name}/evaluation_report.txt', 'w') as f:
            f.write(report)
        print(report)

    results = \
        classification_report(y_true=labels, y_pred=predictions, target_names=['BEFORE', 'AFTER', 'EQUAL', 'VAGUE'],
                              output_dict=True)['weighted avg']
    results.pop('support')
    return results

In [7]:
lm_checkpoints = ['onlplab/alephbert-base', 'avichr/heBERT', 'imvladikon/alephbertgimmel-base-512']
architectures = ['SEQ_CLS', 'ESS', 'EMP', 'EF']

In [8]:
for checkpoint in lm_checkpoints:
    for arc in architectures:
        model_final_name = f'hebrew-trc-{checkpoint.split("/")[1]}-{arc}'
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        tokenizer.add_special_tokens({'additional_special_tokens': ['[א1]', '[/א1]', '[א2]', '[/א2]']})
        E1_start = tokenizer.convert_tokens_to_ids('[א1]')
        E2_start = tokenizer.convert_tokens_to_ids('[א2]')
        tokenized_datasets = raw_datasets.map(preprocess_function, remove_columns=['named_label'], batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        tokenizer_class = str(type(tokenizer)).strip("><'").split('.')[-1]
        config = TemporalRelationClassificationConfig(EMS1=E1_start, EMS2=E2_start, architecture=arc,
                                                      num_labels=len(label2id),
                                                      id2label=id2label,
                                                      label2id=label2id, name_or_path=checkpoint,
                                                      tokenizer_class=tokenizer_class)

        # model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=checkpoint)
        model = TemporalRelationClassification(config=config)
        model.bert.resize_token_embeddings(len(tokenizer))

        training_args = TrainingArguments(
            output_dir=model_final_name,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            weight_decay=0.01,
            num_train_epochs=20,
            evaluation_strategy="epoch",
            save_strategy="no",
            report_to=[],
            use_mps_device=True
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        eval_mode = True
        print('Evaluate:', model_final_name)
        trainer.evaluate(tokenized_datasets['test'])
        eval_mode = False
        config.register_for_auto_class()
        model.register_for_auto_class('AutoModelForSequenceClassification')
        # trainer.push_to_hub()
        trainer.save_model(model_final_name)

Loading cached processed dataset at /Users/guy.yanko/.cache/huggingface/datasets/guyyanko___csv/guyyanko--trc-hebrew-9dd0114a08465c4a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-32975abda20aba62.arrow
Loading cached processed dataset at /Users/guy.yanko/.cache/huggingface/datasets/guyyanko___csv/guyyanko--trc-hebrew-9dd0114a08465c4a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-47b584f6b5edfcd2.arrow
The following columns in the training set don't have a corresponding argument in `TemporalRelationClassification.forward` and have been ignored: text. If text are not expected by `TemporalRelationClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5826
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7300
  Number of trainable 

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 