https://www.notion.so/msalnikov/b0b68b3db11b4c40a4bada127bfde310?v=635216a0f3d646d58fde31f60cc9e4c9&p=1dc75fe61b104fc0aba5ae00c01b9a29&pm=c

In [1]:
import datasets
import os
import pandas as pd
import evaluate
import numpy as np

import torch
import transformers
from transformers import TrainingArguments, Trainer

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
dataset = datasets.load_dataset('AmazonScience/mintaka')
dataset

No config specified, defaulting to: mintaka/en
Found cached dataset mintaka (/root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'lang', 'question', 'answerText', 'category', 'complexityType', 'questionEntity', 'answerEntity'],
        num_rows: 14000
    })
    validation: Dataset({
        features: ['id', 'lang', 'question', 'answerText', 'category', 'complexityType', 'questionEntity', 'answerEntity'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['id', 'lang', 'question', 'answerText', 'category', 'complexityType', 'questionEntity', 'answerEntity'],
        num_rows: 4000
    })
})

In [4]:
train_df = dataset['train'].to_pandas()

complexity_types = train_df['complexityType'].unique().tolist()
num_labels = len(complexity_types)
print('Number of labels: ', num_labels, '\n\nLabels:')
for idx, label in enumerate(complexity_types):
    print(f"\t{idx:3}: {label}")


Number of labels:  9 

Labels:
	  0: ordinal
	  1: intersection
	  2: generic
	  3: superlative
	  4: yesno
	  5: comparative
	  6: multihop
	  7: difference
	  8: count


In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-large-uncased")
model = transformers.BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=num_labels)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [6]:
def convert_to_features(
    example_batch,
    tokenizer,
    question_feature_name: str = "question",
):
    """convert_to_features function for HF dataset for applying tokenizer

    Args:
        example_batch (Dict): HF Dataset batch
        tokenizer (PreTrainedTokenizer): HF Tokenizer
        question_feature_name (str): Name of column with quesions

    Returns:
        Dict: HF Dataset tokenized batch
    """
    input_encodings = tokenizer(
        example_batch[question_feature_name],
        padding="max_length",
        truncation=True,
        max_length=64,
    )
    labels = torch.tensor(
        [complexity_types.index(t) for t in example_batch['complexityType']],
        dtype=torch.long,
    )

    encodings = {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels,
    }

    return encodings


dataset = dataset.map(
    lambda examples: convert_to_features(examples, tokenizer),
    batched=True,
)
columns = [
    "input_ids",
    "labels",
    "attention_mask",
]
dataset.set_format(type="torch", columns=columns)

Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-67fa482011cbfed8.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-3a6dff0fa9cdb368.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-f5b3c57c6cf3d86a.arrow


In [7]:
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

In [8]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    torch_compile=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=18,
    evaluation_strategy='steps',
    eval_steps=2000,
    log_level='info',
    save_steps=2000,
    save_total_limit=1,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics,
)

In [9]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: answerText, questionEntity, category, lang, answerEntity, id, question, complexityType. If answerText, questionEntity, category, lang, answerEntity, id, question, complexityType are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 2.3379404544830322,
 'eval_f1': 0.01821493624772313,
 'eval_runtime': 4.7269,
 'eval_samples_per_second': 423.11,
 'eval_steps_per_second': 26.444}

In [10]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: answerText, questionEntity, category, lang, answerEntity, id, question, complexityType. If answerText, questionEntity, category, lang, answerEntity, id, question, complexityType are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,000
  Num Epochs = 18
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 15,750
  Number of trainable parameters = 335,151,113


Step,Training Loss,Validation Loss


TrainOutput(global_step=15750, training_loss=2.1741561879960316, metrics={'train_runtime': 2469.6316, 'train_samples_per_second': 102.04, 'train_steps_per_second': 6.377, 'total_flos': 2.9356532255232e+16, 'train_loss': 2.1741561879960316, 'epoch': 18.0})

In [None]:
trainer.evaluate()