In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from torch.utils.data.sampler import WeightedRandomSampler
import torch
import datasets
import random
import numpy as np
import evaluate
import os

2023-07-27 17:21:06.692699: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-27 17:21:06.914617: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-27 17:21:07.594295: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-27 17:21:07.594370: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [3]:
torch.manual_seed(8)
random.seed(8)
np.random.seed(8)


METRIC_CLASSIFIER = evaluate.combine(
    [
        "hyperml/balanced_accuracy",
    ]
)

In [4]:
complexityTypeToId = {
    'count': 0,
    'yesno': 1,
    'other': 2,   
}

def convert_to_features(
    example_batch,
    tokenizer,
    question_feature_name: str = "question",
):
    """convert_to_features function for HF dataset for applying tokenizer

    Args:
        example_batch (Dict): HF Dataset batch
        tokenizer (PreTrainedTokenizer): HF Tokenizer
        question_feature_name (str): Name of column with quesions
        label_feature_name (str): Name of column with labels

    Returns:
        Dict: HF Dataset tokenized batch
    """
    input_encodings = tokenizer(
        example_batch[question_feature_name],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    labels = []
    for label in example_batch['complexityType']:
        labels.append(complexityTypeToId.get(label, complexityTypeToId['other']))
    labels = torch.LongTensor(labels)

    encodings = {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels,
    }

    return encodings

In [5]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModelForSequenceClassification.from_pretrained(
    'sentence-transformers/all-mpnet-base-v2',
    num_labels=len(complexityTypeToId),
)

Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a

In [6]:
ds = datasets.load_dataset('AmazonScience/mintaka', 'en')
ds = ds.map(lambda batch: convert_to_features(batch, tokenizer), batched=True)
columns = [
    "input_ids",
    "labels",
    "attention_mask",
]
ds.set_format(type="torch", columns=columns)

Found cached dataset mintaka (/root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-8cc068011e65f709.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-33301831a0c26da8.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d/cache-34c372dde80482b4.arrow


In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return METRIC_CLASSIFIER.compute(predictions=predictions, references=labels)

In [8]:
def create_sampler(target):
    class_sample_count = np.array(
        [len(np.where(target == t)[0]) for t in np.unique(target)]
    )
    weight = 1.0 / class_sample_count
    samples_weight = np.array([weight[t] for t in target])

    samples_weight = torch.from_numpy(samples_weight)
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    return sampler


In [9]:
class CustomTrainer(Trainer):
    def get_train_dataloader(self) -> torch.utils.data.DataLoader:
        train_sampler = create_sampler(ds['train']['labels'].numpy().astype(int).ravel())
        train_loader = torch.utils.data.DataLoader(
            ds['train'], batch_size=32, sampler=train_sampler
        )
        return train_loader

In [10]:
training_args = TrainingArguments(
    output_dir="/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/",
    save_total_limit=1,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="balanced_accuracy",
    greater_is_better=True,
    logging_steps=250,
    save_steps=250,
    evaluation_strategy="steps",
    report_to="wandb",
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    compute_metrics=compute_metrics,
)

In [11]:
trainer.evaluate()
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmsalnikov[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Balanced Accuracy
250,0.587,0.222922,0.971667
500,0.1069,0.153056,0.979167
750,0.0715,0.134845,0.982083
1000,0.0633,0.130211,0.976042
1250,0.0475,0.126757,0.982917
1500,0.046,0.117246,0.981667
1750,0.0284,0.110915,0.979792
2000,0.0238,0.101008,0.974167


TrainOutput(global_step=2190, training_loss=0.11345933953376665, metrics={'train_runtime': 408.3032, 'train_samples_per_second': 171.441, 'train_steps_per_second': 5.364, 'total_flos': 4604484810240000.0, 'train_loss': 0.11345933953376665, 'epoch': 5.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.12675663828849792,
 'eval_balanced_accuracy': 0.9829166666666667,
 'eval_runtime': 2.9897,
 'eval_samples_per_second': 668.958,
 'eval_steps_per_second': 10.703,
 'epoch': 5.0}

In [13]:
checkpoint_best_path = (
    "/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best"
)
model.save_pretrained(checkpoint_best_path)
tokenizer.save_pretrained(checkpoint_best_path)

('/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/tokenizer_config.json',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/special_tokens_map.json',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/vocab.txt',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/added_tokens.json',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/tokenizer.json')

In [None]:
model.push_to_hub("s-nlp/mintaka_question_complexity_type_classifier")
tokenizer.push_to_hub("s-nlp/mintaka_question_complexity_type_classifier")