In [1]:
import torch
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from torch.utils.data.sampler import WeightedRandomSampler
import torch
import datasets
import random
import numpy as np
import evaluate
import os

2023-08-18 08:20:57.526467: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-18 08:20:57.705657: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-08-18 08:20:58.396854: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-08-18 08:20:58.396969: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [4]:
torch.manual_seed(8)
random.seed(8)
np.random.seed(8)


METRIC_CLASSIFIER = evaluate.combine(
    [
        "hyperml/balanced_accuracy",
    ]
)

In [5]:
complexityTypeToId = {
    "count": 0,
    "yesno": 1,
    "other": 2,
}


def convert_to_features(
    example_batch,
    tokenizer,
    question_feature_name: str = "question",
):
    """convert_to_features function for HF dataset for applying tokenizer

    Args:
        example_batch (Dict): HF Dataset batch
        tokenizer (PreTrainedTokenizer): HF Tokenizer
        question_feature_name (str): Name of column with quesions
        label_feature_name (str): Name of column with labels

    Returns:
        Dict: HF Dataset tokenized batch
    """
    input_encodings = tokenizer(
        example_batch[question_feature_name],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    labels = []
    for label in example_batch["complexityType"]:
        labels.append(complexityTypeToId.get(label, complexityTypeToId["other"]))
    labels = torch.LongTensor(labels)

    encodings = {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels,
    }

    return encodings

In [6]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModelForSequenceClassification.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2",
    num_labels=len(complexityTypeToId),
)

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
ds = datasets.load_dataset("AmazonScience/mintaka", "en")
ds = ds.map(lambda batch: convert_to_features(batch, tokenizer), batched=True)
columns = [
    "input_ids",
    "labels",
    "attention_mask",
]
ds.set_format(type="torch", columns=columns)

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return METRIC_CLASSIFIER.compute(predictions=predictions, references=labels)

In [11]:
from torch.utils.data.sampler import WeightedRandomSampler
import numpy as np


class CustomTrainer(Trainer):
    def get_labels(self):
        labels = []
        for i in self.train_dataset:
            labels.append(int(i["labels"].cpu().detach().numpy()))
        return labels

    def _get_train_sampler(self) -> torch.utils.data.Sampler:
        labels = self.get_labels()
        return self.create_sampler(labels)

    def create_sampler(self, target):
        class_sample_count = np.array(
            [len(np.where(target == t)[0]) for t in np.unique(target)]
        )
        weight = 1.0 / class_sample_count
        samples_weight = np.array([weight[t] for t in target])

        samples_weight = torch.from_numpy(samples_weight)
        samples_weight = samples_weight.double()
        sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

        return sampler

In [13]:
training_args = TrainingArguments(
    output_dir="/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/",
    save_total_limit=1,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="balanced_accuracy",
    greater_is_better=True,
    logging_steps=250,
    save_steps=250,
    evaluation_strategy="steps",
    report_to="wandb",
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mhle2000[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Balanced Accuracy
250,0.5877,0.191927,0.969167


KeyboardInterrupt: 

In [12]:
trainer.evaluate()

{'eval_loss': 0.12675663828849792,
 'eval_balanced_accuracy': 0.9829166666666667,
 'eval_runtime': 2.9897,
 'eval_samples_per_second': 668.958,
 'eval_steps_per_second': 10.703,
 'epoch': 5.0}

In [13]:
checkpoint_best_path = "/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best"
model.save_pretrained(checkpoint_best_path)
tokenizer.save_pretrained(checkpoint_best_path)

('/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/tokenizer_config.json',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/special_tokens_map.json',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/vocab.txt',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/added_tokens.json',
 '/mnt/storage/QA_System_Project/mintaka_question_type_classifier_runs/sbert/checkpoint-best/tokenizer.json')

In [None]:
model.push_to_hub("s-nlp/mintaka_question_complexity_type_classifier")
tokenizer.push_to_hub("s-nlp/mintaka_question_complexity_type_classifier")