Labels preparation

In [14]:
import pandas as pd
from datasets import Dataset

from multitask_training.BertForHierarchicalClassification import BertForHierarchicalClassification

df = pd.read_csv('./data_cleaned_manual_combined.csv')

labels_gs = df['Global Subject'].unique()
labels_qi = df['Question Intent'].unique()

id2label_gs = {i: label for i, label in enumerate(labels_gs)}
label2id_gs = {label: i for i, label in enumerate(labels_gs)}

id2label_qi = {i: label for i, label in enumerate(labels_qi)}
label2id_qi = {label: i for i, label in enumerate(labels_qi)}

Now we'll preprocess all the data by encoding it

In [15]:
from transformers import AutoTokenizer
import numpy as np

# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained
# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/bert#transformers.BertTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def preprocess_data(example):
    question = example['Question']

    # https://huggingface.co/docs/transformers/v4.46.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
    encodings = tokenizer(question, padding="max_length", truncation=True, max_length=128)
    label_gs = label2id_gs[example['Global Subject']]
    label_qi = label2id_qi[example['Question Intent']]

    encodings.update({'labels_main': label_gs, 'labels_sub': label_qi})

    return encodings

In [16]:
dataset = Dataset.from_pandas(df)

tokenized_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names)
tokenized_dataset.set_format("torch")

tokenized_dataset

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels_main', 'labels_sub'],
    num_rows: 855
})

In [17]:
train_testvalid = tokenized_dataset.train_test_split(test_size=0.8, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
eval_dataset = test_valid['train']
test_dataset = test_valid['test']

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_pred):
    # Unpack predictions and labels
    predictions, labels = eval_pred

    # If predictions and labels are tuples, unpack them
    if isinstance(predictions, tuple):
        logits_main, logits_sub = predictions
    else:
        # If not a tuple, handle accordingly (unlikely in this case)
        logits_main, logits_sub = predictions

    if isinstance(labels, tuple):
        labels_main, labels_sub = labels
    else:
        # If labels are not a tuple, they might be a dict or array
        labels_main = labels['labels_main']
        labels_sub = labels['labels_sub']

    # Convert logits to predicted class indices
    preds_main = np.argmax(logits_main, axis=1)
    preds_sub = np.argmax(logits_sub, axis=1)

    # Compute metrics for main topic
    acc_main = accuracy_score(labels_main, preds_main)
    precision_main, recall_main, f1_main, _ = precision_recall_fscore_support(
        labels_main, preds_main, average='weighted', zero_division=0
    )

    # Compute metrics for subtopic
    acc_sub = accuracy_score(labels_sub, preds_sub)
    precision_sub, recall_sub, f1_sub, _ = precision_recall_fscore_support(
        labels_sub, preds_sub, average='weighted', zero_division=0
    )

    # Combine metrics into a dictionary
    metrics = {
        'accuracy_main': acc_main,
        'precision_main': precision_main,
        'recall_main': recall_main,
        'f1_main': f1_main,
        'accuracy_sub': acc_sub,
        'precision_sub': precision_sub,
        'recall_sub': recall_sub,
        'f1_sub': f1_sub,
    }

    return metrics


In [19]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")


CUDA available: True
Number of GPUs: 1
Current CUDA device: 0
Device name: NVIDIA GeForce RTX 3070 Ti


In [20]:
import wandb

wandb.init(project='tesi')

0,1
eval/accuracy_main,▁▄▅▆▆▇▇▇██▇▇██▇██████
eval/accuracy_sub,▁▁▃▄▅▅▆▆▆▇▆▇▇▇▇██████
eval/f1_main,▁▅▆▆▇▇▇▇███▇██▇██████
eval/f1_sub,▁▁▃▄▅▅▆▆▆▇▆▇▇▇███████
eval/loss,█▆▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
eval/precision_main,▁▅▆▇▇▇▇▇███▇██▇██████
eval/precision_sub,▁▂▃▄▅▅▆▆▆▇▆▆▇████████
eval/recall_main,▁▄▅▆▆▇▇▇██▇▇██▇██████
eval/recall_sub,▁▁▃▄▅▅▆▆▆▇▆▇▇▇▇██████
eval/runtime,▄▂▃▃▃▅▃▂▁▁▇▆▃▂▂▄▃▃▂▅█

0,1
eval/accuracy_main,0.85882
eval/accuracy_sub,0.67059
eval/f1_main,0.85909
eval/f1_sub,0.62389
eval/loss,1.90889
eval/precision_main,0.8623
eval/precision_sub,0.63333
eval/recall_main,0.85882
eval/recall_sub,0.67059
eval/runtime,0.2623


In [21]:
from transformers import TrainingArguments, Trainer

model = BertForHierarchicalClassification.from_pretrained(
    # 'bert-base-uncased',
    'distilbert-base-uncased',
    num_main_topics=len(labels_gs),
    num_subtopics=len(labels_qi)
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate=2e-5,
    warmup_ratio=0.1,  # Warmup for the first 10% of steps
    lr_scheduler_type='linear',  # Linear scheduler
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy='epoch',
    logging_strategy='epoch',
    eval_strategy='epoch',
    logging_dir='./logs',
    load_best_model_at_end=True,  # Load the best model at the end based on evaluation metric
    metric_for_best_model='f1_sub',  # Use subtopic F1-score to determine the best model
    greater_is_better=True,  # Higher metric indicates a better model,
    report_to='wandb'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

print(f"Trainer is using device: {trainer.args.device}")

trainer.train()

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForHierarchicalClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier_main.bias', 'classifier_main.weight', 'classifier_sub.bias', 'classifier_sub.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.va

Trainer is using device: cuda:0


Epoch,Training Loss,Validation Loss,Accuracy Main,Precision Main,Recall Main,F1 Main,Accuracy Sub,Precision Sub,Recall Sub,F1 Sub
1,5.5608,5.324038,0.263158,0.069252,0.263158,0.109649,0.154971,0.024016,0.154971,0.041587
2,5.2886,5.14952,0.374269,0.348022,0.374269,0.29611,0.154971,0.024016,0.154971,0.041587
3,5.1061,5.090141,0.365497,0.162433,0.365497,0.218671,0.192982,0.043004,0.192982,0.070041
4,4.8109,4.728669,0.438596,0.450157,0.438596,0.371537,0.210526,0.095824,0.210526,0.114372
5,4.3946,4.426216,0.482456,0.424773,0.482456,0.429709,0.280702,0.180224,0.280702,0.177347
6,4.0821,4.175373,0.573099,0.598022,0.573099,0.526788,0.280702,0.115591,0.280702,0.158887
7,3.7188,3.890641,0.643275,0.655725,0.643275,0.635411,0.318713,0.166497,0.318713,0.201344
8,3.4291,3.684939,0.716374,0.729106,0.716374,0.716613,0.347953,0.198069,0.347953,0.235117
9,3.2152,3.571958,0.71345,0.741998,0.71345,0.709942,0.365497,0.239887,0.365497,0.252559
10,2.9813,3.442618,0.74269,0.753891,0.74269,0.743847,0.380117,0.253652,0.380117,0.270817


TrainOutput(global_step=220, training_loss=3.3088507305492056, metrics={'train_runtime': 78.2865, 'train_samples_per_second': 43.686, 'train_steps_per_second': 2.81, 'total_flos': 225050844441600.0, 'train_loss': 3.3088507305492056, 'epoch': 20.0})

In [22]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 3.078139305114746, 'eval_accuracy_main': 0.7514619883040936, 'eval_precision_main': 0.7590364719806276, 'eval_recall_main': 0.7514619883040936, 'eval_f1_main': 0.7516241836796272, 'eval_accuracy_sub': 0.4590643274853801, 'eval_precision_sub': 0.3496699982690117, 'eval_recall_sub': 0.4590643274853801, 'eval_f1_sub': 0.3597683150431605, 'eval_runtime': 0.8713, 'eval_samples_per_second': 392.511, 'eval_steps_per_second': 25.249, 'epoch': 20.0}


In [23]:
trainer.save_model(f'./models/{wandb.run.name}')

In [24]:
predictions = trainer.predict(test_dataset)

# Access the predictions and true labels
# logits_main = predictions.predictions['logits_main']
# logits_sub = predictions.predictions['logits_sub']
# labels_main = predictions.label_ids['labels_main']
# labels_sub = predictions.label_ids['labels_sub']
# 
# # Convert logits to predicted class labels
# preds_main = np.argmax(logits_main, axis=1)
# preds_sub = np.argmax(logits_sub, axis=1)
predictions.metrics

{'test_loss': 3.283393383026123,
 'test_accuracy_main': 0.7280701754385965,
 'test_precision_main': 0.740716548903624,
 'test_recall_main': 0.7280701754385965,
 'test_f1_main': 0.7304996561138977,
 'test_accuracy_sub': 0.39473684210526316,
 'test_precision_sub': 0.25404933512683364,
 'test_recall_sub': 0.39473684210526316,
 'test_f1_sub': 0.27758337527090454,
 'test_runtime': 0.8877,
 'test_samples_per_second': 385.266,
 'test_steps_per_second': 24.783}