In [7]:
import pandas as pd
from datasets import Dataset, concatenate_datasets

df = pd.read_csv('./data_cleaned_manual_combined.csv')

labels_gs = df['Global Subject'].unique()
labels_qi = df['Question Intent'].unique()

id2label_gs = {i: label for i, label in enumerate(labels_gs)}
label2id_gs = {label: i for i, label in enumerate(labels_gs)}

id2label_qi = {i: label for i, label in enumerate(labels_qi)}
label2id_qi = {label: i for i, label in enumerate(labels_qi)}

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained
# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/bert#transformers.BertTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def preprocess_data(example):
    question = example['Question']

    # https://huggingface.co/docs/transformers/v4.46.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
    encodings = tokenizer(question, padding="max_length", truncation=True, max_length=128)
    label_gs = label2id_gs[example['Global Subject']]
    label_qi = label2id_qi[example['Question Intent']]

    encodings.update({'labels': label_gs, 'labels_sub': label_qi})

    return encodings

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
from datasets import Dataset

# Assuming df is your DataFrame and it has a column 'Global Subject' for stratification
df = pd.read_csv('./data_cleaned_manual_combined.csv')

# Create the stratified split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['Global Subject']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Reset index of strat_test_set
strat_test_set = strat_test_set.reset_index(drop=True)

# Further split the test set into validation and test sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in split.split(strat_test_set, strat_test_set['Global Subject']):
    strat_val_set = strat_test_set.loc[val_index]
    strat_test_set = strat_test_set.loc[test_index]

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(strat_train_set)
eval_dataset = Dataset.from_pandas(strat_val_set)
test_dataset = Dataset.from_pandas(strat_test_set)

# Tokenize the datasets
train_dataset = train_dataset.map(preprocess_data, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_data, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, remove_columns=test_dataset.column_names)

train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

Map:   0%|          | 0/683 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    metrics = {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }
    return metrics

In [11]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

CUDA available: True
Number of GPUs: 1
Current CUDA device: 0
Device name: NVIDIA GeForce RTX 3070 Ti


In [12]:
import wandb

wandb.init(project='tesi', tags=['hierarchical-pipeline'], notes='Model only for the main subject classification')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mstefano-porta800[0m ([33mstefano-porta800-universit-degli-studi-di-torino[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
from transformers import TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_gs))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate=2e-5,
    warmup_ratio=0.1,  # Warmup for the first 10% of steps
    lr_scheduler_type='linear',  # Linear scheduler
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy='epoch',
    logging_strategy='epoch',
    eval_strategy='epoch',
    logging_dir='./logs',
    load_best_model_at_end=True,  # Load the best model at the end based on evaluation metric
    metric_for_best_model='f1',  # Use subtopic F1-score to determine the best model
    greater_is_better=True,  # Higher metric indicates a better model,
    report_to='wandb',
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

print(f"Trainer is using device: {trainer.args.device}")

trainer.train()

wandb.finish()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer is using device: cuda:0


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9152,1.663243,0.411765,0.46092,0.411765,0.291785
2,1.4755,1.164157,0.647059,0.682899,0.647059,0.623515
3,0.9144,0.675352,0.870588,0.87166,0.870588,0.868165
4,0.4793,0.501129,0.858824,0.869447,0.858824,0.858726
5,0.2404,0.404944,0.882353,0.883806,0.882353,0.878151
6,0.1352,0.361241,0.894118,0.895722,0.894118,0.891858
7,0.0799,0.420352,0.905882,0.908959,0.905882,0.905457
8,0.0587,0.594881,0.870588,0.88551,0.870588,0.860954
9,0.0507,0.504575,0.905882,0.911045,0.905882,0.905057
10,0.0395,0.445815,0.905882,0.908959,0.905882,0.905457


VBox(children=(Label(value='0.025 MB of 0.025 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▄█▇██████▇█████████
eval/f1,▁▅█▇███▇██▇█████████
eval/loss,█▅▃▂▁▁▁▂▂▁▂▂▂▂▂▂▂▂▂▂
eval/precision,▁▄▇▇██████▇█████████
eval/recall,▁▄█▇██████▇█████████
eval/runtime,▆▁▅▅▄▄▄▅▅▇▆▅▄▃█▂▄▅▃▄
eval/samples_per_second,▂█▄▄▄▅▄▄▃▂▃▃▅▆▁▇▅▄▅▅
eval/steps_per_second,▂█▄▄▄▅▄▄▃▂▃▃▅▆▁▇▅▄▅▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.90588
eval/f1,0.90546
eval/loss,0.54788
eval/precision,0.90896
eval/recall,0.90588
eval/runtime,0.2227
eval/samples_per_second,381.651
eval/steps_per_second,26.94
total_flos,898564591488000.0
train/epoch,20.0


### Fine-tuning the model for the subtopic classification

In [18]:
import wandb


# Function to train a model for a specific global subject
def train_model_for_subject(global_subject, train_data, val_data):
    # Initialize a new wandb run for each model
    wandb.init(project='tesi', tags=['hierarchical-pipeline', global_subject],
               notes=f'Model for {global_subject} sub-topic classification')

    # Define the model and training arguments
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_qi))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    training_args = TrainingArguments(
        output_dir=f'./results_{global_subject}',
        num_train_epochs=20,
        # https://datascience.stackexchange.com/questions/64583/what-are-the-good-parameter-ranges-for-bert-hyperparameters-while-finetuning-it
        learning_rate=4e-5,
        lr_scheduler_type='linear',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_strategy='epoch',
        logging_strategy='epoch',
        eval_strategy='epoch',
        logging_dir=f'./logs_{global_subject}',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        report_to='wandb',
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Finish the wandb run
    wandb.finish()


# Example usage
for global_subject in labels_gs:
    gs_id = label2id_gs[global_subject]
    print(f"Training model for global subject: {global_subject}")


    # Filter the dataset for the current global subject

    def filter_fn(x):
        return x['labels'] == gs_id


    # merge the two resulting datasets
    complete_data = concatenate_datasets([train_dataset.filter(filter_fn), eval_dataset.filter(filter_fn)])
    
    print(f"Complete dataset size: {len(complete_data)}")
    
    # if the complete data has only one class, skip the training
    if len(complete_data.unique('labels_sub')) == 1:
        print(f"Skipping training for {global_subject} as it has only one subtopic")
        continue
    
    complete_data = complete_data.class_encode_column('labels_sub')
    complete_data = complete_data.train_test_split(test_size=0.2, shuffle=True, seed=42,
                                                             stratify_by_column='labels_sub')

    subject_train_data = complete_data['train']
    subject_val_data = complete_data['test']

    print(f"Train dataset size: {len(subject_train_data)}, Validation dataset size: {len(subject_val_data)}")
    
    # remove the labels column and map the labels_sub column to labels
    subject_train_data = subject_train_data.remove_columns('labels').rename_column('labels_sub', 'labels')
    subject_val_data = subject_val_data.remove_columns('labels').rename_column('labels_sub', 'labels')
    
    # Train the model for the current global subject
    train_model_for_subject(global_subject, subject_train_data, subject_val_data)

Training model for global subject: start


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 17


Flattening the indices:   0%|          | 0/17 [00:00<?, ? examples/s]

Skipping training for start as it has only one subtopic
Training model for global subject: automaton


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 127


Flattening the indices:   0%|          | 0/127 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/127 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/127 [00:00<?, ? examples/s]

Train dataset size: 101, Validation dataset size: 26


VBox(children=(Label(value='0.028 MB of 0.028 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▃▂▇▆▆████
eval/f1,▁▂▂▆▆▆████
eval/loss,█▇▅▄▃▃▂▂▁▁
eval/precision,▁▂▃▅▅▅▇▇▇█
eval/recall,▁▃▂▇▆▆████
eval/runtime,▂▄▂▃▇▁▂▂█▃
eval/samples_per_second,▆▅▆▆▂█▇▇▁▆
eval/steps_per_second,▆▅▆▆▂█▇▇▁▆
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██

0,1
eval/accuracy,0.68293
eval/f1,0.61933
eval/loss,1.06647
eval/precision,0.57382
eval/recall,0.68293
eval/runtime,0.1101
eval/samples_per_second,372.246
eval/steps_per_second,27.238
train/epoch,10.0
train/global_step,110.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1471,2.749605,0.5,0.25,0.5,0.333333
2,2.5742,2.27955,0.5,0.25,0.5,0.333333
3,2.1045,1.980132,0.5,0.25,0.5,0.333333
4,1.9118,1.758918,0.5,0.25,0.5,0.333333
5,1.6461,1.584418,0.538462,0.375385,0.538462,0.399798
6,1.4726,1.413909,0.576923,0.386218,0.576923,0.443659
7,1.3002,1.318507,0.730769,0.555769,0.730769,0.624709
8,1.1617,1.192267,0.730769,0.544028,0.730769,0.620536
9,0.9805,1.083906,0.807692,0.70625,0.807692,0.742507
10,0.8015,1.00818,0.807692,0.70625,0.807692,0.742507


0,1
eval/accuracy,▁▁▁▁▂▃▆▆▇▇██████████
eval/f1,▁▁▁▁▂▃▆▅▇▇██████████
eval/loss,█▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁
eval/precision,▁▁▁▁▃▃▅▅████████████
eval/recall,▁▁▁▁▂▃▆▆▇▇██████████
eval/runtime,▃▁▁▂▆▅▂▁▅▃▄▂▂▄▃▂▁█▆▇
eval/samples_per_second,▅██▇▂▃▆█▄▆▄▇▇▅▆▇█▁▃▂
eval/steps_per_second,▅██▇▂▃▆█▄▆▄▇▇▅▆▇█▁▃▂
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.84615
eval/f1,0.78159
eval/loss,0.79095
eval/precision,0.73333
eval/recall,0.84615
eval/runtime,0.0751
eval/samples_per_second,346.18
eval/steps_per_second,26.629
total_flos,132915223802880.0
train/epoch,20.0


Training model for global subject: transition


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 202


Flattening the indices:   0%|          | 0/202 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/202 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/202 [00:00<?, ? examples/s]

Train dataset size: 161, Validation dataset size: 41


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112032888882418, max=1.0…

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3515,3.022951,0.243902,0.154839,0.243902,0.166468
2,2.8804,2.633957,0.390244,0.21144,0.390244,0.257445
3,2.4828,2.295789,0.317073,0.266814,0.317073,0.229804
4,2.0571,1.985682,0.609756,0.421138,0.609756,0.489976
5,1.7119,1.721367,0.585366,0.39789,0.585366,0.469472
6,1.4959,1.508317,0.585366,0.40331,0.585366,0.474475
7,1.1462,1.303472,0.658537,0.543219,0.658537,0.590548
8,0.9984,1.243428,0.658537,0.537644,0.658537,0.587212
9,0.781,1.12026,0.658537,0.537644,0.658537,0.587212
10,0.6556,1.066471,0.682927,0.57382,0.682927,0.619333


0,1
eval/accuracy,▁▃▂▆▆▆▇▇▇▇▇▇▇▇██▇▇█▇
eval/f1,▁▂▂▅▅▅▆▆▆▇▇▆▇▇▇█▇▇█▇
eval/loss,█▇▆▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁
eval/precision,▁▂▂▄▄▄▆▅▅▆▆▆▇▇▇█▇▇█▇
eval/recall,▁▃▂▆▆▆▇▇▇▇▇▇▇▇██▇▇█▇
eval/runtime,▆▄▇▁▆▅▂▄█▅▆▆▇█▂▆▃▇█▄
eval/samples_per_second,▂▅▂█▃▃▇▅▁▄▃▃▂▁▇▂▆▂▁▄
eval/steps_per_second,▂▅▂█▃▃▇▅▁▄▃▃▂▁▇▂▆▂▁▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.70732
eval/f1,0.66866
eval/loss,0.93258
eval/precision,0.64943
eval/recall,0.70732
eval/runtime,0.118
eval/samples_per_second,347.429
eval/steps_per_second,25.422
total_flos,211874762695680.0
train/epoch,20.0


Training model for global subject: grammar


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 129


Flattening the indices:   0%|          | 0/129 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/129 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/129 [00:00<?, ? examples/s]

Train dataset size: 103, Validation dataset size: 26


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3816,3.058219,0.423077,0.186154,0.423077,0.258547
2,2.799,2.491895,0.423077,0.178994,0.423077,0.251559
3,2.3802,2.180712,0.423077,0.186154,0.423077,0.258547
4,2.0576,1.85651,0.423077,0.186154,0.423077,0.258547
5,1.7659,1.662922,0.538462,0.326923,0.538462,0.397436
6,1.4834,1.498599,0.692308,0.612393,0.692308,0.611553
7,1.3396,1.336496,0.692308,0.629555,0.692308,0.625275
8,1.2484,1.145809,0.769231,0.720513,0.769231,0.715497
9,1.0731,1.076996,0.769231,0.644712,0.769231,0.68661
10,0.9403,0.933683,0.807692,0.753593,0.807692,0.756119


0,1
eval/accuracy,▁▁▁▁▃▅▅▆▆▆▇▇▇▇▇▇▇▇██
eval/f1,▁▁▁▁▃▅▅▆▆▆▇▇▇▇▇▇▇▇██
eval/loss,█▆▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁
eval/precision,▁▁▁▁▂▅▅▆▅▆▇▇▇▇▇▇▇▇██
eval/recall,▁▁▁▁▃▅▅▆▆▆▇▇▇▇▇▇▇▇██
eval/runtime,███▄▆▅▆▅▅▂▁▅▂▅▃▁▂▁▁▄
eval/samples_per_second,▁▁▁▄▃▄▃▃▄▆▇▃▇▄▆█▇▇█▄
eval/steps_per_second,▁▁▁▄▃▄▃▃▄▆▇▃▇▄▆█▇▇█▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.92308
eval/f1,0.9186
eval/loss,0.56814
eval/precision,0.9391
eval/recall,0.92308
eval/runtime,0.0732
eval/samples_per_second,354.989
eval/steps_per_second,27.307
total_flos,135547208432640.0
train/epoch,20.0


Training model for global subject: state


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 94


Flattening the indices:   0%|          | 0/94 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/94 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/94 [00:00<?, ? examples/s]

Train dataset size: 75, Validation dataset size: 19


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5024,3.225173,0.368421,0.193182,0.368421,0.247542
2,3.109,2.868631,0.263158,0.069252,0.263158,0.109649
3,2.8031,2.621925,0.263158,0.069252,0.263158,0.109649
4,2.5827,2.433649,0.315789,0.230994,0.315789,0.193364
5,2.3738,2.302485,0.368421,0.235294,0.368421,0.245933
6,2.1297,2.251,0.368421,0.180162,0.368421,0.236424
7,2.0045,2.101247,0.473684,0.277512,0.473684,0.344925
8,1.9248,2.076839,0.526316,0.356459,0.526316,0.412594
9,1.7559,1.939108,0.526316,0.356459,0.526316,0.412594
10,1.6538,1.920463,0.526316,0.356459,0.526316,0.412594


0,1
eval/accuracy,▃▁▁▂▃▃▅▆▆▆▆▇████████
eval/f1,▃▁▁▂▃▃▅▆▆▆▆▇████████
eval/loss,█▇▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
eval/precision,▃▁▁▃▃▃▄▅▅▅▅▆████████
eval/recall,▃▁▁▂▃▃▅▆▆▆▆▇████████
eval/runtime,▁█▄▁▁▅▅▅▂▆▄▂▄▂▄▃▂▂▃▄
eval/samples_per_second,█▁▅██▃▄▃▇▂▄▇▅▇▄▅▇▇▆▅
eval/steps_per_second,█▁▅██▃▄▃▇▂▄▇▅▇▄▅▇▇▆▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.63158
eval/f1,0.5614
eval/loss,1.55989
eval/precision,0.55263
eval/recall,0.63158
eval/runtime,0.0554
eval/samples_per_second,343.01
eval/steps_per_second,36.106
total_flos,98699423616000.0
train/epoch,20.0


Training model for global subject: theory


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 103


Flattening the indices:   0%|          | 0/103 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/103 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/103 [00:00<?, ? examples/s]

Train dataset size: 82, Validation dataset size: 21


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4106,3.182578,0.333333,0.111111,0.333333,0.166667
2,2.9633,2.776573,0.333333,0.111111,0.333333,0.166667
3,2.6049,2.484577,0.333333,0.111111,0.333333,0.166667
4,2.3745,2.239974,0.333333,0.111111,0.333333,0.166667
5,2.0165,2.083162,0.333333,0.111111,0.333333,0.166667
6,1.8955,1.915145,0.333333,0.111111,0.333333,0.166667
7,1.6262,1.756606,0.52381,0.37535,0.52381,0.406085
8,1.4746,1.566254,0.571429,0.353968,0.571429,0.428571
9,1.3425,1.424828,0.619048,0.460317,0.619048,0.526674
10,1.1746,1.317999,0.714286,0.622222,0.714286,0.652865


0,1
eval/accuracy,▁▁▁▁▁▁▄▅▆▇▇▆▆▇█▇▇███
eval/f1,▁▁▁▁▁▁▄▄▅▇▇▇▆▇█▇▇███
eval/loss,█▇▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁
eval/precision,▁▁▁▁▁▁▄▄▅▆▆▇▆▇██▇███
eval/recall,▁▁▁▁▁▁▄▅▆▇▇▆▆▇█▇▇███
eval/runtime,▅▄▄▁█▃▂▃▇▃▃▄▄▂▃▅▇▂▃▄
eval/samples_per_second,▄▅▅█▁▆▆▆▂▆▅▅▅▇▆▄▂▇▅▅
eval/steps_per_second,▄▅▅█▁▆▆▆▂▆▅▅▅▇▆▄▂▇▅▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.7619
eval/f1,0.75238
eval/loss,1.0147
eval/precision,0.75
eval/recall,0.7619
eval/runtime,0.0573
eval/samples_per_second,366.664
eval/steps_per_second,34.92
total_flos,107911369820160.0
train/epoch,20.0


Training model for global subject: off_topic


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 96


Flattening the indices:   0%|          | 0/96 [00:00<?, ? examples/s]

Skipping training for off_topic as it has only one subtopic
