In [107]:
import pandas as pd
from datasets import Dataset, concatenate_datasets

df = pd.read_csv('./data_cleaned_manual_combined.csv')

labels_gs = df['Global Subject'].unique()
labels_qi = df['Question Intent'].unique()

id2label_gs = {i: label for i, label in enumerate(labels_gs)}
label2id_gs = {label: i for i, label in enumerate(labels_gs)}

id2label_qi = {i: label for i, label in enumerate(labels_qi)}
label2id_qi = {label: i for i, label in enumerate(labels_qi)}

In [108]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained
# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/bert#transformers.BertTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def preprocess_data(example):
    question = example['Question']

    # https://huggingface.co/docs/transformers/v4.46.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
    encodings = tokenizer(question, padding="max_length", truncation=True, max_length=128)
    label_gs = label2id_gs[example['Global Subject']]
    label_qi = label2id_qi[example['Question Intent']]

    encodings.update({'labels': label_gs, 'labels_sub': label_qi})

    return encodings

In [106]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
from datasets import Dataset

# Assuming df is your DataFrame and it has a column 'Global Subject' for stratification
df = pd.read_csv('./data_cleaned_manual_combined.csv')

# Create the stratified split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['Global Subject']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Reset index of strat_test_set
strat_test_set = strat_test_set.reset_index(drop=True)

# Further split the test set into validation and test sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in split.split(strat_test_set, strat_test_set['Global Subject']):
    strat_val_set = strat_test_set.loc[val_index]
    strat_test_set = strat_test_set.loc[test_index]

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(strat_train_set)
eval_dataset = Dataset.from_pandas(strat_val_set)
test_dataset = Dataset.from_pandas(strat_test_set)

# Tokenize the datasets
train_dataset = train_dataset.map(preprocess_data, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_data, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, remove_columns=test_dataset.column_names)

train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

Map:   0%|          | 0/683 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

In [47]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    metrics = {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }
    return metrics

In [48]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

CUDA available: True
Number of GPUs: 1
Current CUDA device: 0
Device name: NVIDIA GeForce RTX 3070 Ti


In [49]:
import wandb

wandb.init(project='tesi', tags=['hierarchical-pipeline'], description='Model only for the main subject classification')

VBox(children=(Label(value='0.087 MB of 0.087 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅▇▇█▇██▇███████████
eval/f1,▁▆▇▇█▇██████████████
eval/loss,█▅▃▁▁▁▁▂▂▁▂▂▂▂▂▂▂▂▂▂
eval/precision,▁▆▇█████████████████
eval/recall,▁▅▇▇█▇██▇███████████
eval/runtime,▆▃▁▄▃▂▂▂▃▂▁▅▄▂▂█▂▇▃▅
eval/samples_per_second,▂▆▇▄▆▆▆▆▆▇█▃▄▆▇▁▇▂▆▄
eval/steps_per_second,▂▆▇▄▆▆▆▆▆▇█▃▄▆▇▁▇▂▆▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.87059
eval/f1,0.86863
eval/loss,0.6567
eval/precision,0.87895
eval/recall,0.87059
eval/runtime,0.1307
eval/samples_per_second,650.303
eval/steps_per_second,45.904
total_flos,453078899896320.0
train/epoch,20.0


In [50]:
from transformers import TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(labels_gs))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate=2e-5,
    warmup_ratio=0.1,  # Warmup for the first 10% of steps
    lr_scheduler_type='linear',  # Linear scheduler
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy='epoch',
    logging_strategy='epoch',
    eval_strategy='epoch',
    logging_dir='./logs',
    load_best_model_at_end=True,  # Load the best model at the end based on evaluation metric
    metric_for_best_model='f1',  # Use subtopic F1-score to determine the best model
    greater_is_better=True,  # Higher metric indicates a better model,
    report_to='wandb'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

print(f"Trainer is using device: {trainer.args.device}")

trainer.train()

wandb.finish()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer is using device: cuda:0


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8837,1.776035,0.258824,0.067787,0.258824,0.107436
2,1.5806,1.29791,0.647059,0.700503,0.647059,0.613204
3,0.9959,0.84216,0.764706,0.767712,0.764706,0.740746
4,0.5356,0.598027,0.823529,0.834118,0.823529,0.813242
5,0.2911,0.523984,0.858824,0.863298,0.858824,0.853581
6,0.1755,0.562138,0.823529,0.836602,0.823529,0.81808
7,0.1135,0.556099,0.858824,0.85702,0.858824,0.855603
8,0.0728,0.617508,0.858824,0.86454,0.858824,0.855683
9,0.0495,0.63998,0.835294,0.845289,0.835294,0.831728
10,0.0354,0.575449,0.870588,0.877843,0.870588,0.8691


0,1
eval/accuracy,▁▅▇▇█▇██▇███████████
eval/f1,▁▆▇▇█▇██████████████
eval/loss,█▅▃▁▁▁▁▂▂▁▂▂▂▂▂▂▂▂▂▂
eval/precision,▁▆▇█████████████████
eval/recall,▁▅▇▇█▇██▇███████████
eval/runtime,▂▁▂▂▁▁█▃▆▆▃▃▂▁▄▁▁▁▁▃
eval/samples_per_second,▇█▇▆▇█▁▅▃▂▆▆▇█▄███▇▆
eval/steps_per_second,▇█▇▆▇█▁▅▃▂▆▆▇█▄███▇▆
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.87059
eval/f1,0.86863
eval/loss,0.6567
eval/precision,0.87895
eval/recall,0.87059
eval/runtime,0.1166
eval/samples_per_second,729.228
eval/steps_per_second,51.475
total_flos,453078899896320.0
train/epoch,20.0


### Fine-tuning the model for the subtopic classification

In [114]:
import wandb


# Function to train a model for a specific global subject
def train_model_for_subject(global_subject, train_data, val_data):
    # Initialize a new wandb run for each model
    wandb.init(project='tesi', tags=['hierarchical-pipeline', global_subject],
               notes=f'Model for {global_subject} sub-topic classification')

    # Define the model and training arguments
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(labels_qi))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    training_args = TrainingArguments(
        output_dir=f'./results_{global_subject}',
        num_train_epochs=20,
        # https://datascience.stackexchange.com/questions/64583/what-are-the-good-parameter-ranges-for-bert-hyperparameters-while-finetuning-it
        learning_rate=4e-5,
        lr_scheduler_type='linear',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_strategy='epoch',
        logging_strategy='epoch',
        eval_strategy='epoch',
        logging_dir=f'./logs_{global_subject}',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Finish the wandb run
    wandb.finish()


# Example usage
for global_subject in labels_gs:
    gs_id = label2id_gs[global_subject]
    print(f"Training model for global subject: {global_subject}")


    # Filter the dataset for the current global subject

    def filter_fn(x):
        return x['labels'] == gs_id


    # merge the two resulting datasets
    complete_data = concatenate_datasets([train_dataset.filter(filter_fn), eval_dataset.filter(filter_fn)])
    
    print(f"Complete dataset size: {len(complete_data)}")
    
    # if the complete data has only one class, skip the training
    if len(complete_data.unique('labels_sub')) == 1:
        print(f"Skipping training for {global_subject} as it has only one subtopic")
        continue
    
    complete_data = complete_data.class_encode_column('labels_sub')
    complete_data = complete_data.train_test_split(test_size=0.2, shuffle=True, seed=42,
                                                             stratify_by_column='labels_sub')

    subject_train_data = complete_data['train']
    subject_val_data = complete_data['test']

    print(f"Train dataset size: {len(subject_train_data)}, Validation dataset size: {len(subject_val_data)}")
    
    # remove the labels column and map the labels_sub column to labels
    subject_train_data = subject_train_data.remove_columns('labels').rename_column('labels_sub', 'labels')
    subject_val_data = subject_val_data.remove_columns('labels').rename_column('labels_sub', 'labels')
    
    # Train the model for the current global subject
    train_model_for_subject(global_subject, subject_train_data, subject_val_data)

Training model for global subject: start


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 17


Flattening the indices:   0%|          | 0/17 [00:00<?, ? examples/s]

Skipping training for start as it has only one subtopic
Training model for global subject: automaton


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 127


Flattening the indices:   0%|          | 0/127 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/127 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/127 [00:00<?, ? examples/s]

Train dataset size: 101, Validation dataset size: 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3611,2.963141,0.5,0.25,0.5,0.333333
2,2.7346,2.42394,0.5,0.25,0.5,0.333333
3,2.2514,2.150762,0.5,0.25,0.5,0.333333
4,2.0845,1.950944,0.5,0.25,0.5,0.333333
5,1.824,1.783001,0.5,0.25,0.5,0.333333
6,1.6565,1.607529,0.538462,0.375385,0.538462,0.399798
7,1.4057,1.424928,0.692308,0.540293,0.692308,0.590045
8,1.2724,1.26576,0.692308,0.540293,0.692308,0.590045
9,1.0125,1.16404,0.730769,0.659413,0.730769,0.655151
10,0.8466,1.043507,0.769231,0.71688,0.769231,0.706717


0,1
eval/accuracy,▁▁▁▁▁▂▅▅▆▆▇▇█▇██████
eval/f1,▁▁▁▁▁▂▅▅▆▆▇▇█▇██████
eval/loss,█▆▅▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁
eval/precision,▁▁▁▁▁▃▅▅▆▇██████████
eval/recall,▁▁▁▁▁▂▅▅▆▆▇▇█▇██████
eval/runtime,▂▂▂▂▁▆▆▆▂█▆▆▇▆▇▆▅▅▆▅
eval/samples_per_second,▇█▇▇█▃▂▂▇▁▃▃▂▃▂▂▃▃▂▄
eval/steps_per_second,▇█▇▇█▃▂▂▇▁▃▃▂▃▂▂▃▃▂▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.84615
eval/f1,0.81026
eval/loss,0.77582
eval/precision,0.80543
eval/recall,0.84615
eval/runtime,0.0368
eval/samples_per_second,707.243
eval/steps_per_second,54.403
total_flos,66940177167360.0
train/epoch,20.0


Training model for global subject: transition


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 202


Flattening the indices:   0%|          | 0/202 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/202 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/202 [00:00<?, ? examples/s]

Train dataset size: 161, Validation dataset size: 41


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4332,3.153058,0.195122,0.038073,0.195122,0.063713
2,3.0152,2.767759,0.195122,0.038073,0.195122,0.063713
3,2.6867,2.474637,0.365854,0.205691,0.365854,0.246489
4,2.2641,2.2174,0.512195,0.414634,0.512195,0.419106
5,1.9565,1.943664,0.536585,0.46748,0.536585,0.465447
6,1.6839,1.676257,0.609756,0.501626,0.609756,0.538282
7,1.3335,1.465874,0.658537,0.567886,0.658537,0.598022
8,1.1464,1.284125,0.658537,0.554878,0.658537,0.592581
9,0.9009,1.160161,0.658537,0.555063,0.658537,0.594645
10,0.7453,1.058549,0.634146,0.52783,0.634146,0.567135


0,1
eval/accuracy,▁▁▃▅▅▆▆▆▆▆▇▇▇███████
eval/f1,▁▁▃▄▅▅▆▆▆▆▇▇▇█▇█████
eval/loss,█▇▆▅▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁
eval/precision,▁▁▂▄▅▅▆▆▆▅▇▇▇█▇█████
eval/recall,▁▁▃▅▅▆▆▆▆▆▇▇▇███████
eval/runtime,▁█▇▇▇▆▆█▂▆▇█▃▆▅▇▃▆▃▃
eval/samples_per_second,█▁▂▂▂▃▂▁▇▃▂▁▆▃▃▂▅▃▅▆
eval/steps_per_second,█▁▂▂▂▃▂▁▇▃▂▁▆▃▃▂▅▃▅▆
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.82927
eval/f1,0.79356
eval/loss,0.7177
eval/precision,0.78502
eval/recall,0.82927
eval/runtime,0.0556
eval/samples_per_second,737.515
eval/steps_per_second,53.965
total_flos,106706619048960.0
train/epoch,20.0


Training model for global subject: grammar


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 129


Flattening the indices:   0%|          | 0/129 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/129 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/129 [00:00<?, ? examples/s]

Train dataset size: 103, Validation dataset size: 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4201,3.001454,0.423077,0.178994,0.423077,0.251559
2,2.7552,2.493597,0.423077,0.178994,0.423077,0.251559
3,2.3796,2.198117,0.423077,0.178994,0.423077,0.251559
4,2.1064,1.92584,0.461538,0.34,0.461538,0.320085
5,1.8369,1.681715,0.576923,0.519231,0.576923,0.487179
6,1.5472,1.445492,0.692308,0.51991,0.692308,0.588828
7,1.3766,1.26089,0.692308,0.541026,0.692308,0.604142
8,1.199,1.106341,0.807692,0.732347,0.807692,0.757051
9,0.9735,0.981223,0.807692,0.732347,0.807692,0.757051
10,0.8108,0.860752,0.884615,0.90927,0.884615,0.876099


0,1
eval/accuracy,▁▁▁▂▃▅▅▇▇███████████
eval/f1,▁▁▁▂▄▅▅▇▇███████████
eval/loss,█▇▆▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁
eval/precision,▁▁▁▃▄▄▄▆▆███████████
eval/recall,▁▁▁▂▃▅▅▇▇███████████
eval/runtime,▂▃▁▃▂▃▁▃▂▁▄▁▁▅▂▂▁█▁▆
eval/samples_per_second,▇▆█▆▇▅█▆▆█▅▇█▃▇▇█▁█▃
eval/steps_per_second,▇▆█▆▇▅█▆▆█▅▇█▃▇▇█▁█▃
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.88462
eval/f1,0.88828
eval/loss,0.57676
eval/precision,0.92308
eval/recall,0.88462
eval/runtime,0.0386
eval/samples_per_second,673.83
eval/steps_per_second,51.833
total_flos,68265725230080.0
train/epoch,20.0


Training model for global subject: state


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 94


Flattening the indices:   0%|          | 0/94 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/94 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/94 [00:00<?, ? examples/s]

Train dataset size: 75, Validation dataset size: 19


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112190533377643, max=1.0…

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5936,3.376437,0.263158,0.069252,0.263158,0.109649
2,3.2494,3.057192,0.263158,0.069252,0.263158,0.109649
3,2.9608,2.797653,0.263158,0.069252,0.263158,0.109649
4,2.7414,2.606042,0.368421,0.1875,0.368421,0.230576
5,2.5264,2.434962,0.421053,0.350877,0.421053,0.315789
6,2.3314,2.283538,0.421053,0.330827,0.421053,0.307677
7,2.142,2.141778,0.526316,0.356459,0.526316,0.412594
8,2.0394,2.042019,0.526316,0.356459,0.526316,0.412594
9,1.8581,1.937664,0.578947,0.488038,0.578947,0.497807
10,1.7312,1.838263,0.578947,0.514354,0.578947,0.503822


0,1
eval/accuracy,▁▁▁▃▄▄▅▅▆▆▆▆▆▆▆▇████
eval/f1,▁▁▁▃▄▄▅▅▆▆▆▇▇▇▇▇████
eval/loss,█▇▆▅▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁
eval/precision,▁▁▁▃▅▄▅▅▆▇▆▇▇▇▇█████
eval/recall,▁▁▁▃▄▄▅▅▆▆▆▆▆▆▆▇████
eval/runtime,▆▁▄▃▂▁▅▃▄▂▃▃▃▃▃▃▂▂▃█
eval/samples_per_second,▃█▅▆▇█▄▆▅▇▆▅▅▆▆▅▇▇▅▁
eval/steps_per_second,▃█▅▆▇█▄▆▅▇▆▅▅▆▆▅▇▇▅▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.68421
eval/f1,0.61404
eval/loss,1.50065
eval/precision,0.60526
eval/recall,0.68421
eval/runtime,0.0319
eval/samples_per_second,595.159
eval/steps_per_second,62.648
total_flos,49708052352000.0
train/epoch,20.0


Training model for global subject: theory


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 103


Flattening the indices:   0%|          | 0/103 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/103 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/103 [00:00<?, ? examples/s]

Train dataset size: 82, Validation dataset size: 21


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4187,3.073453,0.333333,0.111111,0.333333,0.166667
2,2.8697,2.688065,0.333333,0.111111,0.333333,0.166667
3,2.5527,2.435899,0.333333,0.111111,0.333333,0.166667
4,2.3782,2.252214,0.333333,0.111111,0.333333,0.166667
5,2.0355,2.074178,0.333333,0.111111,0.333333,0.166667
6,1.8169,1.842865,0.571429,0.465079,0.571429,0.480904
7,1.5337,1.619268,0.571429,0.47619,0.571429,0.515806
8,1.3947,1.478573,0.619048,0.491342,0.619048,0.54195
9,1.2877,1.352166,0.666667,0.563889,0.666667,0.602958
10,1.158,1.226017,0.666667,0.569841,0.666667,0.609307


0,1
eval/accuracy,▁▁▁▁▁▅▅▆▆▆▇▇▇▇▇██▇██
eval/f1,▁▁▁▁▁▅▅▆▇▇██████████
eval/loss,█▇▆▅▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁
eval/precision,▁▁▁▁▁▅▅▆▆▇██████████
eval/recall,▁▁▁▁▁▅▅▆▆▆▇▇▇▇▇██▇██
eval/runtime,▂▃▃▁▂▂▃▄▃▂▅▂▆█▃▇▅█▁▆
eval/samples_per_second,▇▆▆█▆▇▆▅▆▇▄▇▃▁▆▂▄▁█▂
eval/steps_per_second,▇▆▆█▆▇▆▅▆▇▄▇▃▁▆▂▄▁█▂
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.7619
eval/f1,0.71667
eval/loss,0.93841
eval/precision,0.69286
eval/recall,0.7619
eval/runtime,0.033
eval/samples_per_second,637.289
eval/steps_per_second,60.694
total_flos,54347470571520.0
train/epoch,20.0


Training model for global subject: off_topic


Filter:   0%|          | 0/683 [00:00<?, ? examples/s]

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Complete dataset size: 96


Flattening the indices:   0%|          | 0/96 [00:00<?, ? examples/s]

Skipping training for off_topic as it has only one subtopic
