In [None]:
import pandas as pd
from datasets import Dataset, concatenate_datasets

df = pd.read_csv('./data_cleaned_manual_combined.csv')

labels_gs = df['Global Subject'].unique()
labels_qi = df['Question Intent'].unique()

id2label_gs = {i: label for i, label in enumerate(labels_gs)}
label2id_gs = {label: i for i, label in enumerate(labels_gs)}

id2label_qi = {i: label for i, label in enumerate(labels_qi)}
label2id_qi = {label: i for i, label in enumerate(labels_qi)}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained
# https://huggingface.co/docs/transformers/v4.46.2/en/model_doc/bert#transformers.BertTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def preprocess_data(example):
    question = example['Question']

    # https://huggingface.co/docs/transformers/v4.46.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
    encodings = tokenizer(question, padding="max_length", truncation=True, max_length=128)
    label_gs = label2id_gs[example['Global Subject']]
    label_qi = label2id_qi[example['Question Intent']]

    encodings.update({'labels': label_gs, 'labels_sub': label_qi})

    return encodings

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
from datasets import Dataset

# Assuming df is your DataFrame and it has a column 'Global Subject' for stratification
df = pd.read_csv('./data_cleaned_manual_combined.csv')

# Create the stratified split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['Global Subject']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Reset index of strat_test_set
strat_test_set = strat_test_set.reset_index(drop=True)

# Further split the test set into validation and test sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in split.split(strat_test_set, strat_test_set['Global Subject']):
    strat_val_set = strat_test_set.loc[val_index]
    strat_test_set = strat_test_set.loc[test_index]

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(strat_train_set)
eval_dataset = Dataset.from_pandas(strat_val_set)
test_dataset = Dataset.from_pandas(strat_test_set)

# Tokenize the datasets
train_dataset = train_dataset.map(preprocess_data, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_data, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, remove_columns=test_dataset.column_names)

train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    metrics = {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }
    return metrics

In [None]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

In [None]:
import wandb

wandb.init(project='tesi', tags=['hierarchical-pipeline'], notes='Model only for the main subject classification')

In [None]:
from transformers import TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_gs))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate=2e-5,
    warmup_ratio=0.1,  # Warmup for the first 10% of steps
    lr_scheduler_type='linear',  # Linear scheduler
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy='epoch',
    logging_strategy='epoch',
    eval_strategy='epoch',
    logging_dir='./logs',
    load_best_model_at_end=True,  # Load the best model at the end based on evaluation metric
    metric_for_best_model='f1',  # Use subtopic F1-score to determine the best model
    greater_is_better=True,  # Higher metric indicates a better model,
    report_to='wandb',
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

print(f"Trainer is using device: {trainer.args.device}")

trainer.train()

wandb.finish()

### Fine-tuning the model for the subtopic classification

In [None]:
import wandb


# Function to train a model for a specific global subject
def train_model_for_subject(global_subject, train_data, val_data):
    # Initialize a new wandb run for each model
    wandb.init(project='tesi', tags=['hierarchical-pipeline', global_subject],
               notes=f'Model for {global_subject} sub-topic classification')

    # Define the model and training arguments
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_qi))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    training_args = TrainingArguments(
        output_dir=f'./results_{global_subject}',
        num_train_epochs=20,
        # https://datascience.stackexchange.com/questions/64583/what-are-the-good-parameter-ranges-for-bert-hyperparameters-while-finetuning-it
        learning_rate=4e-5,
        lr_scheduler_type='linear',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_strategy='epoch',
        logging_strategy='epoch',
        eval_strategy='epoch',
        logging_dir=f'./logs_{global_subject}',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True,
        report_to='wandb',
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Finish the wandb run
    wandb.finish()


# Example usage
for global_subject in labels_gs:
    gs_id = label2id_gs[global_subject]
    print(f"Training model for global subject: {global_subject}")


    # Filter the dataset for the current global subject

    def filter_fn(x):
        return x['labels'] == gs_id


    # merge the two resulting datasets
    complete_data = concatenate_datasets([train_dataset.filter(filter_fn), eval_dataset.filter(filter_fn)])
    
    print(f"Complete dataset size: {len(complete_data)}")
    
    # if the complete data has only one class, skip the training
    if len(complete_data.unique('labels_sub')) == 1:
        print(f"Skipping training for {global_subject} as it has only one subtopic")
        continue
    
    complete_data = complete_data.class_encode_column('labels_sub')
    complete_data = complete_data.train_test_split(test_size=0.2, shuffle=True, seed=42,
                                                             stratify_by_column='labels_sub')

    subject_train_data = complete_data['train']
    subject_val_data = complete_data['test']

    print(f"Train dataset size: {len(subject_train_data)}, Validation dataset size: {len(subject_val_data)}")
    
    # remove the labels column and map the labels_sub column to labels
    subject_train_data = subject_train_data.remove_columns('labels').rename_column('labels_sub', 'labels')
    subject_val_data = subject_val_data.remove_columns('labels').rename_column('labels_sub', 'labels')
    
    # Train the model for the current global subject
    train_model_for_subject(global_subject, subject_train_data, subject_val_data)