# Fine-tuning distilbert/distilbert-base-multilingual-cased

After the original attempt at hybrid matching failed spectacularly without a single matched author out of over 25,000 records, I've decided to try fine-tuning `distilbert/distilbert-base-multilingual-cased` to match authors with author IDs.

## Prepare the Data for Fine-Tuning

In [44]:
!pip install dropbox
import sam_dropbox as sdpbx
import os

if not os.path.exists("./distilbert-finetuned"):
    os.makedirs("./distilbert-finetuned")

# Establish file directories
output_dir = './distilbert-finetuned'  # Temporary directory to save the model
dropbox_dir = '/SamsColab/distilbert-finetuned'  # Dropbox directory where you want to save the checkpoint

if not os.path.exists("./logs"):
    os.makedirs("./logs")

log_dir = './logs'
dropbox_log_dir = '/SamsColab/distilbert-finetuned/logs'

# Define Dropbox file path and local file path
dropbox_file_path = '/SamsColab/distilbert-finetuned/distilbert-data.csv'  # Path to the CSV in Dropbox
local_file_path = './data.csv'  # Path to save it locally in Colab

# Download the CSV from Dropbox
sdpbx.download_from_dropbox(dropbox_file_path, local_file_path)



In [45]:
import pandas as pd

# Load the dataset
data = pd.read_csv('./data.csv', encoding='utf-8', quotechar='"')

In [46]:
!pip install transformers
from transformers import DistilBertTokenizerFast

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

# Tokenize the data to find the max length
tokens = data['variant'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
max_length = int(pd.Series(tokens).map(len).quantile(0.95))
print("Determined max_length for padding/truncation:", max_length)

Determined max_length for padding/truncation: 21


In [47]:
!pip install datasets
from datasets import Dataset, ClassLabel, Features

# Creating the Hugging Face dataset
hf_dataset = Dataset.from_pandas(data)

# Define ClassLabel feature
num_labels = len(hf_dataset.unique('dll_author_id'))
label_feature = ClassLabel(num_classes=num_labels, names=hf_dataset.unique('dll_author_id'))

# Update dataset to use ClassLabel
def convert_labels(example):
    example['labels'] = label_feature.str2int(example['dll_author_id'])
    return example

hf_dataset = hf_dataset.map(convert_labels)



Map:   0%|          | 0/27290 [00:00<?, ? examples/s]

In [48]:
# Split the dataset into training, validation, and testing sets
train_test_split = hf_dataset.train_test_split(test_size=0.2)
train_val_split = train_test_split['train'].train_test_split(test_size=0.125)  # 0.125 of 0.8 = 0.1 of original

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']
test_dataset = train_test_split['test']

In [49]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['variant'], padding="max_length", truncation=True, max_length=max_length)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/19103 [00:00<?, ? examples/s]

Map:   0%|          | 0/2729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5458 [00:00<?, ? examples/s]

In [50]:
# Set the format for PyTorch
def format_dataset(dataset):
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
val_dataset = format_dataset(val_dataset)
test_dataset = format_dataset(test_dataset)

# Print the dataset to ensure correct setup
print("Training Dataset:", train_dataset)
print("Validation Dataset:", val_dataset)
print("Testing Dataset:", test_dataset)

Training Dataset: Dataset({
    features: ['variant', 'dll_author_id', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 19103
})
Validation Dataset: Dataset({
    features: ['variant', 'dll_author_id', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2729
})
Testing Dataset: Dataset({
    features: ['variant', 'dll_author_id', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5458
})


In [51]:
from transformers import DistilBertForSequenceClassification

num_labels = data['dll_author_id'].nunique()
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
from sklearn.metrics import f1_score, accuracy_score
from transformers import EvalPrediction

def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'f1': f1,
        'accuracy': acc
    }

In [53]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=output_dir,             # Directory for saving model and logs
    eval_strategy="epoch",        # Evaluate the model every 'eval_steps'
    eval_steps=500,                     # Number of steps to run evaluation
    logging_dir=log_dir,               # Directory for storing logs
    logging_strategy="steps",           # Log metrics every epoch
    logging_steps=100,                  # Log metrics every 100 steps
    save_strategy="epoch",              # Save checkpoints every 'save_steps'
    save_steps=500,                     # Save the model every 500 steps
    save_total_limit=1,
    per_device_train_batch_size=16,     # Batch size for training
    per_device_eval_batch_size=64,      # Batch size for evaluation
    num_train_epochs=20,                 # Number of training epochs
    load_best_model_at_end=True,        # Load the best model at the end of training
    metric_for_best_model='f1',         # Use F1 score to find the best model
    greater_is_better=True,              # Higher F1 score is better
    report_to=["tensorboard"]
)

In [56]:
from transformers import DataCollatorWithPadding
# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [57]:
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=data_collator
)

In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0936,0.823846,0.869763,0.877611
2,0.0546,0.727672,0.890875,0.896299
3,0.0312,0.740022,0.894289,0.898864
4,0.0308,0.731722,0.902925,0.907292
5,0.0116,0.727024,0.905063,0.910224
6,0.0071,0.739388,0.903406,0.908391
7,0.0137,0.715717,0.906157,0.911323
8,0.0129,0.752853,0.904059,0.909124
9,0.0094,0.723428,0.912427,0.917186
10,0.0082,0.725607,0.915011,0.920117


TrainOutput(global_step=15522, training_loss=0.018644785950234673, metrics={'train_runtime': 472.7392, 'train_samples_per_second': 808.183, 'train_steps_per_second': 50.514, 'total_flos': 1424719809413082.0, 'train_loss': 0.018644785950234673, 'epoch': 13.0})

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Upload to Dropbox
sdpbx.upload_directory_to_dropbox(output_dir, dropbox_dir)

# Upload logs
sdpbx.upload_directory_to_dropbox(log_dir, dropbox_log_dir)

File ./distilbert-finetuned/vocab.txt is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/tokenizer.json is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/tokenizer_config.json is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/model.safetensors exceeds 150 MB, using chunked upload.
Uploading ./distilbert-finetuned/model.safetensors in chunks (size: 550960660 bytes)
Finishing upload...
File ./distilbert-finetuned/special_tokens_map.json is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/config.json is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/checkpoint-11940/vocab.txt is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/checkpoint-11940/tokenizer.json is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/checkpoint-11940/scheduler.pt is smaller than 150 MB, using standard upload.
File ./distilbert-finetuned/checkpoint-

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./logs

In [None]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_dataset)

print("Test Results:", test_results)

In [None]:
import torch

# Load the saved model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./distilbert-finetuned')
tokenizer = DistilBertTokenizerFast.from_pretrained('./distilbert-finetuned')
# Example texts for inference
texts = ["Cicero, Marcus Tullius","Caesar, Julius","Vergil","Ovid","Ovidius","Tacitus, Cornelius"]

# Tokenize the texts
inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Convert predictions back to labels
predicted_labels = [label_feature.int2str(label_id) for label_id in predictions.tolist()]

for text, label in zip(texts, predicted_labels):
    print(f"Text: {text} - Predicted Label: {label}")
