# Fine-tune a BERT model

In [32]:
from functions_variables import *
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [33]:
#Load the dataset
path = '../data/preprocessed/'
files = {name: f'{path}{name}.csv' for name in set_names}
dataset = load_dataset('csv', data_files=files)
limit = 1000  # 25000 for the full dataset
train = dataset["train"].shuffle(seed=42).select(range(limit))
test = dataset["test"].shuffle(seed=42).select(range(limit))
unsupervised = dataset["unsupervised"].shuffle(seed=42).select(range(limit))
# labels = train['label']

In [34]:
# Define your compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [38]:
dataset = dataset.map(preprocess_function, batched=True)
train = dataset["train"].shuffle(seed=42).select(range(limit))
test = dataset["test"].shuffle(seed=42).select(range(limit))
unsupervised = dataset["unsupervised"].shuffle(seed=42).select(range(limit))

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [39]:
model_name = 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
training_args = TrainingArguments(
    output_dir="../data/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="../data/logs",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.458201,0.82,0.811741,0.821721,0.816701
2,No log,0.363058,0.849,0.816135,0.891393,0.852106
3,No log,0.45016,0.831,0.889976,0.745902,0.811594
4,No log,0.38791,0.848,0.826848,0.870902,0.848303


TrainOutput(global_step=252, training_loss=0.36506104847741505, metrics={'train_runtime': 112.0114, 'train_samples_per_second': 35.711, 'train_steps_per_second': 2.25, 'total_flos': 263111055360000.0, 'train_loss': 0.36506104847741505, 'epoch': 4.0})