In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
import evaluate
import numpy as np
import torch

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# Load CSVs
train_df = pd.read_csv('../../data/Article-Bias-Prediction/article-bias-detection_train.csv')
test_df  = pd.read_csv('../../data/Article-Bias-Prediction/article-bias-detection_test.csv')

In [4]:
# Preprocess: fill NAs and combine text fields
for df in (train_df, test_df):
    df['title']   = df['title'].fillna('')
    df['content'] = df['content'].fillna('')
    df['text']    = df['title'] + ' ' + df['content']

In [5]:
# Convert to HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df[['text','bias']])
test_ds  = Dataset.from_pandas(test_df[['text','bias']])

In [6]:
# Load tokenizer & tokenize
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length=512)

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

Map:   0%|          | 0/30041 [00:00<?, ? examples/s]

Map:   0%|          | 0/7513 [00:00<?, ? examples/s]

In [7]:
# Set format for PyTorch
train_ds = train_ds.rename_column('bias','labels')
test_ds  = test_ds.rename_column('bias','labels')
train_ds.set_format('torch', columns=['input_ids','attention_mask','labels'])
test_ds.set_format('torch',  columns=['input_ids','attention_mask','labels'])

In [8]:
# Load model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Prepare Trainer
args = TrainingArguments(
    output_dir='./bert-bias-checkpoint',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
# Train & Evaluate
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4979,0.526773,0.789032,0.788463


{'eval_loss': 0.5267727375030518,
 'eval_accuracy': 0.7890323439371756,
 'eval_f1': 0.7884630055110862,
 'eval_runtime': 68.1634,
 'eval_samples_per_second': 110.22,
 'eval_steps_per_second': 6.895,
 'epoch': 1.0}