In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
import json
from tqdm.auto import tqdm

In [None]:
df = pd.read_json("https://raw.githubusercontent.com/chutki26/dissertation-newscrawler/refs/heads/main/training_articles.json?token=GHSAT0AAAAAACVXYCPCEKWTUPEMS3KQFC32Z6AO3QQ")

HTTPError: HTTP Error 404: Not Found

In [None]:
# create full text column

df['description'] = df['description'].fillna("")
df['title'] = df['title'].fillna("")
df['text'] = df['text'].fillna("")

df['full_text'] = df['title'] + " " + df['description'] + " " + df['text']

In [None]:
# convert "true" and "false" to 0 or 1

def convert_to_binary(value):
    if value == "true":
        return 1
    else:
        return 0

df['label'] = df['relevant'].map(convert_to_binary)

In [None]:
# set up cross validation
# stratified k fold ensures that class (im)balance is kept when splitting data
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
# Initialize metrics tracking
all_predictions = []
all_labels = []
fold_metrics = []


In [None]:
def get_tokenizer():
    return AutoTokenizer.from_pretrained("bert-base-uncased")

def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    )

In [None]:
# create dataset class
class NewsDataset(torch.utils.data.Dataset):
  def __init__(self, texts, labels, tokenizer):
        assert len(texts) == len(labels), "Texts and labels must have the same length"
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

  def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # Remove the batch dimension added by the tokenizer
        item = {key: val[0] for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

  def __len__(self):
    return len(self.texts)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(df, df['label']), total=n_splits)):
    print(f"\nFold {fold+1}/{n_splits}")

    # Split data
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    # Initialize tokenizer and model for this fold
    tokenizer = get_tokenizer()
    model = get_model()

    # Create datasets
    train_dataset = NewsDataset(
        train_df['full_text'].tolist(),
        train_df['label'].tolist(),
        tokenizer
    )

    val_dataset = NewsDataset(
        val_df['full_text'].tolist(),
        val_df['label'].tolist(),
        tokenizer
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/fold-{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'./logs/fold-{fold}',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train model
    trainer.train()

    # Evaluate
    eval_results = trainer.evaluate()
    fold_metrics.append(eval_results)

    # Get predictions for this fold
    predictions = trainer.predict(val_dataset)
    preds = np.argmax(predictions.predictions, axis=1)

    # Store predictions and labels for aggregate metrics
    all_predictions.extend(preds)
    all_labels.extend(val_df['label'].tolist())

    # Print fold results
    print(f"Fold {fold+1} results:")
    print(eval_results)


# Detailed classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions))

# Averages across folds
avg_metrics = {
    'accuracy': np.mean([m['eval_accuracy'] for m in fold_metrics]),
    'precision': np.mean([m['eval_precision'] for m in fold_metrics]),
    'recall': np.mean([m['eval_recall'] for m in fold_metrics]),
    'f1': np.mean([m['eval_f1'] for m in fold_metrics]),
}

print("\nAverage Metrics Across Folds:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")






  0%|          | 0/5 [00:00<?, ?it/s]


Fold 1/5


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6308,0.583223,0.7,0.0,0.0,0.0
2,0.4232,0.299278,0.95,1.0,0.833333,0.909091
3,0.2938,0.127057,1.0,1.0,1.0,1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 results:
{'eval_loss': 0.12705715000629425, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 63.5096, 'eval_samples_per_second': 0.63, 'eval_steps_per_second': 0.079, 'epoch': 3.0}

Fold 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6232,0.604421,0.7,0.0,0.0,0.0
2,0.5447,0.401099,0.875,0.769231,0.833333,0.8
3,0.2722,0.432212,0.825,1.0,0.416667,0.588235


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 results:
{'eval_loss': 0.4010985493659973, 'eval_accuracy': 0.875, 'eval_precision': 0.7692307692307693, 'eval_recall': 0.8333333333333334, 'eval_f1': 0.8, 'eval_runtime': 64.8861, 'eval_samples_per_second': 0.616, 'eval_steps_per_second': 0.077, 'epoch': 3.0}

Fold 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6253,0.606819,0.7,0.0,0.0,0.0
2,0.4556,0.466725,0.75,0.583333,0.583333,0.583333
3,0.3248,0.450363,0.775,0.578947,0.916667,0.709677


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 3 results:
{'eval_loss': 0.45036250352859497, 'eval_accuracy': 0.775, 'eval_precision': 0.5789473684210527, 'eval_recall': 0.9166666666666666, 'eval_f1': 0.7096774193548387, 'eval_runtime': 64.1318, 'eval_samples_per_second': 0.624, 'eval_steps_per_second': 0.078, 'epoch': 3.0}

Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6257,0.601333,0.7,0.0,0.0,0.0
2,0.5358,0.35926,0.95,1.0,0.833333,0.909091
3,0.2523,0.151432,0.95,1.0,0.833333,0.909091


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 4 results:
{'eval_loss': 0.35925984382629395, 'eval_accuracy': 0.95, 'eval_precision': 1.0, 'eval_recall': 0.8333333333333334, 'eval_f1': 0.9090909090909091, 'eval_runtime': 66.7427, 'eval_samples_per_second': 0.599, 'eval_steps_per_second': 0.075, 'epoch': 3.0}

Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6234,0.60591,0.7,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6234,0.60591,0.7,0.0,0.0,0.0
2,0.5687,0.467593,0.85,0.666667,1.0,0.8
3,0.3834,0.254525,0.9,1.0,0.666667,0.8


Fold 5 results:
{'eval_loss': 0.4675934910774231, 'eval_accuracy': 0.85, 'eval_precision': 0.6666666666666666, 'eval_recall': 1.0, 'eval_f1': 0.8, 'eval_runtime': 65.5119, 'eval_samples_per_second': 0.611, 'eval_steps_per_second': 0.076, 'epoch': 3.0}

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       140
           1       0.76      0.92      0.83        60

    accuracy                           0.89       200
   macro avg       0.86      0.90      0.88       200
weighted avg       0.90      0.89      0.89       200


Average Metrics Across Folds:
accuracy: 0.8900
precision: 0.8030
recall: 0.9167
f1: 0.8438


In [None]:
# Calculate and display overall metrics
print("\n===== Overall Cross-Validation Results =====")
precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_predictions, average='binary'
)
accuracy = accuracy_score(all_labels, all_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")



===== Overall Cross-Validation Results =====
Accuracy: 0.8900
Precision: 0.7639
Recall: 0.9167
F1 Score: 0.8333
