In [None]:
!free -mh

               total        used        free      shared  buff/cache   available
Mem:            12Gi       821Mi       6.7Gi       1.0Mi       5.2Gi        11Gi
Swap:             0B          0B          0B


In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
import json
from tqdm.auto import tqdm

In [None]:
df = pd.read_json("https://raw.githubusercontent.com/chutki26/dissertation-newscrawler/refs/heads/main/training_articles.json?token=GHSAT0AAAAAACVXYCPC7RZK5H5NPMLKDBMMZ6FU2VQ")

In [None]:
# create full text column

df['description'] = df['description'].fillna("")
df['title'] = df['title'].fillna("")
df['text'] = df['text'].fillna("")

df['full_text'] = df['title'] + " " + df['description'] + " " + df['text']

In [None]:
# convert "true" and "false" to 0 or 1

def convert_to_binary(value):
    if value == "true":
        return 1
    else:
        return 0

df['label'] = df['relevant'].map(convert_to_binary)

In [None]:
# set up cross validation
# stratified k fold ensures that class (im)balance is kept when splitting data
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
## check how many of each class is in each fold
for i, (train_index, test_index) in enumerate(skf.split(df, df['label'])):
    print(f"Fold {i}:")
    print(f"  Train Classes: {df.iloc[train_index]['label'].value_counts()}")
    print(f"  Test Classes: {df.iloc[test_index]['label'].value_counts()}")

Fold 0:
  Train Classes: label
0    112
1     48
Name: count, dtype: int64
  Test Classes: label
0    28
1    12
Name: count, dtype: int64
Fold 1:
  Train Classes: label
0    112
1     48
Name: count, dtype: int64
  Test Classes: label
0    28
1    12
Name: count, dtype: int64
Fold 2:
  Train Classes: label
0    112
1     48
Name: count, dtype: int64
  Test Classes: label
0    28
1    12
Name: count, dtype: int64
Fold 3:
  Train Classes: label
0    112
1     48
Name: count, dtype: int64
  Test Classes: label
0    28
1    12
Name: count, dtype: int64
Fold 4:
  Train Classes: label
0    112
1     48
Name: count, dtype: int64
  Test Classes: label
0    28
1    12
Name: count, dtype: int64


In [None]:
# Initialize metrics tracking
all_predictions = []
all_labels = []
fold_metrics = []

In [None]:
def get_tokenizer():
    return AutoTokenizer.from_pretrained("roberta-base")

def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=2
    )

In [None]:
# create dataset class
class NewsDataset(torch.utils.data.Dataset):
  def __init__(self, texts, labels, tokenizer):
        assert len(texts) == len(labels), "Texts and labels must have the same length"
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

  def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # Remove the batch dimension added by the tokenizer
        item = {key: val[0] for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

  def __len__(self):
    return len(self.texts)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(df, df['label']), total=n_splits)):
    print(f"\nFold {fold+1}/{n_splits}")

    # Split data
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    # Initialize tokenizer and model for this fold
    tokenizer = get_tokenizer()
    model = get_model()

    # Create datasets
    train_dataset = NewsDataset(
        train_df['full_text'].tolist(),
        train_df['label'].tolist(),
        tokenizer
    )

    val_dataset = NewsDataset(
        val_df['full_text'].tolist(),
        val_df['label'].tolist(),
        tokenizer
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/fold-{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'./logs/fold-{fold}',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train model
    trainer.train()

    # Evaluate
    eval_results = trainer.evaluate()
    fold_metrics.append(eval_results)

    # Get predictions for this fold
    predictions = trainer.predict(val_dataset)
    preds = np.argmax(predictions.predictions, axis=1)

    # Store predictions and labels for aggregate metrics
    all_predictions.extend(preds)
    all_labels.extend(val_df['label'].tolist())

    # Print fold results
    print(f"Fold {fold+1} results:")
    print(eval_results)


# Detailed classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions))

# Averages across folds
avg_metrics = {
    'accuracy': np.mean([m['eval_accuracy'] for m in fold_metrics]),
    'precision': np.mean([m['eval_precision'] for m in fold_metrics]),
    'recall': np.mean([m['eval_recall'] for m in fold_metrics]),
    'f1': np.mean([m['eval_f1'] for m in fold_metrics]),
}

print("\nAverage Metrics Across Folds:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")






  0%|          | 0/5 [00:00<?, ?it/s]


Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6655,0.649007,0.7,0.0,0.0,0.0
2,0.6681,0.624105,0.7,0.0,0.0,0.0
3,0.5025,0.247465,0.925,0.846154,0.916667,0.88


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 results:
{'eval_loss': 0.24746493995189667, 'eval_accuracy': 0.925, 'eval_precision': 0.8461538461538461, 'eval_recall': 0.9166666666666666, 'eval_f1': 0.88, 'eval_runtime': 1.2948, 'eval_samples_per_second': 30.893, 'eval_steps_per_second': 3.862, 'epoch': 3.0}

Fold 2/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6682,0.647619,0.7,0.0,0.0,0.0
2,0.6682,0.551493,0.7,0.0,0.0,0.0
3,0.3365,0.590779,0.75,1.0,0.166667,0.285714


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 results:
{'eval_loss': 0.5907794237136841, 'eval_accuracy': 0.75, 'eval_precision': 1.0, 'eval_recall': 0.16666666666666666, 'eval_f1': 0.2857142857142857, 'eval_runtime': 1.3248, 'eval_samples_per_second': 30.192, 'eval_steps_per_second': 3.774, 'epoch': 3.0}

Fold 3/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6574,0.652114,0.7,0.0,0.0,0.0
2,0.541,0.642756,0.7,0.0,0.0,0.0
3,0.3237,0.618262,0.8,0.7,0.583333,0.636364


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 3 results:
{'eval_loss': 0.6182623505592346, 'eval_accuracy': 0.8, 'eval_precision': 0.7, 'eval_recall': 0.5833333333333334, 'eval_f1': 0.6363636363636364, 'eval_runtime': 1.2796, 'eval_samples_per_second': 31.26, 'eval_steps_per_second': 3.907, 'epoch': 3.0}

Fold 4/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6732,0.647548,0.7,0.0,0.0,0.0
2,0.635,0.536445,0.7,0.0,0.0,0.0
3,0.3127,0.123149,0.925,0.909091,0.833333,0.869565


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 4 results:
{'eval_loss': 0.12314943969249725, 'eval_accuracy': 0.925, 'eval_precision': 0.9090909090909091, 'eval_recall': 0.8333333333333334, 'eval_f1': 0.8695652173913043, 'eval_runtime': 1.3487, 'eval_samples_per_second': 29.658, 'eval_steps_per_second': 3.707, 'epoch': 3.0}

Fold 5/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6758,0.645909,0.7,0.0,0.0,0.0
2,0.6822,0.556608,0.7,0.0,0.0,0.0
3,0.4381,0.593477,0.8,1.0,0.333333,0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 5 results:
{'eval_loss': 0.5934766530990601, 'eval_accuracy': 0.8, 'eval_precision': 1.0, 'eval_recall': 0.3333333333333333, 'eval_f1': 0.5, 'eval_runtime': 1.376, 'eval_samples_per_second': 29.07, 'eval_steps_per_second': 3.634, 'epoch': 3.0}

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       168
           1       0.85      0.47      0.61        72

    accuracy                           0.82       240
   macro avg       0.83      0.72      0.74       240
weighted avg       0.82      0.82      0.80       240


Average Metrics Across Folds:
accuracy: 0.8167
precision: 0.7425
recall: 0.4722
f1: 0.5286


In [None]:
# Calculate and display overall metrics
print("\n===== Overall Cross-Validation Results =====")
precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_predictions, average='binary'
)
accuracy = accuracy_score(all_labels, all_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")



===== Overall Cross-Validation Results =====
Accuracy: 0.8167
Precision: 0.8500
Recall: 0.4722
F1 Score: 0.6071
