# Fine-Tuning a Transformer for Fake News Detection

This notebook contains the complete pipeline for fine-tuning a transformer model (like BERT or DistilBERT) for text classification, based on the `train.py` script. The process includes:

1.  **Configuration**: Setting up all experiment parameters.
2.  **Helper Functions**: Defining logic for layer freezing and metrics computation.
3.  **Data Loading & Preparation**: Loading the processed dataset and creating a validation split.
4.  **Tokenization**: Preparing the text data for the model.
5.  **Model Setup**: Loading the model and applying the layer freezing strategy.
6.  **Training**: Running the fine-tuning process using the Hugging Face `Trainer`.
7.  **Saving & Pushing to Hub**: Saving the final model and optionally pushing it to the Hugging Face Hub.

## 1. Imports and Setup

In [1]:
import pandas as pd
import transformers
import torch
import accelerate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from huggingface_hub import HfFolder, notebook_login
from datasets import Dataset, DatasetDict
import os
import logging

# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import TrainingArguments
import inspect
sig = inspect.signature(TrainingArguments.__init__)
print([param for param in sig.parameters.keys() if 'eval' in param.lower()])

['do_eval', 'eval_strategy', 'per_device_eval_batch_size', 'per_gpu_eval_batch_size', 'eval_accumulation_steps', 'eval_delay', 'jit_mode_eval', 'bf16_full_eval', 'fp16_full_eval', 'eval_steps', 'eval_do_concat_batches', 'batch_eval_metrics', 'eval_on_start', 'eval_use_gather_object']


## 2. Helper Functions

### Layer Freezing Logic

This function applies the chosen freezing strategy ('all', 'half', or 'none') to the model's encoder layers.

In [3]:
def freeze_layers(model, model_type, freeze_mode, num_total_layers_bert=12, num_total_layers_distilbert=6):
    logger.info(f"Applying freeze mode: {freeze_mode} for model type: {model_type}")
    if freeze_mode == "none": # Full fine-tuning
        for param in model.parameters():
            param.requires_grad = True
        logger.info("All layers are trainable (full fine-tuning).")
        return

    if model_type == 'bert':
        encoder_layers = model.bert.encoder.layer
        embeddings = model.bert.embeddings
        num_total_layers = num_total_layers_bert
    elif model_type == 'distilbert':
        encoder_layers = model.distilbert.transformer.layer
        embeddings = model.distilbert.embeddings
        num_total_layers = num_total_layers_distilbert
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    # Freeze embeddings by default when freezing encoder layers
    logger.info("Freezing embedding layers.")
    for param in embeddings.parameters():
        param.requires_grad = False

    if freeze_mode == "all":
        num_layers_to_freeze = num_total_layers
    elif freeze_mode == "half":
        num_layers_to_freeze = num_total_layers // 2
    else:
        raise ValueError(f"Unsupported freeze mode: {freeze_mode}. Choose 'all', 'half', or 'none'.")

    logger.info(f"Total encoder layers: {num_total_layers}. Layers to freeze: {num_layers_to_freeze}.")

    for i, layer in enumerate(encoder_layers):
        if i < num_layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False
        else:
            for param in layer.parameters(): # Ensure subsequent layers are trainable
                param.requires_grad = True

    # Ensure the classifier head is always trainable
    trainable_classifier = False
    if hasattr(model, 'classifier') and model.classifier is not None:
        for param in model.classifier.parameters():
            param.requires_grad = True
        trainable_classifier = True
    if hasattr(model, 'pre_classifier') and model.pre_classifier is not None: # For DistilBERT
         for param in model.pre_classifier.parameters():
            param.requires_grad = True
         trainable_classifier = True

    if trainable_classifier:
        logger.info("Classifier head parameters are set to trainable.")
    else:
        logger.warning("Could not find a standard classifier head to ensure it's trainable.")

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    logger.info(f"Number of trainable parameters: {trainable_params} / {total_params} ({100 * trainable_params / total_params:.2f}%)")

### Metrics Computation

This function calculates accuracy, F1-score, precision, and recall. It will be passed to the `Trainer` to evaluate the model on the validation set during training.

In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

## 3. Configuration

Instead of command-line arguments, we define all parameters in this configuration class. **This is the main place to change settings for different experiments.**

In [5]:
class TrainingConfig:
    # --- Model arguments ---
    # Change for different experiments: 'distilbert-base-uncased' or 'bert-base-uncased'
    model_name_or_path = "distilbert-base-uncased"
    # Must match model_name_or_path: 'distilbert' or 'bert'
    model_type = "distilbert"
    # Change for different experiments: 'all', 'half', or 'none'
    freeze_mode = "half"

    # --- Data arguments ---
    # Note: Paths are relative to the `notebooks` directory
    train_file = "../data/processed/train.csv"
    eval_file = None # Set to "../data/processed/test.csv" or other if you have a separate eval file
    text_column = "text"
    label_column = "label"
    validation_split_size = 0.1 # Used if eval_file is None

    # --- Training arguments ---
    # Note: Paths are relative to the `notebooks` directory
    output_dir = f"../results/{model_type}_{freeze_mode}_notebook"
    num_train_epochs = 3
    per_device_train_batch_size = 8
    per_device_eval_batch_size = 16
    learning_rate = 5e-5
    weight_decay = 0.01
    warmup_steps = 0
    logging_steps = 100
    save_steps = 500 # Used if save_strategy is 'steps'
    eval_strategy = "epoch"
    save_strategy = "epoch"
    load_best_model_at_end = True
    metric_for_best_model = "f1"
    fp16 = torch.cuda.is_available() # Enable if you have a compatible GPU

    # --- Hugging Face Hub arguments ---
    push_to_hub = True # Set to True to push model to the Hub
    # IMPORTANT: Change this to your username and a descriptive model name
    hub_model_id = f"stegostegosaur/{model_type}-{freeze_mode}-fakern"
    hub_token = None # Will use token from `notebook_login` or saved token

# Instantiate the config
args = TrainingConfig()

## 4. Hugging Face Hub Login

If `push_to_hub` is set to `True` in the config, this cell will prompt you to log in to the Hugging Face Hub. You'll need to provide an access token with `write` permissions.

In [6]:
if args.push_to_hub:
    logger.info("Attempting to log into Hugging Face Hub...")
    hub_token_to_use = args.hub_token or HfFolder.get_token()
    if hub_token_to_use:
        logger.info("Token found, will use it for pushing to Hub.")
    else:
        logger.info("Hub token not found. Running notebook_login().")
        notebook_login()

2025-06-10 15:21:05,645 - INFO - Attempting to log into Hugging Face Hub...
2025-06-10 15:21:05,651 - INFO - Token found, will use it for pushing to Hub.


## 5. Load and Prepare Data

This section loads the training data from the specified CSV file. If no evaluation file is provided, it will automatically split the training data to create a validation set.

In [7]:
logger.info(f"Loading training data from: {args.train_file}")
train_df = pd.read_csv(args.train_file)

if args.eval_file:
    logger.info(f"Loading evaluation data from: {args.eval_file}")
    eval_df = pd.read_csv(args.eval_file)
    raw_datasets = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'validation': Dataset.from_pandas(eval_df)
    })
else:
    logger.info(f"No evaluation file provided. Splitting training data for validation (split: {args.validation_split_size}).")
    train_pandas_df, eval_pandas_df = train_test_split(
        train_df,
        test_size=args.validation_split_size,
        random_state=42,
        stratify=train_df[args.label_column] if args.label_column in train_df.columns else None
    )
    raw_datasets = DatasetDict({
        'train': Dataset.from_pandas(train_pandas_df),
        'validation': Dataset.from_pandas(eval_pandas_df)
    })

logger.info(f"Raw datasets loaded: {raw_datasets}")

2025-06-10 15:21:08,427 - INFO - Loading training data from: ../data/processed/train.csv


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/train.csv'

### Process Labels

The model requires integer labels. We find all unique labels in our data and create `label2id` and `id2label` mappings, which are essential for both training and inference.

In [104]:
unique_labels = train_df[args.label_column].unique()
label2id = {int(label): i for i, label in enumerate(sorted(unique_labels))}
id2label = {i: int(label) for label, i in label2id.items()}
num_labels = len(unique_labels)

logger.info(f"Found {num_labels} unique labels: {unique_labels}. Label mapping: {label2id}")

def map_labels(example):
    example[args.label_column] = label2id[example[args.label_column]]
    return example

raw_datasets = raw_datasets.map(map_labels, batched=False)

2025-06-10 14:58:40,350 - INFO - Found 2 unique labels: [0 1]. Label mapping: {0: 0, 1: 1}
Map: 100%|██████████| 42242/42242 [00:02<00:00, 15067.06 examples/s]
Map: 100%|██████████| 4694/4694 [00:00<00:00, 29985.19 examples/s]


## 6. Tokenization

We load the tokenizer that corresponds to our chosen model and apply it to our datasets. This converts the text into a format the model can understand (token IDs).

In [105]:
logger.info(f"Loading tokenizer for: {args.model_name_or_path}")
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

def tokenize_function(examples):
    return tokenizer(examples[args.text_column], padding="max_length", truncation=True, max_length=512)

logger.info("Tokenizing datasets...")
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=[args.text_column] if args.text_column in raw_datasets['train'].column_names else None)

# The Trainer expects the label column to be named 'labels'
if args.label_column != 'labels':
     tokenized_datasets = tokenized_datasets.rename_column(args.label_column, "labels")

logger.info(f"Tokenized datasets ready: {tokenized_datasets}")

2025-06-10 14:58:43,366 - INFO - Loading tokenizer for: distilbert-base-uncased
2025-06-10 14:58:43,828 - INFO - Tokenizing datasets...
Map: 100%|██████████| 42242/42242 [00:20<00:00, 2093.68 examples/s]
Map: 100%|██████████| 4694/4694 [00:02<00:00, 2031.02 examples/s]
2025-06-10 14:59:06,332 - INFO - Tokenized datasets ready: DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 42242
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 4694
    })
})


## 7. Model Setup

Now we load the pre-trained model. We provide `num_labels`, `id2label`, and `label2id` so it initializes with a classification head tailored to our specific task. Afterwards, we apply our chosen layer freezing strategy.

In [106]:
logger.info(f"Loading model: {args.model_name_or_path} for {num_labels}-class classification.")
model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name_or_path,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# --- Apply Layer Freezing ---
if args.freeze_mode != 'none':
    freeze_layers(model, args.model_type, args.freeze_mode)
else:
    logger.info("No layer freezing applied (full fine-tuning). All model parameters are trainable.")

2025-06-10 14:59:06,339 - INFO - Loading model: distilbert-base-uncased for 2-class classification.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-06-10 14:59:06,553 - INFO - Applying freeze mode: half for model type: distilbert
2025-06-10 14:59:06,553 - INFO - Freezing embedding layers.
2025-06-10 14:59:06,553 - INFO - Total encoder layers: 6. Layers to freeze: 3.
2025-06-10 14:59:06,553 - INFO - Classifier head parameters are set to trainable.
2025-06-10 14:59:06,554 - INFO - Number of trainable parameters: 21855746 / 66955010 (32.64%)


## 8. Training

Finally, we define the `TrainingArguments` and instantiate the `Trainer`. The `trainer.train()` call will start the fine-tuning process.

In [114]:
os.makedirs(args.output_dir, exist_ok=True)

training_args_dict = {
    "output_dir": args.output_dir,
    "num_train_epochs": args.num_train_epochs,
    "per_device_train_batch_size": args.per_device_train_batch_size,
    "per_device_eval_batch_size": args.per_device_eval_batch_size,
    "learning_rate": args.learning_rate,
    "weight_decay": args.weight_decay,
    "warmup_steps": args.warmup_steps,
    "logging_dir": os.path.join(args.output_dir, 'logs'),
    "logging_steps": args.logging_steps,
    "eval_strategy": args.eval_strategy if 'validation' in tokenized_datasets else "no",
    "save_strategy": args.save_strategy,
    "save_steps": args.save_steps,
    "load_best_model_at_end": args.load_best_model_at_end if 'validation' in tokenized_datasets else False,
    "metric_for_best_model": args.metric_for_best_model if 'validation' in tokenized_datasets else None,
    "greater_is_better": True if args.metric_for_best_model in ["accuracy", "f1"] else None,
    "fp16": args.fp16,
    "report_to": "tensorboard",
}
if args.push_to_hub:
    training_args_dict["push_to_hub"] = True
    training_args_dict["hub_model_id"] = args.hub_model_id
    if args.hub_token:
         training_args_dict["hub_token"] = args.hub_token

training_arguments = TrainingArguments(**training_args_dict)

# --- Instantiate Trainer ---
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets.get("validation"),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics if tokenized_datasets.get("validation") else None,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
logger.info("Starting model training...")

try:
    train_result = trainer.train()
    trainer.save_model()  # Saves the tokenizer too
    logger.info("Training finished successfully.")

    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    if args.push_to_hub:
        logger.info(f"Pushing model and tokenizer to Hugging Face Hub: {args.hub_model_id}")
        trainer.push_to_hub(commit_message="End of training from notebook")
        logger.info("Model pushed to Hub successfully.")

except Exception as e:
    logger.exception(f"An error occurred during training: {e}")