In [1]:
# Install required libraries
!pip install transformers datasets evaluate nltk rouge_score pytorch-lightning>=2.0.0 torch>=2.0.0


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install contractions

import pandas as pd
import numpy as np
import re
import contractions
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    T5ForConditionalGeneration,
    AutoModelForSeq2SeqLM,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
import evaluate
import os
import random
import matplotlib.pyplot as plt
import json
from tqdm.auto import tqdm

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [5]:
# Preprocessing functions
def expand_abbreviations(text):
    if not isinstance(text, str):
        return ""

    # Common abbreviations dictionary
    abbr_dict = {
        "gov.": "governor",
        "feb.": "february",
        "jan.": "january",
        "mar.": "march",
        "apr.": "april",
        "jun.": "june",
        "jul.": "july",
        "aug.": "august",
        "sep.": "september",
        "oct.": "october",
        "nov.": "november",
        "dec.": "december",
        "vp": "vice president",
        "eta": "estimated time of arrival",
        "aka": "also known as",
        "vs.": "versus",
        "etc.": "etcetera",
        "dr.": "doctor",
        "mr.": "mister",
        "mrs.": "missus",
        "ms.": "miss",
        "prof.": "professor",
        "rep.": "representative",
        "sen.": "senator",
        "co.": "company",
        "corp.": "corporation",
        "inc.": "incorporated",
    }

    # Convert text to lowercase for easier matching
    text_lower = text.lower()

    # Replace abbreviations
    for abbr, full_form in abbr_dict.items():
        pattern = r'\b' + re.escape(abbr.lower()) + r'\b'
        text_lower = re.sub(pattern, full_form, text_lower)

    return text_lower

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Expand abbreviations
    text = expand_abbreviations(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove special characters except punctuation needed for readability
    text = re.sub(r'[^\w\s.,!?]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def load_and_preprocess_clan_data(file_path='CLAN_data.csv'):
    try:
        # Load the dataset
        print(f"Loading dataset from {file_path}...")
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")

        # Display sample of the raw data
        print("\nSample of raw data:")
        print(df.head(2))

        # Apply preprocessing to social media posts
        print("\nPreprocessing social media posts...")

        # Identify column names based on common patterns
        post_col = next((col for col in df.columns if 'post' in col.lower() or 'text' in col.lower() or 'social media post' in col.lower()), None)
        claim_col = next((col for col in df.columns if 'claim' in col.lower() or 'norm' in col.lower()), None)

        if not post_col:
            raise ValueError("Could not identify social media post column in the dataset.")
        if not claim_col:
            raise ValueError("Could not identify normalized claim column in the dataset.")

        print(f"Using '{post_col}' as post content and '{claim_col}' as claim content")

        # Preprocess posts and claims
        df['preprocessed_post'] = df[post_col].apply(preprocess_text)
        df['preprocessed_claim'] = df[claim_col].apply(preprocess_text)

        # Display sample of preprocessed data
        print("\nSample of preprocessed data:")
        print(df[['preprocessed_post', 'preprocessed_claim']].head(2))

        # Check for missing values
        missing_posts = df['preprocessed_post'].isna().sum()
        missing_claims = df['preprocessed_claim'].isna().sum()
        if missing_posts > 0 or missing_claims > 0:
            print(f"\nWarning: Found {missing_posts} missing posts and {missing_claims} missing claims.")
            # Fill missing values with empty strings
            df['preprocessed_post'] = df['preprocessed_post'].fillna("")
            df['preprocessed_claim'] = df['preprocessed_claim'].fillna("")

        return df

    except Exception as e:
        print(f"Error during data loading or preprocessing: {str(e)}")
        return None


def split_data(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_state=42):
    if not np.isclose(train_ratio + val_ratio + test_ratio, 1.0):
        raise ValueError("The sum of ratios must be 1.0")

    # Shuffle the data
    df_shuffled = df.sample(frac=1, random_state=random_state)

    # Calculate split indices
    train_end = int(len(df_shuffled) * train_ratio)
    val_end = int(len(df_shuffled) * (train_ratio + val_ratio))

    # Split the data
    train_df = df_shuffled.iloc[:train_end].copy().reset_index(drop=True)
    val_df = df_shuffled.iloc[train_end:val_end].copy().reset_index(drop=True)
    test_df = df_shuffled.iloc[val_end:].copy().reset_index(drop=True)

    print(f"\nData split complete:")
    print(f"Training set: {len(train_df)} samples ({train_ratio*100:.1f}%)")
    print(f"Validation set: {len(val_df)} samples ({val_ratio*100:.1f}%)")
    print(f"Test set: {len(test_df)} samples ({test_ratio*100:.1f}%)")

    return train_df, val_df, test_df



In [6]:
# Dataset class for claim normalization
class ClaimNormalizationDataset(Dataset):
    def __init__(self, texts, claims, tokenizer, max_input_length=512, max_target_length=128):
        self.texts = texts
        self.claims = claims
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        claim = str(self.claims[idx])

        # Tokenize inputs
        input_encoding = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize targets
        target_encoding = self.tokenizer(
            claim,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Replace padding token id with -100 for loss calculation
        target_ids = target_encoding["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_ids
        }



In [7]:
# Lightning module for claim normalization
class ClaimNormalizationModel(pl.LightningModule):
    def __init__(
        self,
        model_name,
        tokenizer,
        learning_rate=3e-5,
        weight_decay=0.01,
        warmup_steps=500,
        total_steps=None
    ):
        super().__init__()
        self.save_hyperparameters(ignore=['tokenizer'])

        # Load model based on type
        if "bart" in model_name.lower():
            self.model = BartForConditionalGeneration.from_pretrained(model_name)
        elif "t5" in model_name.lower():
            self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model type: {model_name}")

        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps

        # Load evaluation metrics
        self.rouge = evaluate.load("rouge")
        self.bleu = evaluate.load("bleu")
        self.bertscore = evaluate.load("bertscore")

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )

        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )

        loss = outputs.loss
        self.log("val_loss", loss, prog_bar=True, logger=True)

        # Generate predictions for metric calculation
        generated_ids = self.model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

        pred_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        label_texts = self.tokenizer.batch_decode(
            batch["labels"].masked_fill(batch["labels"] == -100, self.tokenizer.pad_token_id),
            skip_special_tokens=True
        )

        # Filter out empty predictions and corresponding references
        valid_pairs = [(pred, ref) for pred, ref in zip(pred_texts, label_texts) if pred.strip() and ref.strip()]

        # If no valid pairs, use default zero values
        if not valid_pairs:
            self.log("val_rouge1", 0.0, prog_bar=True, logger=True)
            self.log("val_rouge2", 0.0, prog_bar=True, logger=True)
            self.log("val_rouge_l", 0.0, prog_bar=True, logger=True)
            self.log("val_bleu4", 0.0, prog_bar=True, logger=True)
            self.log("val_bertscore", 0.0, prog_bar=True, logger=True)
            return loss

        # Unzip the valid pairs
        valid_preds, valid_refs = zip(*valid_pairs)

        # Calculate ROUGE scores
        rouge_output = self.rouge.compute(
            predictions=valid_preds,
            references=valid_refs,
            use_stemmer=True
        )

        # Extract scores
        rouge1 = rouge_output["rouge1"]
        rouge2 = rouge_output["rouge2"]
        rougeL = rouge_output["rougeL"]

        # Calculate BLEU scores
        bleu_output = self.bleu.compute(
            predictions=valid_preds,
            references=[[ref] for ref in valid_refs],
        )
        bleu4 = bleu_output["bleu"]

        # Calculate BERTScore with error handling
        try:
            bertscore_output = self.bertscore.compute(
                predictions=valid_preds,
                references=valid_refs,
                lang="en",
                model_type="microsoft/deberta-xlarge-mnli"
            )
            bertscore = sum(bertscore_output["f1"]) / len(bertscore_output["f1"]) if bertscore_output["f1"] else 0.0
        except Exception as e:
            print(f"BERTScore calculation error: {e}")
            bertscore = 0.0

        # Log metrics
        self.log("val_rouge1", rouge1, prog_bar=True, logger=True)
        self.log("val_rouge2", rouge2, prog_bar=True, logger=True)
        self.log("val_rouge_l", rougeL, prog_bar=True, logger=True)
        self.log("val_bleu4", bleu4, prog_bar=True, logger=True)
        self.log("val_bertscore", bertscore, prog_bar=True, logger=True)

        return loss

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def configure_optimizers(self):
        # Create optimizer with weight decay
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate)

        # Create scheduler with warmup
        if self.total_steps is None:
            self.total_steps = 1000  # Default value

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_steps
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
            },
        }

    def generate_normalized_claims(self, texts, max_length=128):
        """Generate normalized claims from input texts"""
        self.model.eval()
        tokenized_inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(self.device)

        generated_ids = self.model.generate(
            input_ids=tokenized_inputs["input_ids"],
            attention_mask=tokenized_inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    def calculate_metrics(self, references, predictions):
        """Calculate all metrics for a list of references and predictions"""
        # Filter out empty predictions and corresponding references
        valid_pairs = [(pred, ref) for pred, ref in zip(predictions, references) if pred.strip() and ref.strip()]

        if not valid_pairs:
            return {"rouge_l": 0.0, "bleu4": 0.0, "bertscore": 0.0}

        valid_preds, valid_refs = zip(*valid_pairs)

        # Calculate ROUGE scores
        rouge_output = self.rouge.compute(
            predictions=valid_preds,
            references=valid_refs,
            use_stemmer=True
        )

        # Calculate BLEU score
        bleu_output = self.bleu.compute(
            predictions=valid_preds,
            references=[[ref] for ref in valid_refs],
        )

        # Calculate BERTScore
        try:
            bertscore_output = self.bertscore.compute(
                predictions=valid_preds,
                references=valid_refs,
                lang="en",
                model_type="microsoft/deberta-xlarge-mnli"
            )
            bertscore = sum(bertscore_output["f1"]) / len(bertscore_output["f1"]) if bertscore_output["f1"] else 0.0
        except Exception as e:
            print(f"BERTScore calculation error: {e}")
            bertscore = 0.0

        return {
            "rouge_l": rouge_output["rougeL"],
            "bleu4": bleu_output["bleu"],
            "bertscore": bertscore
        }



In [8]:
def plot_training_history(logs_dir, model_name, save_path=None):
    """
    Plot training and validation loss curves from TensorBoard logs.

    Args:
        logs_dir (str): Directory containing TensorBoard logs
        model_name (str): Name of the model directory in logs
        save_path (str, optional): Path to save the plot. If None, will save to model_name_loss.png

    Returns:
        str: Path to the saved plot
    """
    from tensorboard.backend.event_processing import event_accumulator
    import matplotlib.pyplot as plt
    import os

    # Default save path if not provided
    if save_path is None:
        save_path = f"{model_name}_loss.png"

    # Find the latest log directory
    model_log_dir = os.path.join(logs_dir, model_name)
    version_dirs = [d for d in os.listdir(model_log_dir) if d.startswith('version_')]
    if not version_dirs:
        print(f"No version directories found in {model_log_dir}")
        return None

    # Sort to get the latest version
    version_dirs.sort(key=lambda x: int(x.split('_')[1]))
    latest_version = version_dirs[-1]
    log_path = os.path.join(model_log_dir, latest_version)

    # Load the event file
    event_file = [f for f in os.listdir(log_path) if f.startswith('events.out.tfevents')]
    if not event_file:
        print(f"No event file found in {log_path}")
        return None

    event_path = os.path.join(log_path, event_file[0])
    ea = event_accumulator.EventAccumulator(event_path)
    ea.Reload()

    # Check available tags
    tags = ea.Tags()['scalars']

    # Extract training and validation loss if available
    train_loss = []
    val_loss = []
    steps = []

    if 'train_loss' in tags:
        train_events = ea.Scalars('train_loss')
        train_loss = [event.value for event in train_events]
        steps = [event.step for event in train_events]

    if 'val_loss' in tags:
        val_events = ea.Scalars('val_loss')
        val_loss = [event.value for event in val_events]

    # Create the plot
    plt.figure(figsize=(10, 6))

    if train_loss:
        plt.plot(steps, train_loss, label='Training Loss', color='blue')

    if val_loss:
        # Validation loss will have fewer points (usually 1 per epoch)
        # We need to align it with the training steps
        val_steps = []
        for i, event in enumerate(ea.Scalars('val_loss')):
            val_steps.append(event.step)

        plt.plot(val_steps, val_loss, label='Validation Loss', color='red', marker='o')

    plt.title(f'{model_name} Training History')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)

    # Save the figure
    plt.savefig(save_path)
    print(f"Training history plot saved to {save_path}")
    plt.close()

    return save_path


def plot_all_metrics(results, logs_dir, save_dir="plots"):
    """
    Create comprehensive plots for all models including:
    1. Training and validation loss
    2. Evaluation metrics comparison

    Args:
        results (dict): Dictionary containing model results
        logs_dir (str): Directory containing TensorBoard logs
        save_dir (str): Directory to save plots
    """
    import os
    import matplotlib.pyplot as plt
    import numpy as np

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Plot training and validation loss for each model
    loss_plots = {}
    for model_name in results.keys():
        loss_plot_path = plot_training_history(
            logs_dir,
            model_name,
            save_path=os.path.join(save_dir, f"{model_name}_loss.png")
        )
        loss_plots[model_name] = loss_plot_path

    # Plot evaluation metrics comparison
    metrics_plot_path = os.path.join(save_dir, "model_comparison.png")
    plot_metrics_comparison(results, save_path=metrics_plot_path)

    # Create a plot for all validation metrics over time for each model
    for model_name in results.keys():
        plot_validation_metrics(logs_dir, model_name, save_dir=save_dir)

    return {
        "loss_plots": loss_plots,
        "metrics_comparison": metrics_plot_path
    }


def plot_validation_metrics(logs_dir, model_name, save_dir="plots"):
    """
    Plot all validation metrics (ROUGE, BLEU, BERTScore) over time.

    Args:
        logs_dir (str): Directory containing TensorBoard logs
        model_name (str): Name of the model directory in logs
        save_dir (str): Directory to save the plot

    Returns:
        str: Path to the saved plot
    """
    from tensorboard.backend.event_processing import event_accumulator
    import matplotlib.pyplot as plt
    import os

    save_path = os.path.join(save_dir, f"{model_name}_metrics.png")

    # Find the latest log directory
    model_log_dir = os.path.join(logs_dir, model_name)
    version_dirs = [d for d in os.listdir(model_log_dir) if d.startswith('version_')]
    if not version_dirs:
        print(f"No version directories found in {model_log_dir}")
        return None

    # Sort to get the latest version
    version_dirs.sort(key=lambda x: int(x.split('_')[1]))
    latest_version = version_dirs[-1]
    log_path = os.path.join(model_log_dir, latest_version)

    # Load the event file
    event_file = [f for f in os.listdir(log_path) if f.startswith('events.out.tfevents')]
    if not event_file:
        print(f"No event file found in {log_path}")
        return None

    event_path = os.path.join(log_path, event_file[0])
    ea = event_accumulator.EventAccumulator(event_path)
    ea.Reload()

    # Check available tags
    tags = ea.Tags()['scalars']

    # Metrics to extract
    metrics = {
        'val_rouge1': ('ROUGE-1', 'blue'),
        'val_rouge2': ('ROUGE-2', 'green'),
        'val_rouge_l': ('ROUGE-L', 'red'),
        'val_bleu4': ('BLEU-4', 'purple'),
        'val_bertscore': ('BERTScore', 'orange'),
    }

    # Create the plot
    plt.figure(figsize=(12, 7))

    # Extract and plot each metric
    for metric_tag, (metric_label, color) in metrics.items():
        if metric_tag in tags:
            events = ea.Scalars(metric_tag)
            values = [event.value for event in events]
            steps = [event.step for event in events]
            plt.plot(steps, values, label=metric_label, color=color, marker='o')

    plt.title(f'{model_name} Validation Metrics')
    plt.xlabel('Steps')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)

    # Save the figure
    plt.savefig(save_path)
    print(f"Validation metrics plot saved to {save_path}")
    plt.close()

    return save_path

In [9]:
def train_models(train_df, val_df, test_df, models_config, batch_size=16, max_epochs=5):
    """Train and evaluate multiple models"""
    results = {}
    os.makedirs("plots", exist_ok=True)

    for model_name, model_config in models_config.items():
        print(f"\n{'='*50}")
        print(f"Training {model_name} model: {model_config['pretrained_model']}")
        print(f"{'='*50}")

        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_config['pretrained_model'])

        # Check if the tokenizer has padding token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Create datasets
        train_dataset = ClaimNormalizationDataset(
            train_df['preprocessed_post'].tolist(),
            train_df['preprocessed_claim'].tolist(),
            tokenizer
        )

        val_dataset = ClaimNormalizationDataset(
            val_df['preprocessed_post'].tolist(),
            val_df['preprocessed_claim'].tolist(),
            tokenizer
        )

        test_dataset = ClaimNormalizationDataset(
            test_df['preprocessed_post'].tolist(),
            test_df['preprocessed_claim'].tolist(),
            tokenizer
        )

        # Create data loaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=2
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            num_workers=2
        )

        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            num_workers=2
        )

        # Calculate total steps for learning rate scheduler
        total_steps = len(train_loader) * max_epochs

        # Initialize model
        model = ClaimNormalizationModel(
            model_config['pretrained_model'],
            tokenizer,
            learning_rate=model_config.get('learning_rate', 3e-5),
            total_steps=total_steps,
        )

        # Define callbacks
        checkpoint_callback = ModelCheckpoint(
            dirpath=f"checkpoints/{model_name}",
            filename="{epoch}-{val_rouge_l:.4f}",
            monitor="val_rouge_l",
            mode="max",
            save_top_k=1
        )

        early_stop_callback = EarlyStopping(
            monitor="val_loss",
            patience=3,
            mode="min"
        )

        # Define logger
        logger = TensorBoardLogger("logs", name=model_name)

        # Initialize trainer
        trainer = pl.Trainer(
            max_epochs=max_epochs,
            accelerator="auto",  # Uses GPU if available, otherwise CPU
            devices=1 if torch.cuda.is_available() else None,
            logger=logger,
            callbacks=[checkpoint_callback, early_stop_callback],
            gradient_clip_val=1.0,
            log_every_n_steps=50
        )

        # Train the model
        trainer.fit(model, train_loader, val_loader)

        # Test the model
        test_results = trainer.test(model, test_loader, ckpt_path="best")

        # Plot training history
        plot_training_history("logs", model_name, save_path=f"plots/{model_name}_loss.png")

        # Plot validation metrics
        plot_validation_metrics("logs", model_name, save_dir="plots")

        # Generate predictions for a few examples
        sample_texts = test_df['preprocessed_post'].iloc[:5].tolist()
        sample_claims = test_df['preprocessed_claim'].iloc[:5].tolist()

        # Load the best model for prediction
        best_model_path = checkpoint_callback.best_model_path
        best_model = None

        if best_model_path:
            best_model = ClaimNormalizationModel.load_from_checkpoint(
                best_model_path,
                model_name=model_config['pretrained_model'],
                tokenizer=tokenizer,
            )

            predicted_claims = best_model.generate_normalized_claims(sample_texts)

            print("\nSample predictions:")
            for i, (text, true_claim, pred_claim) in enumerate(zip(sample_texts, sample_claims, predicted_claims)):
                print(f"Example {i+1}:")
                print(f"  Post: {text[:100]}...")
                print(f"  True claim: {true_claim}")
                print(f"  Predicted: {pred_claim}")
                print()

            # Calculate metrics for the sample predictions
            metrics = best_model.calculate_metrics(sample_claims, predicted_claims)
            print("\nEvaluation metrics on sample predictions:")
            print(f"  ROUGE-L: {metrics['rouge_l']:.4f}")
            print(f"  BLEU-4: {metrics['bleu4']:.4f}")
            print(f"  BERTScore: {metrics['bertscore']:.4f}")
            print()

            # Save the model
            model_save_path = f"final_models/{model_name}"
            best_model.model.save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)
            print(f"Model saved to {model_save_path}")

            # Save results
            results[model_name] = {
                "test_results": {
                    "rouge_l": test_results[0].get("val_rouge_l", 0.0),
                    "bleu4": test_results[0].get("val_bleu4", 0.0),
                    "bertscore": test_results[0].get("val_bertscore", 0.0)
                },
                "best_model_path": model_save_path,
                "sample_metrics": metrics,
                "plots": {
                    "loss": f"plots/{model_name}_loss.png",
                    "metrics": f"plots/{model_name}_metrics.png"
                }
            }
        else:
            print(f"Warning: No best model path found for {model_name}")

    return results


In [11]:
def plot_metrics_comparison(results, save_path="model_comparison.png"):
    """Create a bar chart comparing model performance"""
    model_names = list(results.keys())
    metrics = ["rouge_l", "bleu4", "bertscore"]
    metric_labels = ["ROUGE-L", "BLEU-4", "BERTScore"]

    metric_values = {
        metric: [results[model]["test_results"][metric] for model in model_names]
        for metric in metrics
    }

    # Set up the figure and axes
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(model_names))
    width = 0.25

    # Plot bars for each metric
    for i, (metric, label) in enumerate(zip(metrics, metric_labels)):
        ax.bar(x + (i-1)*width, metric_values[metric], width, label=label)

    # Customize the plot
    ax.set_ylabel('Score')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names)
    ax.legend()
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Ensure all values are shown properly
    plt.tight_layout()

    # Save the figure
    plt.savefig(save_path)
    print(f"Metrics comparison saved to {save_path}")
    plt.close()



In [12]:
!pip install bert-score



Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:

# Set random seed for reproducibility
set_seed(42)

# Create directories
os.makedirs("checkpoints", exist_ok=True)
os.makedirs("final_models", exist_ok=True)
os.makedirs("logs", exist_ok=True)
os.makedirs("plots", exist_ok=True)

# Load and preprocess data
df = load_and_preprocess_clan_data('CLAN_data.csv')

if df is None:
        print("Failed to load dataset. Exiting.")
        exit(1)

# Split data into train, validation, and test sets
train_df, val_df, test_df = split_data(df)
test_df.to_csv("test.csv", index=False)
print("Test dataset saved to test.csv")

# Define model configurations
models_config = {
        "bart": {
            "pretrained_model": "facebook/bart-base",
            "learning_rate": 3e-5
        }
        # "t5": {
        #     "pretrained_model": "t5-small",
        #     "learning_rate": 3e-5
        # }
    }

# Train models
results = train_models(
        train_df,
        val_df,
        test_df,
        models_config,
        batch_size=8,
        max_epochs=3
    )

# Save results to JSON
with open("evaluation_results.json", "w") as f:
        json.dump(results, f, indent=2)
print("\nResults saved to evaluation_results.json")

# Plot comparison of model performance
plot_metrics_comparison(results, save_path="plots/model_comparison.png")

# Plot a comprehensive comparison of all models
plot_all_metrics(results, "logs", save_dir="plots")

# Find the best model based on ROUGE-L score
best_model = max(results.items(), key=lambda x: x[1]["test_results"]["rouge_l"])[0]
best_model_path = results[best_model]["best_model_path"]
print(f"\nBest performing model: {best_model} (path: {best_model_path})")
print(f"\nAll plots are saved in the 'plots' directory")

Loading dataset from CLAN_data.csv...
Dataset loaded successfully with 2811 rows and 3 columns.

Sample of raw data:
   PID                                  Social Media Post  \
0    1  President \n @realDonaldTrump\n : "Biden's pla...   
1    2  IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...   

                                    Normalized Claim  
0  Biden’s energy plan would get rid of seniors’ ...  
1  If someone with the new coronavirus sneezes, i...  

Preprocessing social media posts...
Using 'Social Media Post' as post content and 'Normalized Claim' as claim content

Sample of preprocessed data:
                                   preprocessed_post  \
0  president realdonaldtrump biden s plan would m...   
1  important announcement coronavirus last evenin...   

                                  preprocessed_claim  
0  biden s energy plan would get rid of seniors a...  
1  if someone with the new coronavirus sneezes, i...  

Data split complete:
Training set: 1967 samples (7

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params | Mode
--------------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M  | eval
--------------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)
0         Modules in train mode
182       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/checkpoints/bart/epoch=2-val_rouge_l=0.3825.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/checkpoints/bart/epoch=2-val_rouge_l=0.3825.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Training history plot saved to plots/bart_loss.png
Validation metrics plot saved to plots/bart_metrics.png

Sample predictions:
Example 1:
  Post: i declare covid 19 over world health organisation boss says coronavirus is no longer a global emerge...
  True claim: the world health organisation has declared the covid 19 pandemic over.
  Predicted: the coronavirus is no longer a global emergency.

Example 2:
  Post: ajike media update, donald trump love for biafra...
  True claim: false donald trump did not call kenya a very corrupt country
  Predicted: ajike media update, donald trump love for biafra

Example 3:
  Post: nobody making under 400,000 will have their taxes raised. period, says joebiden ....
  True claim: biden s tax rate on a family making 75,000 dollars would go from 12 to 25 .
  Predicted: under 400,000 people will have their taxes raised. period, says joe biden.

Example 4:
  Post: winner of 1.28 billion lottery gets 433.7 million after tax. congratulations to the irs on



Model saved to final_models/bart

Results saved to evaluation_results.json
Metrics comparison saved to plots/model_comparison.png
Training history plot saved to plots/bart_loss.png
Metrics comparison saved to plots/model_comparison.png
Validation metrics plot saved to plots/bart_metrics.png

Best performing model: bart (path: final_models/bart)

All plots are saved in the 'plots' directory


In [16]:
best_model_path = 'final_models/bart'

test_metrics = inference_on_test_file(
        "test.csv",
        best_model_path,
        "test_predictions.csv"
    )


ModuleNotFoundError: No module named 'preprocessing'