In [1]:
# Install required libraries
!pip install transformers datasets evaluate nltk rouge_score pytorch-lightning>=2.0.0 torch>=2.0.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0m

In [2]:
!pip install contractions

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# Transformers imports
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BartForConditionalGeneration,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

# Use PyTorch's AdamW instead of transformers' version
from torch.optim import AdamW

# Replace datasets.load_metric with evaluate
import evaluate

import nltk
import os
import random
import re
import contractions
from tqdm.auto import tqdm

# Download NLTK data for tokenization
nltk.download('punkt')

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:


# Create a simple abbreviation expansion function
def expand_abbreviations(text):
    # Common abbreviations dictionary
    abbr_dict = {
        "gov.": "governor",
        "feb.": "february",
        "jan.": "january",
        "mar.": "march",
        "apr.": "april",
        "jun.": "june",
        "jul.": "july",
        "aug.": "august",
        "sep.": "september",
        "oct.": "october",
        "nov.": "november",
        "dec.": "december",
        "vp": "vice president",
        "eta": "estimated time of arrival",
        "aka": "also known as",
        "vs.": "versus",
        "etc.": "etcetera",
        "dr.": "doctor",
        "mr.": "mister",
        "mrs.": "missus",
        "ms.": "miss",
        "prof.": "professor",
        "rep.": "representative",
        "sen.": "senator",
        "co.": "company",
        "corp.": "corporation",
        "inc.": "incorporated",
    }

    # Convert text to lowercase for easier matching
    text_lower = text.lower()

    # Replace abbreviations
    for abbr, full_form in abbr_dict.items():
        pattern = r'\b' + re.escape(abbr.lower()) + r'\b'
        text_lower = re.sub(pattern, full_form, text_lower)

    return text_lower

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Expand abbreviations
    text = expand_abbreviations(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove special characters except punctuation needed for readability
    text = re.sub(r'[^\w\s.,!?]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def load_and_preprocess_clan_data(file_path='CLAN_data.csv'):
    try:
        # Load the dataset
        print(f"Loading dataset from {file_path}...")
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")

        # Display sample of the raw data
        print("\nSample of raw data:")
        print(df.head(2))

        # Apply preprocessing to social media posts
        print("\nPreprocessing social media posts...")
        # Identify column names based on common patterns
        post_col = next((col for col in df.columns if 'post' in col.lower() or 'text' in col.lower()), None)
        claim_col = next((col for col in df.columns if 'claim' in col.lower() or 'norm' in col.lower()), None)

        if not post_col:
            raise ValueError("Could not identify social media post column in the dataset.")
        if not claim_col:
            raise ValueError("Could not identify normalized claim column in the dataset.")

        print(f"Using '{post_col}' as post content and '{claim_col}' as claim content")

        # Preprocess posts and claims
        df['preprocessed_post'] = df[post_col].apply(preprocess_text)
        df['preprocessed_claim'] = df[claim_col].apply(preprocess_text)

        # Display sample of preprocessed data
        print("\nSample of preprocessed data:")
        print(df[['preprocessed_post', 'preprocessed_claim']].head(2))

        # Check for missing values
        missing_posts = df['preprocessed_post'].isna().sum()
        missing_claims = df['preprocessed_claim'].isna().sum()
        if missing_posts > 0 or missing_claims > 0:
            print(f"\nWarning: Found {missing_posts} missing posts and {missing_claims} missing claims.")
            # Fill missing values with empty strings
            df['preprocessed_post'] = df['preprocessed_post'].fillna("")
            df['preprocessed_claim'] = df['preprocessed_claim'].fillna("")

        return df

    except Exception as e:
        print(f"Error during data loading or preprocessing: {str(e)}")
        return None

def split_data(df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_state=42):
    if not np.isclose(train_ratio + val_ratio + test_ratio, 1.0):
        raise ValueError("The sum of ratios must be 1.0")

    # Shuffle the data
    df_shuffled = df.sample(frac=1, random_state=random_state)

    # Calculate split indices
    train_end = int(len(df_shuffled) * train_ratio)
    val_end = int(len(df_shuffled) * (train_ratio + val_ratio))

    # Split the data
    train_df = df_shuffled.iloc[:train_end].copy().reset_index(drop=True)
    val_df = df_shuffled.iloc[train_end:val_end].copy().reset_index(drop=True)
    test_df = df_shuffled.iloc[val_end:].copy().reset_index(drop=True)

    print(f"\nData split complete:")
    print(f"Training set: {len(train_df)} samples ({train_ratio*100:.1f}%)")
    print(f"Validation set: {len(val_df)} samples ({val_ratio*100:.1f}%)")
    print(f"Test set: {len(test_df)} samples ({test_ratio*100:.1f}%)")

    return train_df, val_df, test_df



In [4]:
# Create a custom dataset class
class ClaimNormalizationDataset(Dataset):
    def __init__(self, texts, claims, tokenizer, max_input_length=512, max_target_length=128):
        self.texts = texts
        self.claims = claims
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        claim = str(self.claims[idx])

        # Tokenize inputs
        input_encoding = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize targets
        target_encoding = self.tokenizer(
            claim,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Replace padding token id with -100 for loss calculation
        target_ids = target_encoding["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_ids
        }

# Define PyTorch Lightning module for seq2seq fine-tuning
class ClaimNormalizationModel(pl.LightningModule):
    def __init__(
        self,
        model_name,
        tokenizer,
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=500,
        total_steps=None
    ):
        super().__init__()
        self.save_hyperparameters(ignore=['tokenizer'])

        # Load model and tokenizer
        if "bart" in model_name.lower():
            self.model = BartForConditionalGeneration.from_pretrained(model_name)
        elif "t5" in model_name.lower():
            self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        else:
            raise ValueError(f"Unsupported model type: {model_name}")

        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps

        # Define metrics
        self.rouge = evaluate.load("rouge")
        self.bleu = evaluate.load("bleu")
        self.bertscore = evaluate.load("bertscore")


    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )

        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )

        loss = outputs.loss
        self.log("val_loss", loss, prog_bar=True, logger=True)

        # Generate predictions for ROUGE calculation
        generated_ids = self.model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

        pred_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        label_texts = self.tokenizer.batch_decode(
            batch["labels"].masked_fill(batch["labels"] == -100, self.tokenizer.pad_token_id),
            skip_special_tokens=True
        )

        # Filter out empty predictions and corresponding references
        valid_pairs = [(pred, ref) for pred, ref in zip(pred_texts, label_texts) if pred.strip() and ref.strip()]

        # If no valid pairs, use default zero values
        if not valid_pairs:
            self.log("val_rouge1", 0.0, prog_bar=True, logger=True)
            self.log("val_rouge2", 0.0, prog_bar=True, logger=True)
            self.log("val_rouge_l", 0.0, prog_bar=True, logger=True)
            self.log("val_bleu4", 0.0, prog_bar=True, logger=True)
            self.log("val_bertscore", 0.0, prog_bar=True, logger=True)

            return {
                "val_loss": loss,
                "rouge1": 0.0,
                "rouge2": 0.0,
                "rouge_l": 0.0,
                "bleu4": 0.0,
                "bertscore": 0.0
            }

        # Unzip the valid pairs
        valid_preds, valid_refs = zip(*valid_pairs)

        # Calculate ROUGE scores (updated for evaluate library)
        rouge_output = self.rouge.compute(
            predictions=valid_preds,
            references=valid_refs,
            use_stemmer=True
        )

        # Extract scores directly
        rouge1 = rouge_output["rouge1"]
        rouge2 = rouge_output["rouge2"]
        rougeL = rouge_output["rougeL"]

        # Calculate BLEU scores
        bleu_output = self.bleu.compute(
            predictions=valid_preds,
            references=[[ref] for ref in valid_refs],  # BLEU expects list of references for each prediction
        )
        bleu4 = bleu_output["bleu"]

        # Calculate BERTScore with proper error handling
        try:
            bertscore_output = self.bertscore.compute(
                predictions=valid_preds,
                references=valid_refs,
                lang="en",
                model_type="microsoft/deberta-xlarge-mnli"
            )
            bertscore = sum(bertscore_output["f1"]) / len(bertscore_output["f1"]) if bertscore_output["f1"] else 0.0
        except Exception as e:
            print(f"BERTScore calculation error: {e}")
            bertscore = 0.0

        # Log metrics
        self.log("val_rouge1", rouge1, prog_bar=True, logger=True)
        self.log("val_rouge2", rouge2, prog_bar=True, logger=True)
        self.log("val_rouge_l", rougeL, prog_bar=True, logger=True)
        self.log("val_bleu4", bleu4, prog_bar=True, logger=True)
        self.log("val_bertscore", bertscore, prog_bar=True, logger=True)

        return {
            "val_loss": loss,
            "rouge1": rouge1,
            "rouge2": rouge2,
            "rouge_l": rougeL,
            "bleu4": bleu4,
            "bertscore": bertscore
        }

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def calculate_rouge_l(self, references, predictions):
        """Calculate ROUGE-L score for a list of references and predictions."""
        # Filter out empty predictions and corresponding references
        valid_pairs = [(pred, ref) for pred, ref in zip(predictions, references) if pred.strip() and ref.strip()]

        if not valid_pairs:
            return 0.0

        valid_preds, valid_refs = zip(*valid_pairs)

        rouge_output = self.rouge.compute(
            predictions=valid_preds,
            references=valid_refs,
            use_stemmer=True
        )
        return rouge_output["rougeL"]

    def calculate_bleu4(self, references, predictions):
        """Calculate BLEU-4 score for a list of references and predictions."""
        # Filter out empty predictions and corresponding references
        valid_pairs = [(pred, ref) for pred, ref in zip(predictions, references) if pred.strip() and ref.strip()]

        if not valid_pairs:
            return 0.0

        valid_preds, valid_refs = zip(*valid_pairs)

        bleu_output = self.bleu.compute(
            predictions=valid_preds,
            references=[[ref] for ref in valid_refs],  # BLEU expects list of references for each prediction
        )
        return bleu_output["bleu"]

    def calculate_bertscore(self, references, predictions):
        """Calculate BERTScore for a list of references and predictions."""
        # Filter out empty predictions and corresponding references
        valid_pairs = [(pred, ref) for pred, ref in zip(predictions, references) if pred.strip() and ref.strip()]

        if not valid_pairs:
            return 0.0

        valid_preds, valid_refs = zip(*valid_pairs)

        try:
            bertscore_output = self.bertscore.compute(
                predictions=valid_preds,
                references=valid_refs,
                lang="en",
                model_type="microsoft/deberta-xlarge-mnli"
            )
            return sum(bertscore_output["f1"]) / len(bertscore_output["f1"]) if bertscore_output["f1"] else 0.0
        except Exception as e:
            print(f"BERTScore calculation error: {e}")
            return 0.0

    def configure_optimizers(self):
        # Create optimizer
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate)

        # Create scheduler
        if self.total_steps is None:
            self.total_steps = 1000  # Default value

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_steps
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
            },
        }

    def generate_normalized_claims(self, texts, max_length=128):
        self.model.eval()
        tokenized_inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(self.device)

        generated_ids = self.model.generate(
            input_ids=tokenized_inputs["input_ids"],
            attention_mask=tokenized_inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )

        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Function to train and evaluate both models
def train_models(train_df, val_df, test_df, models_config, batch_size, max_epochs):
    results = {}

    for model_name, model_config in models_config.items():
        print(f"\n{'='*50}")
        print(f"Training {model_name} model: {model_config['pretrained_model']}")
        print(f"{'='*50}")

        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_config['pretrained_model'])

        # Check if the tokenizer has padding token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Create datasets
        train_dataset = ClaimNormalizationDataset(
            train_df['preprocessed_post'].tolist(),
            train_df['preprocessed_claim'].tolist(),
            tokenizer
        )

        val_dataset = ClaimNormalizationDataset(
            val_df['preprocessed_post'].tolist(),
            val_df['preprocessed_claim'].tolist(),
            tokenizer
        )

        test_dataset = ClaimNormalizationDataset(
            test_df['preprocessed_post'].tolist(),
            test_df['preprocessed_claim'].tolist(),
            tokenizer
        )

        # Create data loaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=2
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            num_workers=2
        )

        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            num_workers=2
        )

        # Calculate total steps for learning rate scheduler
        total_steps = len(train_loader) * max_epochs

        # Initialize model with updated metrics
        model = ClaimNormalizationModel(
            model_config['pretrained_model'],
            tokenizer,
            learning_rate=model_config.get('learning_rate',3e-5),
            total_steps=total_steps,
        )

        # Define callbacks - updated to use rouge_l as the main monitoring metric
        checkpoint_callback = ModelCheckpoint(
            dirpath=f"checkpoints/{model_name}",
            filename="{epoch}-{val_rouge_l:.4f}",  # Updated to rouge_l
            monitor="val_rouge_l",  # Updated to rouge_l
            mode="max",
            save_top_k=1
        )

        early_stop_callback = EarlyStopping(
            monitor="val_loss",
            patience=3,
            mode="min"
        )

        # Define logger
        logger = TensorBoardLogger("logs", name=model_name)

        # Initialize trainer
        trainer = pl.Trainer(
            max_epochs=max_epochs,
            accelerator="auto",  # Uses GPU if available, otherwise CPU
            devices=1 if torch.cuda.is_available() else None,
            logger=logger,
            callbacks=[checkpoint_callback, early_stop_callback],
            gradient_clip_val=1.0,
            log_every_n_steps=50
        )

        # Train the model
        trainer.fit(model, train_loader, val_loader)

        # Test the model
        test_results = trainer.test(model, test_loader, ckpt_path="best")

        # Generate predictions for a few examples
        sample_texts = test_df['preprocessed_post'].iloc[:5].tolist()
        sample_claims = test_df['preprocessed_claim'].iloc[:5].tolist()

        # Load the best model for prediction
        best_model_path = checkpoint_callback.best_model_path
        if best_model_path:
            best_model = ClaimNormalizationModel.load_from_checkpoint(
                best_model_path,
                model_name=model_config['pretrained_model'],
                tokenizer=tokenizer,
                # metrics=["rouge-l", "bleu-4", "bertscore"]  # Updated metrics
            )

            predicted_claims = best_model.generate_normalized_claims(sample_texts)

            print("\nSample predictions:")
            for i, (text, true_claim, pred_claim) in enumerate(zip(sample_texts, sample_claims, predicted_claims)):
                print(f"Example {i+1}:")
                print(f"  Post: {text[:100]}...")
                print(f"  True claim: {true_claim}")
                print(f"  Predicted: {pred_claim}")
                print()

            # Calculate and display all metrics for the sample predictions
            print("\nEvaluation metrics on sample predictions:")
            rouge_scores = best_model.calculate_rouge_l(sample_claims, predicted_claims)
            bleu_scores = best_model.calculate_bleu4(sample_claims, predicted_claims)
            bert_scores = best_model.calculate_bertscore(sample_claims, predicted_claims)

            print(f"  ROUGE-L: {rouge_scores:.4f}")
            print(f"  BLEU-4: {bleu_scores:.4f}")
            print(f"  BERTScore: {bert_scores:.4f}")
            print()

            # Save the model
            best_model.model.save_pretrained(f"final_models/{model_name}")
            tokenizer.save_pretrained(f"final_models/{model_name}")
            print(f"Model saved to final_models/{model_name}")

            # Save results with new metrics
            results[model_name] = {
                "test_results": {
                    "rouge_l": test_results[0].get("test_rouge_l", 0.0),
                    "bleu_4": test_results[0].get("test_bleu_4", 0.0),
                    "bertscore": test_results[0].get("test_bertscore", 0.0)
                },
                "best_model_path": f"final_models/{model_name}"
            }

    return results

In [None]:
# Main execution
if __name__ == "__main__":
    # Create directories for checkpoints and models
    os.makedirs("checkpoints", exist_ok=True)
    os.makedirs("final_models", exist_ok=True)
    os.makedirs("logs", exist_ok=True)

    # Install required libraries if in Colab
    try:
        import google.colab
        print("Running in Google Colab. Installing required packages...")
        !pip install contractions nltk sacrebleu bert-score transformers

        # Download required NLTK data for tokenization used in metrics
        import nltk
        nltk.download('punkt')
    except:
        pass

    # Load and preprocess data
    df = load_and_preprocess_clan_data('CLAN_data.csv')

    if df is not None:
        # Split the data
        train_df, val_df, test_df = split_data(df)

        # Define models to train
        models_config = {
            "bart_base": {
                "pretrained_model": "facebook/bart-base",
                "learning_rate": 3e-5
            },
            "t5_base": {
                "pretrained_model": "t5-base",
                "learning_rate": 3e-5
            }
        }

        # Train the models
        results = train_models(
            train_df,
            val_df,
            test_df,
            models_config,
            batch_size=32,
            max_epochs=5
        )

        # Print final results with the new metrics
        print("\nFinal Results:")
        for model_name, model_results in results.items():
            print(f"\n{model_name} Model Evaluation:")
            print(f"  ROUGE-L: {model_results['test_results']['rouge_l']:.4f}")
            print(f"  BLEU-4: {model_results['test_results']['bleu_4']:.4f}")
            print(f"  BERTScore: {model_results['test_results']['bertscore']:.4f}")
            print(f"  Model saved at: {model_results['best_model_path']}")

        # Save results to JSON for future reference
        import json
        with open("evaluation_results.json", "w") as f:
            json.dump(results, f, indent=2)
        print("\nEvaluation results saved to evaluation_results.json")

        # Optional: generate a simple visualization of results
        try:
            import matplotlib.pyplot as plt
            import numpy as np

            model_names = list(results.keys())
            metrics = ["rouge_l", "bleu_4", "bertscore"]
            metric_values = {
                metric: [results[model]["test_results"][metric] for model in model_names]
                for metric in metrics
            }

            x = np.arange(len(model_names))
            width = 0.25

            fig, ax = plt.subplots(figsize=(10, 6))

            # Plot bars for each metric
            bars1 = ax.bar(x - width, metric_values["rouge_l"], width, label="ROUGE-L")
            bars2 = ax.bar(x, metric_values["bleu_4"], width, label="BLEU-4")
            bars3 = ax.bar(x + width, metric_values["bertscore"], width, label="BERTScore")

            ax.set_ylabel("Score")
            ax.set_title("Evaluation Metrics by Model")
            ax.set_xticks(x)
            ax.set_xticklabels(model_names)
            ax.legend()

            plt.tight_layout()
            plt.savefig("evaluation_metrics.png")
            print("Evaluation metrics visualization saved to evaluation_metrics.png")
        except Exception as e:
            print(f"Could not generate visualization: {str(e)}")

Running in Google Colab. Installing required packages...
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portal

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Sample of preprocessed data:
                                   preprocessed_post  \
0  president realdonaldtrump biden s plan would m...   
1  important announcement coronavirus last evenin...   

                                  preprocessed_claim  
0  biden s energy plan would get rid of seniors a...  
1  if someone with the new coronavirus sneezes, i...  

Data split complete:
Training set: 1967 samples (70.0%)
Validation set: 422 samples (15.0%)
Test set: 422 samples (15.0%)

Training bart_base model: facebook/bart-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                         | Params | Mode
--------------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M  | eval
--------------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)
0         Modules in train mode
182       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]