# Rouge Metrics Comparison Table Generator

This notebook generates a comparison table of Rouge-1, Rouge-2, and Rouge-L metrics for three models (FlanT5, Mistral, DistilBART) before and after fine-tuning on the XSUM dataset.

## Instructions:
1. Upload the three JSON files from `Newset/XSUM/` directory to your Colab environment
2. Run all cells below
3. The table will be generated automatically

## Expected Files:
- `before_after_all_metrics_200XSUM_flant5.json` (FlanT5 data)
- `rescore_mistral_XSUM_before_after_JSON_file.json` (Mistral data)
- `results_DISTILBART_XSUM_Dataset_200_rescore.json` (DistilBART data)

In [None]:
# Install required packages
!pip install pandas numpy --quiet

print("✅ Required packages installed successfully!")

In [None]:
import json
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Any
import os
from collections import Counter

print("📚 Libraries imported successfully!")

In [None]:
def compute_rouge_n(reference: str, candidate: str, n: int = 1) -> float:
    """
    Compute ROUGE-N score between reference and candidate text.
    
    Args:
        reference: Reference/target text
        candidate: Generated/candidate text
        n: N-gram size (1 for ROUGE-1, 2 for ROUGE-2)
    
    Returns:
        ROUGE-N F1 score
    """
    if not reference or not candidate:
        return 0.0
    
    # Tokenize and convert to lowercase
    ref_tokens = reference.lower().split()
    cand_tokens = candidate.lower().split()
    
    if len(ref_tokens) < n or len(cand_tokens) < n:
        return 0.0
    
    # Generate n-grams
    ref_ngrams = [tuple(ref_tokens[i:i+n]) for i in range(len(ref_tokens) - n + 1)]
    cand_ngrams = [tuple(cand_tokens[i:i+n]) for i in range(len(cand_tokens) - n + 1)]
    
    if not ref_ngrams or not cand_ngrams:
        return 0.0
    
    # Count n-grams
    ref_counter = Counter(ref_ngrams)
    cand_counter = Counter(cand_ngrams)
    
    # Calculate overlap
    overlap = sum((ref_counter & cand_counter).values())
    
    # Calculate precision and recall
    precision = overlap / len(cand_ngrams) if len(cand_ngrams) > 0 else 0
    recall = overlap / len(ref_ngrams) if len(ref_ngrams) > 0 else 0
    
    # Calculate F1 score
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * precision * recall / (precision + recall)
    return f1

def compute_rouge_l(reference: str, candidate: str) -> float:
    """
    Compute ROUGE-L score using Longest Common Subsequence.
    
    Args:
        reference: Reference/target text
        candidate: Generated/candidate text
    
    Returns:
        ROUGE-L F1 score
    """
    if not reference or not candidate:
        return 0.0
    
    ref_tokens = reference.lower().split()
    cand_tokens = candidate.lower().split()
    
    if not ref_tokens or not cand_tokens:
        return 0.0
    
    # Compute LCS length using dynamic programming
    def lcs_length(seq1, seq2):
        m, n = len(seq1), len(seq2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if seq1[i-1] == seq2[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1])
        
        return dp[m][n]
    
    lcs_len = lcs_length(ref_tokens, cand_tokens)
    
    if lcs_len == 0:
        return 0.0
    
    # Calculate precision and recall
    precision = lcs_len / len(cand_tokens)
    recall = lcs_len / len(ref_tokens)
    
    # Calculate F1 score
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * precision * recall / (precision + recall)
    return f1

print("🔧 Rouge computation functions defined!")

In [None]:
def extract_flant5_rouge_metrics(data: List[Dict]) -> Dict[str, float]:
    """
    Extract Rouge metrics from FlanT5 data which already contains computed Rouge scores.
    """
    rouge1_before = []
    rouge1_after = []
    rouge2_before = []
    rouge2_after = []
    rougel_before = []
    rougel_after = []
    
    for record in data:
        rouge1_before.append(record.get('ROUGE1_before', 0.0))
        rouge1_after.append(record.get('ROUGE1_after', 0.0))
        rouge2_before.append(record.get('ROUGE2_before', 0.0))
        rouge2_after.append(record.get('ROUGE2_after', 0.0))
        rougel_before.append(record.get('ROUGEL_before', 0.0))
        rougel_after.append(record.get('ROUGEL_after', 0.0))
    
    return {
        'rouge1_before': np.mean(rouge1_before),
        'rouge1_after': np.mean(rouge1_after),
        'rouge2_before': np.mean(rouge2_before),
        'rouge2_after': np.mean(rouge2_after),
        'rougel_before': np.mean(rougel_before),
        'rougel_after': np.mean(rougel_after)
    }

def compute_rouge_metrics_from_text(data: List[Dict]) -> Dict[str, float]:
    """
    Compute Rouge metrics from text data for models that don't have precomputed Rouge scores.
    """
    rouge1_before = []
    rouge1_after = []
    rouge2_before = []
    rouge2_after = []
    rougel_before = []
    rougel_after = []
    
    for record in data:
        # Get reference and generated summaries
        reference = record.get('reference_summary', record.get('reference', ''))
        gen_before = record.get('gen_before', record.get('generatedsummary_before', ''))
        gen_after = record.get('gen_after', record.get('generatedsummary_after', ''))
        
        if reference and gen_before:
            rouge1_before.append(compute_rouge_n(reference, gen_before, 1))
            rouge2_before.append(compute_rouge_n(reference, gen_before, 2))
            rougel_before.append(compute_rouge_l(reference, gen_before))
        
        if reference and gen_after:
            rouge1_after.append(compute_rouge_n(reference, gen_after, 1))
            rouge2_after.append(compute_rouge_n(reference, gen_after, 2))
            rougel_after.append(compute_rouge_l(reference, gen_after))
    
    return {
        'rouge1_before': np.mean(rouge1_before) if rouge1_before else 0.0,
        'rouge1_after': np.mean(rouge1_after) if rouge1_after else 0.0,
        'rouge2_before': np.mean(rouge2_before) if rouge2_before else 0.0,
        'rouge2_after': np.mean(rouge2_after) if rouge2_after else 0.0,
        'rougel_before': np.mean(rougel_before) if rougel_before else 0.0,
        'rougel_after': np.mean(rougel_after) if rougel_after else 0.0
    }

print("🛠️ Data processing functions defined!")

In [None]:
# Define file paths for the three JSON files
file_paths = {
    'FlanT5': 'before_after_all_metrics_200XSUM_flant5.json',
    'Mistral': 'rescore_mistral_XSUM_before_after_JSON_file.json',
    'DistilBART': 'results_DISTILBART_XSUM_Dataset_200_rescore.json'
}

# Check which files exist
print("📁 Checking for JSON files...")
for model, filename in file_paths.items():
    if os.path.exists(filename):
        print(f"✅ Found {model} data: {filename}")
    else:
        print(f"❌ Missing {model} data: {filename}")
        print(f"   Please upload this file to your Colab environment")

print("\n💡 If files are missing, please upload them using the Files panel on the left.")

In [None]:
# Process each file and compute Rouge metrics
print("🔄 Processing JSON files and computing Rouge metrics...")
print("=" * 60)

results = {}

for model_name, file_path in file_paths.items():
    if not os.path.exists(file_path):
        print(f"⚠️ Skipping {model_name} - file not found: {file_path}")
        continue
    
    print(f"\n📊 Processing {model_name} data...")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"   ✅ Loaded {len(data)} records")
        
        # Check if this is FlanT5 with precomputed Rouge metrics
        if model_name.lower() == 'flant5' and 'ROUGE1_before' in data[0]:
            print(f"   🎯 Using precomputed Rouge metrics")
            metrics = extract_flant5_rouge_metrics(data)
        else:
            print(f"   🧮 Computing Rouge metrics from text")
            metrics = compute_rouge_metrics_from_text(data)
        
        results[model_name] = metrics
        print(f"   ✅ Completed processing {model_name}")
        
    except Exception as e:
        print(f"   ❌ Error processing {model_name}: {e}")
        results[model_name] = {
            'rouge1_before': 0.0, 'rouge1_after': 0.0,
            'rouge2_before': 0.0, 'rouge2_after': 0.0,
            'rougel_before': 0.0, 'rougel_after': 0.0
        }

print("\n✅ Data processing complete!")

In [None]:
# Create the results table
print("📋 Creating Rouge Metrics Comparison Table...")
print("=" * 80)

# Prepare data for DataFrame
df_data = []
for model_name, metrics in results.items():
    df_data.append({
        'Model': model_name,
        'Rouge-1 Before': f"{metrics['rouge1_before']:.4f}",
        'Rouge-1 After': f"{metrics['rouge1_after']:.4f}",
        'Rouge-2 Before': f"{metrics['rouge2_before']:.4f}",
        'Rouge-2 After': f"{metrics['rouge2_after']:.4f}",
        'RougeL Before': f"{metrics['rougel_before']:.4f}",
        'RougeL After': f"{metrics['rougel_after']:.4f}"
    })

# Create DataFrame
df = pd.DataFrame(df_data)

# Display the table
print("📊 ROUGE METRICS COMPARISON TABLE")
print("=" * 80)
print(df.to_string(index=False))

print("\n📋 Formatted Table (Markdown Style):")
print("-" * 90)
print("| Model     | Rouge-1 Before | Rouge-1 After | Rouge-2 Before | Rouge-2 After | RougeL Before | RougeL After |")
print("|-----------|----------------|---------------|----------------|---------------|---------------|--------------|")
for _, row in df.iterrows():
    print(f"| {row['Model']:<9} | {row['Rouge-1 Before']:<14} | {row['Rouge-1 After']:<13} | {row['Rouge-2 Before']:<14} | {row['Rouge-2 After']:<13} | {row['RougeL Before']:<13} | {row['RougeL After']:<12} |")

# Save results
csv_filename = 'rouge_metrics_comparison.csv'
df.to_csv(csv_filename, index=False)
print(f"\n💾 Results saved to: {csv_filename}")

print("\n🎉 Analysis Complete!")

In [None]:
# Provide some insights about the results
print("🔍 Analysis Insights:")
print("=" * 50)

for model_name, metrics in results.items():
    print(f"\n📈 {model_name}:")
    
    # Calculate changes
    rouge1_change = metrics['rouge1_after'] - metrics['rouge1_before']
    rouge2_change = metrics['rouge2_after'] - metrics['rouge2_before']
    rougel_change = metrics['rougel_after'] - metrics['rougel_before']
    
    def format_change(change):
        if change > 0:
            return f"+{change:.4f} (improved)"
        elif change < 0:
            return f"{change:.4f} (decreased)"
        else:
            return "0.0000 (no change)"
    
    print(f"   Rouge-1 Change: {format_change(rouge1_change)}")
    print(f"   Rouge-2 Change: {format_change(rouge2_change)}")
    print(f"   Rouge-L Change: {format_change(rougel_change)}")

print("\n💡 Notes:")
print("   - Higher Rouge scores indicate better overlap with reference summaries")
print("   - Rouge-1: Unigram overlap (individual word matches)")
print("   - Rouge-2: Bigram overlap (two consecutive word matches)")
print("   - Rouge-L: Longest common subsequence overlap")
print("   - Positive changes indicate improvement after fine-tuning")