# Notebook 3: Text Summarization

This notebook implements:
- Using pretrained mBART model for multilingual summarization
- Generating summaries in multiple lengths (small, medium, large)
- Support for both Nepali and English text
- Evaluation using ROUGE metrics

In [None]:
# Import required libraries
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Transformers and PyTorch
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    pipeline
)

# ROUGE metrics
from rouge_score import rouge_scorer

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("‚úì Libraries imported successfully")

## 1. Configuration

In [None]:
# Paths
BASE_DIR = Path(r'c:\Users\sagun\Desktop\news_project')
DATA_DIR = BASE_DIR / 'data' / 'processed'
MODEL_DIR = BASE_DIR / 'models' / 'summarizer'
RESULTS_DIR = BASE_DIR / 'results' / 'summaries'

# Create directories
MODEL_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration - Using pretrained mBART
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"

# Summary length configurations (in words)
SUMMARY_CONFIGS = {
    'small': {'min_length': 30, 'max_length': 50},
    'medium': {'min_length': 80, 'max_length': 150},
    'large': {'min_length': 150, 'max_length': 300}
}

print(f"Pretrained Model: {MODEL_NAME}")
print(f"\nSummary configurations:")
for size, config in SUMMARY_CONFIGS.items():
    print(f"  {size}: {config['min_length']}-{config['max_length']} tokens")

## 2. Load Pretrained Model

In [None]:
# Load pretrained mBART model and tokenizer
print(f"Loading pretrained model: {MODEL_NAME}...")
print("This may take a few minutes...\n")

tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
model = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)

print("‚úì Model and tokenizer loaded successfully")
print(f"Model parameters: {model.num_parameters():,}")

## 3. Load Data

In [None]:
# Load test data for summarization
with open(DATA_DIR / 'test_data.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"‚úì Loaded {len(test_df)} articles for summarization")

# Select a subset for demonstration (to save time)
sample_size = min(50, len(test_df))
sample_df = test_df.sample(n=sample_size, random_state=42)
print(f"‚úì Using {sample_size} articles for demonstration")

## 4. Summarization Functions

In [None]:
def generate_summary(text, size='medium', src_lang='ne_NP', tgt_lang='ne_NP'):
    """
    Generate summary using pretrained mBART model
    
    Args:
        text: Input text to summarize
        size: 'small', 'medium', or 'large'
        src_lang: Source language code (ne_NP for Nepali)
        tgt_lang: Target language code
    """
    # Set source language
    tokenizer.src_lang = src_lang
    
    # Get configuration for summary size
    config = SUMMARY_CONFIGS.get(size, SUMMARY_CONFIGS['medium'])
    
    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors='pt',
        max_length=1024,
        truncation=True
    ).to(device)
    
    # Generate summary
    model.eval()
    with torch.no_grad():
        summary_ids = model.generate(
            inputs['input_ids'],
            num_beams=4,
            min_length=config['min_length'],
            max_length=config['max_length'],
            early_stopping=True,
            forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang]
        )
    
    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Test the function
sample_text = sample_df.iloc[0]['text']
print("Testing summarization function...\n")
print(f"Original text (first 300 chars):\n{sample_text[:300]}...\n")

for size in ['small', 'medium', 'large']:
    summary = generate_summary(sample_text, size=size)
    print(f"{size.upper()} summary:\n{summary}\n")
    print("-" * 80)

## 5. Generate Summaries for All Sizes

In [None]:
# Generate summaries for all articles in all sizes
from tqdm import tqdm

summaries_data = []

print("Generating summaries for all articles...\n")

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Summarizing"):
    text = row['text']
    category = row['category']
    
    article_summaries = {
        'original_text': text,
        'category': category,
        'text_length': len(text),
        'word_count': len(text.split())
    }
    
    # Generate summaries for each size
    for size in ['small', 'medium', 'large']:
        try:
            summary = generate_summary(text, size=size)
            article_summaries[f'{size}_summary'] = summary
            article_summaries[f'{size}_length'] = len(summary)
            article_summaries[f'{size}_words'] = len(summary.split())
        except Exception as e:
            print(f"Error generating {size} summary for article {idx}: {e}")
            article_summaries[f'{size}_summary'] = ""
    
    summaries_data.append(article_summaries)

print("\n‚úì Summary generation complete!")

In [None]:
# Create DataFrame with summaries
summaries_df = pd.DataFrame(summaries_data)

print("Summary Statistics:\n")
print(f"Total articles summarized: {len(summaries_df)}")
print(f"\nAverage summary lengths (characters):")
for size in ['small', 'medium', 'large']:
    avg_length = summaries_df[f'{size}_length'].mean()
    avg_words = summaries_df[f'{size}_words'].mean()
    print(f"  {size}: {avg_length:.0f} chars, {avg_words:.0f} words")

summaries_df.head()

## 6. Visualize Summary Statistics

In [None]:
# Compare original text length vs summary lengths
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Original text length distribution
axes[0, 0].hist(summaries_df['text_length'], bins=30, color='steelblue', edgecolor='black')
axes[0, 0].set_title('Original Text Length Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Length (characters)')
axes[0, 0].set_ylabel('Frequency')

# Summary lengths comparison
summary_lengths = summaries_df[['small_length', 'medium_length', 'large_length']]
summary_lengths.columns = ['Small', 'Medium', 'Large']
summary_lengths.boxplot(ax=axes[0, 1])
axes[0, 1].set_title('Summary Length Comparison', fontweight='bold')
axes[0, 1].set_ylabel('Length (characters)')

# Compression ratio
for size in ['small', 'medium', 'large']:
    summaries_df[f'{size}_compression'] = summaries_df[f'{size}_length'] / summaries_df['text_length'] * 100

compression_data = summaries_df[['small_compression', 'medium_compression', 'large_compression']]
compression_data.columns = ['Small', 'Medium', 'Large']
compression_data.boxplot(ax=axes[1, 0])
axes[1, 0].set_title('Compression Ratio (%)', fontweight='bold')
axes[1, 0].set_ylabel('Percentage of Original')

# Average summary length by size
avg_lengths = [summaries_df[f'{size}_length'].mean() for size in ['small', 'medium', 'large']]
axes[1, 1].bar(['Small', 'Medium', 'Large'], avg_lengths, color=['lightcoral', 'skyblue', 'lightgreen'])
axes[1, 1].set_title('Average Summary Length by Size', fontweight='bold')
axes[1, 1].set_ylabel('Length (characters)')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'summary_statistics.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úì Visualization saved to {RESULTS_DIR / 'summary_statistics.png'}")

## 7. ROUGE Score Evaluation

In [None]:
# For ROUGE evaluation, we'll use the medium summary as reference
# and compare small and large summaries against it

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

def calculate_rouge_scores(reference, hypothesis):
    """Calculate ROUGE scores"""
    scores = scorer.score(reference, hypothesis)
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

# Calculate ROUGE scores for a subset
rouge_results = []

for idx, row in summaries_df.head(20).iterrows():
    # Use original text as reference (first 200 words)
    reference = ' '.join(row['original_text'].split()[:200])
    
    for size in ['small', 'medium', 'large']:
        hypothesis = row[f'{size}_summary']
        if hypothesis:
            scores = calculate_rouge_scores(reference, hypothesis)
            scores['size'] = size
            rouge_results.append(scores)

rouge_df = pd.DataFrame(rouge_results)

print("ROUGE Scores by Summary Size:\n")
print(rouge_df.groupby('size')[['rouge1', 'rouge2', 'rougeL']].mean())

In [None]:
# Visualize ROUGE scores
rouge_avg = rouge_df.groupby('size')[['rouge1', 'rouge2', 'rougeL']].mean()

fig, ax = plt.subplots(figsize=(10, 6))
rouge_avg.plot(kind='bar', ax=ax, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax.set_title('ROUGE Scores by Summary Size', fontsize=14, fontweight='bold')
ax.set_xlabel('Summary Size')
ax.set_ylabel('F1 Score')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
ax.legend(title='ROUGE Metric')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'rouge_scores.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úì ROUGE scores saved to {RESULTS_DIR / 'rouge_scores.png'}")

## 8. Save Results

In [None]:
# Save all summaries
summaries_output = summaries_df.to_dict('records')

with open(RESULTS_DIR / 'all_summaries.json', 'w', encoding='utf-8') as f:
    json.dump(summaries_output, f, ensure_ascii=False, indent=2)

print(f"‚úì All summaries saved to {RESULTS_DIR / 'all_summaries.json'}")

# Save ROUGE scores
rouge_summary = rouge_df.groupby('size')[['rouge1', 'rouge2', 'rougeL']].mean().to_dict()

with open(RESULTS_DIR / 'rouge_scores.json', 'w', encoding='utf-8') as f:
    json.dump(rouge_summary, f, ensure_ascii=False, indent=2)

print(f"‚úì ROUGE scores saved to {RESULTS_DIR / 'rouge_scores.json'}")

## 9. Example Summaries

In [None]:
# Display example summaries
print("="*80)
print("EXAMPLE SUMMARIES")
print("="*80)

for i in range(min(3, len(summaries_df))):
    row = summaries_df.iloc[i]
    
    print(f"\nüì∞ Article {i+1} - Category: {row['category']}")
    print("-" * 80)
    print(f"\nOriginal Text ({row['word_count']} words):\n{row['original_text'][:300]}...\n")
    
    for size in ['small', 'medium', 'large']:
        summary = row[f'{size}_summary']
        words = row[f'{size}_words']
        print(f"\n{size.upper()} Summary ({words} words):\n{summary}")
    
    print("\n" + "="*80)

## 10. Summary

In [None]:
print("="*80)
print("TEXT SUMMARIZATION SUMMARY")
print("="*80)
print(f"\nü§ñ Pretrained Model: {MODEL_NAME}")
print(f"üìä Articles Summarized: {len(summaries_df)}")
print(f"\nüìù Summary Sizes:")
for size in ['small', 'medium', 'large']:
    avg_words = summaries_df[f'{size}_words'].mean()
    avg_compression = summaries_df[f'{size}_compression'].mean()
    print(f"  ‚Ä¢ {size.capitalize()}: ~{avg_words:.0f} words ({avg_compression:.1f}% of original)")

print(f"\nüìà ROUGE Scores (Medium Summary):")
medium_scores = rouge_df[rouge_df['size'] == 'medium'][['rouge1', 'rouge2', 'rougeL']].mean()
print(f"  ‚Ä¢ ROUGE-1: {medium_scores['rouge1']:.4f}")
print(f"  ‚Ä¢ ROUGE-2: {medium_scores['rouge2']:.4f}")
print(f"  ‚Ä¢ ROUGE-L: {medium_scores['rougeL']:.4f}")

print(f"\nüíæ Saved Files:")
print(f"  ‚Ä¢ All summaries: {RESULTS_DIR / 'all_summaries.json'}")
print(f"  ‚Ä¢ ROUGE scores: {RESULTS_DIR / 'rouge_scores.json'}")
print(f"  ‚Ä¢ Statistics plot: {RESULTS_DIR / 'summary_statistics.png'}")
print(f"  ‚Ä¢ ROUGE plot: {RESULTS_DIR / 'rouge_scores.png'}")
print("\n‚úÖ Text summarization completed successfully!")
print("="*80)