In [None]:
# Installing necessary packages
%pip install uv
!uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!uv pip install nltk datasets spacy evaluate transformers rouge-score matplotlib sympy scipy accelerate scikit-learn wandb bert-score

Note: you may need to restart the kernel to use updated packages.
[2mUsing Python 3.9.18 environment at: /home/parmar.sa/.venv[0m
[2mAudited [1m3 packages[0m [2min 17ms[0m[0m
[2mUsing Python 3.9.18 environment at: /home/parmar.sa/.venv[0m
[2mAudited [1m13 packages[0m [2min 42ms[0m[0m


In [7]:
# week 9: Cleaning and preprocessing the WikiSum dataset

# importing necessary libraries
from datasets import load_dataset
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import concurrent.futures
from typing import List, Tuple
import math
nltk.download('all') 

# Checking for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

try:
    # Ensuring the download directory exists
    import os
    nltk_data_dir = os.path.expanduser('~/nltk_data')
    if not os.path.exists(nltk_data_dir):
        os.makedirs(nltk_data_dir)
    
    # Downloading required NLTK resources
    resources = ['punkt', 'stopwords', 'averaged_perceptron_tagger', 'punkt_tab']
    for resource in resources:
        try:
            nltk.download(resource, quiet=True)
            print(f"Successfully downloaded {resource}")
        except Exception as e:
            print(f"Error downloading {resource}: {str(e)}")
    
    # Verifying the downloads
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    print("NLTK resources successfully verified")
except Exception as e:
    print(f"Error setting up NLTK resources: {str(e)}")
    raise

# Initializing BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

class WikiSumDataset(Dataset):
    """Custom Dataset for batch processing"""
    def __init__(self, texts: List[str]):
        self.texts = texts
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

def batch_tokenize(texts: List[str], batch_size: int = 32) -> torch.Tensor:
    """Tokenize texts in batches using GPU"""
    dataset = WikiSumDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_tokens = []
    with torch.no_grad():
        for batch in dataloader:
            tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
            tokens = {k: v.to(device) for k, v in tokens.items()}
            all_tokens.append(tokens)
    
    return all_tokens

def load_and_prepare_wikisum():
    """Load WikiSum dataset and prepare initial dataframe"""
    dataset = load_dataset("d0rj/wikisum")
    
    # Converting to pandas DataFrame for easier manipulation
    train_df = pd.DataFrame(dataset['train'])
    val_df = pd.DataFrame(dataset['validation'])
    test_df = pd.DataFrame(dataset['test'])
    
    return train_df, val_df, test_df

def clean_text(text: str) -> str:
    """Clean text by removing special characters, extra whitespace, etc."""
    # Converting to lowercase
    text = text.lower()
    
    # Removing special characters and digits
    text = re.sub(r'[^\w\s.]', '', text)
    
    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def calculate_info_density(tokens: List[str], stop_words: set) -> float:
    """Calculate information density using GPU-accelerated operations"""
    # Converting tokens to tensor
    token_tensor = torch.tensor([1 if token.lower() not in stop_words else 0 for token in tokens],
                              dtype=torch.float32,
                              device=device)
    
    # Calculating density
    density = torch.mean(token_tensor).item()
    return density

def process_sentences_batch(sentences: List[str], stop_words: set) -> List[str]:
    """Process a batch of sentences in parallel"""
    processed_sentences = []
    
    def process_single_sentence(sent: str) -> Tuple[bool, str]:
        words = word_tokenize(sent)
        
        # Skipping very short or very long sentences
        if len(words) < 5 or len(words) > 100:
            return False, sent
        
        # Calculating information density using GPU
        info_density = calculate_info_density(words, stop_words)
        
        return info_density > 0.5, sent
    
    # Processing sentences in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda s: process_single_sentence(s), sentences))
    
    return [sent for keep, sent in results if keep]

def preprocess_document(doc: str) -> str:
    """Preprocess a single document with GPU acceleration where possible"""
    # Cleaning the text
    clean_doc = clean_text(doc)
    
    # Splitting into sentences
    sentences = sent_tokenize(clean_doc)
    
    # Processing sentences in batches
    stop_words = set(stopwords.words('english'))
    processed_sentences = process_sentences_batch(sentences, stop_words)
    
    return ' '.join(processed_sentences)

def process_dataset(df: pd.DataFrame, batch_size: int = 32) -> pd.DataFrame:
    """Process the entire dataset with GPU acceleration"""
    # Creating processing function for batch operations
    def process_batch(texts: List[str]) -> List[str]:
        return [preprocess_document(doc) for doc in texts]
    
    # Processing documents in batches
    print("Processing documents...")
    n_batches = math.ceil(len(df) / batch_size)
    processed_docs = []
    
    for i in tqdm(range(n_batches)):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df['article'].iloc[start_idx:end_idx].tolist()
        
        # Processing batch
        processed_batch = process_batch(batch_texts)
        processed_docs.extend(processed_batch)
    
    df['processed_document'] = processed_docs
    
    # Processing summaries similarly
    print("Processing summaries...")
    processed_summaries = []
    
    for i in tqdm(range(n_batches)):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df['summary'].iloc[start_idx:end_idx].tolist()
        
        # Processing batch
        processed_batch = [clean_text(text) for text in batch_texts]
        processed_summaries.extend(processed_batch)
    
    df['processed_summary'] = processed_summaries
    
    # Removing empty entries
    df = df[df['processed_document'].str.len() > 0]
    df = df[df['processed_summary'].str.len() > 0]
    
    return df

def main():
    """Main function to run the GPU-accelerated preprocessing pipeline"""
    print("Loading dataset...")
    train_df, val_df, test_df = load_and_prepare_wikisum()
    
    # Processing each split
    print("\nProcessing training set...")
    processed_train = process_dataset(train_df)
    
    print("\nProcessing validation set...")
    processed_val = process_dataset(val_df)
    
    print("\nProcessing test set...")
    processed_test = process_dataset(test_df)
    
    # Saving processed datasets
    processed_train.to_csv('processed_wikisum_train.csv', index=False)
    processed_val.to_csv('processed_wikisum_val.csv', index=False)
    processed_test.to_csv('processed_wikisum_test.csv', index=False)
    
    print("\nPreprocessing complete! Processed files saved as CSV.")
    
    # Printing some statistics
    print("\nDataset statistics:")
    print(f"Training samples: {len(processed_train)}")
    print(f"Validation samples: {len(processed_val)}")
    print(f"Test samples: {len(processed_test)}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/parmar.sa/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/parmar.sa/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/parmar.sa/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/parmar.sa/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/parmar.sa/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_da

Using device: cuda
Successfully downloaded punkt
Successfully downloaded stopwords
Successfully downloaded averaged_perceptron_tagger
Successfully downloaded punkt_tab
NLTK resources successfully verified
Loading dataset...

Processing training set...
Processing documents...


100%|██████████| 1118/1118 [15:27<00:00,  1.21it/s]


Processing summaries...


100%|██████████| 1118/1118 [00:01<00:00, 989.41it/s] 



Processing validation set...
Processing documents...


100%|██████████| 63/63 [00:53<00:00,  1.19it/s]


Processing summaries...


100%|██████████| 63/63 [00:00<00:00, 1008.73it/s]



Processing test set...
Processing documents...


100%|██████████| 63/63 [00:52<00:00,  1.20it/s]


Processing summaries...


100%|██████████| 63/63 [00:00<00:00, 1030.98it/s]



Preprocessing complete! Processed files saved as CSV.

Dataset statistics:
Training samples: 35773
Validation samples: 2000
Test samples: 2000


In [3]:
# week 10: Implementing the TextRank algorithm for extractive summarization and evaluating it

# Importing necessary libraries
from typing import List
from torch.nn import functional as F
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import numpy as np
import networkx as nx
from rouge_score import rouge_scorer
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name())

class OptimizedTextRankSummarizer:
    def __init__(self, bert_model_name: str = 'bert-base-uncased'):
        """
        Initialize the optimized TextRank summarizer
        """
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.model = BertModel.from_pretrained(bert_model_name).to(self.device)
        self.model.eval()
        self.stop_words = set(stopwords.words('english'))

    def sentence_position_scores(self, num_sentences: int) -> np.ndarray:
        """
        Calculate position-based importance scores
        """
        positions = np.arange(num_sentences)
        # Exponential decay for position importance
        position_scores = np.exp(-positions / num_sentences)
        return position_scores

    def calculate_sentence_lengths(self, sentences: List[str]) -> np.ndarray:
        """
        Calculate normalized sentence lengths
        """
        lengths = np.array([len(word_tokenize(sent)) for sent in sentences])
        # Normalizing lengths to [0,1] range
        return (lengths - lengths.min()) / (lengths.max() - lengths.min() + 1e-8)

    def get_bert_embeddings(self, sentences: List[str]) -> torch.Tensor:
        embeddings = []

        # Moving model to GPU explicitly
        self.model = self.model.to(self.device)

        for sentence in sentences:
            inputs = self.tokenizer(sentence, return_tensors='pt',
                                  padding=True, truncation=True,
                                  max_length=512)
            # Moving inputs to GPU
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs)

                cls_embedding = outputs.last_hidden_state[:, 0, :]
                attention_mask = inputs['attention_mask']
                token_embeddings = outputs.last_hidden_state

                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                mean_embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

                final_embedding = (cls_embedding + mean_embedding) / 2
                # Moving result back to CPU before appending
                embeddings.append(final_embedding.cpu())

        return torch.cat(embeddings, dim=0)


    def build_similarity_matrix(self, embeddings: torch.Tensor,
                              position_scores: np.ndarray,
                              length_scores: np.ndarray,
                              threshold: float = 0.3) -> np.ndarray:
        """
        Build enhanced similarity matrix
        """
        # Calculating cosine similarity
        similarity_matrix = F.cosine_similarity(embeddings.unsqueeze(1),
                                              embeddings.unsqueeze(0), dim=2)
        similarity_matrix = similarity_matrix.numpy()

        # Applying threshold
        similarity_matrix[similarity_matrix < threshold] = 0

        # Incorporating position and length importance
        importance_scores = (position_scores + length_scores) / 2
        importance_matrix = np.outer(importance_scores, importance_scores)
        similarity_matrix = similarity_matrix * importance_matrix

        # Normalizing
        row_sums = similarity_matrix.sum(axis=1, keepdims=True)
        similarity_matrix = np.divide(similarity_matrix, row_sums, where=row_sums!=0)

        return similarity_matrix

    def get_optimal_summary_length(self, text_length: int, num_sentences: int) -> int:
        """
        Dynamic summary length based on text properties
        """
        # Base length based on text length
        if text_length < 500:
            base_length = 3
        elif text_length < 1000:
            base_length = 4
        elif text_length < 2000:
            base_length = 5
        else:
            base_length = 6

        # Adjusting based on number of sentences
        return min(base_length, max(3, num_sentences // 4))

    def summarize(self, text: str, num_sentences: int = None) -> str:
        """
        Generate extractive summary using optimized TextRank
        """
        # Splitting and preprocess
        sentences = sent_tokenize(text)
        if len(sentences) < 3:
            return text

        # Calculating optimal summary length
        if num_sentences is None:
            num_sentences = self.get_optimal_summary_length(len(text), len(sentences))

        # Calculating importance scores
        position_scores = self.sentence_position_scores(len(sentences))
        length_scores = self.calculate_sentence_lengths(sentences)

        # Generating embeddings and similarity matrix
        embeddings = self.get_bert_embeddings(sentences)
        similarity_matrix = self.build_similarity_matrix(embeddings,
                                                       position_scores,
                                                       length_scores)

        # Running PageRank with personalization
        nx_graph = nx.from_numpy_array(similarity_matrix)
        personalization = dict(enumerate(position_scores))
        scores = nx.pagerank(nx_graph,
                           alpha=0.85,
                           personalization=personalization,
                           max_iter=100,
                           tol=1e-6)

        # Selecting sentences
        sentence_scores = np.array([scores[i] for i in range(len(scores))])
        top_idx = np.argsort(sentence_scores)[-num_sentences:]
        top_idx = sorted(top_idx)

        # Combining sentences
        summary = ' '.join([sentences[i] for i in top_idx])

        return summary

def evaluate_with_cross_validation(val_path: str = 'processed_wikisum_val.csv',
                                 n_folds: int = 3,
                                 samples_per_fold: int = 100):
    """
    Evaluate with cross-validation for more robust results
    """
    print("Loading data...")
    processed_val = pd.read_csv(val_path)

    # Initializing summarizer and scorer
    summarizer = OptimizedTextRankSummarizer()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'],
                                    use_stemmer=True)

    all_scores = []

    # Cross-validation
    for fold in range(n_folds):
        print(f"\nProcessing fold {fold + 1}/{n_folds}")
        start_idx = fold * samples_per_fold
        end_idx = start_idx + samples_per_fold
        fold_samples = processed_val.iloc[start_idx:end_idx]

        fold_results = []
        for _, row in tqdm(fold_samples.iterrows(), total=len(fold_samples)):
            generated_summary = summarizer.summarize(row['processed_document'])
            scores = scorer.score(row['processed_summary'], generated_summary)

            fold_results.append({
                'rouge1_f': scores['rouge1'].fmeasure,
                'rouge2_f': scores['rouge2'].fmeasure,
                'rougeL_f': scores['rougeL'].fmeasure
            })

        all_scores.append(pd.DataFrame(fold_results).mean())
        print(f"Fold {fold + 1} scores:")
        print(pd.DataFrame(fold_results).mean())

    # Calculating and printing final results
    final_scores = pd.DataFrame(all_scores).mean()
    print("\nFinal Cross-validation Results:")
    for metric, score in final_scores.items():
        print(f"{metric}: {score:.4f}")

    return final_scores


if __name__ == "__main__":
    evaluate_with_cross_validation()


  backends.update(_get_backends("networkx.backends"))


CUDA available: True
Current device: 0
Device name: NVIDIA A100-SXM4-40GB
Loading data...

Processing fold 1/3


100%|██████████| 100/100 [00:37<00:00,  2.66it/s]


Fold 1 scores:
rouge1_f    0.380939
rouge2_f    0.094451
rougeL_f    0.193433
dtype: float64

Processing fold 2/3


100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Fold 2 scores:
rouge1_f    0.381517
rouge2_f    0.098991
rougeL_f    0.199095
dtype: float64

Processing fold 3/3


100%|██████████| 100/100 [00:34<00:00,  2.89it/s]

Fold 3 scores:
rouge1_f    0.377944
rouge2_f    0.100161
rougeL_f    0.196346
dtype: float64

Final Cross-validation Results:
rouge1_f: 0.3801
rouge2_f: 0.0979
rougeL_f: 0.1963





In [4]:
# week 11: Fine-tuning the BART model for abstractive summarization
import torch
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import load_dataset
import evaluate
from typing import Dict
import gc
import os
from torch.utils.data import DataLoader
import torch.distributed as dist

# Loading the dataset
dataset = load_dataset("d0rj/wikisum", streaming=True)


# Function for Memory management
def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

class WikiSumDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=384):
        self.dataset = list(dataset.take(1000))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        inputs = self.tokenizer(
            item['article'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        with torch.no_grad():
            targets = self.tokenizer(
                item['summary'],
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

        clear_memory()

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

def compute_metrics(eval_preds, tokenizer):
    rouge = evaluate.load('rouge')
    predictions, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    return {k: round(v * 100, 2) for k, v in result.items()}

def train_bart_model():
    # Setting memory efficient attention
    torch.backends.cuda.max_memory_allocated = lambda: 0

    # Loading data
    dataset = load_dataset("d0rj/wikisum", streaming=True)
    train_data = dataset['train'].take(100000) 
    val_data = dataset['validation'].take(10000)

    clear_memory()

    # Initializing tokenizer and saving it immediately
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
    os.makedirs("./bart_wikisum", exist_ok=True)
    tokenizer.save_pretrained("./bart_wikisum")

    train_dataset = WikiSumDataset(train_data, tokenizer)
    val_dataset = WikiSumDataset(val_data, tokenizer)

    # Model initialization
    model = BartForConditionalGeneration.from_pretrained(
        'facebook/bart-base',
        gradient_checkpointing=True,
        use_cache=False
    )

    clear_memory()
    if torch.cuda.is_available():
        model = model.cuda()


    training_args = TrainingArguments(
    output_dir="./bart_wikisum",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    evaluation_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    fp16=True,
    fp16_full_eval=True,
    fp16_backend="auto",
    optim="adamw_torch",
    max_grad_norm=1.0,
    gradient_checkpointing=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    dataloader_num_workers=0,
    ddp_find_unused_parameters=False,
    remove_unused_columns=True,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    weight_decay=0.02,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda x: compute_metrics(x, tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    return trainer, tokenizer

def generate_summary(text, model_path):
    try:
        # Loading tokenizer and model from the same directory
        tokenizer = BartTokenizer.from_pretrained(model_path)
        model = BartForConditionalGeneration.from_pretrained(model_path)

        # Moving model to GPU if available
        if torch.cuda.is_available():
            model = model.cuda()

        inputs = tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        )

        # Moving inputs to GPU if available
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        summary_ids = model.generate(
            inputs["input_ids"],
            num_beams=4,
            length_penalty=0.8,
            no_repeat_ngram_size=3,
            min_length=30,
            max_length=150,
            early_stopping=True,
            repetition_penalty=1.2,
            do_sample=True,  # Adding some randomness
            temperature=0.7  # Controlling randomness
        )


        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error during summary generation: {str(e)}")
        return None

if __name__ == "__main__":
    # Clearing GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

    # Training model
    trainer, tokenizer = train_bart_model()
    trainer.train()

    # Saving both model and tokenizer
    trainer.save_model("./bart_wikisum")
    tokenizer.save_pretrained("./bart_wikisum")

    # Getting first example from the validation set
    example = next(iter(dataset['validation']))
    text = example['article']

    print("Summary with base model:")
    summary_base = generate_summary(text, "facebook/bart-base")
    print(f"Generated Summary: {summary_base}")

    print("\nSummary with fine-tuned model:")
    summary_finetuned = generate_summary(text, "./bart_wikisum")
    print(f"Generated Summary: {summary_finetuned}")

    # Printing directory contents for debugging
    print("\nContents of ./bart_wikisum directory:")
    print(os.listdir("./bart_wikisum"))



Step,Training Loss,Validation Loss




Summary with base model:
Generated Summary: Figure out how you typically spend your time. Before you can figure out how to optimize your time, get a good sense for how you already manage day-to-day affairs. If you have to attend school or work, these hours are already managed for you. In your free time, you have a much greater degree of flexibility. Spend a few days keeping track of how you do every day. Pay close attention to how you spend free time. Do you play video games, or do you clean the house? Make a list of these activities and how long you spend on them. Estimate how long in your daily schedule. It's likely that you spend a relatively large amount of some of your days simply traveling to or from work

Summary with fine-tuned model:
Generated Summary: To optimize your time, start by making a list of how long you spend commuting for school, work, and errands. Then, determine how much time you'll spend on these activities. For example, if you have to go to school or work, make 

In [21]:
# week 12: Implementing the hybrid approach and evaluating its performance using ROUGE, BERTScore, and METEOR with analysis and visualization of the results
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from typing import List, Dict, Union
import numpy as np
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
import gc
import evaluate
from nltk.translate.meteor_score import meteor_score
import nltk

nltk.download('wordnet')

class HybridSummarizer:
    """
    A hybrid summarization system that combines extractive (TextRank) and 
    abstractive (BART) approaches with adaptive selection and comprehensive metrics
    """
    def __init__(self, bart_model_path: str = "./bart_wikisum"):
        """
        Initialize the hybrid summarizer with both extractive and abstractive components
        and evaluation metrics
        """
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Initializing BART components
        try:
            self.tokenizer = BartTokenizer.from_pretrained(bart_model_path)
            self.bart_model = BartForConditionalGeneration.from_pretrained(bart_model_path).to(self.device)
        except:
            print("Using base BART model as fallback...")
            self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
            self.bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base').to(self.device)
        
        # Initializing TextRank component
        self.textrank_summarizer = OptimizedTextRankSummarizer()
        
        # Initializing all evaluation metrics
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.bertscore = evaluate.load('bertscore')
        
    def clear_memory(self):
        """Memory management utility"""
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

    def calculate_bertscore(self, reference: str, candidate: str) -> float:
        """Calculate BERTScore"""
        results = self.bertscore.compute(
            predictions=[candidate],
            references=[reference],
            lang="en",
            model_type="microsoft/deberta-xlarge-mnli"
        )
        return sum(results['f1']) / len(results['f1'])

    def calculate_meteor(self, reference: str, candidate: str) -> float:
        """Calculate METEOR score"""
        reference_tokens = nltk.word_tokenize(reference)
        candidate_tokens = nltk.word_tokenize(candidate)
        return meteor_score([reference_tokens], candidate_tokens)

    def generate_abstractive_summary(self, text: str) -> str:
        """Generate abstractive summary using fine-tuned BART"""
        inputs = self.tokenizer(
            text,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        summary_ids = self.bart_model.generate(
            inputs["input_ids"],
            num_beams=4,
            length_penalty=0.8,
            no_repeat_ngram_size=3,
            min_length=30,
            max_length=150,
            early_stopping=True,
            repetition_penalty=1.2,
            do_sample=True,
            temperature=0.7
        )

        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    def adaptive_summarize(self, text: str, strategy: str = 'hybrid') -> Dict[str, str]:
        """Generate summary using specified strategy with adaptive selection"""
        results = {}
        
        # Generating summaries using both methods
        extractive_summary = self.textrank_summarizer.summarize(text)
        abstractive_summary = self.generate_abstractive_summary(text)
        
        # Calculating comprehensive metrics for both summaries
        extractive_scores = self.rouge_scorer.score(text[:200], extractive_summary)
        abstractive_scores = self.rouge_scorer.score(text[:200], abstractive_summary)
        
        ext_rouge1 = extractive_scores['rouge1'].fmeasure
        abs_rouge1 = abstractive_scores['rouge1'].fmeasure
        
        # Calculating BERTScore and METEOR for both summaries
        ext_bertscore = self.calculate_bertscore(text[:200], extractive_summary)
        abs_bertscore = self.calculate_bertscore(text[:200], abstractive_summary)
        ext_meteor = self.calculate_meteor(text[:200], extractive_summary)
        abs_meteor = self.calculate_meteor(text[:200], abstractive_summary)
        
        if strategy == 'hybrid':
            # Adaptive selection using all metrics
            ext_score = (ext_rouge1 + ext_bertscore + ext_meteor) / 3
            abs_score = (abs_rouge1 + abs_bertscore + abs_meteor) / 3
            
            if abs_score > ext_score * 1.1:
                final_summary = abstractive_summary
                method_used = 'abstractive'
            elif ext_score > abs_score * 1.1:
                final_summary = extractive_summary
                method_used = 'extractive'
            else:
                # Weighted combination based on comprehensive scores
                total_score = ext_score + abs_score
                ext_weight = ext_score / total_score
                abs_weight = abs_score / total_score
                
                ext_sents = sent_tokenize(extractive_summary)
                abs_sents = sent_tokenize(abstractive_summary)
                
                combined_sents = []
                for i in range(min(len(ext_sents), len(abs_sents))):
                    if np.random.random() < ext_weight:
                        combined_sents.append(ext_sents[i])
                    else:
                        combined_sents.append(abs_sents[i])
                
                final_summary = ' '.join(combined_sents)
                method_used = 'hybrid'
        else:
            final_summary = extractive_summary if strategy == 'extractive' else abstractive_summary
            method_used = strategy
            
        results = {
            'summary': final_summary,
            'method_used': method_used,
            'metrics': {
                'rouge1': ext_rouge1 if method_used == 'extractive' else abs_rouge1,
                'bertscore': ext_bertscore if method_used == 'extractive' else abs_bertscore,
                'meteor': ext_meteor if method_used == 'extractive' else abs_meteor
            }
        }
        
        return results

def evaluate_hybrid_system(model_path: str = "./bart_wikisum", num_samples: int = 100):
    """Evaluate the hybrid summarization system on the WikiSum dataset with comprehensive metrics"""
    dataset = load_dataset("d0rj/wikisum")
    eval_data = dataset['validation'].select(range(num_samples))
    
    summarizer = HybridSummarizer(model_path)
    all_metrics = []
    
    for item in tqdm(eval_data):
        try:
            # Generating summaries using different strategies
            hybrid_result = summarizer.adaptive_summarize(item['article'], strategy='hybrid')
            extractive_result = summarizer.adaptive_summarize(item['article'], strategy='extractive')
            abstractive_result = summarizer.adaptive_summarize(item['article'], strategy='abstractive')
            
            # Calculating all metrics for each method
            metrics = {
                'hybrid_rouge1': summarizer.rouge_scorer.score(item['summary'], 
                    hybrid_result['summary'])['rouge1'].fmeasure,
                'hybrid_bertscore': summarizer.calculate_bertscore(item['summary'], 
                    hybrid_result['summary']),
                'hybrid_meteor': summarizer.calculate_meteor(item['summary'], 
                    hybrid_result['summary']),
                
                'extractive_rouge1': summarizer.rouge_scorer.score(item['summary'], 
                    extractive_result['summary'])['rouge1'].fmeasure,
                'extractive_bertscore': summarizer.calculate_bertscore(item['summary'], 
                    extractive_result['summary']),
                'extractive_meteor': summarizer.calculate_meteor(item['summary'], 
                    extractive_result['summary']),
                
                'abstractive_rouge1': summarizer.rouge_scorer.score(item['summary'], 
                    abstractive_result['summary'])['rouge1'].fmeasure,
                'abstractive_bertscore': summarizer.calculate_bertscore(item['summary'], 
                    abstractive_result['summary']),
                'abstractive_meteor': summarizer.calculate_meteor(item['summary'], 
                    abstractive_result['summary']),
                
                'method_used': hybrid_result['method_used']
            }
            all_metrics.append(metrics)
            
        except Exception as e:
            print(f"Error processing sample: {str(e)}")
            continue
            
        if len(all_metrics) % 10 == 0:
            summarizer.clear_memory()
    
    # Calculating and displaying final results
    results_df = pd.DataFrame(all_metrics)
    print("\nFinal Evaluation Results:")
    
    # Printing average scores for each method and metric
    for method in ['hybrid', 'extractive', 'abstractive']:
        print(f"\n{method.capitalize()} Method:")
        for metric in ['rouge1', 'bertscore', 'meteor']:
            score = results_df[f'{method}_{metric}'].mean()
            print(f"{metric}: {score:.4f}")
    
    print("\nMethod Selection Distribution:")
    print(results_df['method_used'].value_counts(normalize=True))
    
    # Saving detailed results
    results_df.to_csv("hybrid_summarization_results.csv", index=False)
    
    # Creating visualization of results
    import matplotlib.pyplot as plt
    
    metrics = ['rouge1', 'bertscore', 'meteor']
    methods = ['hybrid', 'extractive', 'abstractive']
    
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(metrics))
    width = 0.25
    
    for i, method in enumerate(methods):
        scores = [results_df[f'{method}_{metric}'].mean() for metric in metrics]
        ax.bar(x + i*width, scores, width, label=method.capitalize())
    
    ax.set_ylabel('Score')
    ax.set_title('Comparison of Summarization Methods')
    ax.set_xticks(x + width)
    ax.set_xticklabels(['ROUGE-1', 'BERTScore', 'METEOR'])
    ax.legend()
    
    plt.tight_layout()
    plt.savefig('summarization_results.png')
    plt.close()
    
    return results_df

if __name__ == "__main__":
    print("Initializing evaluation...")
    results = evaluate_hybrid_system(num_samples=50)
    print("\nResults saved to hybrid_summarization_results.csv and summarization_results.png")

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/parmar.sa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initializing evaluation...


Downloading builder script: 100%|██████████| 7.95k/7.95k [00:00<00:00, 5.86MB/s]
100%|██████████| 50/50 [06:07<00:00,  7.35s/it]


Final Evaluation Results:

Hybrid Method:
rouge1: 0.3952
bertscore: 0.6026
meteor: 0.2890

Extractive Method:
rouge1: 0.3964
bertscore: 0.5999
meteor: 0.3028

Abstractive Method:
rouge1: 0.4242
bertscore: 0.6144
meteor: 0.3003

Method Selection Distribution:
method_used
extractive     0.64
hybrid         0.28
abstractive    0.08
Name: proportion, dtype: float64

Results saved to hybrid_summarization_results.csv and summarization_results.png



