### Import Libraries and Load Pre-Trained Model

In [None]:
!pip install transformers datasets peft bitsandbytes accelerate sentencepiece rouge-score chromadb langchain sentence-transformers prettytable torch numpy pandas tqdm

In [None]:
!pip install langchain-community

In [None]:
# Mount Google Drive to access stored files
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from rouge_score import rouge_scorer
from prettytable import PrettyTable
import numpy as np
from tqdm import tqdm
import bitsandbytes as bnb
import chromadb
import torch
import pandas as pd
import time
import sys


In [None]:
# Check if GPU is available
print(torch.cuda.is_available())

### Load Dataset

In [None]:
# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
# Shuffle the dataset with a fixed seed for reproducibility
dataset = dataset.shuffle(seed=42)
# Select a subset of the dataset for training, validation, and testing
train_dataset = dataset["train"].select(range(8000))
val_dataset = dataset["validation"].select(range(1000))
test_dataset = dataset["test"].select(range(1000))

### Data Preprocessing

In [None]:
# Check dataset shape
train_dataset.shape

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [None]:
# Function to preprocess text data
def preprocess_function(examples):
    # Format inputs as "Summarize: {article}"
    inputs = ["Summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize the summaries
    labels = tokenizer(
        examples["highlights"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to training and validation datasets
processed_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
processed_val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

### Load Base Model with Quantization

In [9]:
# Set up quantization to load model efficiently
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [None]:
# Load pre-trained FLAN-T5 base model with quantization
model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/flan-t5-base",
    quantization_config=quantization_config,
    device_map="auto"
)

# Prepare model for parameter-efficient fine-tuning
model = prepare_model_for_kbit_training(model)

# Configure LoRA for fine-tuning
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,                          # Rank of update matrices
    lora_alpha=32,                 # Scale parameter
    lora_dropout=0.1,              # Dropout probability
    target_modules=["q", "v"],     # Apply LoRA to attention query and value matrices
)

# Apply LoRA adapters
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Should show ~1-3% of parameters are trainable


### Fine-Tuning Setup

In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-summarization", # Output directory for model checkpoints
    evaluation_strategy="steps",          # Evaluate at regular intervals
    eval_steps=500,                       # Number of steps between evaluations
    learning_rate=5e-4,                   # Learning rate for optimization
    per_device_train_batch_size=4,        # Training batch size per device
    per_device_eval_batch_size=4,         # Evaluation batch size per device
    weight_decay=0.01,                    # Weight decay to prevent overfitting
    save_total_limit=3,                   # Keep only the last 3 checkpoints
    num_train_epochs=3,                   # Number of training epochs
    predict_with_generate=True,           # Use text generation during evaluation
    fp16=True,                            # Enable mixed precision training
    gradient_accumulation_steps=4,        # Effective batch size through accumulation
    generation_max_length=128,            # Maximum generated summary length
    report_to="tensorboard",              #Log metrics to TensorBoard
)


# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_val_dataset,
    tokenizer=tokenizer,
)


# start and save the Fine-tune the model
trainer.train()
model.save_pretrained("/content/drive/MyDrive/flan-t5-summarization-final")
tokenizer.save_pretrained("/content/drive/MyDrive/flan-t5-summarization-final")

### Evaluate Model Performance

In [None]:
# Create embedding model (using all-MiniLM-L6 for efficiency)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Setup vector database with ChromaDB for retrieval-augmented generation (RAG)
chroma_client = chromadb.PersistentClient("./chroma_db")
vector_db = Chroma(
    client=chroma_client,
    collection_name="summarization_knowledge_base",
    embedding_function=embedding_model
)

# Function to load and process knowledge base documents
# This splits large documents into manageable chunks before adding them to the vector DB
def process_knowledge_documents(documents_path):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    loader = TextLoader(documents_path)
    documents = loader.load()
    splits = text_splitter.split_documents(documents)
    vector_db.add_documents(splits)

# Function to generate summaries using RAG-enhanced approach
def rag_enhanced_summarization(article_text, fine_tuned_model, top_k=3):
    # Retrieve relevant context from knowledge base
    retrieved_docs = vector_db.similarity_search(article_text, k=top_k)
    relevant_context = "\n".join([doc.page_content for doc in retrieved_docs])

     # Enhance the input prompt with retrieved context
    enhanced_prompt = f"""Summarize this article using the provided context where relevant.

    Context: {relevant_context}

    Article: {article_text}
    """

    # 3. Generate summary using fine-tuned model
    summarizer = pipeline(
        "summarization",
        model=fine_tuned_model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

    summary = summarizer(
        enhanced_prompt,
        max_length=128,
        min_length=30,
        do_sample=False
    )[0]["summary_text"]

    return summary

### Compute ROUGE Scores

In [None]:
# Initialize ROUGE scorer to measure summarization quality
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to evaluate summarization models
def evaluate_model(model, test_dataset, model_name, use_rag=False):
    # Load the model
    summarizer = pipeline(
        "summarization",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

    results = []
    rouge_scores = []
    inference_times = []
     # Iterate through test dataset and evaluate model performance
    for i, example in enumerate(tqdm(test_dataset)):
        article = example["article"]
        reference_summary = example["highlights"]

        # Measure inference time
        start_time = time.time()

        if use_rag:
            generated_summary = rag_enhanced_summarization(article, model)
        else:
            input_text = "Summarize: " + article
            generated_summary = summarizer(
                input_text,
                max_length=128,
                min_length=30,
                do_sample=False
            )[0]["summary_text"]

        inference_time = time.time() - start_time
        inference_times.append(inference_time)

        # Calculate ROUGE scores
        scores = scorer.score(reference_summary, generated_summary)
        rouge_scores.append({
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        })

        # Save evaluation results
        results.append({
            'article': article[:200] + "...",  # Truncate for display
            'reference': reference_summary,
            'generated': generated_summary,
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure,
            'inference_time': inference_time
        })

        # Only process a subset for efficiency
        if i >= 99:  # Process 100 examples
            break

    # Calculate average scores
    avg_rouge1 = np.mean([s['rouge1'] for s in rouge_scores])
    avg_rouge2 = np.mean([s['rouge2'] for s in rouge_scores])
    avg_rougeL = np.mean([s['rougeL'] for s in rouge_scores])
    avg_inference_time = np.mean(inference_times)

    print(f"--- Evaluation Results for {model_name} {'with RAG' if use_rag else ''} ---")
    print(f"ROUGE-1: {avg_rouge1:.4f}")
    print(f"ROUGE-2: {avg_rouge2:.4f}")
    print(f"ROUGE-L: {avg_rougeL:.4f}")
    print(f"Average inference time: {avg_inference_time:.4f} seconds")


    return pd.DataFrame(results), {
        'model': model_name,
        'rag': use_rag,
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL,
        'inference_time': avg_inference_time
    }

# Run comprehensive evaluation
eval_results = []


# 1. Evaluate base model
base_results, base_metrics = evaluate_model(
    "google/flan-t5-base",
    test_dataset,
    "FLAN-T5-Base"
)
eval_results.append(base_metrics)

# 2. Evaluate fine-tuned model
ft_results, ft_metrics = evaluate_model(
    "/content/drive/MyDrive/flan-t5-summarization-final",
    test_dataset,
    "Fine-tuned FLAN-T5"
)
eval_results.append(ft_metrics)

# 3. Evaluate RAG-enhanced model
rag_results, rag_metrics = evaluate_model(
    "/content/drive/MyDrive/flan-t5-summarization-final",
    test_dataset,
    "Fine-tuned FLAN-T5",
    use_rag=True
)
eval_results.append(rag_metrics)

# Create comparison table
comparison_df = pd.DataFrame(eval_results)
comparison_df
comparison_df.to_csv("evaluation_results.csv", index=False)


In [None]:
def display_summary_comparison(example_id=0, output_file=sys.stdout):
    article = test_dataset[example_id]["article"]
    reference = test_dataset[example_id]["highlights"]

    # Get summaries from different approaches
    summarizer_base = pipeline("summarization", model="google/flan-t5-base")
    summarizer_ft = pipeline("summarization", model="/content/drive/MyDrive/flan-t5-summarization-final")

    base_summary = summarizer_base("Summarize: " + article, max_length=128)[0]["summary_text"]
    ft_summary = summarizer_ft("Summarize: " + article, max_length=128)[0]["summary_text"]
    rag_summary = rag_enhanced_summarization(article, "/content/drive/MyDrive/flan-t5-summarization-final")

    output_file.write("="*80 + "\n")
    output_file.write("ARTICLE EXCERPT (first 300 chars):\n")
    output_file.write(article[:300] + "...\n")
    output_file.write("\nREFERENCE SUMMARY:\n")
    output_file.write(reference + "\n")
    output_file.write("\nBASE MODEL SUMMARY:\n")
    output_file.write(base_summary + "\n")
    output_file.write("\nFINE-TUNED MODEL SUMMARY:\n")
    output_file.write(ft_summary + "\n")
    output_file.write("\nRAG-ENHANCED SUMMARY:\n")
    output_file.write(rag_summary + "\n")
    output_file.write("="*80 + "\n")

    # Calculate ROUGE scores for each approach
    base_rouge = scorer.score(reference, base_summary)
    ft_rouge = scorer.score(reference, ft_summary)
    rag_rouge = scorer.score(reference, rag_summary)

    scores_table = PrettyTable()
    scores_table.field_names = ["Model", "ROUGE-1", "ROUGE-2", "ROUGE-L"]
    scores_table.add_row(["Base FLAN-T5",
                         f"{base_rouge['rouge1'].fmeasure:.4f}",
                         f"{base_rouge['rouge2'].fmeasure:.4f}",
                         f"{base_rouge['rougeL'].fmeasure:.4f}"])
    scores_table.add_row(["Fine-tuned FLAN-T5",
                         f"{ft_rouge['rouge1'].fmeasure:.4f}",
                         f"{ft_rouge['rouge2'].fmeasure:.4f}",
                         f"{ft_rouge['rougeL'].fmeasure:.4f}"])
    scores_table.add_row(["RAG-enhanced",
                         f"{rag_rouge['rouge1'].fmeasure:.4f}",
                         f"{rag_rouge['rouge2'].fmeasure:.4f}",
                         f"{rag_rouge['rougeL'].fmeasure:.4f}"])

    output_file.write(str(scores_table) + "\n")


# Open the file in write mode and specify the location to save the output
output_path = "/content/drive/MyDrive/flan-t5-summarization-final/comparison_results.txt"
with open(output_path, "w") as output_file:
    # Display comparisons for 3 different examples
    for i in range(3):
        display_summary_comparison(i, output_file)
        output_file.write("\n")
