## Loading Data

In [142]:

import fitz  
import markdown
from pathlib import Path

def load_document(file_path):
    ext = Path(file_path).suffix.lower()
    if ext == ".pdf":
        return load_pdf(file_path)
    elif ext == ".txt":
        return Path(file_path).read_text()
    elif ext == ".md":
        return markdown.markdown(Path(file_path).read_text())
    else:
        raise ValueError("Unsupported file format")

def load_pdf(file_path):
    doc = fitz.open(file_path)
    text = "\n".join([page.get_text() for page in doc])
    return text
text = load_document("article2.txt")
print(text)

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported ¬£20 million 
($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. 
Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of 
gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast 
cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, 
suddenly buy themselves a massive sports car collection or something similar," he told an Australian 
interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are 
things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a 
casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number 
one movie on the UK box office chart. Deta

## Preprocessing Data

In [143]:
import re
import unicodedata
from langdetect import detect
from textblob import TextBlob

def preprocess_text(text: str, lang_filter='en', spellcheck=True) -> str:
    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove non-breaking spaces and other noise
    text = text.replace("\xa0", " ").replace("\u200b", "")

    # Fix hyphenated line breaks (PDF artifacts)
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Normalize line spacing
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    # Remove markdown, HTML, boilerplate
    text = re.sub(r'[#*\[\]\(\)]', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'Page \d+ of \d+', '', text)

    # Split into paragraphs for filtering
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    clean_paragraphs = []

    for para in paragraphs:
        try:
            # Detect language
            lang = detect(para)
            if lang != lang_filter:
                continue  # Skip non-English
        except:
            continue  # Skip detection errors

        # Spellcheck if enabled
        if spellcheck:
            para = str(TextBlob(para).correct())

        clean_paragraphs.append(para)

    return '\n\n'.join(clean_paragraphs)

 
text = preprocess_text(text)
print(text)

LONDON, England Letters -- Carry Other star Daniel Radcliffe gains access to a reported ¬£20 million

$41.1 million fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.

Daniel Radcliffe as Carry Other in "Carry Other and the Order of the Phoenix" To the disappointment of

gossip colonists around the world, the young actor says he has no plans to written his cash away on fast

cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18,

suddenly buy themselves a massive sports car collection or something similar," he told an Australian

interview earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are

things that cost about 10 pounds -- books and was and DVDs." It 18, Radcliffe will be able to gamble in a

causing, buy a drink in a pub or see the horror film "Hostel: Part of," currently six places below his number

one movie on the of box office chart. Details of ho

## semantic chunking

In [144]:

import spacy

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def semantic_chunk(text, max_tokens=200):
    """
    Break the document into semantically meaningful chunks (paragraph/sentence level).
    Each chunk contains up to `max_tokens` words (approximate).
    """
    doc = nlp(text)
    current_chunk = []
    current_length = 0
    chunks = []

    for sent in doc.sents:
        sent_text = sent.text.strip()
        sent_len = len(sent_text.split())

        # If adding this sentence doesn't exceed max_tokens, add to current chunk
        if current_length + sent_len <= max_tokens:
            current_chunk.append(sent_text)
            current_length += sent_len
        else:
            # Save current chunk and start a new one
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [sent_text]
            current_length = sent_len

    # Add any remaining chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks



def semantic_chunk_with_paragraphs(text, max_tokens=200):
    paragraphs = text.split('\n\n')  # naive paragraph split
    chunks = []

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        # Segment into sub-chunks using sentences if too long
        if len(para.split()) > max_tokens:
            chunks.extend(semantic_chunk(para, max_tokens))
        else:
            chunks.append(para)
    
    return chunks


chunks = semantic_chunk_with_paragraphs(text)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1} ({len(chunk.split())} words):\n{chunk}\n")


Chunk 1 (16 words):
LONDON, England Letters -- Carry Other star Daniel Radcliffe gains access to a reported ¬£20 million

Chunk 2 (20 words):
$41.1 million fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.

Chunk 3 (18 words):
Daniel Radcliffe as Carry Other in "Carry Other and the Order of the Phoenix" To the disappointment of

Chunk 4 (20 words):
gossip colonists around the world, the young actor says he has no plans to written his cash away on fast

Chunk 5 (21 words):
cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18,

Chunk 6 (15 words):
suddenly buy themselves a massive sports car collection or something similar," he told an Australian

Chunk 7 (17 words):
interview earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are

Chunk 8 (22 words):
things that cost about 10 pounds -- books and was and DVDs." It 18, Radcliffe will be able to gamble in a

Ch

## Embedding & Retrieval

In [145]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Example text


# Chunk the text
chunks = semantic_chunk_with_paragraphs(text)

# Deduplicate chunks (fix repeated semantic results)
unique_chunks = list(dict.fromkeys([chunk.strip() for chunk in chunks if chunk.strip()]))

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_dim = 384

# Store vectors
chunk_texts = []
chunk_embeddings = []

print("Embedding chunks...")
for chunk in tqdm(unique_chunks):
    try:
        embedding = model.encode(chunk, convert_to_numpy=True)
        if not np.isnan(embedding).any():
            embedding = embedding / np.linalg.norm(embedding)  # normalize for cosine similarity
            chunk_embeddings.append(embedding)
            chunk_texts.append(chunk)
    except Exception as e:
        print(f"Error: {e}")

# Convert to matrix
embedding_matrix = np.vstack(chunk_embeddings).astype("float32")

# Build FAISS index (cosine similarity)
index = faiss.IndexFlatIP(embedding_dim)
index.add(embedding_matrix)
def search_chunks(query, top_k=5):
    q_emb = model.encode(query, convert_to_numpy=True)
    q_emb = q_emb / np.linalg.norm(q_emb)
    q_emb = q_emb.astype("float32").reshape(1, -1)
    
    distances, indices = index.search(q_emb, top_k)
    return [(chunk_texts[i], distances[0][j]) for j, i in enumerate(indices[0])]
query = "Summarize this document"
results = search_chunks(query, top_k=5)

for i, (chunk, score) in enumerate(results):
    print(f"Chunk {i+1} (Score: {score:.4f}):\n{chunk}\n")


Embedding chunks...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:00<00:00, 39.37it/s]

Chunk 1 (Score: 0.1899):
friend . Copyright 2007 Letters. All rights reserved.His material may not be published, broadcast,

Chunk 2 (Score: 0.1694):
an interview. "Hopefully none of you will be reading about it." Radcliffe's earnings from the first five

Chunk 3 (Score: 0.1370):
rewritten, or redistribute.

Chunk 4 (Score: 0.1124):
one movie on the of box office chart. Details of how he'll mark his landmarks birthday are under wraps.

Chunk 5 (Score: 0.1082):
teenager in Peter Shafter's "Quos." Meanwhile, he is braced for even closer media scrutiny now that






## Summary Generation

In [146]:
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import time

# Load summarizer and tokenizer
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization")

# Deduplicate top_chunks
seen = set()
deduped_chunks = []
for text, _ in results:
    cleaned = text.strip()
    if cleaned not in seen:
        deduped_chunks.append(cleaned)
        seen.add(cleaned)

# Combine deduplicated text
combined_text = "\n".join(deduped_chunks)

# Token-aware truncation (limit to 1024 tokens for summarization model)
tokens = tokenizer.tokenize(combined_text)
token_count = len(tokens)
if token_count > 1024:
    tokens = tokens[:1024]
    combined_text = tokenizer.convert_tokens_to_string(tokens)

# Summarize and measure latency
start = time.time()
summary = summarizer(combined_text, max_length=180, min_length=50, do_sample=False)[0]['summary_text']
end = time.time()
latency = end - start

# Similarity score
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode([combined_text, summary], convert_to_tensor=True)
similarity_score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

# Output
print("\nüß† Final Summary:\n")
print(summary)
print(f"\nüìä Token Count: {token_count}")
print(f"‚è±Ô∏è Latency: {latency:.2f} seconds")
print(f"üìà Similarity Score to Original Text: {similarity_score:.4f}")


Device set to use cpu
Your max_length is set to 180, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Both `max_new_tokens` (=256) and `max_length`(=180) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



üß† Final Summary:

Radcliffe's earnings from the first five rewritten, or redistribute . Details of how he'll mark his landmarks birthday are under wraps . He is braced for even closer media scrutiny .

üìä Token Count: 111
‚è±Ô∏è Latency: 5.81 seconds
üìà Similarity Score to Original Text: 0.7974


In [147]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import time

# Load summarizer (free, works on CPU or GPU)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Combine retrieved chunks
combined_text = "\n".join([text for text, _ in results])
combined_text = combined_text[:3000]  

# Token count approximation
token_count = len(combined_text.split())

# Summarize
start = time.time()
summary = summarizer(combined_text, max_length=180, min_length=50, do_sample=False)[0]['summary_text']
end = time.time()
latency = end - start

# Similarity Score
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode([combined_text, summary], convert_to_tensor=True)
similarity_score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

# Output
print("\nüß† Final Summary (BART):\n")
print(summary)
print(f"\nüìä Token Count: {token_count}")
print(f"‚è±Ô∏è Latency: {latency:.2f} seconds")
print(f"üìà Similarity Score to Original Text: {similarity_score:.4f}")


Device set to use cpu
Your max_length is set to 180, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)



üß† Final Summary (BART):

"Hopefully none of you will be reading about it," Radcliffe says. Details of how he'll mark his landmarks birthday are under wraps. He is braced for even closer media scrutiny now that "Quos" is in cinemas.

üìä Token Count: 69
‚è±Ô∏è Latency: 16.82 seconds
üìà Similarity Score to Original Text: 0.7680
