‚úÖ Extracts text from a PDF. 
‚úÖ Splits it into overlapping chunks. 
‚úÖ Uses a Hugging Face QA model.    
‚úÖ Asks a question over each chunk.
‚úÖ Returns the best answer based on confidence

In [19]:
import pdfplumber
from transformers import pipeline
import math

In [20]:
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

In [21]:
# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join(
            page.extract_text() for page in pdf.pages if page.extract_text()
        )


In [22]:
# Step 1: Load text from .txt file
def load_text_from_file(txt_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        return f.read()


In [23]:
max_tokens=450
model_name="deepset/roberta-base-squad2"
model_tokenizer="deepset/roberta-base-squad2"
#model_name="allenai/longformer-base-4096",
#model_tokenizer="allenai/longformer-base-4096"
pdf_path="CIV_1798.160.pdf"
txt_path=""
question = "What are the main topics discussed in this document?"

In [24]:
# Step 2: Chunk text into overlapping pieces
def chunk_text(text, max_tokens=max_tokens, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + max_tokens, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks


In [25]:
# Step 3: Run QA model on each chunk
def get_best_answer(chunks, question, model_name=model_name, tokenizer=model_tokenizer):
    qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_tokenizer)
    best_answer = None
    best_score = -math.inf

    for i, context in enumerate(chunks):
        try:
            result = qa_pipeline(question=question, context=context)
            print(f"‚úÖ Chunk {i+1} ‚Üí Answer: {result['answer']} (score: {result['score']:.4f})")
            if result["score"] > best_score:
                best_score = result["score"]
                best_answer = result["answer"]
        except Exception as e:
            print(f"‚ö†Ô∏è Skipping chunk {i+1}: {e}")
    
    return best_answer


In [26]:
# Step 4: Run the whole flow
def answer_question_from_pdf(pdf_path, question):
    print("üîç Extracting text...")
    text = extract_text_from_pdf(pdf_path)
    
    print("üß± Splitting into chunks...")
    chunks = chunk_text(text, max_tokens=max_tokens, overlap=50)
    
    print(f"üí¨ Asking: {question}")
    answer = get_best_answer(chunks, question)
    
    print(f"\nüéØ Best Answer: {answer}")
    return answer


In [27]:
# üîß Usage
answer_question_from_pdf(pdf_path, question)


üîç Extracting text...
üß± Splitting into chunks...
üí¨ Asking: What are the main topics discussed in this document?


Device set to use mps:0


‚úÖ Chunk 1 ‚Üí Answer: promote and protect consumer privacy, educate children in the area of online privacy (score: 0.0003)
‚úÖ Chunk 2 ‚Üí Answer: grant program (score: 0.0009)

üéØ Best Answer: grant program


'grant program'

In [28]:
#!pip install evaluate

In [30]:

import evaluate

# Load the SQuAD-style metric
squad_metric = evaluate.load("squad")

# Reference and prediction
references = ["Contractor may terminate the agreement with 30 days' written notice."]
predictions = ["Either party may terminate with 30 days' notice."]

# Format as expected by the metric
formatted_predictions = [{"prediction_text": p, "id": str(i)} for i, p in enumerate(predictions)]
formatted_references = [{"answers": {"text": [r], "answer_start": [0]}, "id": str(i)} for i, r in enumerate(references)]

# Compute EM and F1
results = squad_metric.compute(predictions=formatted_predictions, references=formatted_references)

print("‚úÖ Exact Match (EM):", results["exact_match"])
print("‚úÖ F1 Score:", results["f1"])


‚úÖ Exact Match (EM): 0.0
‚úÖ F1 Score: 70.58823529411765
