✅ Extracts text from a PDF. 
✅ Splits it into overlapping chunks. 
✅ Uses a Hugging Face QA model 
✅ Asks a question over each chunk. 
✅ Returns the best answer based on confidence. 

In [20]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import os
from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer, BertConfig, pipeline
#from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
import pdfplumber
import textwrap

In [21]:
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

In [22]:
# Load and save PDF text to a .txt file (LlamaIndex can read from files)
def save_pdf_as_txt(pdf_path, txt_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

save_pdf_as_txt("CIV_1798.160.pdf", "./doc/pdf_text_1.txt")

In [23]:

# Step 1: Load documents
documents = SimpleDirectoryReader("./docs").load_data()

# Step 2: Setup Legal QA model and embeddings
from transformers import pipeline

qa_pipeline = pipeline(
    "question-answering",
    #model="allenai/longformer-base-4096",
    #tokenizer="allenai/longformer-base-4096"
    model="atharvamundada99/bert-large-question-answering-finetuned-legal",
    tokenizer="atharvamundada99/bert-large-question-answering-finetuned-legal",
)

# Load full text
with open("doc/pdf_text_1.txt", "r", encoding="utf-8") as f:
    context = f.read()

question = "What are the main topics discussed in the document?"

result = qa_pipeline(question=question, context=context)
print("💡 Answer:", result["answer"])

Device set to use mps:0


💡 Answer: begin administering the
grant program
