### STARTING

In [1]:
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from langchain.llms import Ollama
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Extract text from PDF
def load_pdf(file_path):
    doc = fitz.open(file_path)
    return " ".join(page.get_text() for page in doc)


In [4]:
# Step 2: Chunk text
def chunk_text(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    print("🔪 Splitting into chunks...")
    chunks = splitter.split_text(text)
    # Simulate tqdm progress for chunk inspection
    for _ in tqdm(range(len(chunks)), desc="Processing chunks"):
        pass
    return chunks




In [5]:
# Step 3: Create vector DB with local embedding model
def create_vector_db(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    return FAISS.from_texts(chunks, embedding=embeddings)


In [6]:
# Step 4: Run local LLM with Ollama (LLaMA 3)
def ask_question(vector_db, query):
    llm = Ollama(model="llama3")  # Make sure this model is already pulled with Ollama
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=vector_db.as_retriever())
    return qa.run(query)

In [16]:
# Main p?
# ?
#  ipeline
if __name__ == "__main__":
    file_path = "C:/Users/viru_/OneDrive/Desktop/PDF_INTERAX/Resume.pdf"

    print("📄 Loading PDF...")
    raw_text = load_pdf(file_path)

    print("🔪 Chunking text...")
    chunks = chunk_text(raw_text)

    print("📦 Creating vector DB with free embeddings...")
    vector_db = create_vector_db(chunks)

    print("💬 Ask questions about your document (type 'exit' to stop):")
    while True:
        query = input("Q: ")
        if query.lower() in ["exit", "quit"]:
            break
        answer = ask_question(vector_db, query)
        print("A:", answer)


📄 Loading PDF...
🔪 Chunking text...
🔪 Splitting into chunks...


Processing chunks: 100%|██████████| 9/9 [00:00<00:00, 77512.80it/s]

📦 Creating vector DB with free embeddings...





KeyboardInterrupt: 