In [1]:
import os
import uuid
import numpy as np
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List,Any
import chromadb



  from .autonotebook import tqdm as notebook_tqdm





In [2]:
import fitz  # PyMuPDF
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
from PIL import Image
import io
from langchain_core.documents import Document

def process_pdf_hybrid(pdf_path: str, text_threshold: int = 50):
    doc = fitz.open(pdf_path)
    docs = []

    for i, page in enumerate(doc):
        text = page.get_text()

        if len(text.strip()) < text_threshold:
            # Fallback to OCR
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            text = pytesseract.image_to_string(img)

        docs.append(
            Document(
                page_content=text.strip(),
                metadata={
                    "source_file": pdf_path.split("\\")[-1],
                    "page_number": i + 1,
                    "file_type": "pdf"
                }
            )
        )

    return docs

print(pytesseract.get_tesseract_version())



5.5.0.20241111


In [3]:
# from langchain_community.document_loaders import WebBaseLoader

# loader = WebBaseLoader("https://docling.ai")
# docs = loader.load()

In [4]:
pdf_files_path = "C:\\Users\\Pranav Bansal\\Documents\\LLM_POWERED_API_AGENT\\pdf_files"
docs = []

from pathlib import Path
pdf_dir = Path(pdf_files_path)
pdf_files = list(pdf_dir.glob("*.pdf"))

for pdf in pdf_files:
    doc = process_pdf_hybrid(str(pdf))
    docs.extend(doc)

print("Loaded pages:", len(docs))

Loaded pages: 1495


In [5]:
print(docs)



In [6]:
import re

def clean_page_text(text: str) -> str:
    lines = text.splitlines()
    cleaned = []

    for line in lines:
        s = line.strip()

        # keep empty lines but compress later
        if not s:
            cleaned.append("")
            continue

        # plain page numbers: "3"
        if re.fullmatch(r"\d{1,4}", s):
            continue

        # "Page 12", "Page 12 of 123", "p. 3/10"
        if re.fullmatch(r"(page|p\.)\s*\d+(\s*(/|of)\s*\d+)?",
                        s, flags=re.IGNORECASE):
            continue

        # tiny non-text junk like "---", "•", "1/3"
        if len(s) <= 4 and not re.search(r"[A-Za-z]", s):
            continue

        cleaned.append(line)

    # collapse many blank lines
    text = "\n".join(cleaned)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


In [None]:
from semantic_text_splitter import TextSplitter
# from langchain.text_splitter import TextSplitter
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer

def split_docs(documents,chunk_size,chunk_overlap):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = [
            "\n```",      
            "\n#include", 
            "\ndef ",       
            "\nclass ",     
            "\n## ",
            "\n### ",
            "\n\n",
            "\n- ",
            "\n* ",
            ". ",      
            "\n",
            " ",
            ""
        ]
    )

    splitted_text = text_splitter.split_documents(documents)
    return splitted_text

for d in docs:
    d.page_content = clean_page_text(d.page_content)
chunks = split_docs(docs,2000,200)

chunks

[Document(metadata={'source_file': 'Applied-Machine-Learning-and-AI-for-Engineers.pdf', 'page_number': 1, 'file_type': 'pdf'}, page_content="O'REILLY*\n\nApplied Machine\nLearning and\nAl for Engineers\n\nSolve Business Problems\nThat Can't Be Solved\nAlgorithmically\n\nJeff Prosise\nForewor: rd by Adam Prosise"),
 Document(metadata={'source_file': 'Applied-Machine-Learning-and-AI-for-Engineers.pdf', 'page_number': 2, 'file_type': 'pdf'}, page_content='Praise for Applied Machine Learning and AI for Engineers\nThis book is a fantastic guide to machine learning and AI\nalgorithms. It’s succinct while being comprehensive, and\nthe concrete examples with working code show how to take\nthe theory into practice.\nMark Russinovich, Azure CTO and Technical Fellow,\nMicrosoft\nWhen Jeff Prosise is passionate about something (whether\nit be technology, his pet yellow-nape Amazon “Hawkeye,”\nor his hobby of building and flying radio-controlled\njets), you definitely want to listen in. He combines

In [None]:
class EmbeddingManager:

    def __init__(self, model_name = "all-MiniLM-L6-v2"):
        
        self.model_name = model_name
        self.model = None
        self.model = SentenceTransformer(self.model_name)

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        
        embeddings = embeddings = self.model.encode(
                                    texts,
                                    batch_size=64,
                                    show_progress_bar=True,
                                    normalize_embeddings=True
                                    )

        return embeddings

In [19]:
texts = [doc.page_content for doc in chunks]

embedding_manager=EmbeddingManager()
embeddings = embedding_manager.generate_embeddings(texts)
embeddings

Batches: 100%|██████████| 25/25 [00:37<00:00,  1.50s/it]


array([[-8.05333033e-02,  1.09690423e-04,  2.93825623e-02, ...,
         4.13207747e-02,  4.56853807e-02,  3.81768458e-02],
       [-6.22593462e-02, -8.87983814e-02,  2.81385034e-02, ...,
         7.17375353e-02, -2.90876701e-02,  8.02726310e-04],
       [ 9.67695564e-03, -7.73420334e-02,  3.24316509e-02, ...,
         7.15736970e-02, -5.13982661e-02, -1.48361865e-02],
       ...,
       [-1.49592347e-02,  6.91460725e-03,  3.49586420e-02, ...,
         1.73235778e-02, -7.55195618e-02,  9.03654844e-03],
       [-7.77352080e-02,  3.32668126e-02,  1.26612103e-02, ...,
         1.90901458e-02, -8.73328652e-03,  5.48958294e-02],
       [-1.11541584e-01,  1.78629197e-02, -1.24846250e-01, ...,
         4.16832492e-02, -1.31553458e-02, -3.84143344e-03]],
      shape=(1548, 384), dtype=float32)

In [20]:
import os
import uuid
from typing import List, Any
import numpy as np
import chromadb


class VectorStore:

    def __init__(
        self,
        collection_name = "pdf_documents",
        persist_directory = r"C:\Users\Pranav Bansal\Documents\LLM_POWERED_API_AGENT\chroma_store"
    ):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"description": "Text document embeddings for RAG"}
        )

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadatas,
            documents=documents_text
        )
        

vectorstore=VectorStore()
vectorstore.add_documents(chunks,embeddings)


In [21]:



def retrieve_top_docs(query: str, top_k: int = 4):
    q_emb = embedding_manager.generate_embeddings([query])[0].tolist()
    results = vectorstore.collection.query(query_embeddings=[q_emb], n_results=top_k)
    docs = results['documents'][0]
    metas = results['metadatas'][0]
    dists = results.get('distances', [[]])[0]
    return list(zip(docs, metas, dists))

# Example query
query = "What is the intuitive difference between algorithmic problem-solving and machine learning?"
top_docs = retrieve_top_docs(query)
print(top_docs)


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.81it/s]

[('. In the real world, engineers\nsometimes spend the bulk of their time generating these\ndatasets. One of the more popular repositories for public\ndatasets is Kaggle.com, which makes lots of useful datasets\navailable and holds competitions allowing budding ML\npractitioners to test their skills.\nMachine Learning Versus Artificial\nIntelligence\nThe terms machine learning and artificial intelligence (AI)\nare used almost interchangeably today, but in fact, each term\nhas a specific meaning, as shown in Figure\xa01-5.\nTechnically speaking, machine learning is a subset of AI,\nwhich encompasses not only machine learning models but also\nother types of models such as expert systems (systems that\nmake decisions based on rules that you define) and\nreinforcement learning systems, which learn behaviors by\nrewarding positive outcomes while penalizing negative ones.\nAn example of a reinforcement learning system is AlphaGo,\nwhich was the first computer program to beat a professional\n




In [22]:
# def build_context(top_docs):
    
#     context = "\n\n".join([doc for doc, meta, dist in top_docs])
#     return context


In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
        api_key=os.getenv("GROQ_API_KEY"),
        model_name="llama-3.1-8b-instant",
        temperature=0.7,
        max_tokens=400
    )
print(f"✅ Groq LLM initialized")
    
def build_context(top_docs):
    context_parts = []
    for i, (doc, meta, dist) in enumerate(top_docs):
        context_parts.append(
            f"[Source {i+1} | Page {meta.get('page', 'N/A')}]\n{doc}"
        )
    return "\n\n".join(context_parts)



✅ Groq LLM initialized


In [24]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        """
        You are a helpful assistant.
        Use the following extracted parts of a document to answer the question.
        If the answer spans multiple parts, combine them logically.
        If the answer is not fully contained, say so clearly.

        Context:
        {context}

        Question:
        {question}

        Answer:
    """
    )
)

In [25]:
def answer_query(query):
    top_docs = retrieve_top_docs(query)
    context = build_context(top_docs)
    formatted_prompt = prompt.format(context=context, question=query)
    response = llm.invoke(formatted_prompt)  
    return response.content.strip()



In [26]:
query = "Give me the code to reverse a linked list and to delete a node from a linked list"
answer = answer_query(query)
print("\n🧠 Answer:\n", answer)


Batches: 100%|██████████| 1/1 [00:00<00:00, 63.96it/s]



🧠 Answer:
 To provide the code for reversing and deleting a node from a linked list, we'll need to combine the provided information with standard linked list node and linked list class structures. However, since there's no explicit code provided in the given extracts, I'll create an example implementation based on the given information.

### Singly Linked List Node Structure

```python
class Node:
    def __init__(self, data=None):
        self.data = data
        self.next = None
```

### Singly Linked List Class Structure

```python
class LinkedList:
    def __init__(self):
        self.head = None
        self.tail = None
```

### Reversing the Linked List

To reverse the linked list, we'll create two pointers, `prev` and `curr`, and traverse the list. Once we find the node to be deleted, we'll update the `prev` node's `next` pointer.

```python
def reverse_linked_list(self):
    prev = None
    curr = self.head
    while curr:
        next_node = curr.next  # Store the next node
 