In [112]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [113]:
def load_pdf_documents(pdf_directory):
  all_docs=[]
  pdf_dir = Path(pdf_directory)
  pdf_files = list(pdf_dir.glob("**/*.pdf"))
  print(f"Found {len(pdf_files)} PDF files")
  for pdf_file in pdf_files:
    print(f"Loading {pdf_file.name}")
    try:
      loader = PyPDFLoader(pdf_file)
      document = loader.load()
      for doc in document:
        doc.metadata['source_file'] = pdf_file.name
        doc.metadata['file_type'] = 'pdf'
      
      all_docs.extend(document)
      print(f"Loaded {len(document)} pages from {pdf_file.name}")
    except Exception as e:
      print(f"Error loading {pdf_file.name}: {e}")
    
  return all_docs

pdf_documents = load_pdf_documents("../data/pdf")
print(f"Total documents loaded: {len(pdf_documents)}")


Found 2 PDF files
Loading 2509.18094v3.pdf
Loaded 23 pages from 2509.18094v3.pdf
Loading 2410.17725v1.pdf
Loaded 9 pages from 2410.17725v1.pdf
Total documents loaded: 32


In [114]:
pdf_documents

[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:e76afa9)', 'creationdate': '', 'author': 'Ye Liu; Zongyang Ma; Junfu Pu; Zhongang Qi; Yang Wu; Ying Shan; Chang Wen Chen', 'doi': 'https://doi.org/10.48550/arXiv.2509.18094', 'license': 'http://creativecommons.org/licenses/by-nc-sa/4.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'UniPixel: Unified Object Referring and Segmentation for Pixel-Level Visual Reasoning', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2509.18094v3', 'source': '../data/pdf/2509.18094v3.pdf', 'total_pages': 23, 'page': 0, 'page_label': '1', 'source_file': '2509.18094v3.pdf', 'file_type': 'pdf'}, page_content='UniPixel: Unified Object Referring and\nSegmentation for Pixel-Level Visual Reasoning\nYe Liu1,2, Zongyang Ma2,3, Junfu Pu2, Zhongang Qi4, Yang Wu5,\nYing Shan2, Chang Wen Chen1∗\n1 The Hong Kong Polytechnic University 2 ARC Lab, Tencent PC

embedding vectorDB

In [115]:
def text_split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Total documents after splitting: {len(split_docs)}")
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    return split_docs

In [116]:
chunks = text_split_documents(pdf_documents)
chunks

Total documents after splitting: 147

Example chunk:
Content: UniPixel: Unified Object Referring and
Segmentation for Pixel-Level Visual Reasoning
Ye Liu1,2, Zongyang Ma2,3, Junfu Pu2, Zhongang Qi4, Yang Wu5,
Ying Shan2, Chang Wen Chen1∗
1 The Hong Kong Polytech...
Metadata: {'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:e76afa9)', 'creationdate': '', 'author': 'Ye Liu; Zongyang Ma; Junfu Pu; Zhongang Qi; Yang Wu; Ying Shan; Chang Wen Chen', 'doi': 'https://doi.org/10.48550/arXiv.2509.18094', 'license': 'http://creativecommons.org/licenses/by-nc-sa/4.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'UniPixel: Unified Object Referring and Segmentation for Pixel-Level Visual Reasoning', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2509.18094v3', 'source': '../data/pdf/2509.18094v3.pdf', 'total_pages': 23, 'page': 0, 'page_label': '1', 'source_file': '2509.18094v3.pdf', 'file_type': 'pdf'

[Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:e76afa9)', 'creationdate': '', 'author': 'Ye Liu; Zongyang Ma; Junfu Pu; Zhongang Qi; Yang Wu; Ying Shan; Chang Wen Chen', 'doi': 'https://doi.org/10.48550/arXiv.2509.18094', 'license': 'http://creativecommons.org/licenses/by-nc-sa/4.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1', 'title': 'UniPixel: Unified Object Referring and Segmentation for Pixel-Level Visual Reasoning', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2509.18094v3', 'source': '../data/pdf/2509.18094v3.pdf', 'total_pages': 23, 'page': 0, 'page_label': '1', 'source_file': '2509.18094v3.pdf', 'file_type': 'pdf'}, page_content='UniPixel: Unified Object Referring and\nSegmentation for Pixel-Level Visual Reasoning\nYe Liu1,2, Zongyang Ma2,3, Junfu Pu2, Zhongang Qi4, Yang Wu5,\nYing Shan2, Chang Wen Chen1∗\n1 The Hong Kong Polytechnic University 2 ARC Lab, Tencent PC

Embedding and VectorDb


In [117]:
import numpy as np
from sentence_transformers import SentenceTransformer
import uuid
import chromadb
from chromadb.config import Settings
from typing import List, Dict, Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [118]:
class EmbeddingVectorDB:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
      try:
        print(f"Loading embedding model: {self.model_name}")
        self.model = SentenceTransformer(self.model_name)
        print("Model loaded successfully.")
      except Exception as e:
        print(f"Error loading model {self.model_name}: {e}")

    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        print(f"Generating embeddings for {len(texts)} texts.")
        embeddings = self.model.encode(texts, convert_to_numpy=True,show_progress_bar=True)
        return embeddings
    
embedding_manager = EmbeddingVectorDB()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully.


<__main__.EmbeddingVectorDB at 0x738d288f7c50>

Vector Db

In [119]:
class VectorStore:
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_db"):

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
       
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            # shutil.rmtree(self.persist_directory,ignore_errors=True)  
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 2058


<__main__.VectorStore at 0x738d2a4a4ec0>

In [120]:
text = [doc.page_content for doc in chunks]
text

['UniPixel: Unified Object Referring and\nSegmentation for Pixel-Level Visual Reasoning\nYe Liu1,2, Zongyang Ma2,3, Junfu Pu2, Zhongang Qi4, Yang Wu5,\nYing Shan2, Chang Wen Chen1∗\n1 The Hong Kong Polytechnic University 2 ARC Lab, Tencent PCG\n3 Institute of Automation, Chinese Academy of Sciences 4 vivo Mobile Communication Co.\n5 MindWingman Technology (Shenzhen) Co., Ltd.\ncoco.ye.liu@connect.polyu.hk\nhttps://polyu-chenlab.github.io/unipixel/\nReasoning Segmentation (ReasonSeg)\nReferring Expression Segmentation (RES)\nInteractive Segmentation (IS)\nReasoning Video Object Segmentation (ReVOS)\nReferring Video Object Segmentation (RVOS)\nMotion-Grounded Video Reasoning\nReferred Video Description\nReferred Video Question-Answering',
 'Reasoning Video Object Segmentation (ReVOS)\nReferring Video Object Segmentation (RVOS)\nMotion-Grounded Video Reasoning\nReferred Video Description\nReferred Video Question-Answering\nFind the empty chair that is to the left of the main sitting down.

In [121]:
embeddings = embedding_manager.generate_embeddings(text)
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 147 texts.


Batches: 100%|██████████| 5/5 [00:00<00:00, 13.58it/s]


Adding 147 documents to vector store...
Successfully added 147 documents to vector store
Total documents in collection: 2205


Retreival Pipeline

In [122]:
class RetreivalPipeline:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingVectorDB):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self,query:str,top_k:int=5,score_threshold:float=0.0)->List[Dict[str,Any]]:
        # Generate embedding for the query
        print(f"Generating embedding for query: {query}")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()], 
                n_results=top_k
            )

            retreive_docs = []  
            if results['documents'] and results['documents'][0]:
                 documents = results['documents'][0]
                 metadatas = results['metadatas'][0]
                 distances = results['distances'][0]
                 ids = results['ids'][0]

                 for i, (doc_id, document, metadata, distance) in enumerate(zip(ids,documents,metadatas,distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retreive_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                    print(f"Retrieved {len(retreive_docs)} documents above the score threshold.")
            else:
                print("No documents found")
            
            return retreive_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []
        
retreival_pipeline = RetreivalPipeline(vectorstore, embedding_manager)

In [123]:
retreival_pipeline

<__main__.RetreivalPipeline at 0x738d2a4a4500>

In [124]:
retreival_pipeline.retrieve("What is yolov11 ?")

Generating embedding for query: What is yolov11 ?
Top K: 5, Score Threshold: 0.0
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 174.46it/s]

Retrieved 1 documents above the score threshold.
Retrieved 2 documents above the score threshold.
Retrieved 3 documents above the score threshold.
Retrieved 4 documents above the score threshold.
Retrieved 5 documents above the score threshold.





[{'id': 'doc_5d894304_116',
  'content': 'its improved backbone and neck structures, and its performance across various computer vision tasks such as object\ndetection, instance segmentation, and pose estimation.\n3 What is YOLOv11?\nThe evolution of the YOLO algorithm reaches new heights with the introduction of YOLOv11 [ 16], representing a\nsignificant advancement in real-time object detection technology. This latest iteration builds upon the strengths of its\npredecessors while introducing novel capabilities that expand its utility across diverse CV applications.\nYOLOv11 distinguishes itself through its enhanced adaptability, supporting an expanded range of CV tasks beyond\ntraditional object detection. Notable among these are posture estimation and instance segmentation, broadening the\n2',
  'metadata': {'source': '../data/pdf/2410.17725v1.pdf',
   'doc_index': 116,
   'moddate': '2024-10-24T00:37:53+00:00',
   'keywords': '',
   'page_label': '2',
   'content_length': 755,
   '

In [125]:
retreival_pipeline.retrieve("What is Visual Referring and Segmentation ?")

Generating embedding for query: What is Visual Referring and Segmentation ?
Top K: 5, Score Threshold: 0.0
Generating embeddings for 1 texts.


Batches: 100%|██████████| 1/1 [00:00<00:00, 134.68it/s]

Retrieved 1 documents above the score threshold.
Retrieved 2 documents above the score threshold.
Retrieved 3 documents above the score threshold.
Retrieved 4 documents above the score threshold.
Retrieved 5 documents above the score threshold.





[{'id': 'doc_671c3c9b_51',
  'content': 'referring and segmentation can be jointly enhanced. Extensive experiments on diverse pixel-level\nunderstanding tasks, including thePixelQAtask, demonstrate the significance of the proposed\nmethod. We hope this work inspires future advancements in pixel-level visual understanding.\nAcknowledgements\nThis study was supported by The Hong Kong RGC Grant (15229423) and a financial support from\nARC Lab, Tencent PCG (ZGG9). We also acknowledge The University Research Facility in Big Data\nAnalytics (UBDA) at The Hong Kong Polytechnic University for providing computing resources that\nhave contributed to the research results reported within this paper.\n10',
  'metadata': {'title': 'UniPixel: Unified Object Referring and Segmentation for Pixel-Level Visual Reasoning',
   'trapped': '/False',
   'doi': 'https://doi.org/10.48550/arXiv.2509.18094',
   'total_pages': 23,
   'creationdate': '',
   'license': 'http://creativecommons.org/licenses/by-nc-sa/4

Adding LLM with rag

In [126]:
import os
from dotenv import load_dotenv
load_dotenv()

os.getenv("groq_secret_key")
print(os.getenv("groq_secret_key"))


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


In [127]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage


In [128]:
class GroqAILLM:
    def __init__(self, model_name: str = "llama-3.1-70b-versatile", api_key: str = None):
        self.model_name = model_name
        self.api_key = api_key or os.environ.get("groq_secret_key")

        if not self.api_key:
            raise ValueError(
                "OpenAI API key not found. Please set the 'groq_secret_key' environment variable."
            )

        self.llm = ChatGroq(
            groq_api_key=self.api_key,
            model_name=self.model_name,
            temperature=0.1,
            max_tokens=1024
        )

        print(f"Initialized ChatGroq with model: {self.model_name}")

    def generate_response(self, query: str, context: str) -> str:
        prompt_template = PromptTemplate(
            input_variables=["context", "query"],
            template="""
You are a helpful AI assistant. Use the following context to answer the question accurately.

Context:
{context}

Question: {query}

Answer:"""
        )

        formatted_prompt = prompt_template.format(
            context=context,
            query=query
        )

        try:
            messages = [HumanMessage(content=formatted_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        except Exception as e:
            print(f"Error generating response: {e}")
            return "I'm sorry, I couldn't generate a response."

    def generate_simple_response(self, query: str, context: str) -> str:
        simple_prompt = f"""Context: {context}

Question: {query}

Answer:"""

        try:
            messages = [HumanMessage(content=simple_prompt)]
            response = self.llm.invoke(messages)
            return response.content
        except Exception as e:
            print(f"Error generating response: {e}")
            return "Error occurred."



In [129]:
try:
    chat_model = GroqAILLM(api_key=os.getenv("groq_secret_key"))
    print("Successfully initialized ChatGroq model.")
except Exception as e:
    print(f"Error initializing ChatOpenAI: {e}")
    chat_model = None

Initialized ChatGroq with model: llama-3.1-70b-versatile
Successfully initialized ChatGroq model.


Integrating VectorDb context pipeline with llm output

In [130]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("groq_secret_key")
# Initialize LLM
llm=ChatGroq(groq_api_key=groq_api_key,model_name="llama-3.1-70b-versatile",temperature=0.1,max_tokens=1024)

def rag_simple(query, retriever, llm, top_k=3):
    # Get relevant documents
    results = retriever.retrieve(
        query,
        top_k=top_k
    )

    # Build context
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "No relevant context found to answer the question."

    # Prompt
    prompt = f"""
     Use the following context to answer the question concisely.

    Context:
    {context}

    Question: {query}

    Answer:
    """

    # Call LLM
    response = llm.invoke([prompt.format(context=context,query=query)])
    return response.content
