In [1]:
from unstructured.partition.pdf import partition_pdf
from langchain_openai import OpenAIEmbeddings

from typing import Literal

class PDFProcessor:
    def __init__(self, file_path: str, chunking_strategy: Literal["by_title", "basic"], maximum_chunk_characters: int,  combine_text_under_n_chars: int, new_after_n_chars: int):
        self.file_path = file_path
        self.chunking_strategy = chunking_strategy
        self.maximum_chunk_characters = maximum_chunk_characters
        self.combine_text_under_n_chars = combine_text_under_n_chars
        self.new_after_n_chars = new_after_n_chars

    def unstructured_chunks(self):
        return partition_pdf(
            filename=self.file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy=self.chunking_strategy,
            max_characters=self.maximum_chunk_characters,              
            combine_text_under_n_chars=self.combine_text_under_n_chars,
            new_after_n_chars=self.new_after_n_chars,

        )

    def split_elements_from_chunks(self, chunk):
        """
        Extract text/table content from a CompositeElement chunk 
        and combine into one string.
        """
        if "CompositeElement" not in str(type(chunk)):
            return ""

        chunk_elements = chunk.metadata.orig_elements
        combined_content = ""

        for element in chunk_elements:
            if "Text" in str(type(element)):
                combined_content += element.text + "\n"
            elif "Table" in str(type(element)):
                combined_content += element.metadata.text_as_html + "\n"
            # elif "Image" in str(type(element)):
            #     combined_content += "[IMAGE]\n"

        return combined_content


  from .autonotebook import tqdm as notebook_tqdm


In [56]:
from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone as LC_Pinecone
from langchain_openai import OpenAIEmbeddings


class VectorStore:
    def __init__(self, pc_api_key:str, openai_api_key:str, index_name:str, dimension:int, model:str, metric:int, region="us-east-1"):
        self.pc_api_key = pc_api_key
        self.openai_api_key = openai_api_key
        #self.pc = Pinecone(api_key=self.pc_api_key)
        self.index_name = index_name
        #self.embeddings = OpenAIEmbeddings(model=self.model, api_key=self.openai_api_key)
        self.model = model
        self.dimension = dimension 
        self.metric = metric
        self.region = region

    def create_vector_db(self):
        print(self.pc_api_key)
        pc = Pinecone(api_key=self.pc_api_key)
        if self.index_name not in pc.list_indexes().names():
            pc.create_index(
                name=self.index_name,
                dimension=self.dimension,
                metric=self.metric,
                spec=ServerlessSpec(cloud="aws", region=self.region)
            )
            print(f"Index '{self.index_name}' created!")

        else:
            # Connect to existing index
            index = pc.Index(self.index_name)
            print(f"Index '{self.index_name}' already exists and is ready.")

            
    def upsert_chunk(self, chunk_id: str, content: str):
        embeddings = OpenAIEmbeddings(model=self.model, api_key=self.openai_api_key)
        """
        Embed the given chunk content and upsert into Pinecone.
        """
        embedded_chunk = embeddings.embed_documents([content])[0]
        pc = Pinecone(api_key=self.pc_api_key)
        index = pc.Index(self.index_name)
        index.upsert([{
            "id": chunk_id,
            "values": embedded_chunk,
            "metadata": {
                "chunk_id": chunk_id,
                "type": "composite",
                "content": content,
                "text": content
            }
        }])

    def query_pinecone(self, query_text, top_k:int):
        """ 
        Query Pinecone index with a text query and return results.

        Args:
        index: Pinecone index object
        embeddings: Embedding model object
        query_text (str): Input query text
        top_k (int): Number of top results to fetch

        Returns:
            list: List of result dictionaries with id, content, score, and metadata
        """
    # Step 1: Convert query text to vector
        embeddings = OpenAIEmbeddings(model=self.model, api_key=self.openai_api_key)
        query_vector = embeddings.embed_query(query_text)
        pc = Pinecone(api_key=self.pc_api_key)
        index = pc.Index(self.index_name)
        # Step 2: Query Pinecone index
        results = index.query(
            vector=query_vector,
            top_k=top_k,
            include_metadata=True,
            include_values=True
        )

        # Step 3: Format results
        formatted_results = []
        for match in results['matches']:
            formatted_results.append({
                "id": match['id'],
                "content": match['metadata'].get('content', ''),
                "score": match['score'],
                "metadata": match['metadata']
            })

        return formatted_results



In [40]:
processor = PDFProcessor(file_path = "/Users/rohan/Documents/RAG Pipeline/RAG-Pipeline/artifacts/research_paper_yolo.pdf", 
                         chunking_strategy = "by_title", 
                         maximum_chunk_characters = 5000,  
                         combine_text_under_n_chars = 2000, 
                         new_after_n_chars = 4000)

In [14]:
chunks = processor.unstructured_chunks()



In [41]:
import os
from dotenv import load_dotenv

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
load_dotenv()

True

In [57]:
pc = VectorStore(pc_api_key=PINECONE_API_KEY, 
                 openai_api_key=OPENAI_API_KEY, 
                 index_name='newindex', 
                 #embeddings = OpenAIEmbeddings(model="text-embedding-3-small",api_key=OPENAI_API_KEY
                 dimension=1536, 
                 model="text-embedding-3-small", 
                 metric="cosine", 
                 region="us-east-1")

In [43]:
pc.create_vector_db()

pcsk_2VkJxA_EjAnGwoD7aDLXUxksRjfxCw5MegC4TLBd294aN7dZpdYJRn3u2sTk1Zx7TqFNRQ
Index 'newindex' already exists and is ready.


In [44]:
chunks = processor.unstructured_chunks()



In [51]:


for i, chunk in enumerate(chunks):
    combined_content = processor.split_elements_from_chunks(chunk)
    if combined_content:
        chunk_id = f"chunk-{i}"
        pc.upsert_chunk(chunk_id, combined_content)
        

In [58]:
pc.query_pinecone("what is Yolo", top_k=2)

[{'id': 'chunk-12',
  'content': 'This paper gives us a review of the YOLO versions. Here we draw the following remarks. First, the YOLO version has a lot of differences. However, they still have some features in common. Hence, they are still similar. Second. The YOLO versions are still very new, have a lot of room for future research. Especially for scenario implementations.\nThere is still room for future improvement. This paper can focus more on the implementations comparing, such as scenario analysis. Further, the research for YOLO V1 is very limited in this paper. For example, in the trend subsection, both the figure and tabular have ignored YOLO V1. Future research can do better on this point.\nThis research has been partially supported by grants from the National Natural Science Foundation of China (Nos. 71774134, U1811462). This research is also supported by the Fundamental Research Funds for the Central Universities,Southwest Minzu University(Grant Number 2020NGD04,and 2018NZD

In [69]:
from langchain_pinecone import PineconeVectorStore

class Retriever:
    def __init__(self, pc_api_key, index_name, model, openai_api_key, search_type: str = "similarity", k: int = 5):
        self.pc_api_key = pc_api_key
        self.index_name=index_name
        self.model = model
        self.openai_api_key = openai_api_key
        self.search_type = search_type
        self.k = k

    def create_retriver(self):
        pc = Pinecone(api_key=self.pc_api_key)
        index = pc.Index(self.index_name)
        embeddings = OpenAIEmbeddings(model=self.model,api_key=self.openai_api_key)
        vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

        retriever = vectorstore.as_retriever(
            search_type=self.search_type,
            search_kwargs={"k": self.k}
        )
        return retriever

    def query(self, query_text: str, retriever):
        pc = Pinecone(api_key=self.pc_api_key)
        index = pc.Index(self.index_name)
        embeddings = OpenAIEmbeddings(model=self.model,api_key=self.openai_api_key)
        vectorstore = PineconeVectorStore(index=index, embedding=embeddings)
        return retriever.get_relevant_documents(query_text)

In [70]:
r= Retriever(pc_api_key = PINECONE_API_KEY, index_name = "newindex", model = 'text-embedding-3-small' , openai_api_key = OPENAI_API_KEY, search_type= "similarity", k = 5)

In [71]:
r1 = r.create_retriver()

In [73]:
docs=r.query("What is Yolo",r1)

In [74]:
for doc in docs:
    print(doc.page_content)

This paper gives us a review of the YOLO versions. Here we draw the following remarks. First, the YOLO version has a lot of differences. However, they still have some features in common. Hence, they are still similar. Second. The YOLO versions are still very new, have a lot of room for future research. Especially for scenario implementations.
There is still room for future improvement. This paper can focus more on the implementations comparing, such as scenario analysis. Further, the research for YOLO V1 is very limited in this paper. For example, in the trend subsection, both the figure and tabular have ignored YOLO V1. Future research can do better on this point.
This research has been partially supported by grants from the National Natural Science Foundation of China (Nos. 71774134, U1811462). This research is also supported by the Fundamental Research Funds for the Central Universities,Southwest Minzu University(Grant Number 2020NGD04,and 2018NZD02).
Author name / Procedia Computer

In [None]:
def __init__(self, pc_api_key, index_name, model, openai_api_key, search_type: str = "similarity", k: int = 5):
        self.pc_api_key = pc_api_key
        self.index_name=index_name
        self.model = model
        self.openai_api_key = openai_api_key
        self.search_type = search_type
        self.k = k

In [82]:
from langchain.prompts import PromptTemplate
from langchain.chat_models.base import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableLambda, RunnableMap
## LLM
from langchain.chat_models.base import init_chat_model
from langchain_core.output_parsers import StrOutputParser
# -----------------------------
# RAG Chain Class
# -----------------------------
class RAGPipeline:
    def __init__(self, retriever,
                model:str,
                pc_api_key:str, 
                index_name:str, 
                openai_api_key
                ):
        self.model = model
        self.pc_api_key = pc_api_key
        self.openai_api_key = openai_api_key
        self.index_name=index_name
        self.retriever = retriever

        self.llm = init_chat_model(model)
        self.prompt_template = PromptTemplate.from_template(
        """Answer the question based on the following context:
            {context}

            Question: {question}
        """
        )
        self.chain = (
            RunnableMap(
                {
                    "context": lambda x: self.retriever.get_relevant_docs(x["question"]),
                    "question": lambda x: x["question"]
                }
            )
            | self.prompt_template
            | self.llm
            | StrOutputParser()
        )
        
    
    def run_query(self, question: str):
        query = {"question": question}
        return self.chain.invoke(query)

In [83]:
rag = RAGPipeline(retriever = r1,
                model = "openai:gpt-3.5-turbo",
                pc_api_key = PINECONE_API_KEY, 
                index_name = OPENAI_API_KEY, 
                openai_api_key = OPENAI_API_KEY)

In [84]:
rag.run_query("What is Yolo")

AttributeError: 'VectorStoreRetriever' object has no attribute 'get_relevant_docs'