In [1]:
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_together import ChatTogether
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from langchain.retrievers import MultiQueryRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.messages import HumanMessage, SystemMessage

load_dotenv()

True

In [2]:
dir_loader = DirectoryLoader(
    path='D:\LangChain\Langchain_Models\MediBot\data',
    glob='*.pdf',
    loader_cls=PyPDFLoader
)
docs = dir_loader.load()

invalid pdf header: b'\xef\xbb\xbf%P'
EOF marker seems truncated
incorrect startxref pointer(1)
parsing for Object Streams
PdfReadError("Invalid Elementary Object starting with b'e' @27066: b' R/MediaBox[-2e-05 0 560.159973 722.880005]/Annots 21 0 R/CropBox[-2e-05 1.44 56'")
PdfReadError("Invalid Elementary Object starting with b'P' @33017702: b'5 0 obj<</Universal PDF(The process that creates this PDF constitutes a trade se'")
PdfReadError("Invalid Elementary Object starting with b'P' @33017702: b'5 0 obj<</Universal PDF(The process that creates this PDF constitutes a trade se'")
PdfReadError("Invalid Elementary Object starting with b'e' @27066: b' R/MediaBox[-2e-05 0 560.159973 722.880005]/Annots 21 0 R/CropBox[-2e-05 1.44 56'")


In [3]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = splitter.split_documents(docs)

In [4]:
embedding = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004')
query_result = embedding.embed_query('hi')
embedding_dimension = len(query_result)

In [5]:
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [13]:
index_name = "medibot"

try:
    index_created = False

    # 1. Create index if not exists
    if not pc.has_index(index_name):
        pc.create_index(
            name=index_name,
            dimension=embedding_dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )
        index_created = True
        print("Created new database index")

    # 2. Initialize index object
    index = pc.Index(index_name)

    # 3. If newly created, insert documents
    if index_created:
        try:
            vector_store = PineconeVectorStore.from_documents(
                documents=chunks,
                index_name=index_name,
                embedding=embedding
            )
            print("Data inserted using from_documents()")
        except Exception as e:
            print(f"from_documents failed: {e}")
            print("Trying batch insert...")

            vector_store = PineconeVectorStore(index_name=index_name, embedding=embedding)
            batch_size = 200
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i + batch_size]
                vector_store.add_documents(documents=batch)
            print("Data inserted using batch upload")
    else:
        # 4. Load existing vector store only
        vector_store = PineconeVectorStore.from_existing_index(
            index_name=index_name,
            embedding=embedding
        )
        stats = index.describe_index_stats()
        print("Existing index stats:", stats)

except Exception as e:
    print(f"Failed to connect to Pinecone index: {e}")


Existing index stats: {'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 46900}},
 'total_vector_count': 46900,
 'vector_type': 'dense'}


In [14]:
llm = ChatGoogleGenerativeAI(model='models/gemini-2.5-pro')
compressor = LLMChainExtractor.from_llm(llm=llm)

In [15]:
mq_retriever = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(search_kwargs={'k':2}),
    llm=llm
)
retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever= mq_retriever
)

In [17]:
query = "india vs england test score today"

In [18]:
context = "\n\n".join(i.page_content for i in retriever.invoke(query) )

In [20]:
messages = [
    SystemMessage(
        content=(
            "You are a helpful and knowledgeable medical assistant. "
            "Answer the user's following medical question in simple and accurate terms. "
            "It would be preferred if you add some bullet points when answering the question.\n\n"
            "Use the following retrieved context to answer the question:\n"
            f"{context}\n\n"
            "If the answer is not in the context, say you don't know."
        )
    ),
    HumanMessage(content=query)
]

In [21]:
result = llm.invoke(messages)
messages.append(result)

In [22]:
result

AIMessage(content="I'm sorry, I don't know the answer to your question.\n\nAs a medical assistant, my purpose is to provide information on health-related topics. I do not have access to live sports scores.", additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--9728ea1b-5d9f-4fe3-a644-6a2234758d52-0', usage_metadata={'input_tokens': 73, 'output_tokens': 44, 'total_tokens': 799, 'input_token_details': {'cache_read': 0}})

In [23]:
result.content

"I'm sorry, I don't know the answer to your question.\n\nAs a medical assistant, my purpose is to provide information on health-related topics. I do not have access to live sports scores."