In [6]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

def load_pdf_files():
    loader = DirectoryLoader(
        "E:\Ravi\GenAI_projects\MedicalBot\data",
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    docs = loader.load()
    return docs



  "E:\Ravi\GenAI_projects\MedicalBot\data",


In [7]:
extracted_data = load_pdf_files()

In [8]:
extracted_data[0]

Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'E:\\Ravi\\GenAI_projects\\MedicalBot\\data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content='')

In [9]:
len(extracted_data)

637

In [10]:
from typing import List
from langchain_core.documents import Document



def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs


In [11]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )

    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk


In [13]:
text_chunk = text_split(minimal_docs)
print(len(text_chunk))

5860


In [25]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [15]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [16]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
HF_TOKEN = os.getenv("HF_TOKEN")

In [17]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key= PINECONE_API_KEY)

In [20]:
index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "dotproduct",
        spec = ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
index = pc.Index(index_name)

In [None]:
"""ðŸŸ¢ Normal Vector Flow
First time:
from_documents() â†’ Embed docs â†’ Store in Pinecone

Later:
from_existing_index() â†’ Just connect â†’ Start searching

ðŸŸ£ Hybrid Flow
First time:
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    index=index
)
retriever.add_texts(texts_chunk)

Dense embedding created
Sparse BM25 vector created
Both stored in Pinecone

Later:
Create retriever only
(No add_texts) """

'ðŸŸ¢ Normal Vector Flow\nFirst time:\nfrom_documents()\n\nLater:\nfrom_existing_index()\n\nðŸŸ£ Hybrid Flow\nFirst time:\nretriever.add_texts()\n\nLater:\nCreate retriever only\n(No add_texts) '

In [27]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embedding,
    index_name=index_name
)

In [28]:
docsearch = PineconeVectorStore.from_existing_index(
    embedding=embedding,
    index_name=index_name
)

In [29]:
retriever = docsearch.as_retriever(
    search_type = 'similarity',
    search_kwargs = {'k':3}
)

In [30]:
retriever.invoke("What is Acne?")

[Document(id='40c2df93-e915-46ad-a23d-854d3172c4f7', metadata={'source': 'E:\\Ravi\\GenAI_projects\\MedicalBot\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='cb2606ad-c80a-45a7-9844-245bfcefe2d4', metadata={'source': 'E:\\Ravi\\GenAI_projects\\MedicalBot\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='377594af-d97a-456a-9e5f-e350de84d20f', metadata={'source': 'E:\\Ravi\\GenAI_projects\\MedicalBot\\data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin dis

In [34]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq   # or Groq, etc.

groq_api_key = os.getenv('GROQ_API_KEY')
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama-3.1-8b-instant"
)

prompt = ChatPromptTemplate.from_template("""
Answer the question using the context below:

Context:
{context}

Question:
{question}
""")

rag_chain = (
    {
        "context": retriever,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
)



In [35]:

result = rag_chain.invoke("What is acne")
print(result.content)

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.


In [None]:
#Test