In [2]:
%pwd

'/Users/srishanthreddy/medical-assistance-chatbot/research'

In [3]:
#change the working directory to the parent directory so that i can access the data folder
import os
os.chdir("../")
%pwd

'/Users/srishanthreddy/medical-assistance-chatbot'

In [4]:
#for loading pdfs
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
#for splitting the text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter



In [5]:
#load and extract text from pdf files
def load_pdf_file(data):
    loader=DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader)
    
    documents=loader.load()
    return documents

In [6]:
extracted_data=load_pdf_file("data")
extracted_data[0]

Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data/Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content='')

In [7]:
len(extracted_data)

637

In [8]:
from typing import List
from langchain.schema import Document

def filter_to_minimalDocs(docs: List[Document]) -> List[Document]:
    minimal_docs : List[Document] = []
    for doc in docs:
        src=doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content, 
                metadata={"source": src}
            )
        )
    return minimal_docs

In [9]:
minimal_docs=filter_to_minimalDocs(extracted_data)
minimal_docs[5]

Document(metadata={'source': 'data/Medical_book.pdf'}, page_content='The Gale Encyclopedia of Medicine 2is a medical ref-\nerence product designed to inform and educate readers\nabout a wide variety of disorders, conditions, treatments,\nand diagnostic tests. The Gale Group believes the product\nto be comprehensive, but not necessarily definitive. It is\nintended to supplement, not replace, consultation with a\nphysician or other healthcare practitioner. While the Gale\nGroup has made substantial efforts to provide information\nthat is accurate, comprehensive, and up-to-date, the Gale\nGroup makes no representations or warranties of any\nkind, including without limitation, warranties of mer-\nchantability or fitness for a particular purpose, nor does it\nguarantee the accuracy, comprehensiveness, or timeliness\nof the information contained in this product. Readers\nshould be aware that the universe of medical knowledge\nis constantly growing and changing, and that differences\nof medic

In [10]:
#divide the text into chunks
def text_split(docs):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks=text_splitter.split_documents(docs)
    return text_chunks

text_chunk=text_split(minimal_docs)
print(len(text_chunk))

5859


In [11]:
#download embeeding model
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings(model_name):
    embeddings=HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embedding=download_embeddings("sentence-transformers/all-MiniLM-L6-v2")

  embeddings=HuggingFaceEmbeddings(model_name=model_name)


In [12]:
vector=embedding.embed_query("Hello world")
print("vector dimension:",len(vector))

vector dimension: 384


In [13]:
from dotenv import load_dotenv
load_dotenv()
import os

In [14]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")



In [15]:
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)


In [16]:
#creating pinecone index
from pinecone import ServerlessSpec
index_name="medical-assistance-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index=pc.Index(index_name)

In [17]:
#store the text chunks in pinecone vector database
from langchain_pinecone import PineconeVectorStore
vector_store=PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embedding,
    index_name=index_name
    )

In [18]:
retriver=vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [19]:
retrived=retriver.invoke("What is acne?")
retrived

[Document(id='fabf5e5f-c017-4d5e-a652-e2b3a9e4d48b', metadata={'source': 'data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='f19dce64-82b9-4086-a71d-be34b0acafd7', metadata={'source': 'data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='fa80d9b8-6f6e-48d3-8335-5e1dfe8a7198', metadata={'source': 'data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26')]

In [30]:
from langchain_google_genai import ChatGoogleGenerativeAI
chatModel=ChatGoogleGenerativeAI(model="gemini-2.5-pro", google_api_key=GOOGLE_API_KEY)

E0000 00:00:1759672666.301434 2014773 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [31]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [32]:
system_prompt=(
    "You are a helpful medical assistant for question answering tasks."
    "Use the following retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know, don't try to make up an answer. "
    "Keep the answer as concise as possible. "
    "Context: {context} "
)
prompt=ChatPromptTemplate.from_messages(
    ["system",system_prompt,
    "human","{input}"]
    )

In [33]:
question_answer_chain=create_stuff_documents_chain(chatModel,prompt)
rag_chain=create_retrieval_chain(retriver,question_answer_chain)

In [36]:
response=rag_chain.invoke({"input":"What is acromegaly and gigantism?"})
response["answer"]

'Acromegaly is a disorder where the abnormal release of a chemical from the pituitary gland causes increased growth in bone and soft tissue, along with other disturbances in the body. The provided text does not define gigantism.'