In [4]:
%pwd

'e:\\coding\\Data Science\\projects\\Medical Chatbot with LLMs, LangChain, Pinecone, Flask & AWS\\Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS\\research'

In [5]:
import os
os.chdir("../")

In [6]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter



In [7]:
#extract text from pdfs in a directory
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [8]:
exctracted_documents = load_pdf_files("data")

In [9]:
len(exctracted_documents)

637

In [10]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []  # ✅ Correct type hint syntax
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(page_content=doc.page_content, metadata={"source": src})
        )
    return minimal_docs


In [11]:
minimal_docs = filter_to_minimal_docs(exctracted_documents)
minimal_docs[0]

Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='')

# split the doc in chunks


In [12]:
def text__split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,

    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk 

In [13]:
texts_chunk = text__split(minimal_docs)
print(len(texts_chunk))

5859


In [14]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
    )
    return embeddings
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [15]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [16]:
vector = embeddings.embed_query("Hello world")
vector

[-0.03447727486491203,
 0.03102312609553337,
 0.006734980270266533,
 0.026108933612704277,
 -0.03936205804347992,
 -0.16030246019363403,
 0.06692394614219666,
 -0.006441438104957342,
 -0.047450482845306396,
 0.014758863486349583,
 0.07087534666061401,
 0.05552757531404495,
 0.019193356856703758,
 -0.02625126577913761,
 -0.01010954286903143,
 -0.026940442621707916,
 0.022307462990283966,
 -0.02222665585577488,
 -0.14969263970851898,
 -0.017493024468421936,
 0.007676282897591591,
 0.054352231323719025,
 0.0032544038258492947,
 0.03172588348388672,
 -0.08462139964103699,
 -0.029405992478132248,
 0.051595550030469894,
 0.048124078661203384,
 -0.003314835485070944,
 -0.05827915295958519,
 0.04196925833821297,
 0.022210702300071716,
 0.1281888633966446,
 -0.022338951006531715,
 -0.011656239628791809,
 0.06292837113142014,
 -0.03287634998559952,
 -0.09122604131698608,
 -0.03117534890770912,
 0.052699536085128784,
 0.04703483358025551,
 -0.08420310169458389,
 -0.030056182295084,
 -0.0207448396

In [17]:
print(len(vector))

384


In [18]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [19]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [20]:
from pinecone import Pinecone
pinecone_api_key=PINECONE_API_KEY
pc=Pinecone(api_key=pinecone_api_key)

In [21]:
pc

<pinecone.control.pinecone.Pinecone at 0x2d5840d3970>

In [25]:
from pinecone import Pinecone, ServerlessSpec
# Define index name
index_name = "medical-chatbot"

# Check if index exists
existing_indexes = [index.name for index in pc.list_indexes()]
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,  # must match your embedding model output dimension
        metric="cosine",  # correct spelling
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

# Get reference to the index
index = pc.Index(index_name)



In [26]:
from langchain_pinecone import PineconeVectorStore
docset=PineconeVectorStore.from_documents(documents=texts_chunk, embedding=embeddings, index_name=index_name)

In [27]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(embedding=embeddings, index_name=index_name)

# Add more data in exixting index

In [28]:
dswith=Document(page_content="New medical information to add to the existing index.", metadata={"source": "new_medical_doc.pdf"})
docsearch.add_documents([dswith])

['e7bf1c1d-556f-4d1d-9e71-d1766de6b275']

In [29]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3}) 

In [30]:
retrived_docs = retriever.invoke("What is hypertension?")
retrived_docs

[Document(id='3b489ec0-07e8-404c-a6d3-053956028f8d', metadata={'source': 'data\\Medical_book.pdf'}, page_content='(BPH), a condition that affects men and is characterized\nby an enlarged prostate gland.\nHigh blood pressure\nHigh blood pressure puts a strain on the heart and\nthe arteries. Over time, hypertension can damage the\nblood vessels to the point of causing stroke, heart fail-\nure or kidney failure. People with high blood pressure\nmay also be at higher risk for heart attacks. Controlling\nhigh blood pressure makes these problems less likely.\nAlpha blockers help lower blood pressure by causing'),
 Document(id='2b486c60-747f-4deb-8c9f-8c1164df9986', metadata={'source': 'data\\Medical_book.pdf'}, page_content='heart and lungs.\n• Seek treatment for hypertension—High blood pressure\ncan be controlled through lifestyle changes—reducing\nsodium and fat, exercising, managing stress, quitting\nKEY TERMS\nArteriosclerosis —Hardening of the arteries. It\nincludes atherosclerosis, but

In [31]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import os

# 1️⃣ Load environment variables from .env file
load_dotenv()

# 2️⃣ Initialize Gemini model using API key
chatModel = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key=os.getenv("GEMINI_API_KEY")   # 👈 this line is crucial
)




In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [33]:
system_template = ("You are a helpful medical assistant for answering questions based on the provided context."
"Use ht following pieces of retrieved documents to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."
"Use three sentences maximum to answer the question."
"answer concisely and accurately."
"\n\n"
"{context}"
)
prompt=ChatPromptTemplate.from_messages(
    [   ("system",system_template),
        ("human","{input}")
    ]
)


In [34]:
question_answer_chain = create_stuff_documents_chain(
    chatModel,
    prompt          # the ChatPromptTemplate we created earlier
)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)