In [1]:
import os 
os.chdir("../")

## Extract text from PDF

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

In [5]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf_files("data")

In [8]:
extracted_data[:5]

[Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 1, 'page_label': '2'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data\\Medical_book.pdf', 'total_pages': 637, 'page': 2, 'page_label': '3'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 

In [9]:
len(extracted_data)

637

## Remove unnecessary metadata

In [10]:
from typing import List
from langchain.schema import Document

def extract_minimal_documents(documents: List[Document]) -> List[Document]:
    """
    Accepts a list of Document objects and returns a new list containing
    only the original content and its 'source' metadata.
    """
    result: List[Document] = []
    for doc in documents:
        source = doc.metadata.get("source")
        simplified = Document(
            page_content=doc.page_content,
            metadata={"source": source}
        )
        result.append(simplified)
    return result

In [11]:
final_docs = extract_minimal_documents(extracted_data)

In [12]:
final_docs[:5]

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content=''),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimed

## Split documents into smaller chunks

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text_for_contextual_integrity(docs, chunk_size=500, chunk_overlap=50):
    """
    Split documents into chunks that preserve natural and semantic boundaries.

    Parameters:
        docs (List[Document]): Input documents to chunk.
        chunk_size (int): Approximate maximum characters per chunk.
        chunk_overlap (int): Desired overlap between adjacent chunks.

    Returns:
        List[Document]: Chunked documents maintaining context and coherence.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " "],  # Prefer paragraphs, then sentences
        length_function=len
    )
    return splitter.split_documents(docs)

texts_chunk = split_text_for_contextual_integrity(final_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 5961


In [14]:
texts_chunk[:5]

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='Dean Dauph

## Convert text into numbers

In [16]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [17]:
embedding

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [18]:
# lets test the embedding
text_number = embedding.embed_query("I am a boy")
text_number

[-0.06482984125614166,
 0.11154395341873169,
 -0.00091736507602036,
 -0.03589130565524101,
 -0.06904588639736176,
 -0.045476965606212616,
 0.08817790448665619,
 0.026484442874789238,
 0.023921702057123184,
 -0.025976283475756645,
 -0.005206645466387272,
 -0.07646573334932327,
 -0.057570118457078934,
 0.016241298988461494,
 0.08806014060974121,
 0.006659925449639559,
 0.023945655673742294,
 -0.021001791581511497,
 -0.07895506918430328,
 -0.03616916388273239,
 -0.10024937242269516,
 0.052485521882772446,
 -0.0008634036057628691,
 -0.0078034778125584126,
 -0.0715513527393341,
 0.022869523614645004,
 -0.020361434668302536,
 -0.05919976159930229,
 -0.00746839540079236,
 0.09555705636739731,
 -0.016858434304594994,
 0.003539272816851735,
 0.06853163242340088,
 -0.09603849798440933,
 -0.028074707835912704,
 0.01716211624443531,
 -0.0027027626056224108,
 -0.04143024608492851,
 0.053926486521959305,
 0.04891061782836914,
 -0.03499503433704376,
 -0.036885879933834076,
 -0.004890583921223879,
 -0

In [19]:
print("Length of the vector : ",len(text_number))

Length of the vector :  384


## Create VectorDB (Pinecone)

In [36]:
from dotenv import load_dotenv
load_dotenv()

True

In [37]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [23]:
from pinecone import Pinecone
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("Environment variable 'PINECONE_API_KEY' is not set")

# 2. Initialize Pinecone client using the environment variable
pc = Pinecone(api_key=pinecone_api_key)

In [24]:
pc

<pinecone.pinecone.Pinecone at 0x287673992d0>

In [25]:
from pinecone import (
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    Metric,
    VectorType,
    DeletionProtection
)

index_name = "medical-assistant"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric=Metric.COSINE,
        vector_type=VectorType.DENSE,
        spec=ServerlessSpec(
            cloud=CloudProvider.AWS,
            region=AwsRegion.US_EAST_1
        ),
        deletion_protection=DeletionProtection.DISABLED,
        tags={
            "purpose": "medical-assistant"
        }
    )
index = pc.Index(index_name)

In [26]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [28]:
# Load Existing index 
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

## Add more data to the existing Pinecone index

In [29]:
new_data = Document(
    page_content="Pinecone is used in this Medical assistant system...",
    metadata={"source": "external"}
)

In [30]:
docsearch.add_documents(documents=[new_data])

['a4dcfa6b-ed1f-448a-b39a-d2f7dfccf5b2']

In [31]:
# create retriver
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='ae840f08-caa7-4186-b2fb-599f7702ccb3', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Nancy J. Nordenson\nAcid reflux see Heartburn\nAcidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'),
 Document(id='0226255e-835a-4477-8a55-56d98786c111', metadata={'source': 'data\\Medical_book.pdf'}, page_content='used to clear up mild to moderately severe acne.\nIsotretinoin (Accutane) is prescribed only for very\nsevere, disfiguring acne.\nAcne is a skin condition that occurs when pores or\nhair follicles become blocked. This allows a waxy\nmaterial, sebum, to collec

In [38]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",  # or "gemini-2.5-pro", or other model IDs
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2
)


## Create RAG Chain

In [39]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [40]:
system_prompt = (
    "You are a medical assistant designed to answer questions accurately and concisely. "
    "Use the provided context to generate your response. "
    "If the answer is not contained within the context, say you don't know. "
    "Limit your response to a maximum of three sentences."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [41]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [42]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder where the pituitary gland in the brain abnormally releases a chemical, leading to increased growth in bone and soft tissue and other bodily disturbances. The provided text defines acromegaly but does not define gigantism.


In [43]:
response = rag_chain.invoke({"input": "what is used in the system?"})
print(response["answer"])

Pinecone is used in this Medical assistant system.


In [44]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])

Treatment for mild noninflammatory acne involves reducing new comedones with topical drugs like tretinoin, benzoyl peroxide, adapalene, or salicylic acid. For inflammatory acne, topical antibiotics may be added, and intralesional corticosteroid injections can treat inflamed pimples. Alternative treatments focus on proper cleansing, a well-balanced diet, and avoiding certain foods and substances.
