In [1]:
print("If you see this, my venv is activated")

If you see this, my venv is activated


In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
%pwd

'f:\\AI\\GenAI\\RAG-based-Medical-Chatbot-Langchain-LLM\\medi_chat\\notebook'

In [5]:
# Extract text from PDF files
def load_pdf_samples(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [6]:
extracted_data = load_pdf_samples("../data")

In [7]:
extracted_data

[Document(metadata={'producer': 'iLovePDF', 'creator': 'Elsevier', 'creationdate': '2018-03-08T13:31:07+08:00', 'crossmarkdomainexclusive': 'true', 'elsevierbookpdfspecifications': '1.32', 'author': 'J. Alastair Innes,Anna R Dover,Karen Fairhurst', 'robots': 'noindex', 'subject': "Macleod's Clinical Examination, Fourteenth Edition (2019) 402pp. 978-0-7020-6993-2", 'crossmarkdomains': '[1]', 'moddate': '2021-02-26T18:24:44+00:00', 'source': '..\\data\\macleods_clinical_examination_14_ed.pdf', 'total_pages': 402, 'page': 0, 'page_label': 'cover'}, page_content=''),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'Elsevier', 'creationdate': '2018-03-08T13:31:07+08:00', 'crossmarkdomainexclusive': 'true', 'elsevierbookpdfspecifications': '1.32', 'author': 'J. Alastair Innes,Anna R Dover,Karen Fairhurst', 'robots': 'noindex', 'subject': "Macleod's Clinical Examination, Fourteenth Edition (2019) 402pp. 978-0-7020-6993-2", 'crossmarkdomains': '[1]', 'moddate': '2021-02-26T18:24:44+00:0

In [8]:
len(extracted_data)

402

In [20]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Filters out:
      - The first page (index 0)
      - Documents with empty or very short content
      - Documents containing irrelevant boilerplate text (e.g., "Student Consult")
    
    Returns minimal Document objects with only 'source' metadata and cleaned content.
    """
    minimal_docs: List[Document] = []
    
    # Define keywords or patterns to filter out "helpless" content
    blacklist_phrases = [
        "studentconsult.inkling.com",  # eBook instructions
        "Redeem your eBook",           # Marketing lines
        "technical assistance",        # Support details
        "For technical assistance",    # Support details
        "This page intentionally left blank"  # Placeholder text
    ]
    
    for idx, doc in enumerate(docs):
        content = doc.page_content.strip()

        # Skip empty or very short content
        if not content or len(content) < 20:
            continue

        # Skip content containing blacklisted phrases
        if any(phrase.lower() in content.lower() for phrase in blacklist_phrases):
            continue

        # Add cleaned document
        minimal_docs.append(
            Document(
                page_content=content,
                metadata={"source": doc.metadata.get("source")}
            )
        )

    return minimal_docs


In [21]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [22]:
minimal_docs[3:]

[Document(metadata={'source': '..\\data\\macleods_clinical_examination_14_ed.pdf'}, page_content='© 2018 Elsevier Ltd. All rights reserved.\nNo part of this publication may be reproduced or transmitted in any form or by any means, electronic or mechanical, \nincluding photocopying, recording, or any information storage and retrieval system, without permission in writing from \nthe publisher. Details on how to seek permission, further information about the publisher’s permissions policies and \nour arrangements with organizations such as the Copyright Clearance Center and the Copyright Licensing Agency, \ncan be found at our website: www.elsevier.com/permissions.\nThis book and the individual contributions contained in it are protected under copyright by the publisher (other than \nas may be noted herein).\nThe  \npublisher’s \npolicy is to use\npaper manufactured \nfrom sustainable forests\nFirst edition 1964\nSecond edition 1967\nThird edition 1973\nFourth edition 1976\nFifth edition 

In [23]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [24]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 3031


In [25]:
texts_chunk

[Document(metadata={'source': '..\\data\\macleods_clinical_examination_14_ed.pdf'}, page_content='Macleod’s \nClinical Examination'),
 Document(metadata={'source': '..\\data\\macleods_clinical_examination_14_ed.pdf'}, page_content='Content Strategist: Laurence Hunter\nContent Development Specialist: Helen Leng\nProject Manager: Anne Collett\nDesigner: Miles Hitchen\nIllustration Manager: Karen Giacomucci\nJohn Macleod (1915–2006)\nJohn Macleod was appointed consultant physician at the Western General Hospital, \nEdinburgh, in 1950. He had major interests in rheumatology and medical education. \nMedical students who attended his clinical teaching sessions remember him as'),
 Document(metadata={'source': '..\\data\\macleods_clinical_examination_14_ed.pdf'}, page_content='an inspirational teacher with the ability to present complex problems with great \nclarity. He was invariably courteous to his patients and students alike. He had an \nuncanny knack of involving all students equally in c

In [26]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [27]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [28]:
vector = embedding.embed_query("Testing the embeddings model")
vector

  return forward_call(*args, **kwargs)


[0.00843075942248106,
 -0.09506786614656448,
 0.029017718508839607,
 0.010260512121021748,
 0.05960341915488243,
 0.05782154202461243,
 -0.05258115381002426,
 -0.026991233229637146,
 -0.010253285989165306,
 -0.029971586540341377,
 0.04070683568716049,
 -0.051947738975286484,
 0.0682876855134964,
 0.045341745018959045,
 -0.08807878196239471,
 -0.019839581102132797,
 0.10807260125875473,
 0.05695078894495964,
 -0.04499265179038048,
 -0.008350764401257038,
 -0.055704034864902496,
 -0.009849965572357178,
 0.03084627352654934,
 -0.05422833561897278,
 -0.0020322061609476805,
 -0.0381985604763031,
 -0.011850891634821892,
 0.03373527526855469,
 0.042387548834085464,
 -0.07003949582576752,
 0.1282544583082199,
 -0.03325416147708893,
 -0.0595649890601635,
 0.08209779858589172,
 0.07104679942131042,
 0.00959720928221941,
 0.045826032757759094,
 -0.04572641849517822,
 -0.03194420784711838,
 0.037650059908628464,
 0.017561981454491615,
 -0.012382633984088898,
 0.0180332213640213,
 0.026519065722823

In [29]:
print( "Vector length:", len(vector))

Vector length: 384


In [30]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [31]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [32]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pinc = Pinecone(api_key=pinecone_api_key)

In [35]:
print(pinc)

<pinecone.pinecone.Pinecone object at 0x000001E35CF48C20>


In [40]:
from pinecone import ServerlessSpec 
index_name = "medi-chat"
if not pinc.has_index(index_name):
    pinc.create_index(
        name=index_name,
        dimension=384,  # Dimension of the embeddings
        metric="cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pinc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

  return forward_call(*args, **kwargs)


In [None]:
# Load Existing index
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

# Add more data to the existing Pinecone index

In [None]:
dswith = Document(
    page_content="Diabetes is a chronic condition that occurs when the body cannot properly process food for use as energy. This leads to high blood sugar levels, which can cause serious health problems over time.",      metadata={"source": "Google Health"}
)

In [30]:
docsearch.add_documents(documents=[dswith])

['48ace028-8de4-4429-9060-8282e3a47d6f']

In [31]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='c00728ce-9039-438f-9468-58b260607964', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='26e60290-18d0-4eda-9507-0ca317137315', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='4cc37eb3-2ba4-429f-9971-d7a40dfb931d', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged wi

In [33]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o")

In [34]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [35]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [36]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [37]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of chemicals from the pituitary gland, leading to increased growth in bones and soft tissues. It occurs after bone growth has stopped. When this abnormality happens before growth stops, it results in gigantism, characterized by unusual height.


In [38]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the skin's pores become clogged with oil, dead skin cells, and bacteria. The medical term for common acne is acne vulgaris.


In [39]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])

The treatment of acne depends on its severity. For mild noninflammatory acne, topical treatments such as tretinoin, benzoyl peroxide, adapalene, or salicylic acid are recommended. For inflammatory acne, additional methods like topical antibiotics may be used, while severe cases may require treatments like isotretinoin.
