In [1]:
! echo "::group::Install Dependencies"
%pip install uv
! uv pip install git+https://github.com/ibm-granite-community/utils.git \
    langchain \
    langchain_community \
    langchain_text_splitters \
    ibm_watsonx_ai \
    langchain_ibm \
    chromadb \
    tiktoken \
    bs4
! echo "::endgroup::"

::group::Install Dependencies
Note: you may need to restart the kernel to use updated packages.
[2mUsing Python 3.11.13 environment at: /opt/conda/envs/Python-RT24.1[0m
[2K[2mResolved [1m102 packages[0m [2min 1.32s[0m[0m                                       [0m
[2mAudited [1m102 packages[0m [2min 15ms[0m[0m
::endgroup::


In [2]:
! pip install PyMuPDF



In [3]:
from langchain_ibm import WatsonxEmbeddings, WatsonxLLM
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain.tools import tool
from langchain.tools.render import render_text_description_and_args
from langchain.agents.output_parsers import JSONAgentOutputParser
from langchain.agents.format_scratchpad import format_log_to_str
from langchain.agents import AgentExecutor
from langchain.memory import ConversationBufferMemory
from langchain_core.runnables import RunnablePassthrough
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from langchain.document_loaders import PyPDFLoader

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

import botocore
import time


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
import getpass
credentials = {
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": getpass.getpass("Please enter your watsonx.ai Runtime API key (hit enter): "),
    "project_id": getpass.getpass("Please enter your project ID (hit enter): "),
    "ibm_api_key": getpass.getpass("Please enter your IBM API key for data asset (hit enter): ")
}

project_id = credentials.get("project_id")

Please enter your watsonx.ai Runtime API key (hit enter):  ········
Please enter your project ID (hit enter):  ········
Please enter your IBM API key for data asset (hit enter):  ········


In [5]:
llm = WatsonxLLM(
    model_id="ibm/granite-3-2-8b-instruct",
    url=credentials.get("url"),
    apikey=credentials.get("apikey"),
    project_id=project_id,
    params={
        GenParams.DECODING_METHOD: "greedy",
        GenParams.TEMPERATURE: 0,
        GenParams.MIN_NEW_TOKENS: 5,
        GenParams.MAX_NEW_TOKENS: 250,
        GenParams.STOP_SEQUENCES: ["Human:", "Observation"],
    },
)

In [6]:
def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
ibm_api_key = credentials.get("ibm_api_key")
cos_client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/identity/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.direct.us-south.cloud-object-storage.appdomain.cloud')

bucket = 'healthcare-donotdelete-pr-fuasfw3lhfcwjc'
object_key = 'Adult_Immunization.pdf'

# load data of type "application/pdf" into a botocore.response.StreamingBody object.
# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.
# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/
# pandas documentation: http://pandas.pydata.org/

streaming_body_2 = cos_client.get_object(Bucket=bucket, Key=object_key)['Body']


In [7]:
max_retries = 3
retry_delay = 2  # seconds

for attempt in range(max_retries):
    try:
        response = cos_client.get_object(Bucket=bucket, Key=object_key)
        streaming_body = response['Body']

        with open("CDC_Adult_Immunization_Schedule.pdf", "wb") as f:
            for chunk in streaming_body.iter_chunks(chunk_size=1024 * 1024):  # 1 MB
                if chunk:
                    f.write(chunk)
        break  # success, exit loop

    except botocore.exceptions.ReadTimeoutError as e:
        print(f"Read timeout on attempt {attempt+1}, retrying...")
        time.sleep(retry_delay)

    except botocore.exceptions.ResponseStreamingError as e:
        print(f"Streaming error on attempt {attempt+1}, retrying...")
        time.sleep(retry_delay)

    except Exception as e:
        print(f"Unexpected error: {e}")
        break

In [8]:
loader1 = PyPDFLoader("CDC_Adult_Immunization_Schedule.pdf")
documents1 = loader1.load()

In [9]:

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       # each chunk ≈ 400–600 tokens
    chunk_overlap=200,     # preserves context continuity
    separators=["\n\n", "\n", "•", "y", ".", " "],  # CDC uses bullet markers & line breaks
)

chunks1 = splitter.split_documents(documents1)
print(f"Created {len(chunks1)} text chunks")

Created 79 text chunks


In [10]:
embeddings = WatsonxEmbeddings(
    model_id="intfloat/multilingual-e5-large",
    url=credentials["url"],
    apikey=credentials["apikey"],
    project_id=project_id,
)

In [11]:
vectorstore1 = Chroma.from_documents(
    documents=chunks1,
    collection_name="adult-immunization-rag",
    embedding=embeddings,
)

In [12]:
retriever1 = vectorstore1.as_retriever(search_kwargs={"k": 3})

In [13]:
@tool
def get_pdf_context_adult_immunization(question: str) -> dict:
    """Get context and page numbers from the PDF chunks stored in Chroma."""
    docs = retriever1.invoke(question)
    combined_text = "\n\n".join([d.page_content for d in docs])
    page_info = []

    for d in docs:
        meta = d.metadata
        page = meta.get("page", "unknown")
        source = meta.get("source", "document")
        page_info.append({"source": source, "page": page})

    return {"context": combined_text, "sources": page_info}


In [14]:
@tool
def translate_to_query_language(context: str, question: str) -> dict:
    """
    Translate the retrieved 'context' into the language of 'question' if the question is not in English.
    Returns:
      {
        "translated_context": "...",
        "language": "English | Spanish | Arabic | ..."
      }
    Notes:
      - Uses the same LLM to (a) detect the question's language and (b) translate the context.
      - Preserves numbers, dosages, acronyms, and medical terms as-is.
    """
    # 1) Detect the question language (minimal output, English label in English)
    detect_prompt = (
        "Detect the language of the following question. "
        "Return only the language name in English (e.g., 'English', 'Spanish', 'French').\n\n"
        f"Question:\n{question}"
    )
    detected_lang = llm.invoke(detect_prompt).strip()

    # Treat any response containing 'English' as English; otherwise translate.
    if "English" in detected_lang:
        return {"translated_context": context, "language": "English"}

    # 2) Translate the context into the detected language
    translate_prompt = (
        f"Translate the following context into {detected_lang}. "
        "Preserve all medical terms, numbers, dosages, acronyms, and units exactly. "
        "Do not add extra information; translate faithfully and clearly.\n\n"
        f"Context:\n{context}"
    )
    translated = llm.invoke(translate_prompt).strip()
    return {"translated_context": translated, "language": detected_lang}


In [15]:
tools = [get_pdf_context_adult_immunization, translate_to_query_language]
tool_names = '"get_pdf_context_adult_immunization", "translate_to_query_language"'

In [16]:
system_prompt = """
You are a knowledgeable and empathetic **Preventive Healthcare Assistant** with access to the tools get_pdf_context_adult_immunization and translate_to_query_language.
Your role is to answer users questions about **preventive healthcare, vaccinations, physical activity, mental wellness, and healthy lifestyle habits**.

### Rules of Engagement
- Only use information retrieved through get_pdf_context.
- Do **not** rely on your own memory or external knowledge.
- If the retrieved context does not answer the question, respond kindly and explain what types of preventive topics you can assist with.
- Do **not** provide diagnostic or treatment advice — focus on prevention, education, and lifestyle guidance.
- Always recommend consulting a qualified healthcare provider for personalized medical decisions.
- The final answer must be in the **same language as the user's question**:
  - If the question is in English, answer in English (no translation needed).
  - If the question is in a different language, first call **translate_to_query_language** on the retrieved context and then compose your answer using the translated context.

### Tool Behaviors
- **get_pdf_context(question: str) -> {{ context: str, sources: [{{source, page}}, ...] }}**
  Retrieve relevant chunks from the indexed PDF.

- **translate_to_query_language(context: str, question: str) -> {{ translated_context: str, language: str }}**
  Translate the retrieved context into the user’s question language when the question is not in English.
  Use this tool only after get_pdf_context, and only if the question is not in English.

When you use get_pdf_context, it returns:
{{
  "context": "...text from retrieved chunks...",
  "sources": [
    {{"source": "filename.pdf", "page": "number"}},
    ...
  ]
}}

Use ONLY that context (or its translated version) to answer user questions.
When you provide the final answer:
- Include the relevant page numbers and source filenames in parentheses at the end.
Example:
  "According to the guidelines, adults should move more and sit less. (Source: Physical_Activity_Guidelines_2nd_edition.pdf, pages 8–9)"

If the context doesn’t contain relevant information, kindly tell the user that no matching information was found in the document.

To use a tool, respond with:
```{{{{ "action": "get_pdf_context", "action_input": "user question here" }}}}```
or
```{{{{ "action": "translate_to_query_language", "action_input": {{"context": "...", "question": "..."}} }}}}```

Valid actions: "get_pdf_context", "translate_to_query_language", or "Final Answer".
Format every action like this:
```{{{{ "action": "Final Answer", "action_input": "Your final response to the human." }}}}```
Always wrap JSON in triple backticks. Begin!
"""

In [17]:
human_prompt = """
{input}
{agent_scratchpad}
(reminder to always respond in a JSON blob)
"""

In [18]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history", optional=True),
        ("human", human_prompt),
    ]
)

In [19]:
prompt = prompt.partial(
    tools=render_text_description_and_args(list(tools)),
    tool_names=", ".join([t.name for t in tools]),
)

In [20]:
memory = ConversationBufferMemory()

  memory = ConversationBufferMemory()


In [21]:
chain = (
    RunnablePassthrough.assign(
        agent_scratchpad=lambda x: format_log_to_str(x["intermediate_steps"]),
        chat_history=lambda x: memory.chat_memory.messages,
    )
    | prompt
    | llm
    | JSONAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=chain, tools=tools, handle_parsing_errors=True, verbose=False, memory=memory
)

In [22]:
print("Preventive Healthcare Assistant. Type 'exit' to stop.\n")

while True:
    user_input = input("You: ").strip()
    if user_input.lower() in {"exit", "quit"}:
        break

    try:
        result = agent_executor.invoke({"input": user_input})
        answer = result.get("output") or result
        print(f"Assistant: {answer}\n")
    except Exception as e:
        print("Error:", e)


Preventive Healthcare Assistant. Type 'exit' to stop.



You:  Is the pneumonia vaccine recommended for people with diabetes?


Assistant: Yes, the pneumonia vaccine is recommended for people with diabetes. According to the CDC, adults with diabetes are at higher risk for pneumococcal disease and should receive the pneumonia vaccine. (Source: CDC_Adult_Immunization_Schedule.pdf, pages 8–9)



You:  Do I need a COVID-19 booster if I’m 65 years old?


Assistant: Yes, according to the CDC, COVID-19 boosters are recommended for adults 65 years and older. (Source: CDC_COVID-19_Vaccination_Recommendations.pdf, page 12)



You:  ¿Cada cuánto tiempo se debe aplicar la vacuna antitetánica ?


Assistant: The information provided describes a patient's vital signs and medication administration, but it does not pertain to preventive healthcare, vaccinations, physical activity, mental wellness, or healthy lifestyle habits. I can assist with questions related to these topics. (Source: N/A)



You:  Quels vaccins sont recommandés pour un adulte de 20 ans ?


Assistant: Pour un adulte de 20 ans, les vaccins recommandés comprennent la vaccination contre le tétanos, la diphtérie et le papillomavirus humain (HPV). (Source: CDC_Adult_Immunization_Schedule.pdf, pages 15-16)



You:  Do I need a second COVID-19 vaccine dose if I already had one last year?


Assistant: Agent stopped due to iteration limit or time limit.



You:  


Assistant: According to the CDC, if you received a two-dose COVID-19 vaccine (Pfizer-BioNTech or Moderna) and it has been at least 6 months since your second dose, you should get a booster shot. If you received the Johnson & Johnson/Janssen vaccine, you should get a booster shot at least 2 months after your initial dose. (Source: CDC_COVID-19_Vaccination_Recommendations.pdf, page 13)



You:  What vaccines should someone get before an elective spleen surgery?


Assistant: Before elective spleen surgery, the CDC recommends the following vaccines: Hepatitis B (HepB), Hepatitis A–Hepatitis B (HepA–HepB), Human papillomavirus (HPV), Measles, mumps, rubella (MMR), and Poliovirus vaccine. These recommendations are based on underlying medical conditions or other risk factors, which may include diabetes. (Source: CDC_Adult_Immunization_Schedule.pdf, pages 13, 9, 13)



You:  exit
