### Environment setup

conda config --add channels conda-forge

conda create -n rag_chatbot2 python pandas nltk numpy jq "langchain<1.0" langchain-community chromadb langchain-chroma langchain-ollama langchain-openai streamlit

conda activate rag_chatbot2

pip install grobid_tei_xml rank_bm25

In [19]:
from langchain_community.document_loaders import JSONLoader, CSVLoader
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

import pandas as pd
from langchain.chains import RetrievalQA

from langchain_ollama import OllamaLLM
import os
from langchain_core.documents import Document
import streamlit as st
import grobid_tei_xml
import re
from pprint import pprint
from tqdm import tqdm


In [20]:
# Please read https://python.langchain.com/docs/how_to/document_loader_json/
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["source"] = record.get("source")
    metadata["title"] = record.get("title")
    metadata["url"] = record.get("url")
    metadata["OPID"] = record.get("OPID")
    metadata["doc_id"] = record.get("doc_id")
    metadata["word_count"] = record.get("word_count")

    return metadata

In [21]:
def load_jsonl_data(data_path):
    loader  = JSONLoader(
        file_path= data_path,
        jq_schema= ".",
        content_key="guid_text",
        json_lines=True,
        metadata_func=metadata_func
    )

    guidelines_data = loader.load()

    return guidelines_data

In [22]:
geriatric_care_dir = "../research-papers/Keyi/results/"
print("Loading the clinical guidelines...")
#guidelines_doc_data = load_jsonl_data(data_path="../results/relevant_clin_guid_data_Oct28.jsonl")
guidelines_doc_data= []
# Directory of adult care guidelines

OPID_counter = 50000

for filename in os.listdir(geriatric_care_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(geriatric_care_dir, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            doc_extract = grobid_tei_xml.parse_document_xml(file.read())
            text_parts = []
            try:
                if doc_extract.abstract:
                    text_parts.append(doc_extract.abstract)
            except AttributeError:
                print("Astract missing")
            try:
                if doc_extract.body:
                    text_parts.append(doc_extract.body) # This might include all text within the body
            except AttributeError:
                print("Body missing", filename)               

            whole_text= "\n\n".join(text_parts)

            doc = Document(page_content=whole_text, metadata={"source": filename, "OPID": OPID_counter})
            OPID_counter = OPID_counter + 1
            guidelines_doc_data.append(doc)
    
    if filename.endswith(".txt"):
        file_path = os.path.join(geriatric_care_dir, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
            text1 = re.sub(r'={2,}', '\n\n', text)
            doc = Document(page_content=text1, metadata={"source": filename, "OPID": OPID_counter})
            OPID_counter = OPID_counter + 1
            guidelines_doc_data.append(doc)

Loading the clinical guidelines...


### Reading the guidelines data complete. Prompting expts

In [23]:
print(len(guidelines_doc_data), "documents loaded.")

12 documents loaded.


In [24]:
print(len(guidelines_doc_data[0].page_content))  # Display the first 500 characters of the first document

72744


In [25]:
target_pop_extract_prompt1 = """Extract and return only the exact text from the guideline that defines the target population / intended users / applicable patients.

What counts as “target population” text
Include sentences/clauses that describe:

disease/condition and clinical setting (e.g., adjuvant, metastatic, primary care)
patient characteristics (age, sex, risk group, stage/grade, operability/resectability, performance status)
inclusion/exclusion statements (e.g., “does not apply to…”, “outside the scope…”)
“intended for”, “applies to”, “scope”, “population”, “patients with…”
What NOT to include

recommendations, interventions, outcomes, evidence grading, or implementation details unless they are part of the population definition.

Output rules

Return verbatim excerpts copied from the guideline.
If the guideline has multiple relevant excerpts, return all of them.
Preserve original wording; do not paraphrase.
If no explicit target population is stated anywhere, output exactly: Not specified

I have provided some examples of TARGET POPULATION below:

1. Adult patients with cutaneous or mucosal melanoma with high risk of recurrence who are rendered disease-free following resection (including resection of all locoregional or distant metastases, if present). Patients with unresected primary disease or metastases fall outside the scope of this document.'
2.These recommendations apply to patients with stage I, II, III, or resectable IV melanoma who are clinically disease-free after treatment with curative intent. Pathological staging is according to the 8 th edition AJCC staging system (Appendix 1).'

Clinical Guideline

Source: {source}

Guideline Text: {guideline_text}
"""

In [29]:
incl_exc_criteria_prompt = """
Task: Extract the patient selection criteria (inclusion and exclusion) stated in the provided perioperative care clinical guideline.

Definitions
- Inclusion criteria: explicit characteristics required for the guideline to apply (e.g., age/sex, diagnosis, disease severity/stage, perioperative setting, procedure type, risk group, timing, geography/care setting).
- Exclusion criteria: explicit characteristics or situations where the guideline does NOT apply (e.g., “not intended for…”, “exclude…”, “does not apply to…”, contraindications, special populations outside scope).

What to extract
- Copy ONLY guideline text that clearly states inclusion and/or exclusion criteria (verbatim excerpts).
- Include scope statements that function as inclusion/exclusion (e.g., “applies to…”, “intended for…”, “this guideline excludes…”).
- Do NOT include recommendations, treatment steps, outcomes, rationale, evidence grading, or background unless they are part of the criteria statement.

Output format

Return a bulleted list exactly in this schema:
Inclusion criteria: 
"<verbatim excerpt 1>"
"<verbatim excerpt 2>"
, ...] 
OR "Not specified"

Exclusion criteria: 
"<verbatim excerpt 1>"
"<verbatim excerpt 2>"
, ...] 
OR "Not specified"

Rules
- Use verbatim text from the guideline; do not paraphrase.
- If multiple relevant excerpts exist, include all of them as separate list items.
- If the guideline provides inclusion but not exclusion (or vice versa), set the missing field to "Not specified".
- If neither is explicitly stated, set both fields to "Not specified".

Source: {source}

Guideline Text:
{guideline_text}
"""

In [30]:
# Invoke both prompts with the local Ollama model and store responses
def _extract_text(resp):
    # resilient extraction for different return types
    try:
        if resp is None:
            return ""
        if isinstance(resp, str):
            return resp
        if isinstance(resp, dict):
            return resp.get("result") or resp.get("text") or str(resp)
        # langchain LLM result objects
        if hasattr(resp, "generations"):
            gens = resp.generations
            if gens and gens[0] and hasattr(gens[0][0], "text"):
                return gens[0][0].text
        if hasattr(resp, "text"):
            return resp.text
    except Exception:
        pass
    return str(resp)

In [31]:
target_pop_prompt_response=  []

for doc in tqdm(guidelines_doc_data):
    doc_source = doc.metadata.get("source", "Unknown Source")
    doc_opid = doc.metadata.get("OPID", "Unknown Id")
    doc_text = doc.page_content

    target_prompt = target_pop_extract_prompt1.format(
        source= doc_source,
        guideline_text= doc_text
    )

    incl_exc_prompt = incl_exc_criteria_prompt.format(
        source= doc_source,
        guideline_text= doc_text
    )

    llm_local = OllamaLLM(model="gemma3n:e4b")
    resp = llm_local.invoke(input=target_prompt)
    target_ans = _extract_text(resp).strip()

    resp2 = llm_local.invoke(input=incl_exc_prompt)
    incl_exc_ans = _extract_text(resp2).strip()

    # store structured responses
    target_pop_prompt_response.append({
        "source": doc_source,
        "OPID": doc_opid,
        "target_population": target_ans,
        "inclusion_exclusion_criteria": incl_exc_ans
    })

100%|██████████| 12/12 [24:11<00:00, 120.93s/it]


In [32]:
# save to a CSV file
df_responses = pd.DataFrame(target_pop_prompt_response)
df_responses.to_csv("geriatric_periop_care_guidelines_extracted_target_pop_incl_exc_criteria_Dec19.csv", index=False)

In [5]:
import json
from datetime import datetime, date
from langchain_core.documents import Document  # or: from langchain.schema import Document

def json_safe(obj):
    # Recursively make metadata JSON-serializable
    if isinstance(obj, (str, int, float, bool)) or obj is None:
        return obj
    if isinstance(obj, (list, tuple)):
        return [json_safe(x) for x in obj]
    if isinstance(obj, dict):
        return {str(k): json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    # Fallback: string representation
    return str(obj)

def save_docs_json(docs, ques_id):
    serializable = []
    for rank, d in enumerate(docs, start=1):
        # Optionally keep retrieval info
        md = dict(d.metadata or {})
        md.setdefault("retrieval_rank", rank)
        md.setdefault("ques_id", ques_id)
        serializable.append({
        "page_content": d.page_content,
        "metadata": json_safe(md),
        })
    return serializable

In [None]:
if __name__ == "__main__":
    retrieval_qa = create_deepseek_rag_qa()

    qa_bank= pd.read_csv("../data/perioperative_questions_Nov2025.csv")

    resp_id_list, resp_str_list, source_trace_list= [], [], []

    for row_id in tqdm(range(qa_bank.shape[0])):
        ques_txt = qa_bank.iloc[row_id, 3] + " Please explain your answer."
        source=qa_bank.iloc[row_id, 1]
        answer = retrieval_qa.invoke(input= ques_txt)
        
        resp_id_list.append(qa_bank.iloc[row_id, 0])
        resp_str_list.append(answer['result'])
        source_trace_list.append(save_docs_json(answer['source_documents'], ques_id=qa_bank.iloc[row_id, 0]))
        #print(answer['result'])
        #print(len(answer['source_documents']))
        

  embeddings = OllamaEmbeddings(model="qwen3-embedding:latest")


Using the precomputed embedding vector store.


100%|██████████| 18/18 [05:29<00:00, 18.29s/it]


In [11]:
ger_rag_resp_df= pd.DataFrame({'ques_id': resp_id_list,  'rag_response': resp_str_list, 'rag_source_trace_top10': source_trace_list})
ger_rag_resp_df.head()

Unnamed: 0,ques_id,rag_response,rag_source_trace_top10
0,1,"Based on the provided context, the recommended...",[{'page_content': 'Intravenous fluids A small ...
1,2,"Based on the provided context, cardiac output ...",[{'page_content': 'Cardiac output monitoring O...
2,3,"Based on the provided context, the NICE guidel...",[{'page_content': 'Blood glucose control 1.4.6...
3,4,"Based on the provided context, the primary mod...","[{'page_content': 'In general, the administrat..."
4,5,"Yes, the WHO surgical safety checklist should ...",[{'page_content': 'Surgical safety checklists ...


In [12]:
qa_bank.head()

Unnamed: 0,ques_id,source,creator,question,creator_response,explanation_for_clinicians
0,1,NICE guideline NG180,gpt5,"For an adult undergoing elective surgery, whic...",Intravenous crystalloid should be considered f...,The guideline recommends using intravenous cry...
1,2,NICE guideline NG180,gpt5,"In adults having major or complex surgery, whe...",Consider cardiac output monitoring in people h...,Cardiac output monitoring can help guide intra...
2,3,NICE guideline NG180,gpt5,For a person with type 2 diabetes undergoing s...,No. Do not use glucose-lowering medicines to a...,Very tight perioperative glucose targets incre...
3,4,NICE guideline NG180,gpt5,What modality can be used intraoperatively to ...,Cardiac output monitoring.,"In major, complex, or high-risk surgery, consi..."
4,5,NICE guideline NG180,gpt5,Should the WHO surgical safety checklist be co...,Yes. Ensure the WHO surgical safety checklist ...,The checklist is intended for universal use ac...


In [13]:
qa_bank_with_response= pd.merge(qa_bank, ger_rag_resp_df, on='ques_id')
qa_bank_with_response.head()

Unnamed: 0,ques_id,source,creator,question,creator_response,explanation_for_clinicians,rag_response,rag_source_trace_top10
0,1,NICE guideline NG180,gpt5,"For an adult undergoing elective surgery, whic...",Intravenous crystalloid should be considered f...,The guideline recommends using intravenous cry...,"Based on the provided context, the recommended...",[{'page_content': 'Intravenous fluids A small ...
1,2,NICE guideline NG180,gpt5,"In adults having major or complex surgery, whe...",Consider cardiac output monitoring in people h...,Cardiac output monitoring can help guide intra...,"Based on the provided context, cardiac output ...",[{'page_content': 'Cardiac output monitoring O...
2,3,NICE guideline NG180,gpt5,For a person with type 2 diabetes undergoing s...,No. Do not use glucose-lowering medicines to a...,Very tight perioperative glucose targets incre...,"Based on the provided context, the NICE guidel...",[{'page_content': 'Blood glucose control 1.4.6...
3,4,NICE guideline NG180,gpt5,What modality can be used intraoperatively to ...,Cardiac output monitoring.,"In major, complex, or high-risk surgery, consi...","Based on the provided context, the primary mod...","[{'page_content': 'In general, the administrat..."
4,5,NICE guideline NG180,gpt5,Should the WHO surgical safety checklist be co...,Yes. Ensure the WHO surgical safety checklist ...,The checklist is intended for universal use ac...,"Yes, the WHO surgical safety checklist should ...",[{'page_content': 'Surgical safety checklists ...


In [14]:
qa_bank_with_response.to_csv("../results/ger_rag_response_perioperative_care_Nov26.csv", index=False)

In [15]:
with open("../results/source_docs_perioperative_questions_Nov26.jsonl", "w", encoding="utf-8") as f:
    for rec in source_trace_list:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

In [16]:
pprint(resp_str_list)

['Based on the provided context, the recommended type of intravenous fluid for '
 'intraoperative maintenance in adults undergoing elective surgery is '
 '**crystalloid**.\n'
 '\n'
 'The evidence and guidelines within the context indicate:\n'
 '\n'
 '1.  A small amount of evidence suggests a possible reduction in mortality '
 'with intravenous crystalloids compared to colloids.\n'
 '2.  While crystalloids increase the risk of nausea and vomiting, the '
 'committee concluded that crystalloids should be considered for '
 'intraoperative fluid maintenance.\n'
 '3.  Recommendation 1.4.3 specifically states: "Consider using intravenous '
 'crystalloid for intraoperative fluid maintenance."\n'
 '\n'
 'Therefore, the guideline advises considering the use of intravenous '
 'crystalloids for this purpose.',
 'Based on the provided context, cardiac output monitoring should be '
 'considered in adults having **major or complex surgery or high-risk '
 'surgery** (as stated in section 1.4.5 of the 

In [17]:
from pprint import pprint

pprint(source_trace_list)

[[{'metadata': {'OPID': 50006,
                'ques_id': '1',
                'retrieval_rank': 1,
                'source': 'perioperative-care-in-adults-pdf-66142014963397.grobid.tei.xml'},
   'page_content': 'Intravenous fluids A small amount of evidence suggested a '
                   'possible reduction in mortality when intravenous '
                   'crystalloid, rather than colloid, is used for '
                   'intraoperative fluid management. However, there was also '
                   'evidence showing that crystalloids resulted in a '
                   'clinically important increase in nausea and vomiting. The '
                   'committee were aware that crystalloid use has become more '
                   'common after reports of increased risks of acute kidney '
                   'injury, coagulopathy and mortality with colloid. They also '
                   'noted that crystalloid is less expensive than colloid. '
                   'They concluded that cr