In [1]:
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
from typing import List, Any, Dict, Tuple
import uuid

In [2]:
import subprocess
import socket

# Get Windows host IP from WSL
def get_windows_host_ip():
    # Method 1: Try WSL2 bridge interface (default route)
    try:
        result = subprocess.run(
            ["ip", "route", "show", "default"],
            capture_output=True,
            text=True
        )
        for line in result.stdout.split('\n'):
            if 'default' in line and 'via' in line:
                parts = line.split()
                via_index = parts.index('via')
                ip = parts[via_index + 1]
                # Test if Ollama is accessible at this IP
                try:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(1)
                    sock.connect((ip, 11434))
                    sock.close()
                    print(f"✓ Found Ollama at {ip} (WSL default gateway)")
                    return ip
                except:
                    print(f"✗ Port 11434 not accessible at {ip}")
    except Exception as e:
        print(f"✗ Method 1 failed: {e}")
    
    # Method 2: Try /etc/resolv.conf nameserver
    try:
        result = subprocess.run(
            ["cat", "/etc/resolv.conf"],
            capture_output=True,
            text=True
        )
        for line in result.stdout.split('\n'):
            if 'nameserver' in line:
                ip = line.split()[1]
                # Test if Ollama is accessible at this IP
                try:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(1)
                    sock.connect((ip, 11434))
                    sock.close()
                    print(f"✓ Found Ollama at {ip} (resolv.conf)")
                    return ip
                except:
                    print(f"✗ Port 11434 not accessible at {ip}")
    except Exception as e:
        print(f"✗ Method 2 failed: {e}")
    
    # Fallback to localhost
    print("⚠ Falling back to localhost")
    return "127.0.0.1"
    
WINDOWS_HOST_IP = get_windows_host_ip()

✓ Found Ollama at 172.17.96.1 (WSL default gateway)


In [3]:
EMBEDDING = OllamaEmbeddings(base_url=f"http://{WINDOWS_HOST_IP}:11434",
                             model="embeddinggemma:300m")
vector_store = Chroma(
    collection_name="eda-hospital",
    embedding_function=EMBEDDING,
    persist_directory="vectordb",
    create_collection_if_not_exists=True,
)

In [4]:
def markdown_to_document(file_path: str, chunk_size: int, chunk_overlap: int) -> List[Document]:
    # Load metadata
    metadata_loader = UnstructuredMarkdownLoader(file_path, mode="elements")
    metadata_docs = metadata_loader.load()
    
    # Load content
    with open(file_path) as f:
        file = f.read()
    text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.create_documents([file], [metadata_docs[0].metadata])

    
    return documents

def combine_list_metadata(document: Document, key: str):
    langs = document.metadata.get(key)
    if langs is None:
        document.metadata[key] = ""                      # or None if you prefer
    elif isinstance(langs, str):
        # already a string, leave as-is
        pass
    else:
        # convert any iterable to comma-separated string, ensure elements are strings
        try:
            document.metadata[key] = ",".join(map(str, langs))
        except TypeError:
            # fallback: stringify the value
            import json
            document.metadata[key] = json.dumps(langs)

In [5]:
documents = []

for folder in os.listdir("./assets/output"):
    for file in os.listdir(f"./assets/output/{folder}"):
        if file.endswith("-ENG.md"):
            file_path = f"./assets/output/{folder}/{file}"
            document = markdown_to_document(file_path=file_path,
                                 chunk_size=2048,
                                 chunk_overlap=200)
            documents.extend(document)

In [29]:
for folder in os.listdir("./assets/output"):
    for file in os.listdir(f"./assets/output/{folder}"):
        if file.endswith("-ENG.md"):
            print(file)

HA-4-0009(1) Understanding Epidural Patient-Controlled Analgesia_0-ENG.md
HA-4-0010(1) Understanding Patient-Controlled Intravenous Analgesia_0-ENG.md
HA-4-0011 Invasive Examination and Treatment Anesthesia Precautions_0-ENG.md
HA-4-0005(7) Outpatient General Surgery Post-Anesthesia Care Instructions_0-ENG.md
HA-7-0033(1)_Constipation Precautions_0-ENG.md
HA-7-0041(1)_Indwelling Catheter Home Care Note_0-ENG.md
HA-7-0042(1)_Chest Contusion Precautions_0-ENG.md
HA-7-0043(1)_Chest Pain Instructions[1]_0-ENG.md
HA-7-0018(1)_Precautions for Sprains or Muscle Strains_0-ENG.md
HA-7-0012(1)_Self Eye Drop Instructions_0-ENG.md
HA-7-0017(1)_Urinary Tract Stone_0-ENG.md
HA-7-0031(1)_Seizure Precautions_0-ENG.md
HA-7-0021(1)_Children's Cough Precautions_0-ENG.md
HA-7-0036(1)_Precautions for Acute Gastroenteritis_0-ENG.md
HA-7-0014(2) Hypoglycemia Care_0-ENG.md
HA-7-0037(1)_Nosebleed Precautions_0-ENG.md
HA-7-0016(1)_Sciatica Precautions_0-ENG.md
HA-7-0030(1)_Sexual Assault Victim Discharge Care-0

In [6]:
for document in documents:
    combine_list_metadata(document=document, key="languages")

In [7]:
ids = [str(uuid.uuid4()) for _ in documents]
vector_store.add_documents(documents=documents, ids=ids)

['f562c063-c9ef-453f-b282-2ea02e80bfa9',
 'f1063983-7d2e-42e5-a84c-d84fd516716d',
 '7021bc8f-926d-41f2-9a8d-52b6b2482fa7',
 'a8723956-5e72-4e32-a5a3-1ad664ae6ab0',
 '5fdd317c-f623-4170-86d6-21e8cc6f71b5',
 'e23ff96b-51f8-4d06-8d74-a817b4c82c53',
 'f00a19d3-b425-461c-b16a-8f6de0e4d4cd',
 'be9c002b-f842-44d9-99de-18dd6e1a1c80',
 'd460c262-aafb-4818-8034-b8cbaf879e63',
 '2cc5b383-98b5-4cc5-8792-c7500dbb7a4c',
 'f615a860-2c77-4e9b-bd39-5cea0be94b30',
 'bcc7fbb0-d6fa-4f18-80f8-a4050621b0e0',
 '628073ad-e502-4699-9598-a0403b4d01c4',
 '3f78a206-d2a6-429b-a59f-8115c58f4e20',
 'e1d73500-3c02-447d-ae48-022d8ce1c612',
 '50dc9946-f034-48ca-89df-cb2d9a4d64f3',
 '430138a7-be89-4d73-aa72-b2c41b3de2e8',
 '178ca9ca-a95d-4b21-81f0-29c5a783f8c6',
 'd3bc8e71-091f-45e9-ab73-d2a1b7f0cfcc',
 '32002362-1264-47f6-82f4-00965175383e',
 '8ba5b961-030a-4e4d-b00a-9a3d5213edd0',
 'e8f14efd-f250-4dd4-867a-641d0b44552e',
 '34772ff2-1469-4f43-999b-d9f84b2a20d2',
 'ef567def-4c8d-4072-90ba-b47c23c4dadd',
 '251c3e04-ebab-

In [15]:
vector_store.get().keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])

In [25]:
len(vector_store.get()["documents"])

81

# Search Database

In [28]:
retriever = vector_store.as_retriever(search_type="mmr", k=5)
docs = retriever.invoke("Kidney transplant")
for doc in docs:
    print(doc.metadata["filename"])

HA-9-0015(2) Understanding Kidney Transplant-ENG.md
HA-9-0104(2) Post-Extracorporeal Shock Wave Lithotripsy Precautions_0-ENG.md
HA-9-0129(1) Peripheral Artery Occlusive Disease-ENG.md
HA-9-0061(3)健康檢查前注意事項-ENG.md
