In [1]:
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
from typing import List, Any, Dict, Tuple
import uuid

In [2]:
import subprocess
import socket

# Get Windows host IP from WSL
def get_windows_host_ip():
    # Method 1: Try WSL2 bridge interface (default route)
    try:
        result = subprocess.run(
            ["ip", "route", "show", "default"],
            capture_output=True,
            text=True
        )
        for line in result.stdout.split('\n'):
            if 'default' in line and 'via' in line:
                parts = line.split()
                via_index = parts.index('via')
                ip = parts[via_index + 1]
                # Test if Ollama is accessible at this IP
                try:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(1)
                    sock.connect((ip, 11434))
                    sock.close()
                    print(f"✓ Found Ollama at {ip} (WSL default gateway)")
                    return ip
                except:
                    print(f"✗ Port 11434 not accessible at {ip}")
    except Exception as e:
        print(f"✗ Method 1 failed: {e}")
    
    # Method 2: Try /etc/resolv.conf nameserver
    try:
        result = subprocess.run(
            ["cat", "/etc/resolv.conf"],
            capture_output=True,
            text=True
        )
        for line in result.stdout.split('\n'):
            if 'nameserver' in line:
                ip = line.split()[1]
                # Test if Ollama is accessible at this IP
                try:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(1)
                    sock.connect((ip, 11434))
                    sock.close()
                    print(f"✓ Found Ollama at {ip} (resolv.conf)")
                    return ip
                except:
                    print(f"✗ Port 11434 not accessible at {ip}")
    except Exception as e:
        print(f"✗ Method 2 failed: {e}")
    
    # Fallback to localhost
    print("⚠ Falling back to localhost")
    return "127.0.0.1"
    
WINDOWS_HOST_IP = get_windows_host_ip()

✓ Found Ollama at 172.17.96.1 (WSL default gateway)


In [3]:
EMBEDDING = OllamaEmbeddings(base_url=f"http://{WINDOWS_HOST_IP}:11434",
                             model="embeddinggemma:300m")
vector_store = Chroma(
    collection_name="eda-hospital-zh",
    embedding_function=EMBEDDING,
    persist_directory="./vectordb/eda-hospital-zh",
    create_collection_if_not_exists=True,
)

In [4]:
def markdown_to_document(file_path: str, chunk_size: int, chunk_overlap: int) -> List[Document]:
    # Load metadata
    metadata_loader = UnstructuredMarkdownLoader(file_path, mode="elements")
    metadata_docs = metadata_loader.load()
    
    # Load content
    with open(file_path) as f:
        file = f.read()
    text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.create_documents([file], [metadata_docs[0].metadata])

    
    return documents

def combine_list_metadata(document: Document, key: str):
    langs = document.metadata.get(key)
    if langs is None:
        document.metadata[key] = ""
    elif isinstance(langs, str):
        pass
    else:
        try:
            document.metadata[key] = ",".join(map(str, langs))
        except TypeError:
            import json
            document.metadata[key] = json.dumps(langs)

In [5]:
documents = []

for folder in os.listdir("./assets/output-zh"):
    for file in os.listdir(f"./assets/output-zh/{folder}"):
        if file.endswith(".md"):
            file_path = f"./assets/output-zh/{folder}/{file}"
            document = markdown_to_document(file_path=file_path,
                                 chunk_size=2048,
                                 chunk_overlap=200)
            documents.extend(document)

In [6]:
for document in documents:
    combine_list_metadata(document=document, key="languages")

In [7]:
ids = [str(uuid.uuid4()) for _ in documents]
vector_store.add_documents(documents=documents, ids=ids)

['f3fe1be7-b211-45f3-81e1-ca6bac9fa0f5',
 '8ac3c2c5-e680-47b9-84af-ebcb9d65235f',
 'bfa3f82e-193f-4ab1-972d-8400b8e855f2',
 'fe1da42e-acaa-43b3-bc94-ec52ccf0d4f9',
 'e86ca32f-0539-457c-8854-018cebd24931',
 'fdc29b9a-cfbe-4d70-921b-5fc52920000a',
 '704a98aa-a8f3-424b-9c93-c64fa1006e4f',
 '1dc54bd1-107f-4943-b61b-dcdc50309f78',
 '335dbab1-3387-4988-bbae-79a1531818cc',
 '1987e4ef-c315-4bd8-8e4f-996eb822ed91',
 '56653fb4-806a-4dd8-960d-05f02dac2254',
 'b20ad73c-6ba2-4b9c-a8b2-1687360d885f',
 'e5fea983-1f24-45ea-a881-871a028e1e41',
 '7367395c-edc4-486b-8414-137244be2797',
 '2c408df1-317d-4c0f-b2e5-1a2421070562',
 '6de36821-6680-4604-8371-27a5c03fdf46',
 'ac3064c9-6fb8-4c6c-abbf-ddfd6bb69b41',
 'bc6ec2a9-083d-46c9-aeda-5f98f928dae7',
 'd741e81f-0208-499d-a3c2-153d1b43294c',
 'd69e7313-aa53-4797-ad04-5b5cd5058661',
 '5c975ec3-1a94-4e65-b748-b65eab037c54',
 '2719722a-f688-42d6-9216-3f13d7bc68c2',
 '053914e3-ec9c-402c-94b2-0062a36eeb28',
 '742dbda3-2273-496a-a835-3e4805bf7f80',
 '62f716a7-aed8-

In [8]:
vector_store.get().keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])

In [9]:
len(vector_store.get()["documents"])

51

# Search Database

In [10]:
retriever = vector_store.as_retriever(search_type="mmr", k=5)
docs = retriever.invoke("Kidney transplant")
for doc in docs:
    print(doc.metadata["filename"])

HA-7-0035(1)_急性腎盂腎炎注意事項_0.md
HA-7-0013(1)_低血鉀注意事項_0.md
HA-7-0014(2)低血糖照護_0.md
HA-7-0003(1)_八字肩帶使用注意事項_0.md
