In [1]:
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
from typing import List
import uuid
import subprocess
import socket
import json
import re

In [2]:
# Get Windows host IP from WSL
def get_windows_host_ip():
    # Method 1: Try WSL2 bridge interface (default route)
    try:
        result = subprocess.run(
            ["ip", "route", "show", "default"],
            capture_output=True,
            text=True
        )
        for line in result.stdout.split('\n'):
            if 'default' in line and 'via' in line:
                parts = line.split()
                via_index = parts.index('via')
                ip = parts[via_index + 1]
                # Test if Ollama is accessible at this IP
                try:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(1)
                    sock.connect((ip, 11434))
                    sock.close()
                    print(f"✓ Found Ollama at {ip} (WSL default gateway)")
                    return ip
                except:
                    print(f"✗ Port 11434 not accessible at {ip}")
    except Exception as e:
        print(f"✗ Method 1 failed: {e}")
    
    # Method 2: Try /etc/resolv.conf nameserver
    try:
        result = subprocess.run(
            ["cat", "/etc/resolv.conf"],
            capture_output=True,
            text=True
        )
        for line in result.stdout.split('\n'):
            if 'nameserver' in line:
                ip = line.split()[1]
                # Test if Ollama is accessible at this IP
                try:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(1)
                    sock.connect((ip, 11434))
                    sock.close()
                    print(f"✓ Found Ollama at {ip} (resolv.conf)")
                    return ip
                except:
                    print(f"✗ Port 11434 not accessible at {ip}")
    except Exception as e:
        print(f"✗ Method 2 failed: {e}")
    
    # Fallback to localhost
    print("⚠ Falling back to localhost")
    return "127.0.0.1"

def markdown_to_document(file_path: str, chunk_size: int, chunk_overlap: int, is_execute: bool = False) -> List[Document]:
    if is_execute:
        # Load metadata
        metadata_loader = UnstructuredMarkdownLoader(file_path, mode="elements")
        metadata_docs = metadata_loader.load()
        
        # Load content
        with open(file_path) as f:
            file = f.read()
        text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        documents = text_splitter.create_documents([file], [metadata_docs[0].metadata])

        
        return documents

def combine_list_metadata(document: Document, key_to_combine: str, is_execute: bool = False):
    if is_execute:
        langs = document.metadata.get(key_to_combine)
        if langs is None:
            document.metadata[key_to_combine] = ""
        elif isinstance(langs, str):
            pass
        else:
            try:
                document.metadata[key_to_combine] = ",".join(map(str, langs))
            except TypeError:
                import json
                document.metadata[key_to_combine] = json.dumps(langs)

def add_documents(vector_store: Chroma, documents: List[Document], is_execute: bool = False):
    if is_execute:
        ids = [str(uuid.uuid4()) for _ in documents]
        vector_store.add_documents(documents=documents, ids=ids)

In [3]:
WINDOWS_HOST_IP = get_windows_host_ip()

EMBEDDING = OllamaEmbeddings(base_url=f"http://{WINDOWS_HOST_IP}:11434",
                             model="embeddinggemma:300m")
vector_store_zh = Chroma(
    collection_name="eda-hospital-zh",
    embedding_function=EMBEDDING,
    persist_directory="./vectordb/eda-hospital-zh",
    create_collection_if_not_exists=True,
)
vector_store_en = Chroma(
    collection_name="eda-hospital-en",
    embedding_function=EMBEDDING,
    persist_directory="./vectordb/eda-hospital-en",
    create_collection_if_not_exists=True,
)

✓ Found Ollama at 172.17.96.1 (WSL default gateway)


In [4]:
documents = []

for folder in os.listdir("./assets/output-en-combined"):
    for file in os.listdir(f"./assets/output-en-combined/{folder}"):
        if file.endswith(".md"):
            file_path = f"./assets/output-en-combined/{folder}/{file}"
            document = markdown_to_document(file_path=file_path,
                                 chunk_size=2048,
                                 chunk_overlap=200,
                                 is_execute=True)
            if document is not None:
                documents.extend(document)

for document in documents:
    combine_list_metadata(document=document, key_to_combine="languages", is_execute=True)

add_documents(vector_store=vector_store_en, documents=documents, is_execute=True)

In [5]:
vector_store_en.get().keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])

In [6]:
len(vector_store_zh.get()["documents"])

0

# Search Database

In [None]:
with open("./filename_translation.json", "r") as f:
    filename_translation = json.load(f)

In [None]:
docs = vector_store_en.similarity_search_with_relevance_scores(query="Nosebleed", k=5, score_threshold=0)
for doc in docs:
    if doc[1] > 0:
        print("--Original retrieved files in English")
        print(f'{doc[0].metadata["filename"]} | {doc[1]}')
        print("--Chinese")
        base_filename = re.sub(r'(?i)(?:_\d+)?\.md$', '', doc[0].metadata["filename"])  # strip trailing _<digits>.md (case-insensitive)
        print(next((k for k, v in filename_translation.items() if v == base_filename), base_filename))
        print("\n")

In [None]:
docs = vector_store_zh.similarity_search_with_relevance_scores(query="流鼻血", k=5, score_threshold=0)
for doc in docs:
    if doc[1] > 0:
        print("--Original retrieved files in Chinese")
        print(f'{doc[0].metadata["filename"]} | {doc[1]}')
        print("--English")
        base_filename = re.sub(r'(?i)(?:_\d+)?\.md$', '', doc[0].metadata["filename"])  # strip trailing _<digits>.md (case-insensitive)
        print(filename_translation.get(base_filename, base_filename))
        print("\n")