In [1]:
from pathlib import Path

import pandas as pd
import chromadb
from tqdm import tqdm
from transformers import pipeline

# Initialize the client

In [2]:
client = chromadb.PersistentClient(path="idx/icd11")

# Load the data

In [3]:
icd = pd.read_csv("data/ICD-11-MMS-en-cleaned.csv.zip").fillna('')
icd.head()

Unnamed: 0,Code,Title,ClassKind,DepthInKind,IsResidual,BrowserLink,isLeaf
0,,Certain infectious or parasitic diseases,chapter,1,False,https://icd.who.int/browse11/l-m/en#/http%3A%2...,False
1,,- Gastroenteritis or colitis of infectious origin,block,1,False,https://icd.who.int/browse11/l-m/en#/http%3A%2...,False
2,,- - Bacterial intestinal infections,block,2,False,https://icd.who.int/browse11/l-m/en#/http%3A%2...,False
3,1A00,- - - Cholera,category,1,False,https://icd.who.int/browse11/l-m/en#/http%3A%2...,True
4,1A01,- - - Intestinal infection due to other Vibrio,category,1,False,https://icd.who.int/browse11/l-m/en#/http%3A%2...,True


In [4]:
titles = icd["Title"].tolist()
titles = [t.lstrip('- ') for t in titles if t]
titles[:5]

['Certain infectious or parasitic diseases',
 'Gastroenteritis or colitis of infectious origin',
 'Bacterial intestinal infections',
 'Cholera',
 'Intestinal infection due to other Vibrio']

In [5]:
max_len_chars = max(len(t) for t in titles)
max_len_words = max(len(t.split()) for t in titles)
max_len_chars, max_len_words

(212, 30)

In [6]:
icd["cleaned_title"] = icd["Title"].str.lstrip('- ')

# Load the vectorizer
Using the pipelines discussed in https://stackoverflow.com/questions/64685243/getting-sentence-embedding-from-huggingface-feature-extraction-pipeline

In [7]:
extractor = pipeline(model="AkshatSurolia/ICD-10-Code-Prediction", task="feature-extraction")

In [8]:
result = extractor("Bacterial intestinal infections", return_tensors=True)
result.shape

torch.Size([1, 7, 768])

In [9]:
sentence_embedding = result[0].mean(dim=0)
sentence_embedding.shape

torch.Size([768])

In [10]:
sentence_embedding2 = result[0][0]
sentence_embedding2.shape

torch.Size([768])

# Load the index

In [11]:
collection = client.get_collection(
    name="icd11_titles", 
    # metadata={"hnsw:space": "cosine"},
)

# Process the test document

In [12]:
test_doc = Path("data/policy-example.txt").read_text(encoding="utf-8")
lines = test_doc.splitlines()
print(len(lines))

206


In [13]:
for n, line in enumerate(lines, 1):
    embedding = extractor(line, return_tensors=True)[0].mean(dim=0).numpy().tolist()
    results = collection.query(query_embeddings=embedding, n_results=5)
    print(f"{n:3d} {line}")
    codes = [c for c, d in zip(results["ids"][0], results["distances"][0]) if d < 1000]
    print(f"   >{codes}")

  1 Avsola® (infliximab‐axxq)
   >['XM3MX3', 'XM1CT9', 'XM78P5']
  2 Pharmacy Coverage Policy
   >[]
  3 Effective Date:  June 17, 2020
   >[]
  4 Revision Date:  January 25, 2023
   >[]
  5 Review Date:  January 18, 2023
   >[]
  6 Line of Business:  Medicaid ‐ South Carolina, Medicaid ‐ Ohio
   >[]
  7 Policy Type:  Prior AuthorizationPage: 1 of 8
   >[]
  8 Humana's documents are updated regularly online. When printed, the version of this document becomes uncontrolled. Do not rely on printed copies for the most up‐to date version.
   >[]
  9 Refer to http://apps.humana.com/tad/tad_new/home.aspx  to verify that this is the current version before utilizing.
   >[]
 10 Disclaimer Background
   >[]
 11 Description Medical Terms
   >[]
 12 Coverage Determination References
   >[]
 13 Disclaimer State and federal law, as well as contract language, including definitions and specific inclusions/exclusions, take precedence over clinical policy and must be considered first in determining elig