# Preprocessing ICD-10 Codes before using in LLM Report Generator

In [1]:
import simple_icd_10_cm as cm
from icd10cm_utils import find_main_ancestor

In [3]:
# Retrieve all ICD-10 codes
all_codes = cm.get_all_codes()

In [4]:
# In all_codes, find all codes that contain a specific word and the code has no children
def find_specific_codes(word):
    return [code for code in all_codes if word.lower() in cm.get_description(code).lower() and not cm.get_children(code)]

In [5]:
# Conditions assumed to be related to bone marrow pathology (leukemia, lymphoma, myeloma, etc.)
diseases = ["Acute Myeloid Leukemia",
            "Acute Lymphoblastic Leukemia",
            "Chronic Myeloid Leukemia",
            "Chronic Lymphocytic Leukemia",
            "Diffuse Large B-Cell Lymphoma",
            "Follicular Lymphoma",
            "Mantle Cell Lymphoma",
            "Burkitt Lymphoma",
            "Hodgkin Lymphoma",
            "Multiple Myeloma",
            "Plasmacytoma",
            "Monoclonal Gammopathy",
            "Chronic Myeloid Leukemia",
            "Polycythemia Vera",
            "Thrombocythemia",
            "Myelofibrosis",
            "Histiocytic Sarcoma",
            "Langerhans Cell Histiocytosis",
            "Systemic Mastocytosis"]
 
relevant_codes = []
for disease in diseases:
    relevant_codes += find_specific_codes(disease)

# Total number of codes containing the diseases
print(len(relevant_codes))

247


In [6]:
# Find the unique ancestors for the relevant codes
unique_ancestors = []
for code in relevant_codes:
    if find_main_ancestor(code) not in unique_ancestors:
        unique_ancestors.append(find_main_ancestor(code))
        print(f'Code: {find_main_ancestor(code)} - {cm.get_description(find_main_ancestor(code))}')


Code: 2 - Neoplasms (C00-D49)
Code: 21 - Factors influencing health status and contact with health services (Z00-Z99)
Code: 3 - Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism (D50-D89)
Code: 10 - Diseases of the respiratory system (J00-J99)


In [7]:
# Count the number of codes per each main ancestor
code_2_count = 0
code_3_count = 0
code_10_count = 0
code_21_count = 0
for code in relevant_codes:
    if find_main_ancestor(code) == '2':
        code_2_count += 1
    if find_main_ancestor(code) == '3': 
        code_3_count += 1
    if find_main_ancestor(code) == '10':
        code_10_count += 1
    if find_main_ancestor(code) == '21':
        code_21_count += 1
print(f'Code 2: {code_2_count}')
print(f'Code 3: {code_3_count}')
print(f'Code 10: {code_10_count}')
print(f'Code 21: {code_21_count}')

relevant_codes_trimmed = []
for code in relevant_codes:
    if find_main_ancestor(code) == '2':
        relevant_codes_trimmed.append(code)

Code 2: 243
Code 3: 1
Code 10: 1
Code 21: 2


In [8]:
# save relevant_codes one line at a time in a text file
with open('relevant_codes.txt', 'w') as f:
    for code in relevant_codes_trimmed:
        f.write(f'{code}\n')