In [1]:
!pip install transformers torch accelerate bitsandbytes



In [2]:
import pandas as pd
import numpy as np
from queue import Queue
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# If you are running via google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# For moving the dataset into local space - via google drive

import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/colab.zip", 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

In [4]:
dataFilename = './dataset/data.csv'
top50Filename = './dataset/top50.csv'
output_file = "./drive/MyDrive/colab/output_log.txt"

maxPromptAttempts = 3
defaultPromptResponse = "yes" # Used when LLM prompted maximum number of times and only returns inconclusive answers

#model_id = "meta-llama/Llama-3.1-8B-Instruct" # test 1
model_id = "meta-llama/Llama-3.2-3B-Instruct" # test 2

# Quantization configuration - reduce memory usage & speed up model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 4bit precision
    bnb_4bit_use_double_quant=True, #n nested quantization
    bnb_4bit_quant_type="nf4", # normalfloat4 quantization type
    bnb_4bit_compute_dtype=torch.bfloat16 # compute in bfloat16 for performance benefits
)

# Padding left side using tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# Load model and automatically allocate system resources
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    max_new_tokens=2,
    tokenizer=tokenizer,
    temperature=0.05 # 0.1 for test1, 0.05 for test2
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [5]:
systemPrompt = """You are a clinical coder, read the brief hospital course (BHC) summary below.\n"""

In [6]:
questions = {
    # Splitting into different chapters
    "question1": "Is the primary reason for hospitalization primarly due to any of: a mental health condition or a neurological condition?",
    "question5": "Is the primary reason for hospitalization primarly due to any of: alcohol abuse, alcohol dependence, alcohol withdrawal or alcoholic cirrhosis?",
    "question3": "Is the primary reason for hospitalization primarly due to any of: sepsis, a viral intestinal infection? Or specifically an infection following a recent procedure? If it is explicitly a bacterial skin infection (cellulitis) respond no.",
    "question4": "Is the primary reason for hospitalization primarly due to any of: pneumonia, COPD exacerbation, asthma, or acute respiratory failure with hypoxia?",
    "question2": "Is the any mention of the following conditions: a cardiovascular issue, myocardial infarction (heart attack), hypertensive heart, heart failure, atherosclerotic heart disease, Nonrheumatic aortic stenosis, or orthostatic hypotension (causing dizziness when standing up)?",

    # Mental & Neurological
    "question6": "Does the patient primarily have psychiatric symptoms (mood changes, depression, hallucinations, suicidal thoughts)?",
    "question7": "Has the patient experienced hallucinations, delusions, or disorganized thinking or speech?",
    "question8": "Has the patient experienced suicidal ideation without any depressive episodes mentioned?",
    "question9": "Has the patient had previous depressive episodes?",
    "question10": "Has brain imaging revealed a nonruptured cerebral aneurysm?",
    "question11": "Does the patient exhibit confusion, memory loss, altered mental status, or neurological impairment, due to toxins, rather than a brief loss of consciousness with quick recovery?",

    # Alcohol
    "question12": "Has the patient been diagnosed with alcoholic liver cirrhosis or hepatitis, liver disease, alcoholic pancreatitis directly due to current misuse, or dependence on alcohol?",
    "question13": "Is the patient described as being currently intoxicated (showing signs like slurred speech, impaired coordination, or altered judgment) and therefore, not showing withdrawal symptoms?",
    "question14": "Does the patient have a history of compulsive alcohol use with tolerance or withdrawal symptoms when not drinking?",

    # Infectious
    "question15": "Is the primary/only reasoning for hospsitalization an infection directly due to a recent medical procedure?",
    "question16": "Does the patient have sepsis (fever, rapid heart rate, low blood pressure, altered mental status, organ dysfunction)?",
    "question17": "Has lab testing confirmed Escherichia coli (E. coli) as the cause of sepsis?",

    # Respiratory
    "question18": "Does the patient have acute respiratory failure with hypoxia (severe difficulty breathing, low oxygen levels requiring oxygen support or ventilation)?",
    "question19": "Has the patient been diagnosed with pneumonia (e.g., fever, cough with sputum, abnormal chest X-ray)?",
    "question20": "Does the patient have a history of chronic obstructive pulmonary disease (COPD) with an acute worsening (e.g., increased cough, sputum production, wheezing, shortness of breath) or an upper respiratory tract infection (URTI) with chronic obstructive pulmonary disease (COPD)?",
    "question21": "Did the patient recently inhale food, vomit, or another foreign substance before experiencing breathing difficulties, cough, or lung inflammation? Respond no if the patient has unspecified asthma with acute exacerbation and no chronic obstructive pulmonary disease.",

    # Cardiovascular
    "question22": "Does the patient have heart failure symptoms (e.g., shortness of breath, fluid retention, fatigue)?",
    "question23": "Is there acute heart failure (rapid onset of symptoms such as worsening shortness of breath, chest discomfort), no indications of renal disease and no clear link stated between hypertension and heart disease?",
    "question24": "Is the patient experiencing symptoms primarily due to an issue with heart relaxation (e.g., stiff heart muscle, difficulty filling during diastole)?",
    "question25": "Does the patient have a history of hypertension contributing to heart and kidney dysfunction (e.g., fluid retention, elevated blood pressure, kidney problems)?",
    "question26": "Has the patient experienced an acute myocardial infarction recently (e.g., chest pain, elevated cardiac biomarkers), AND not have a subsequent one within 4 weeks? Alternatively, was the patient admitted and treated for coronary artery disease during the same provider spell?",
    "question27": "Does the patient have atherosclerosis but without significant narrowing in the coronary arteries (no major blockage or symptoms of unstable angina)? Alternatively, has the patient had a myocardial infarction with other forms of ischemic heart disease, and been admitted for an intervention to treat coronary artery disease?",
    "question28": "Is the patient experiencing unstable angina (severe chest pain at rest or with minimal exertion, risk of heart attack)?",
    "question29": "Does the patient have a documented history of dizziness or fainting upon standing, without associated chest pain or shortness of breath during exertion?",

    # Other Conditions
    "question30": "Does the patient have episodes of irregular, rapid heartbeats (palpitations) with no clear structural heart abnormalities or fluid overload present at the time of diagnosis?",
    "question31": "Does the patient experience episodes of atrial fibrillation that start and stop on their own, with a return to normal rhythm within 7 days?",
    "question32": "Is the primary cause for hospitalization kidney (renal) disease? Symptoms could be patient has a sudden decrease in urine output accompanied by confusison or fatigue?",
    "question33": "Does the patient have a history of recent severe dehydration, a drop in blood pressure, or signs of shock (e.g., rapid heartbeat, low urine output) in addition to urinary symptoms?",
    "question34": "Does the patient have symptoms of acute abdominal pain (especially in the lower right abdomen, upper right abdomen, or presence of black, tarry stools) with no clear history of trauma, chest pain, or significant vascular issues (e.g., pulmonary embolism, thrombosis) in the clinical notes?",
    "question35": "Does the patient have localized abdominal pain in the lower right abdomen with fever, nausea, or vomiting, with no history of jaundice or digestive issues?",
    "question36": "Has the patient experienced black, tarry stools (melena) without being a symptom of another diagnosed disease or without any associated abdominal pain or gastrointestinal trauma?",
    "question37": "Is the primary reason for hospitalization a pregnancy lasting between 40-42 weeks, or referred to as post-term or post-date?",
    "question38": "Is the primary reason for hospitalization chest pain where the underlying cause is unknown and there is no other relevant diagnosis mentioned?",
    "question39": "Is the cause for the chest pain known?",
    "question40": "Is the primary reason for hospitalization an encounter for antineoplastic chemotherapy?",
    "question41": "Is the primary reason for hospitalization a secondary neoplasm of (explicitly) the brain (cancer spread from another part of the body to the brain)?",
    "question42": "Is the primary reason for hospitalization cellulitis of a lower limb (right or left)?",
    "question43": "Is the cellulitis present in the right lower limb and not the left lower limb? If it is present in both, choose the one it originated in (or otherwise is more prominent in).",
    "question44": "Did the patient experience a traumatic subdural hemorrhage (head injury or trauma with symptoms like headache, dizziness, or confusion) but without loss of consciousness?",
    "question45": "Is the primary reason for hospitalization pulmonary embolism without acute cor pulmonale or thrombosis due to vascular prosthetic devices, implants and grafts?",
    "question46": "Does the patient have a pulmonary embolism without acute cor pulmonale?",
    "question47": "Is the primary reason for hospitalization unspecified abdominal pain with no underlying cause mentioned?",
    "question48": "Is the primary reason for hospitalization one of: type 2 diabetes mellitus (with hypoglycemia), or morbid obesity due to excess calories?",
    "question49": "Does the patient have type 2 diabetes mellitus with hypoglycemia?"
}


In [7]:
# Used for the BDT

class Node:
    def __init__(self, question, yes_branch=None, no_branch=None, outcome=None):
        self.question = question
        self.yes_branch = yes_branch
        self.no_branch = no_branch
        self.outcome = outcome

    def is_leaf(self):
        return self.outcome is not None

In [8]:
top50_df = pd.read_csv('./dataset/top50.csv', header=None)

leaf_nodes = {f"leaf{i}": Node(None, outcome=code) for i, code in enumerate(top50_df[0], start=1)}

In [9]:
# Generates nodes for each question and acts as mapping for the BDT

question49 = Node(questions['question49'], leaf_nodes['leaf40'], leaf_nodes['leaf50'])
question48 = Node(questions['question48'], question49, leaf_nodes['leaf44'])
question47 = Node(questions['question47'], leaf_nodes['leaf38'], question48)
question46 = Node(questions['question46'], leaf_nodes['leaf19'], leaf_nodes['leaf41'])
question45 = Node(questions['question45'], question46, question47)
question44 = Node(questions['question44'], leaf_nodes['leaf37'], question45)
question43 = Node(questions['question43'], leaf_nodes['leaf39'], leaf_nodes['leaf33'])
question42 = Node(questions['question42'], question43, question44)
question41 = Node(questions['question41'], leaf_nodes['leaf47'], question42)
question40 = Node(questions['question40'], leaf_nodes['leaf4'], question41)
question39 = Node(questions['question39'], leaf_nodes['leaf11'], leaf_nodes['leaf6'])
question38 = Node(questions['question38'], question39, question40)
question37 = Node(questions['question37'], leaf_nodes['leaf31'], question38)
question36 = Node(questions['question36'], leaf_nodes['leaf32'], leaf_nodes['leaf48'])
question35 = Node(questions['question35'], leaf_nodes['leaf30'], question36)
question34 = Node(questions['question34'], question35, question37)
question33 = Node(questions['question33'], leaf_nodes['leaf8'], leaf_nodes['leaf10'])
question32 = Node(questions['question32'], question33, question34)
question31 = Node(questions['question31'], leaf_nodes['leaf21'], leaf_nodes['leaf28'])
question25 = Node(questions['question25'], leaf_nodes['leaf7'], leaf_nodes['leaf12'])
question24 = Node(questions['question24'], leaf_nodes['leaf24'], leaf_nodes['leaf26'])
question23 = Node(questions['question23'], question24, question25)
question30 = Node(questions['question30'], question31, question23)
question29 = Node(questions['question29'], leaf_nodes['leaf49'], leaf_nodes['leaf16'])
question28 = Node(questions['question28'], leaf_nodes['leaf13'], question29)
question27 = Node(questions['question27'], leaf_nodes['leaf15'], question28)
question26 = Node(questions['question26'], leaf_nodes['leaf5'], question27)
question22 = Node(questions['question22'], question30, question26)
question21 = Node(questions['question21'], leaf_nodes['leaf36'], leaf_nodes['leaf45'])
question20 = Node(questions['question20'], leaf_nodes['leaf18'], question21)
question19 = Node(questions['question19'], leaf_nodes['leaf9'], question20)
question18 = Node(questions['question18'], leaf_nodes['leaf43'], question19)
question17 = Node(questions['question17'], leaf_nodes['leaf23'], leaf_nodes['leaf1'])
question16 = Node(questions['question16'], question17, leaf_nodes['leaf46'])
question15 = Node(questions['question15'], leaf_nodes['leaf17'], question16)
question14 = Node(questions['question14'], leaf_nodes['leaf29'], leaf_nodes['leaf3'])
question13 = Node(questions['question13'], question14, leaf_nodes['leaf25'])
question12 = Node(questions['question12'], leaf_nodes['leaf35'], question13)
question11 = Node(questions['question11'], leaf_nodes['leaf42'], leaf_nodes['leaf14'])
question10 = Node(questions['question10'], leaf_nodes['leaf20'], question11)
question9 = Node(questions['question9'], leaf_nodes['leaf34'], leaf_nodes['leaf2'])
question8 = Node(questions['question8'], leaf_nodes['leaf26'], question9)
question7 = Node(questions['question7'], leaf_nodes['leaf22'], question8)
question6 = Node(questions['question6'], question7, question10)
question5 = Node(questions['question5'], question12, question32)
question4 = Node(questions['question4'], question18, question5)
question3 = Node(questions['question3'], question15, question4)
question2 = Node(questions['question2'], question22, question3)
question1 = Node(questions['question1'], question6, question2)

In [10]:
# Used to get maximum number of prompts needed to reach a leaf node
def get_tree_height(node):
    if node is None:
        return 0
    left_height = get_tree_height(node.yes_branch)
    right_height = get_tree_height(node.no_branch)
    return 1 + max(left_height, right_height)

# Identifies if LLM response indicates affirmative or negative
def isPositiveResponse(response):
    if response.lower().find('no') != -1:
        return "no"
    if response.lower().find('yes') != -1:
        return "yes"

    print("Error, neither yes nor no was found inside response from model: " + response)
    return "inconclusive"

# Used for prompting model and only returning response without additional data structures wrapping it
def promptModel(prompt):
    rawResponse = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text']
    trimmedResponse = rawResponse[len(prompt):]
    return trimmedResponse

def isCodeTop50(code):
    return code in top50_df.values

In [11]:
df = pd.read_csv(dataFilename)
top50_df = pd.read_csv(top50Filename, header=None)

bdt_height = get_tree_height(question1) # Maximum number of prompts needed to reach a leaf node
num_entries = df.shape[0]
assignedCodes = np.full(num_entries, '', dtype=object)
correctCodes = np.full(num_entries, '', dtype=object)
writingBuffer = []

with open(output_file, "w") as f:

  # Iterate for each BHC summary
  for index, row in enumerate(df.itertuples(index=False)):

      bhc_summary = row[2]
      correctCodes[index] = row[7]
      node = question1 # root node
      
      # Creating prompt outside loop to improve efficiency
      promptStart = f"{systemPrompt}\nBHC summary: {bhc_summary} \n\nRules: Answer the following question with a 'yes' or 'no' only. Do not include an explanation.\nQuestion: "

      # iterate for up to a maximum of BDT height
      for _ in range(bdt_height):
          
          # cancel iteration early if leaf reached
          if node.is_leaf():
              assignedCodes[index] = node.outcome
              writingBuffer += [node.outcome]

              # Writing to output file every 500 entries in case execution stops
              if ((index % 500 == 0) or (index+1==num_entries)):
                print(f"Analysing entry {index}/{num_entries}")
                f.write("\n".join(writingBuffer) + "\n")
                f.flush()
                writingBuffer = []

              break

          prompt = promptStart + f"{node.question}\nAnswer:"
          promptAttempts = 0
          definitiveAnswer = False

          while not definitiveAnswer:
            response = promptModel(prompt)

            # If model returns an inconclusive response, repeat the prompt up to a defined maximum times
            promptAnswer = isPositiveResponse(response)
            if promptAnswer == "inconclusive":
              promptAttempts += 1

              # After too many unsuccessful prompts
              if promptAttempts == maxPromptAttempts:
                  
                promptAnswer = defaultPromptResponse
                print(f"Error: attempted {promptAttempts} prompts and all inconclusive")
                definitiveAnswer = True

            else:
              definitiveAnswer = True

          # set current node based on response
          node = node.yes_branch if isPositiveResponse(response)=="yes" else node.no_branch


Analysing entry 0/500


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found inside response from model:  
1
Error: attempted 3 prompts and all inconclusive
Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found inside response from model:  
(
Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found inside response from model:  
(
Error: attempted 3 prompts and all inconclusive
Error, neither yes nor no was found inside response from model:  
(
Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found inside response from model:  
1
Error: attempted 3 prompts and all inconclusive
Error, neither yes nor no was found inside response from model:  
1
Error, neither yes nor no was found insi

KeyboardInterrupt: 