In [1]:
!pip install transformers torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.m

In [2]:
import pandas as pd
import numpy as np
from queue import Queue
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# If you are running via google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# For moving the dataset into local space - via google drive

import zipfile

zip_ref = zipfile.ZipFile("/content/drive/MyDrive/colab.zip", 'r')
zip_ref.extractall("/content/dataset")
zip_ref.close()

In [4]:
dataFilename = './dataset/data.csv'
top50Filename = './dataset/top50.csv'
top50_desc_filename = './dataset/top50_d.csv'
output_file = "./drive/MyDrive/colab/baseline_log.txt"

maxPromptAttempts = 3
defaultPromptResponse = "yes" # Used when LLM prompted maximum number of times and only returns inconclusive answers

model_id = "meta-llama/Llama-3.1-8B-Instruct"

# Quantization configuration - reduce memory usage & speed up model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 4bit precision
    bnb_4bit_use_double_quant=True, #n nested quantization
    bnb_4bit_quant_type="nf4", # normalfloat4 quantization type
    bnb_4bit_compute_dtype=torch.bfloat16 # compute in bfloat16 for performance benefits
)

# Padding left side using tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# Load model and automatically allocate system resources
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    max_new_tokens=5,
    tokenizer=tokenizer,
    temperature=0.1
)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
top50_desc_df = pd.read_csv(top50_desc_filename)

# Formatting ICD codes and descriptions
formatted_codes = top50_desc_df.apply(lambda row: f"({row['icd_code']}) {row['long_title']}", axis=1)
codes = "\n".join(formatted_codes)

systemPrompt = f"You are a clinical coder, here is a brief hospital course (BHC) summary. What code best describes the PRIMARY reason for hospitalization? Select one from these codes. \nCodes:\n{codes}\n"
examplePrompt = "Here is an example of an outputz\nCode: I110"

In [6]:
# Used for prompting model and only returning response without additional data structures wrapping it
def promptModel(prompt):
    rawResponse = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text']
    trimmedResponse = rawResponse[len(prompt):]
    return trimmedResponse

# Extracts the ICD code from the model response, only searches for the top50
def getICDCode(text):
    text.replace('.', '')
    icd_codes = top50_df.iloc[:, 0].astype(str).tolist()

    # Find the first ICD code found in the text
    for code in icd_codes:
        if code in text:
            return code

    return "none"


In [7]:
df = pd.read_csv(dataFilename)
top50_df = pd.read_csv(top50Filename, header=None)

num_entries = df.shape[0]
assignedCodes = np.full(num_entries, '', dtype=object)
correctCodes = np.full(num_entries, '', dtype=object)

writingBuffer = []
with open(output_file, "w") as f:

  # Iterate for each BHC summary
  for index, row in enumerate(df.itertuples(index=False)):

      # Writing to output file every 500 entries in case execution stops
      if ((index % 500 == 0) or (index+1==num_entries)):
        print(f"Analysing entry {index}/{num_entries}")
        f.write("\n".join(writingBuffer) + "\n")
        f.flush()
        writingBuffer = []

      bhc_summary = row[2]
      correctCodes[index] = row[7]

      # Creating prompt outside loop to improve efficiency
      prompt = f"{systemPrompt} BHC Summary: {bhc_summary} + {examplePrompt} \nPredicted code: "

      promptAttempts = 0
      definitiveAnswer = False
      while not definitiveAnswer:

        response = promptModel(prompt)
        responseCode = getICDCode(response)

        if responseCode != "none":
          definitiveAnswer = True
        else:
          # Occurs if an inconclusive answer is given from LLM
          print("Error, neither yes nor no was found inside response from model: " + response)
          promptAttempts += 1

          # After too many unsuccessful prompts
          if promptAttempts == maxPromptAttempts:
            responseCode = "N/A"
            definitiveAnswer = True

      assignedCodes[index] = responseCode
      writingBuffer += [responseCode]


Analysing entry 0/25142


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Analysing entry 500/25142
Analysing entry 1000/25142
Analysing entry 1500/25142
Analysing entry 2000/25142
Analysing entry 2500/25142
Error, neither yes nor no was found inside response from model: 428.22
Explanation
Error, neither yes nor no was found inside response from model: 428.22
Explanation
Error, neither yes nor no was found inside response from model: 428.22
Explanation
Analysing entry 3000/25142
Error, neither yes nor no was found inside response from model:  S55
Explanation:
Error, neither yes nor no was found inside response from model:  S55
The best
Error, neither yes nor no was found inside response from model:  S55
The best
Analysing entry 3500/25142
Analysing entry 4000/25142
Analysing entry 4500/25142
Analysing entry 5000/25142
Analysing entry 5500/25142
Analysing entry 6000/25142
Analysing entry 6500/25142
Analysing entry 7000/25142
Analysing entry 7500/25142
Analysing entry 8000/25142
Analysing entry 8500/25142
Analysing entry 9000/25142
Analysing entry 9500/25142
A