In [1]:
# Import packages
import json
from huggingface_hub import login
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, pipeline
import transformers
import random
import torch
import time
import re
from tqdm import tqdm
import pandas as pd

# Set GPUs
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,4"

print("Available GPUs:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

# Set seeds
random.seed(0)
torch.manual_seed(0)


  from .autonotebook import tqdm as notebook_tqdm


Available GPUs: 2
GPU 0: NVIDIA A100-SXM4-40GB
GPU 1: NVIDIA A100-SXM4-40GB


<torch._C.Generator at 0x7f475bf6e6f0>

In [2]:
# Log into Huggingface
with open("../../huggingface_token.txt", "r") as file:
    access_token = file.read().strip()
login(access_token)

# Load Huggingface Model
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token, low_cpu_mem_usage=True,
                    torch_dtype=torch.float16, device_map='auto')

Downloading shards: 100%|█████████████████████████████████████████████████████████████████████| 2/2 [01:10<00:00, 35.40s/it]
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.81s/it]


In [3]:
def get_message(note):
    system = 'You are a medical assistant. Your tasks are to generate a clinical note and create an Assessment and Plan section. Additionally, you will also determine the fate of the patient. The patient either survives for the next few years or succumbs to either sudden cardiac death or pump failure death. Only suggest death if there is strong evidence for it. Provide your confidence in survival, sudden cardiac death, and pump failure death such that the confidence percentages add up to 100 percent and format these results in a list. Please output a clinical note that has a section for demographics, medical history, lab results, LVEF, medication, and ECG impressions. In the end, put the Assessment and Plan section along with a prediction. Provide reasoning for the prediction.'

    prompt = f"Here is the patient data: \n{note}"

    messages = [
		{"role": "system", "content": system},
		{"role": "user", "content": prompt}
	]

    return messages

def extract_assistant_response(response):
    parts = response.split("assistant\n\n", 1)
    return parts[1].strip() if len(parts) > 1 else response

In [4]:
# Load in csv file with prompts
df = pd.read_csv("../Data/subject-info-cleaned-with-prompts.csv")
df['Prompts'][0]

'Generate a structured clinical note based on the following data:\n\nAge: 58\nGender: Male \nWeight: 83 kg\nHeight: 163 cm\nNYHA Class: III\nBlood Pressure: 110/75 mmHg\nPast Medical History: Idiopathic dilated cardiomyopathy\nAlbumin (g/L): 424.0\nALT or GPT (IU/L): 10\nAST or GOT (IU/L): 20\nTotal Cholesterol (mmol/L): 54\nCreatinine (mmol/L): 106\nGamma-glutamil transpeptidase (IU/L): 20.0\nGlucose (mmol/L): 57.0\nHemoglobin (g/L): 132.0\nHDL (mmol/L): 1,29\nPotassium (mEq/L): 46.0\nLDL (mmol/L): 3,36\nSodium (mEq/L): 141.0\nPro-BNP (ng/L): 1834.0\nProtein (g/L): 69.0\nT3 (pg/dL): 0,05\nT4 (ng/L): 15.0\nTroponin (ng/mL): 0,01\nTSH (mIU/L): 3,02\nUrea (mg/dL): 712\nLVEF (%): 35\nMedications: Beta Blockers, Digoxin, Loop Diuretics, ACE Inhibitor\nHolter rhythm: permanent atrial fibrillation \nECG Impression:\n        - Ventricular Extrasystole: Polymorphic\n        - Ventricular Tachycardia: Non-sustained VT\n        - Non-sustained ventricular tachycardia (CH>10): Yes\n        - Paro

In [5]:
message = get_message(df['Prompts'][0])
message

[{'role': 'system',
  'content': 'You are a medical assistant. Your tasks are to generate a clinical note and create an Assessment and Plan section. Additionally, you will also determine the fate of the patient. The patient either survives for the next few years or succumbs to either sudden cardiac death or pump failure death. Only suggest death if there is strong evidence for it. Provide your confidence in survival, sudden cardiac death, and pump failure death such that the confidence percentages add up to 100 percent and format these results in a list. Please output a clinical note that has a section for demographics, medical history, lab results, LVEF, medication, and ECG impressions. In the end, put the Assessment and Plan section along with a prediction. Provide reasoning for the prediction.'},
 {'role': 'user',
  'content': 'Here is the patient data: \nGenerate a structured clinical note based on the following data:\n\nAge: 58\nGender: Male \nWeight: 83 kg\nHeight: 163 cm\nNYHA C

In [7]:
# Put message into LLM
input_text = tokenizer.apply_chat_template(message, tokenize = False, add_generation_prompt = True)
inputs = tokenizer(input_text, return_tensors = "pt").to(model.device)
output = model.generate(**inputs, max_new_tokens = 1000)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [8]:
# Get result
result = tokenizer.decode(output[0], skip_special_tokens = True)
result = result.replace("**", "")
result = extract_assistant_response(result)
result

"Clinical Note\n\nPatient Demographics\n\n* Age: 58 years\n* Gender: Male\n* Weight: 83 kg\n* Height: 163 cm\n\nMedical History\n\n* Past Medical History: Idiopathic dilated cardiomyopathy\n\nVital Signs\n\n* Blood Pressure: 110/75 mmHg\n* Heart Rate: Not available\n\nLab Results\n\n* Albumin (g/L): 424.0\n* ALT or GPT (IU/L): 10\n* AST or GOT (IU/L): 20\n* Total Cholesterol (mmol/L): 54\n* Creatinine (mmol/L): 106\n* Gamma-glutamil transpeptidase (IU/L): 20.0\n* Glucose (mmol/L): 57.0\n* Hemoglobin (g/L): 132.0\n* HDL (mmol/L): 1,29\n* Potassium (mEq/L): 46.0\n* LDL (mmol/L): 3,36\n* Sodium (mEq/L): 141.0\n* Pro-BNP (ng/L): 1834.0\n* Protein (g/L): 69.0\n* T3 (pg/dL): 0.05\n* T4 (ng/L): 15.0\n* Troponin (ng/mL): 0.01\n* TSH (mIU/L): 3.02\n* Urea (mg/dL): 712\n\nEchocardiogram Results\n\n* LVEF (%): 35%\n\nMedications\n\n* Beta Blockers\n* Digoxin\n* Loop Diuretics\n* ACE Inhibitor\n\nECG Impression\n\n* Ventricular Extrasystole: Polymorphic\n* Ventricular Tachycardia: Non-sustained VT

In [9]:
test_result = result.replace("**", "")
print(extract_assistant_response(test_result))

Clinical Note

Patient Demographics

* Age: 58 years
* Gender: Male
* Weight: 83 kg
* Height: 163 cm

Medical History

* Past Medical History: Idiopathic dilated cardiomyopathy

Vital Signs

* Blood Pressure: 110/75 mmHg
* Heart Rate: Not available

Lab Results

* Albumin (g/L): 424.0
* ALT or GPT (IU/L): 10
* AST or GOT (IU/L): 20
* Total Cholesterol (mmol/L): 54
* Creatinine (mmol/L): 106
* Gamma-glutamil transpeptidase (IU/L): 20.0
* Glucose (mmol/L): 57.0
* Hemoglobin (g/L): 132.0
* HDL (mmol/L): 1,29
* Potassium (mEq/L): 46.0
* LDL (mmol/L): 3,36
* Sodium (mEq/L): 141.0
* Pro-BNP (ng/L): 1834.0
* Protein (g/L): 69.0
* T3 (pg/dL): 0.05
* T4 (ng/L): 15.0
* Troponin (ng/mL): 0.01
* TSH (mIU/L): 3.02
* Urea (mg/dL): 712

Echocardiogram Results

* LVEF (%): 35%

Medications

* Beta Blockers
* Digoxin
* Loop Diuretics
* ACE Inhibitor

ECG Impression

* Ventricular Extrasystole: Polymorphic
* Ventricular Tachycardia: Non-sustained VT
* Non-sustained ventricular tachycardia (CH>10): Yes
*

In [10]:
# Test
df_test = df[[df.columns[1]]]
df_test['Reports'] = None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['Reports'] = None


In [11]:
for i in tqdm(range(5), desc = "Test"):
    # Get message
    message = get_message(df['Prompts'][i])

    # Put message into LLM
    input_text = tokenizer.apply_chat_template(message, tokenize = False, add_generation_prompt = True)
    inputs = tokenizer(input_text, return_tensors = "pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens = 1000)

    # Get result
    result = tokenizer.decode(output[0], skip_special_tokens = True)
    result = result.replace("**", "")
    result = extract_assistant_response(result)

    # Store result
    df_test.loc[i, 'Reports'] = result
    

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                              | 0/5 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                      | 1/5 [00:22<01:31, 22.89s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                      | 2/5 [00:54<01:23, 27.87s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                      | 3/5 [01:18<00:52, 26.05s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.█████▍                | 4/5 [01:47<00:27, 27.39s/it]
Test: 100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [02:14<00:00, 26.94s/it]


In [18]:
print(df_test['Reports'][4])

Clinical Note

Patient Information:

* Name: [Not provided]
* Age: 70
* Gender: Male
* Weight: 97 kg
* Height: 183 cm
* NYHA Class: II

Demographics:

* Date of Birth: [Not provided]
* Social History: [Not provided]

Medical History:

* Ischemic dilated cardiomyopathy
* Diabetes
* Dyslipemia
* Hypertension
* Myocardial Infarction

Vital Signs:

* Blood Pressure: 125/85 mmHg
* Pulse: 100 beats per minute
* Respiratory Rate: 20 breaths per minute
* Oxygen Saturation: 98% on room air

Medications:

* Diabetes Medication
* Beta Blockers
* Loop Diuretics
* Statins
* ACE Inhibitor
* Nitrovasodilator

Lab Results:

* Albumin: 453.0 g/L
* ALT or GPT: 24 IU/L
* AST or GOT: 23 IU/L
* Total Cholesterol: 572 mmol/L
* Creatinine: 88 mmol/L
* Gamma-glutamil transpeptidase: 26.0 IU/L
* Glucose: 78.0 mmol/L
* Hemoglobin: 159.0 g/L
* HDL: 1.06 mmol/L
* Potassium: 49.0 mEq/L
* LDL: 3.28 mmol/L
* Sodium: 140.0 mEq/L
* Pro-BNP: 456.0 ng/L
* Protein: 82.0 g/L
* T3: 0.05 pg/dL
* T4: 16.0 ng/L
* Troponin: 0.