In [1]:
# Import packages
import json
from huggingface_hub import login
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, pipeline
import transformers
import random
import torch
import time
import re
from tqdm import tqdm
import pandas as pd

# Set GPUs
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

print("Available GPUs:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

# Set seeds
random.seed(0)
torch.manual_seed(0)


  from .autonotebook import tqdm as notebook_tqdm


Available GPUs: 2
GPU 0: NVIDIA A100-SXM4-40GB
GPU 1: NVIDIA A100-SXM4-40GB


<torch._C.Generator at 0x7f9f1bf5a650>

In [4]:
# Log into Huggingface
with open("../../huggingface_token.txt", "r") as file:
    access_token = file.read().strip()
login(access_token)

# # Load Huggingface Model
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token, low_cpu_mem_usage=True,
#                     torch_dtype=torch.float16, device_map='auto')

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("medalpaca/medalpaca-7b")
model = AutoModelForCausalLM.from_pretrained("medalpaca/medalpaca-7b")

ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [29]:
def get_message(note):
    system = 'You are a medical assistant. Your tasks are to generate a clinical note and create an Assessment and Plan section. Additionally, you will also determine the fate of the patient. The patient either survives for the next few years or succumbs to either sudden cardiac death or pump failure death. Only suggest death if there is strong evidence for it. Provide your confidence in survival, sudden cardiac death, and pump failure death such that the confidence percentages add up to 100 percent and format these results in a list. Please output a clinical note that has a section for demographics, medical history, lab results, LVEF, medication, and ECG impressions. In the end, put the Assessment and Plan section along with a prediction. Provide reasoning for the prediction.'

    prompt = f"Here is the patient data: \n{note}"

    messages = [
		{"role": "system", "content": system},
		{"role": "user", "content": prompt}
	]

    return messages

def extract_assistant_response(response):
    parts = response.split("assistant\n\n", 1)
    return parts[1].strip() if len(parts) > 1 else response

In [30]:
# Load in csv file with prompts
df = pd.read_csv("../Data/subject-info-cleaned-with-prompts.csv")
df['Prompts'][0]

'Generate a structured clinical note based on the following data:\n\nAge: 58\nGender: Male \nWeight: 83 kg\nHeight: 163 cm\nNYHA Class: III\nBlood Pressure: 110/75 mmHg\nPast Medical History: Idiopathic dilated cardiomyopathy\nAlbumin (g/L): 424.0\nALT or GPT (IU/L): 10\nAST or GOT (IU/L): 20\nTotal Cholesterol (mmol/L): 54\nCreatinine (mmol/L): 106\nGamma-glutamil transpeptidase (IU/L): 20.0\nGlucose (mmol/L): 57.0\nHemoglobin (g/L): 132.0\nHDL (mmol/L): 1,29\nPotassium (mEq/L): 46.0\nLDL (mmol/L): 3,36\nSodium (mEq/L): 141.0\nPro-BNP (ng/L): 1834.0\nProtein (g/L): 69.0\nT3 (pg/dL): 0,05\nT4 (ng/L): 15.0\nTroponin (ng/mL): 0,01\nTSH (mIU/L): 3,02\nUrea (mg/dL): 712\nLVEF (%): 35\nMedications: Beta Blockers, Digoxin, Loop Diuretics, ACE Inhibitor\nHolter rhythm: permanent atrial fibrillation \nECG Impression:\n        - Ventricular Extrasystole: Polymorphic\n        - Ventricular Tachycardia: Non-sustained VT\n        - Non-sustained ventricular tachycardia (CH>10): Yes\n        - Paro

In [32]:
message = get_message(df['Prompts'][0])
message

[{'role': 'system',
  'content': 'You are a medical assistant. Your tasks are to generate a clinical note and create an Assessment and Plan section. Additionally, you will also determine the fate of the patient. The patient either survives for the next few years or succumbs to either sudden cardiac death or pump failure death. Only suggest death if there is strong evidence for it. Provide your confidence in survival, sudden cardiac death, and pump failure death such that the confidence percentages add up to 100 percent and format these results in a list. Please output a clinical note that has a section for demographics, medical history, lab results, LVEF, medication, and ECG impressions. In the end, put the Assessment and Plan section along with a prediction. Provide reasoning for the prediction.'},
 {'role': 'user',
  'content': 'Here is the patient data: \nGenerate a structured clinical note based on the following data:\n\nAge: 58\nGender: Male \nWeight: 83 kg\nHeight: 163 cm\nNYHA C

In [33]:
# Put message into LLM
input_text = tokenizer.apply_chat_template(message, tokenize = False, add_generation_prompt = True)
inputs = tokenizer(input_text, return_tensors = "pt").to(model.device)
output = model.generate(**inputs, max_new_tokens = 1000)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [34]:
# Get result
result = tokenizer.decode(output[0], skip_special_tokens = True)
result = result.replace("**", "")
result = extract_assistant_response(result)
result

"Clinical Note\n\nDemographics:\n\n- Patient Name: [Insert Patient Name]\n- Age: 58 years\n- Gender: Male\n- Weight: 83 kg\n- Height: 163 cm\n\nMedical History:\n\n- Past Medical History: Idiopathic dilated cardiomyopathy\n- Current Medications:\n  - Beta Blockers\n  - Digoxin\n  - Loop Diuretics\n  - ACE Inhibitor\n\nLab Results:\n\n- Albumin: 424.0 g/L\n- ALT or GPT: 10 IU/L\n- AST or GOT: 20 IU/L\n- Total Cholesterol: 54 mmol/L\n- Creatinine: 106 mmol/L\n- Gamma-glutamil transpeptidase: 20.0 IU/L\n- Glucose: 57.0 mmol/L\n- Hemoglobin: 132.0 g/L\n- HDL: 1.29 mmol/L\n- Potassium: 4.6 mEq/L\n- LDL: 3.36 mmol/L\n- Sodium: 141.0 mEq/L\n- Pro-BNP: 1834.0 ng/L\n- Protein: 69.0 g/L\n- T3: 0.05 pg/dL\n- T4: 15.0 ng/L\n- Troponin: 0.01 ng/mL\n- TSH: 3.02 mIU/L\n- Urea: 7.12 mg/dL\n\nLVEF:\n\n- Left Ventricular Ejection Fraction (LVEF): 35%\n\nECG Impression:\n\n- Ventricular Extrasystole: Polymorphic\n- Ventricular Tachycardia: Non-sustained VT\n- Non-sustained ventricular tachycardia (CH>10)

In [35]:
test_result = result.replace("**", "")
print(extract_assistant_response(test_result))

Clinical Note

Demographics:

- Patient Name: [Insert Patient Name]
- Age: 58 years
- Gender: Male
- Weight: 83 kg
- Height: 163 cm

Medical History:

- Past Medical History: Idiopathic dilated cardiomyopathy
- Current Medications:
  - Beta Blockers
  - Digoxin
  - Loop Diuretics
  - ACE Inhibitor

Lab Results:

- Albumin: 424.0 g/L
- ALT or GPT: 10 IU/L
- AST or GOT: 20 IU/L
- Total Cholesterol: 54 mmol/L
- Creatinine: 106 mmol/L
- Gamma-glutamil transpeptidase: 20.0 IU/L
- Glucose: 57.0 mmol/L
- Hemoglobin: 132.0 g/L
- HDL: 1.29 mmol/L
- Potassium: 4.6 mEq/L
- LDL: 3.36 mmol/L
- Sodium: 141.0 mEq/L
- Pro-BNP: 1834.0 ng/L
- Protein: 69.0 g/L
- T3: 0.05 pg/dL
- T4: 15.0 ng/L
- Troponin: 0.01 ng/mL
- TSH: 3.02 mIU/L
- Urea: 7.12 mg/dL

LVEF:

- Left Ventricular Ejection Fraction (LVEF): 35%

ECG Impression:

- Ventricular Extrasystole: Polymorphic
- Ventricular Tachycardia: Non-sustained VT
- Non-sustained ventricular tachycardia (CH>10): Yes
- Paroxysmal supraventricular tachyarrhythmi

In [36]:
# Test
df_test = df[[df.columns[1]]]
df_test['Reports'] = None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['Reports'] = None


In [37]:
for i in tqdm(range(5), desc = "Test"):
    # Get message
    message = get_message(df['Prompts'][i])

    # Put message into LLM
    input_text = tokenizer.apply_chat_template(message, tokenize = False, add_generation_prompt = True)
    inputs = tokenizer(input_text, return_tensors = "pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens = 1000)

    # Get result
    result = tokenizer.decode(output[0], skip_special_tokens = True)
    result = result.replace("**", "")
    result = extract_assistant_response(result)

    # Store result
    df_test.loc[i, 'Reports'] = result
    

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                                                                  | 0/5 [00:00<?, ?it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                                                          | 1/5 [00:39<02:37, 39.41s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                                                                          | 2/5 [01:11<01:44, 34.89s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.████████████████████                                                      | 3/5 [01:40<01:05, 32.53s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.███████████████████████████████████████████████                           | 4/5 [02:15<00:33, 33.32s/it]
Test: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████

In [40]:
print(df_test['Reports'][1])

Clinical Note

Demographics

- Patient Name: [Not provided]
- Date of Birth: [Not provided]
- Age: 58
- Gender: Male
- Weight: 74 kg
- Height: 160 cm

Medical History

- Ischemic dilated cardiomyopathy
- Dyslipemia
- Myocardial Infarction
- NYHA Class II

Lab Results

- Albumin: 404.0 g/L
- ALT: 20 IU/L
- AST: 20 IU/L
- Total Cholesterol: 618 mmol/L
- Creatinine: 121 mmol/L
- Gamma-glutamil transpeptidase: 44.0 IU/L
- Glucose: 56.0 mmol/L
- Hemoglobin: 126.0 g/L
- HDL: 0.98 mmol/L
- Potassium: 4.6 mEq/L
- LDL: 4.06 mmol/L
- Sodium: 140.0 mEq/L
- Pro-BNP: 570.0 ng/L
- Protein: 75.0 g/L
- T3: 0.04 pg/dL
- T4: 12.0 ng/L
- Troponin: 0.01 ng/mL
- TSH: 3.27 mIU/L
- Urea: 104.7 mg/dL

Medications

- Angiotensin II Receptor Blocker
- Beta Blockers
- Statins

ECG Impressions

- Ventricular Extrasystole: Monomorphic
- Ventricular Tachycardia: No
- Non-sustained ventricular tachycardia (CH>10): No
- Paroxysmal supraventricular tachyarrhythmia: No
- Bradycardia: No

LVEF

- Left Ventricular Ejecti