In [1]:
# Import packages
import json
from huggingface_hub import login
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, pipeline
import transformers
import random
import torch
import time
import re
from tqdm import tqdm
import pandas as pd
import numpy as np



In [2]:
# Read in csv file
df = pd.read_csv("../Data/subject-info-cleaned-with-ECGReport.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Patient ID,Follow-up period from enrollment (days),days_4years,Exit of the study,Cause of death,Age,Gender (male=1),Weight (kg),Height (cm),...,Anticoagulants/antitrombotics (yes=1),Betablockers (yes=1),Digoxin (yes=1),Loop diuretics (yes=1),Spironolactone (yes=1),Statins (yes=1),Hidralazina (yes=1),ACE inhibitor (yes=1),Nitrovasodilator (yes=1),ECG_impressions
0,1,P0002,2045,1460,0,0,58,1,74,160,...,1,1,0,0,0,1,0,0,0,\n ECG Impression:\n - V...
1,2,P0004,2044,1460,0,0,56,0,84,165,...,1,1,0,1,1,0,0,0,0,\n ECG Impression:\n - V...
2,3,P0006,2043,1460,0,0,70,1,83,165,...,1,1,0,1,1,0,0,1,1,\n ECG Impression:\n - V...
3,4,P0007,2042,1460,0,0,52,1,71,162,...,1,1,0,0,0,1,0,0,0,\n ECG Impression:\n - V...
4,5,P0009,2039,1460,0,0,64,0,68,155,...,1,1,0,1,0,1,0,1,1,\n ECG Impression:\n - V...


In [3]:
# Test dictionary 
def generate_dictionary(row):
    # Create a dictionary to store non-missing values
    patient_data = {col: row[col] for col in df.columns if pd.notna(row[col])}
    return patient_data
generate_dictionary(df.iloc[0])

{'Unnamed: 0': np.int64(1),
 'Patient ID': 'P0002',
 'Follow-up period from enrollment (days)': np.int64(2045),
 'days_4years': np.int64(1460),
 'Exit of the study': np.int64(0),
 'Cause of death': np.int64(0),
 'Age': np.int64(58),
 'Gender (male=1)': np.int64(1),
 'Weight (kg)': np.int64(74),
 'Height (cm)': np.int64(160),
 'Body Mass Index (Kg/m2)': np.int64(289),
 'NYHA class': np.int64(2),
 'Diastolic blood  pressure (mmHg)': np.int64(80),
 'Systolic blood pressure (mmHg)': np.int64(130),
 'HF etiology - Diagnosis': np.int64(2),
 'Diabetes (yes=1)': np.int64(0),
 'History of dyslipemia (yes=1)': np.int64(1),
 'Peripheral vascular disease (yes=1)': np.int64(0),
 'History of hypertension (yes=1)': np.int64(0),
 'Prior Myocardial Infarction (yes=1)': np.int64(1),
 'Prior implantable device': np.int64(0),
 'Prior Revascularization': np.int64(1),
 'Syncope': np.int64(0),
 'daily smoking (cigarretes/day)': np.int64(20),
 'smoke-free time (years)': np.int64(1),
 'cigarettes /year': np.in

In [4]:
# Test prompt 
def generate_prompt(row):
    # Create a dictionary to store non-missing values
    patient_data = {col: row[col] for col in df.columns if pd.notna(row[col])}

    # Start the prompt
    prompt = "You are a medical AI assistant. Generate a structured clinical note based on the following data:\n\n"

    # Add demographic information 
    if "Age" in patient_data:
        prompt += f"Age: {patient_data['Age']}\n"
    if "Gender (male=1)" in patient_data:
        if patient_data['Gender (male=1)'] == 1:
            prompt += f"Gender: Male \n"
        elif patient_data['Gender (male=1)'] == 0:
            prompt += f"Gender: Female \n"
    if "Weight (kg)" in patient_data:
        prompt += f"Weight: {patient_data['Weight (kg)']} kg\n"
    if "Height (cm)" in patient_data:
        prompt += f"Height: {patient_data['Height (cm)']} cm\n"

    # Add clinical features
    if "NYHA class" in patient_data:
        if patient_data['NYHA class'] == 2:
            prompt += f"NYHA Class: II\n"
        elif patient_data['NYHA class'] == 3:
            prompt += f"NYHA Class: III\n"
    if ("Systolic blood pressure (mmHg)" in patient_data) and ("Diastolic blood  pressure (mmHg)" in patient_data):
        prompt += f"Blood Pressure: {patient_data['Systolic blood pressure (mmHg)']}/{patient_data['Diastolic blood  pressure (mmHg)']} mmHg\n"

    # Past medical history
    past_medical_conditions = []
    for condition in ["HF etiology - Diagnosis", "Diabetes (yes=1)", "History of dyslipemia (yes=1)", "Peripheral vascular disease (yes=1)",
                     "History of hypertension (yes=1)", "Prior Myocardial Infarction (yes=1)"]:
        if (condition in patient_data) and (condition == "HF etiology - Diagnosis"):
            if patient_data['HF etiology - Diagnosis'] == 1:
                past_medical_conditions.append("Idiopathic dilated cardiomyopathy")
                # prompt += f"HF Etiology: Idiopathic dilated cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 2:
                past_medical_conditions.append("Ischemic dilated cardiomyopathy")
                # prompt += f"HF Etiology: Ischemic dilated cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 3:
                past_medical_conditions.append("Enolic dilated cardiomyopathy")
                # prompt += f"HF Etiology: Enolic dilated cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 4:
                past_medical_conditions.append("Valvular cardiomyopathy")
                # prompt += f"HF Etiology: Valvular cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 5:
                past_medical_conditions.append("Toxic dilated cardiomyopathy")
                #prompt += f"HF Etiology: Toxic dilated cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 6:
                past_medical_conditions.append("Post-myocardial dilated cardiomyopathy")
                # prompt += f"HF Etiology: Post-myocardial dilated cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 7:
                past_medical_conditions.append("Hypertropic cardiomyopathy")
                # prompt += f"HF Etiology: Hypertropic cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 8:
                past_medical_conditions.append("Hypertensive cardiomyopathy")
                # prompt += f"HF Etiology: Hypertensive cardiomyopathy\n"
            if patient_data['HF etiology - Diagnosis'] == 9:
                past_medical_conditions.append("Other HF etiology")
                # prompt += f"HF Etiology: Other\n"
        elif (condition in patient_data) and (condition == "Diabetes (yes=1)"):
            if patient_data['Diabetes (yes=1)'] == 1:
                past_medical_conditions.append("Diabetes")
        elif (condition in patient_data) and (condition == "History of dyslipemia (yes=1)"):
            if patient_data['History of dyslipemia (yes=1)'] == 1:
                past_medical_conditions.append("Dyslipemia")
        elif (condition in patient_data) and (condition == "Peripheral vascular disease (yes=1)"):
            if patient_data['Peripheral vascular disease (yes=1)'] == 1:
                past_medical_conditions.append("Peripheral vascular disease")
        elif (condition in patient_data) and (condition == "History of hypertension (yes=1)"):
            if patient_data['History of hypertension (yes=1)'] == 1:
                past_medical_conditions.append("Hypertension")
        elif (condition in patient_data) and (condition == "Prior Myocardial Infarction (yes=1)"):
            if patient_data['Prior Myocardial Infarction (yes=1)'] == 1:
                past_medical_conditions.append("Myocardial Infarction")
    if past_medical_conditions:
        prompt += "Past Medical History: " + ", ".join(past_medical_conditions) + "\n"
    else:
        prompt += "Past Medical History: None\n"

    # Lab results
    
    
            
    # for condition in ["Diabetes", "Dyslipidemia", "Hypertension", "Prior_MI"]:
    #     if condition in patient_data and patient_data[condition] == "Yes":
    #         past_medical_conditions.append(condition.replace("_", " "))  # Formatting
    # if past_medical_conditions:
    #     prompt += "Past Medical History: " + ", ".join(past_medical_conditions) + "\n"
    
    return prompt
    
print(generate_prompt(df.iloc[0]))


You are a medical AI assistant. Generate a structured clinical note based on the following data:

Age: 58
Gender: Male 
Weight: 74 kg
Height: 160 cm
NYHA Class: II
Blood Pressure: 130/80 mmHg
Past Medical History: Ischemic dilated cardiomyopathy, Dyslipemia, Myocardial Infarction



In [5]:
# Test prompt for first patient
def generate_prompt(row):
    # Create a dictionary to store non-missing values
    patient_data = {col: row[col] for col in df.columns if pd.notna(row[col])}

    # Start the prompt
    prompt = "You are a medical AI assistant. Generate a structured clinical note based on the following data:\n\n"

    # Add demographic and vital signs information
    if "Age" in patient_data:
        prompt += f"Age: {patient_data['Age']}\n"
    if "Gender" in patient_data:
        prompt += f"Gender: {patient_data['Gender']}\n"
    if "Height" in patient_data:
        prompt += f"Height: {patient_data['Height']} cm\n"
    if "Weight" in patient_data:
        prompt += f"Weight: {patient_data['Weight']} kg\n"
    if "NYHA_Class" in patient_data:
        prompt += f"NYHA Class: {patient_data['NYHA_Class']}\n"
    if "SBP" in patient_data and "DBP" in patient_data:
        prompt += f"Blood Pressure: {patient_data['SBP']}/{patient_data['DBP']} mmHg\n"

    # Add past medical history
    past_medical_conditions = []
    for condition in ["Diabetes", "Dyslipidemia", "Hypertension", "Prior_MI"]:
        if condition in patient_data and patient_data[condition] == "Yes":
            past_medical_conditions.append(condition.replace("_", " "))  # Formatting
    if past_medical_conditions:
        prompt += "Past Medical History: " + ", ".join(past_medical_conditions) + "\n"

    # Add lab results only if available
    lab_tests = ["Creatinine", "Glucose", "Hemoglobin", "LVEF", "proBNP"]
    for test in lab_tests:
        if test in patient_data:
            prompt += f"{test}: {patient_data[test]}\n"

    # Add ECG findings only if available
    ecg_findings = []
    for ecg_feature in ["Ventricular_Extrasystoles", "Ventricular_Tachycardia", 
                        "Nonsustained_VT", "Paroxysmal_SVT", "Bradycardia"]:
        if ecg_feature in patient_data and patient_data[ecg_feature] == "Yes":
            ecg_findings.append(ecg_feature.replace("_", " "))  # Formatting
    if ecg_findings:
        prompt += "ECG Findings: " + ", ".join(ecg_findings) + "\n"

    # Final instruction to the LLM
    prompt += "\nGenerate a structured clinical note using this information, including an Assessment & Plan section."

    return prompt

# Example usage for one patient
sample_prompt = generate_prompt(df.iloc[0])
print(sample_prompt)  # Send this to the LLM

You are a medical AI assistant. Generate a structured clinical note based on the following data:

Age: 58

Generate a structured clinical note using this information, including an Assessment & Plan section.
