## Import


In [1]:
import json

## Config


In [2]:
# TODO: adjust this according to your input data.
mode_decoding = "not greedy"
max_tokens = 700
seed = 42
frequency_penalty = 0
presence_penalty = 0

if mode_decoding == "greedy":
    temperature = 0
    top_p = 0
else:
    temperature = 0.1
    top_p = 0.9

## Batch query


### Client initialization


In [3]:
# TODO: Use the code below to generate 2 batch JSONL files for your task and send the generated JSONL file to me
# model_name_batch = "gpt-35-turbo-batch"
model_name_batch = "gpt-4o-batch"

### Data preparation


In [1]:
import pandas as pd
from ast import literal_eval
import json

def load_prompt_templates(prompt_file_path, template_key_prefix):
    """Load prompt templates from a JSON file, filtering by prefix."""
    with open(prompt_file_path, 'r') as file:
        prompt_templates_json = json.load(file)
    
    # Filter keys that start with the specified prefix
    prompt_templates_with_keys = [
        (key, prompt_templates_json[key])
        for key in prompt_templates_json
        if key.startswith(template_key_prefix)
    ]
    
    if not prompt_templates_with_keys:
        raise KeyError(f"No templates found for the key prefix: {template_key_prefix}")
    
    print(f'Loaded {len(prompt_templates_with_keys)} prompt templates for prefix "{template_key_prefix}"')
    return prompt_templates_with_keys

def get_input_prompt(data_path, prompt_path):
    if 'PPV' in data_path:
        dataset_name = "Internal Data"
    else:
        dataset_name = "External Data"

    input_df = pd.read_csv(data_path)
    input_df.loc[:, 'active_medications'] = input_df['active_medications'].apply(lambda x: literal_eval(x)).apply(lambda x: [med.lower() for med in x])
    input_df.loc[:, 'discontinued_medications'] = input_df['discontinued_medications'].apply(lambda x: literal_eval(x)).apply(lambda x: [med.lower() for med in x])
    if dataset_name != "Internal Data":
        input_df.loc[:, 'neither_medications'] = input_df['neither_medications'].apply(lambda x: literal_eval(x)).apply(lambda x: [med.lower() for med in x])

    col_list = ['active_medications', 'discontinued_medications', 'neither_medications'] if dataset_name != "Internal Data" else ['active_medications', 'discontinued_medications']
    true_set = input_df[col_list].apply(lambda x: set([med for meds in x for med in meds]), axis=1)

    snippets = input_df['snippet'].values.tolist()

    # load prompt.json
    prompt_prefix = 'Other' if dataset_name != "Internal Data" else 'Internal Data'
    prompts = load_prompt_templates(prompt_path, prompt_prefix)
    
    input_list = []
    prompt_key = []
    for prompt_key, prompt_template in prompts:
        input_list.extend([prompt_template.format(snippet) for snippet in snippets])
        
        ### add prompt for classification with true set
        p_list = prompt_template.split("\nOutput:") 
        # insert to the second last element
        insert = "Hint: Here is a complete list of medications included in this note: {}. Assign a status for each of them.\n"
        # rejoin the prompt template to have the option to insert the hint
        prompt_template = '\nOutput:'.join(p_list[:-1]) + insert + "\nOutput:" + p_list[-1]
        input_list.extend([prompt_template.format(snippet, ground_truth) for snippet, ground_truth in zip(snippets, true_set)])

    return input_list

data_folder = '/PHShome/cs1839/capstone_data/'
input_for_MIT = get_input_prompt(data_folder + 'medication_status_test.csv', 'prompts.json')
input_for_MIMIV = get_input_prompt(data_folder + 'mimic_iv_snippets_list_new.csv', 'prompts.json')
input_for_internal_data = get_input_prompt(data_folder + 'PPV_snippet_medications.csv', 'prompts.json')

# print the length of the prompts for each dataset
print('MIT:', len(input_for_MIT))
print('MIMIV:', len(input_for_MIMIV))
print('Internal Data:', len(input_for_internal_data))

Loaded 14 prompt templates for prefix "Other"
Loaded 14 prompt templates for prefix "Other"
Loaded 14 prompt templates for prefix "Internal Data"
MIT: 2800
MIMIV: 8372
Internal Data: 6748


In [5]:
# TODO: Put your input here
# there are 5 simulations for each prompt
list_data = input_for_MIT*5 + input_for_MIMIV*5 + input_for_internal_data*5   

print("Total number of prompts:", len(list_data))

Total number of prompts: 89600


In [6]:
system_message = '''You are a clinical assistant AI specialized in processing medical notes. Your primary tasks are:
1. Accurately extracting all mentioned medications as it is in the input notes.
2. Determining the status of each medication (active, discontinued, or neither) based on the context provided in the medical notes.

Your goal is to provide clear, accurate, and concise results, avoiding irrelevant information or hallucinations. Follow the specific task instructions provided in the user prompts to guide your outputs.
''' # TODO: Put your system message here (if applicable)

list_dict_data_batch = []
for idx, data in enumerate(list_data):
    dict_data_batch = {
        "custom_id": f"{idx}",
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": model_name_batch,
            "messages": [
                {"role": "system", "content": system_message}, # TODO: If no system message, then comment this line out
                {"role": "user", "content": data},
            ],
            "temperature": temperature,
            "top_p": top_p,
            "frequency_penalty": frequency_penalty,
            "presence_penalty": presence_penalty,
            "max_tokens": max_tokens,
        },
    }
    list_dict_data_batch.append(dict_data_batch)

In [7]:
# save into jsonl
path_file_data_batch = f"data_batch_{model_name_batch}.jsonl"
with open(path_file_data_batch, "w", encoding="utf-8") as f:
    for dict_data_batch in list_dict_data_batch:
        f.write(json.dumps(dict_data_batch, ensure_ascii=False) + "\n")
print(f"Saved {len(list_dict_data_batch)} data into {path_file_data_batch}")

Saved 89600 data into data_batch_gpt-4o-batch.jsonl


# After you send the 2 JSONL files to me, I will run the batch jobs for you using the 2 models and give you the "result_batch_{model_name_batch}.jsonl" files. Please use the code below to fetch the model output.

In [2]:
import json
path_file_result_batch = "/PHShome/cs1839/capstone_data/jsonl/data_batch_gpt-4o-batch_result.jsonl"
path_input_filtered = "/PHShome/cs1839/capstone_data/jsonl/data_batch_gpt-4o-batch_input_filter_result.jsonl"
path_output_filtered = "/PHShome/cs1839/capstone_data/jsonl/data_batch_gpt-4o-batch_output_filter_result.jsonl"

info_dict = {}
result_list = []
token_usage_dict_list = []
with open(path_file_result_batch, "r", encoding="utf-8") as f:
    list_dict_result = sorted([json.loads(line) for line in f.readlines()], key=lambda x: x["custom_id"])
    for model_response_dict in list_dict_result:
        id = int(model_response_dict["custom_id"])
        try:
            model_response_str = model_response_dict['response']['body']['choices'][0]['message']['content']
        except: # The result is filtered by the safety filter, return empty string
            model_response_str = ""
        token_usage = model_response_dict['response']['body']['usage'] # Token (input, output, and total) usage for each sample
        info_dict[id] = (model_response_str, token_usage)

not_returned = []
filtered = []
for i in range(0, 89600):
    if i in info_dict:
        model_response_str, token_usage = info_dict[i]
        if model_response_str == "":
            filtered.append(i)
        result_list.append(model_response_str)
        token_usage_dict_list.append(token_usage)
    else:
        result_list.append("")
        token_usage_dict_list.append({})
        not_returned.append(i)
        
# update the output content filtered by the safety filter
with open(path_output_filtered, "r", encoding="utf-8") as f:
    output_filtered_list_dict_result = sorted([json.loads(line) for line in f.readlines()], key=lambda x: int(x["custom_id"]))
    for model_response_dict in output_filtered_list_dict_result:
        id = int(model_response_dict["custom_id"])
        try:
            model_response_str = model_response_dict['response']['body']['choices'][0]['message']['content']
        except: # The result is filtered by the safety filter, return empty string
            print('id:', id)
            model_response_str = ""
        token_usage = model_response_dict['response']['body']['usage'] # Token (input, output, and total) usage for each sample
        info_dict[id] = (model_response_str, token_usage)

        # update the result_list and token_usage_dict_list
        result_list[id] = model_response_str
        info_dict[id] = (model_response_str, token_usage)

# update the input content filtered by the safety filter
with open(path_input_filtered, "r", encoding="utf-8") as f:
    input_filtered_list_dict_result = sorted([json.loads(line) for line in f.readlines()], key=lambda x: int(x["custom_id"]))
    for model_response_dict in input_filtered_list_dict_result:
        id = int(model_response_dict["custom_id"])
        try:
            model_response_str = model_response_dict['response']['body']['choices'][0]['message']['content']
        except: # The result is filtered by the safety filter, return empty string
            print('id:', id)
            model_response_str = ""
        token_usage = model_response_dict['response']['body']['usage'] # Token (input, output, and total) usage for each sample
        info_dict[id] = (model_response_str, token_usage)

        # update the result_list and token_usage_dict_list
        result_list[id] = model_response_str
        info_dict[id] = (model_response_str, token_usage)



In [41]:
# check any item in the result_list is empty
empty = [i for i, x in enumerate(result_list) if x == ""]
print('Empty:', empty)

Empty: []


In [9]:
print(len(result_list))
print(len(token_usage_dict_list))
print(f"Number of samples that are not returned: {len(not_returned)}")
print(f"Custom IDs of the not returned samples that are not returned: {not_returned}")
print(f"Number of samples that are filtered: {len(filtered)}")
print(f"Custom IDs of the not returned samples that are filtered: {filtered}")

89600
89600
Number of samples that are not returned: 10
Custom IDs of the not returned samples that are not returned: [2088, 2188, 4888, 4988, 7688, 7788, 10488, 10588, 13288, 13388]
Number of samples that are filtered: 4130
Custom IDs of the not returned samples that are filtered: [200, 225, 240, 300, 325, 340, 400, 440, 500, 525, 540, 600, 625, 640, 671, 700, 725, 740, 800, 825, 840, 900, 940, 1000, 1020, 1025, 1040, 1100, 1125, 1140, 1595, 1695, 1795, 1808, 1840, 1908, 1940, 1995, 2005, 2008, 2040, 2140, 2195, 2208, 2240, 2274, 2308, 2340, 2374, 2395, 2400, 2425, 2440, 2500, 2525, 2540, 2795, 3000, 3025, 3040, 3125, 3140, 3200, 3325, 3340, 3400, 3425, 3440, 3471, 3500, 3525, 3540, 3600, 3640, 3700, 3725, 3740, 3800, 3820, 3840, 3900, 3920, 3925, 3940, 4408, 4495, 4540, 4595, 4608, 4640, 4708, 4740, 4795, 4800, 4808, 4840, 4854, 4940, 4995, 5008, 5040, 5108, 5140, 5195, 5200, 5225, 5240, 5300, 5325, 5340, 5495, 5508, 5540, 5595, 5800, 5825, 5840, 5871, 5900, 5925, 5940, 6000, 6025, 6

In [44]:
print(result_list[0])
print(token_usage_dict_list[0])

- dobutamine (active)
- Nipride (active)
- Esmolol (discontinued)
{'completion_tokens': 21, 'prompt_tokens': 254, 'total_tokens': 275}


In [45]:
total_input = 0
total_output = 0
total_all = 0
for element in token_usage_dict_list:
    if len(element) > 0:
        total_input += element["prompt_tokens"]
        total_output += element["completion_tokens"]
        total_all += element["total_tokens"]

print(f"Total input tokens: {total_input}\nTotal output tokens: {total_output}\nTotal tokens: {total_all}")

Total input tokens: 55097725
Total output tokens: 6479691
Total tokens: 61577416


The *token_usage_dict_list* will look like [{'completion_tokens': 28, 'prompt_tokens': 21, 'total_tokens': 49}, {'completion_tokens': 208, 'prompt_tokens': 16, 'total_tokens': 224}], where *completion_tokens* is the **output** token, *prompt_tokens* is the **input** token

# process result

In [4]:
from vllm_inference import *


# 3. Update run_pipeline to use LLMEngine and align with the original logic
def run_llm_pipeline(input_df, response_list, dataset_name):
    """
    Main function to run the text generation pipeline using LLMEngine and compute metrics.
    
    Parameters:
    ----------
    llm_model : str
        The initilized llm model.
    input_df : pd.DataFrame
        The data to be inferred.
    prompt_template : str
        Template for constructing the prompts.
    dataset_name : str
        Name of the dataset. Used for special cases like Internal Data exclusion.
    batch_size : int
        Number of examples per batch.
    max_token_output : int
        Maximum number of tokens to generate.
    use_sampling : bool
        Whether to use sampling (or greedy decoding).
    
    Returns:
    -------
    result_df : pd.DataFrame
        DataFrame with the processed outputs and calculated metrics.
    """
    # Generate responses
    response_list_wo_groundtruth, response_list_with_groundtruth = response_list

    # Process the responses to categorize medications
    df_w_classifications = process_output(input_df, response_list_wo_groundtruth, dataset_name)  
    # process the responses with ground truth to categorize medications
    df_w_classifications_with_groundtruth = process_output(input_df, response_list_with_groundtruth, dataset_name)

    # Calculate row-level metrics
    extraction_precision, extraction_recall, extraction_f1, joint_accuracy, joint_macro_f1, joint_macro_precision, joint_macro_recall = calculate_metrics_by_dataset(df_w_classifications, dataset_name)
    # calculate the classification metrics with responses with ground truth
    accuracy, macro_f1, macro_precision, macro_recall = calculate_classification_metrics(df_w_classifications_with_groundtruth, dataset_name)

    # append the active_medications_pred, discontinued_medications_pred, neither_medications_pred to the df_w_classifications with extension of _with_groundtruth
    df_w_classifications_with_groundtruth = df_w_classifications_with_groundtruth[['model_response', 'active_medications_pred', 'discontinued_medications_pred', 'neither_medications_pred']] if dataset_name != "Internal Data" else df_w_classifications_with_groundtruth[['model_response', 'active_medications_pred', 'discontinued_medications_pred']]
    df_w_classifications_with_groundtruth.columns = [f'{col}_with_groundtruth' for col in df_w_classifications_with_groundtruth.columns]
    df_w_classifications = pd.concat([df_w_classifications, df_w_classifications_with_groundtruth], axis=1)

    # Return the final DataFrame with metrics
    return df_w_classifications, extraction_precision, extraction_recall, extraction_f1, joint_accuracy, joint_macro_f1, joint_macro_precision, joint_macro_recall, accuracy, macro_f1, macro_precision, macro_recall


def json_metric_calculation(dataset_name, result_list, input_df, data_folder, result_df_path):

    model_name = 'gpt-4o'
    
    # Check if input_df.active_medications[0] is a list, if not, apply eval and lower to medications
    if not isinstance(input_df.active_medications[0], list):
        input_df.loc[:, 'active_medications'] = input_df['active_medications'].apply(lambda x: literal_eval(x)).apply(lambda x: [med.lower() for med in x])
        input_df.loc[:, 'discontinued_medications'] = input_df['discontinued_medications'].apply(lambda x: literal_eval(x)).apply(lambda x: [med.lower() for med in x])
        if dataset_name != "Internal Data":
            input_df.loc[:, 'neither_medications'] = input_df['neither_medications'].apply(lambda x: literal_eval(x)).apply(lambda x: [med.lower() for med in x])
    
    col_list = ['active_medications', 'discontinued_medications', 'neither_medications'] if dataset_name != "Internal Data" else ['active_medications', 'discontinued_medications']
    input_df['true_set'] = input_df[col_list].apply(lambda x: set([med for meds in x for med in meds]), axis=1)

    prompt_prefix = 'Other' if dataset_name != "Internal Data" else 'Internal Data'
    prompt_templates_with_keys = load_prompt_templates('prompts.json', prompt_prefix)
    length_of_content_per_prompt = len(result_list)//len(prompt_templates_with_keys)

    count_per_sim = len(result_list)//5

    for sim in range(5):

        for idx, prompt in enumerate(prompt_templates_with_keys):
            prompt_key, prompt_template = prompt

            # get the content for simulation sim
            results_sim = result_list[sim*count_per_sim:(sim+1)*count_per_sim]
            length_of_content_per_prompt = len(results_sim)//len(prompt_templates_with_keys)
            
            # get the content for prompt idx
            results_prompt = results_sim[idx*length_of_content_per_prompt:(idx+1)*length_of_content_per_prompt]
            # get the content without the hint
            results_prompt_without_hint = results_prompt[:length_of_content_per_prompt//2]
            # get the content with the hint
            results_prompt_with_hint = results_prompt[length_of_content_per_prompt//2:]

            # Run the LLMEngine pipeline
            df_w_classifications, extraction_precision, extraction_recall, extraction_f1, joint_accuracy, \
            joint_macro_f1, joint_macro_precision, joint_macro_recall, accuracy, macro_f1, macro_precision, \
            macro_recall = run_llm_pipeline(
                input_df = input_df, 
                response_list = (results_prompt_without_hint, results_prompt_with_hint),
                dataset_name = dataset_name
            )

            # Save the row metrics DataFrame to a CSV
            output_filename = f'{dataset_name}_{model_name}_sim_{sim}_{prompt_key}.csv'
            df_w_classifications.to_csv(data_folder + f'base_pred_data/{output_filename}', index=False)

            # Read the results CSV
            result_df = pd.read_csv(result_df_path)

            # Define your result row
            new_row = {
                'Dataset': dataset_name,
                'Model': model_name,
                'Prompt': prompt_template,
                'Simulation': sim,

                'extraction_precision': extraction_precision,
                'extraction_recall': extraction_recall,
                'extraction_f1': extraction_f1,

                'accuracy_w_gt': accuracy,
                'macro_f1_w_gt': macro_f1,
                'macro_precision_w_gt': macro_precision,
                'macro_recall_w_gt': macro_recall,

                'joint_accuracy': joint_accuracy,
                'joint_macro_f1': joint_macro_f1,
                'joint_macro_precision': joint_macro_precision,
                'joint_macro_recall': joint_macro_recall,
            }

            new_row = pd.DataFrame([new_row])

            # Append the new row to the results DataFrame and save
            result_df = pd.concat([result_df, new_row], axis=0, ignore_index=True)
            result_df.to_csv(result_df_path, index=False)

# split the result into 3 datasets
result_MIT = result_list[:len(input_for_MIT)*5]
result_MIMIV = result_list[len(input_for_MIT)*5:len(input_for_MIT)*5+len(input_for_MIMIV)*5]
result_internal_data = result_list[-len(input_for_internal_data)*5:]


data_folder = '/PHShome/cs1839/capstone_data/'
MIT = pd.read_csv(data_folder + 'medication_status_test.csv')
MIMIV = pd.read_csv(data_folder + 'mimic_iv_snippets_list_new.csv')
internal_data = pd.read_csv(data_folder + 'PPV_snippet_medications.csv')

json_metric_calculation(dataset_name='MIT',
                        result_list=result_MIT,
                        input_df=MIT,
                        data_folder=data_folder,
                        result_df_path=data_folder + 'results_gpt4o.csv')

json_metric_calculation(dataset_name="MIMIC-IV",
                        result_list=result_MIMIV,
                        input_df=MIMIV,
                        data_folder=data_folder,
                        result_df_path=data_folder + 'results_gpt4o.csv')

json_metric_calculation(dataset_name='Internal Data',
                        result_list=result_internal_data,
                        input_df=internal_data,
                        data_folder=data_folder,
                        result_df_path=data_folder + 'results_gpt4o.csv')

Loaded 14 prompt templates for prefix "Other"


  result_df = pd.concat([result_df, new_row], axis=0, ignore_index=True)


Loaded 14 prompt templates for prefix "Other"
Loaded 14 prompt templates for prefix "Internal Data"
