In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import json
from huggingface_hub import login

from tqdm import tqdm
import torch
from transformers import AutoTokenizer, pipeline

import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import re


# Read the JSON config file
with open('config.json', 'r') as f:
    config = json.load(f)

# Get the token from the JSON file
hg_token = config['HuggingFace']['token']
# Login using the token
login(token=hg_token)

# LLM folder
llm_folder = "/PHShome/jn180/llm_public_host"
# Data folder
data_folder = "/PHShome/cs1839/capstone_data/"
# results table path
results_df_path = data_folder + "results.csv"

# data to inference 
medication_status_test = pd.read_csv(data_folder + "medication_status_test.csv")

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /PHShome/cs1839/.cache/huggingface/token
Login successful


# Draft


In [4]:
model_path = "/netapp3/raw_data3/share/llm_public_host/Llama-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_path)
    
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos_token (common for autoregressive models)
tokenizer.padding_side = "left"  # Set padding to left for autoregressive models

# Initialize the pipeline for text generation
generator = pipeline(
    task="text-generation",
    model=model_path,
    tokenizer=tokenizer,  # Pass the tokenizer with left padding settings
    device=0,  # '0' for GPU, '-1' for CPU
    model_kwargs={"torch_dtype": torch.bfloat16} # Use torch.bfloat16 for faster generation
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.05s/it]


In [52]:
sub_df = medication_status_test['snippet'].values.tolist()

batch_size = 16
num_step = len(sub_df) // batch_size + 1 if len(sub_df) % batch_size != 0 else len(sub_df) // batch_size
max_token_output = 80
response_list = []

# Settings for text generation
use_sampling = True  # Set to True if you want to use sampling; False for greedy search
temperature = 0.1 if use_sampling else None  # Set temperature for sampling; None for greedy
top_p = 0.9 if use_sampling else None  # Use top-p sampling only when sampling is enabled

# Iterate through batches
for i in tqdm(range(num_step)):
    input_texts = sub_df[i*batch_size:(i+1)*batch_size]

    prompt = """
Identify and categorize the medications mentioned in the following medical note. Extract all medications the patient has taken before, is currently taking, and any other medications mentioned.
Note: Adjust the number of medications in each category based on the input. Write None if no other medication mentioned. Strictly follow the output format.
Expected Output Format:
"
- Current Medications (Active): [Medication 1], [Medication 2]
- Discontinued Medications: [Medication 3], [Medication 4]
- Other Mentioned Medications (neither active nor discontinued): [Medication 5], [Medication 6]
END"

Input Medical Note:
"""
    output = """
    \nOutput:\n
    """

    input_texts = [prompt + text + output for text in input_texts]

    # Generate responses for each batch
    responses = generator(
        input_texts,  # Concatenate the prompt and input texts
        max_new_tokens=max_token_output,   # Limit the number of new tokens in the output
        pad_token_id=generator.tokenizer.eos_token_id,  # Set the pad_token_id
        eos_token_id=generator.tokenizer.eos_token_id,  # Set the eos_token_id
        
        truncation=True,          # Truncate the input if it's longer than max_token_input
        do_sample=use_sampling,   # Sampling or greedy search
        temperature=temperature,  # Only set if sampling is enabled
        top_p=top_p,              # Only set if sampling is enabled
    )

    # Loop through each input and its corresponding response
    for response in responses:
        # Each `response` is a list with one dictionary, so we need to extract the first item
        for generated in response:  # Loop through the list in case of multiple generations
            # only save the generated output
            response_list.append(generated['generated_text'].split("\nOutput:\n")[1].split("END")[0])


100%|██████████| 7/7 [02:15<00:00, 19.30s/it]


In [77]:
def process_output(input_df, response_list):
    """
    Processes a list of LLM responses to extract medication information and adds it to the input DataFrame.

    This function takes an input DataFrame (`input_df`) and a list of responses (`response_list`),
    where each response contains categorized medication data. The function extracts three categories
    of medications (active, discontinued, and neither), formats them into lists, and creates a new
    DataFrame with three columns:
    
    - `active_medications`: Medications that the patient is currently taking.
    - `discontinued_medications`: Medications that the patient has taken but has since discontinued.
    - `neither_medications`: Medications that are mentioned but are neither currently taken nor discontinued.

    The new DataFrame with these three columns is concatenated with the `input_df` and returned.

    Parameters:
    ----------
    input_df : pd.DataFrame
        The original input DataFrame, which will be concatenated with the extracted medication data.
    
    response_list : list of str
        A list of strings containing the LLM responses. Each response includes a categorized list of medications
        (active, discontinued, and neither).

    Returns:
    -------
    pd.DataFrame
        A new DataFrame that concatenates the `input_df` with the extracted medication data.
        The resulting DataFrame will have the original columns from `input_df`, plus three new columns:
        `active_medications`, `discontinued_medications`, and `neither_medications`, each containing a list of medications.

    Example:
    --------
    >>> input_df = pd.DataFrame({'notes': ["Note 1", "Note 2"]})
    >>> response_list = [
    >>>     'Current Medications (Active): Aspirin\nDiscontinued Medications: Atenolol\nOther Mentioned Medications: Ibuprofen',
    >>>     'Current Medications (Active): None\nDiscontinued Medications: Metoprolol\nOther Mentioned Medications: Acetaminophen'
    >>> ]
    >>> final_df = process_output(input_df, response_list)
    >>> print(final_df)
    
    Output:
    -------
        notes    active_medications     discontinued_medications    neither_medications
        Note 1   [Aspirin]              [Atenolol]                  [Ibuprofen]
        Note 2   []                     [Metoprolol]                [Acetaminophen]
    """
    # Initialize lists to store the medications for each category
    active_medications_list = []
    discontinued_medications_list = []
    neither_medications_list = []

    # Loop through each response in the response_list
    for response in response_list:
        # Extract the active, discontinued, and neither medications using regular expressions
        active_medications = re.search(r'Current Medications \(Active\):\s*(.*)', response)
        discontinued_medications = re.search(r'Discontinued Medications:\s*(.*)', response)
        neither_medications = re.search(r'Other Mentioned Medications.*:\s*(.*)', response)

        # Convert to lists and handle None cases
        active_medications = active_medications.group(1).split(', ') if active_medications and active_medications.group(1) != "None" else []
        discontinued_medications = discontinued_medications.group(1).split(', ') if discontinued_medications and discontinued_medications.group(1) != "None" else []
        neither_medications = neither_medications.group(1).split(', ') if neither_medications and neither_medications.group(1) != "None" else []

        # Append each category list to their respective main lists
        active_medications_list.append(active_medications)
        discontinued_medications_list.append(discontinued_medications)
        neither_medications_list.append(neither_medications)

    # Create a new DataFrame from the lists
    output_df = pd.DataFrame({
        'active_medications_pred': active_medications_list,
        'discontinued_medications_pred': discontinued_medications_list,
        'neither_medications_pred': neither_medications_list
    })

    # Concatenate the input_df with the output_df by rows
    result_df = pd.concat([input_df, output_df], axis=1)

    return result_df

def calculate_row_metrics(df):
    columns = df.columns.tolist()
    # Iterate over the three categories
    for category in ['active_medications', 'discontinued_medications', 'neither_medications']:
        true_col = category
        pred_col = category + '_pred'

        # Check the type of true_col, if not list, use eval to convert back
        if not isinstance(df[true_col][0], list):
            df[true_col] = df[true_col].apply(lambda x: eval(x))

        # Initialize columns to store row-wise metrics
        df.loc[:, 'avg_precision'] = np.nan
        df.loc[:, 'avg_recall'] = np.nan
        df.loc[:, 'avg_f1'] = np.nan
        df.loc[:, 'avg_accuracy'] = np.nan

        # For each row, compute metrics
        for index, row in df.iterrows():
            # Convert lists to sets for easier comparison
            true_set = set(row[true_col])
            pred_set = set(row[pred_col])
            
            # Check if both sets are empty
            if not true_set and not pred_set:
                precision, recall, f1, accuracy = 1.0, 1.0, 1.0, 1.0  # perfect scores when both are empty
            else:
                # Create binary lists: 1 if medication is present, 0 otherwise
                all_medications = list(true_set.union(pred_set))
                true_binary = [1 if med in true_set else 0 for med in all_medications]
                pred_binary = [1 if med in pred_set else 0 for med in all_medications]

                # Calculate precision, recall, F1, accuracy
                precision = precision_score(true_binary, pred_binary, zero_division=1)
                recall = recall_score(true_binary, pred_binary, zero_division=1)
                f1 = f1_score(true_binary, pred_binary, zero_division=1)
                accuracy = accuracy_score(true_binary, pred_binary)

            # Append the metrics to the DataFrame
            df.loc[index, f'{category}_precision'] = precision
            df.loc[index, f'{category}_recall'] = recall
            df.loc[index, f'{category}_f1'] = f1
            df.loc[index, f'{category}_accuracy'] = accuracy
        
    # get the average of each metric and append to a column as avg_precision, avg_recall, avg_f1, avg_accuracy
    df['avg_precision'] = df[['active_medications_precision', 'discontinued_medications_precision', 'neither_medications_precision']].mean(axis=1)
    df['avg_recall'] = df[['active_medications_recall', 'discontinued_medications_recall', 'neither_medications_recall']].mean(axis=1)
    df['avg_f1'] = df[['active_medications_f1', 'discontinued_medications_f1', 'neither_medications_f1']].mean(axis=1)
    df['avg_accuracy'] = df[['active_medications_accuracy', 'discontinued_medications_accuracy', 'neither_medications_accuracy']].mean(axis=1)

    return df[columns+['avg_precision', 'avg_recall', 'avg_f1', 'avg_accuracy']]


df_w_classifications = process_output(medication_status_test, response_list)
df_w_row_metrics = calculate_row_metrics(df_w_classifications)

result_df = pd.read_csv(data_folder+'results.csv')
metrics_mean = df_w_row_metrics[['avg_precision', 'avg_recall', 'avg_f1', 'avg_accuracy']].mean(axis=0)

# Define your result row
new_row = {
    'Dataset': 'MIT',
    'Model': model_path.split('/')[-1],
    'Prompt': prompt,
    'Accuracy': metrics_mean.get('avg_accuracy', np.nan),
    'Precision': metrics_mean.get('avg_precision', np.nan),
    'Recall': metrics_mean.get('avg_recall', np.nan),
    'F1': metrics_mean.get('avg_f1', np.nan)
}

result_df._append(new_row, ignore_index=True)

  result_df._append(new_row, ignore_index=True)


Unnamed: 0,Dataset,Model,Prompt,Accuracy,Precision,Recall,F1
0,MIT,meta-llama/Meta-Llama-3.1-8B-Instruct,\nIdentify and categorize the medications ment...,0.602556,0.688087,0.745139,0.629691


# Pipeline

In [29]:
import re
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm
import gc

# 1. Function to initialize model and tokenizer
def initialize_model(model_path, device=0, use_fp16=True):
    """
    Initializes the model and tokenizer for text generation.
    
    Parameters:
    ----------
    model_path : str
        Path of the model to be loaded.
    device : int
        Device to use, 0 for GPU and -1 for CPU.
    use_fp16 : bool
        Whether to use FP16 for inference.
    
    Returns:
    -------
    generator : pipeline
        A HuggingFace pipeline ready for text generation.
    """
    # Load tokenizer and set padding side to left
    tokenizer = AutoTokenizer.from_pretrained(model_path)
        
    tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos_token (common for autoregressive models)
    tokenizer.padding_side = "left"  # Set padding to left for autoregressive models

    # Initialize the pipeline for text generation
    generator = pipeline(
        task="text-generation",
        model=model_path,
        tokenizer=tokenizer,  # Pass the tokenizer with left padding settings
        device=device,  # '0' for GPU, '-1' for CPU
        model_kwargs={"torch_dtype": torch.bfloat16} if use_fp16 else {}
    )
    return generator

# 2. Function to generate batch responses using the model
    return response_list

# 2. Function to generate batch responses using the model
def generate_responses(input_df, batch_size, generator, prompt_template, max_token_output=80, use_sampling=True):
    """
    Generate text responses in batches using the generator.
    
    Parameters:
    ----------
    input_df : list of str
        List of input texts to run inference on.
    batch_size : int
        Size of each batch for inference.
    generator : pipeline
        HuggingFace pipeline initialized for text generation.
    prompt_template : str
        The template for the prompt to be used.
    max_token_output : int
        Maximum number of tokens to generate.
    use_sampling : bool
        Whether to use sampling or greedy decoding.
    
    Returns:
    -------
    response_list : list of str
        List of generated responses.
    """
    sub_df = input_df['snippet'].values.tolist()

    response_list = []
    num_step = len(sub_df) // batch_size + (1 if len(sub_df) % batch_size != 0 else 0)
    temperature = 0.1 if use_sampling else None
    top_p = 0.9 if use_sampling else None

    for i in tqdm(range(num_step)):
        input_texts = sub_df[i*batch_size:(i+1)*batch_size]
        input_texts = [prompt_template.format(text) for text in input_texts]

        # Generate the responses
        responses = generator(
            input_texts,  
            max_new_tokens=max_token_output,  
            pad_token_id=generator.tokenizer.eos_token_id,
            eos_token_id=generator.tokenizer.eos_token_id,
            truncation=True,
            do_sample=use_sampling,
            temperature=temperature,
            top_p=top_p
        )

        # Process the output
        for response in responses:
            for generated in response:
                # Extract relevant part of the response and append to list
                response_list.append(generated['generated_text'].split("\nOutput:\n")[1].split("END")[0])
    
    return response_list

# 3. Function to process the LLM output
def process_output(input_df, response_list):
    """
    Processes a list of LLM responses to extract medication information and adds it to the input DataFrame.

    This function takes an input DataFrame (`input_df`) and a list of responses (`response_list`),
    where each response contains categorized medication data. The function extracts three categories
    of medications (active, discontinued, and neither), formats them into lists, and creates a new
    DataFrame with three columns:
    
    - `active_medications`: Medications that the patient is currently taking.
    - `discontinued_medications`: Medications that the patient has taken but has since discontinued.
    - `neither_medications`: Medications that are mentioned but are neither currently taken nor discontinued.

    The new DataFrame with these three columns is concatenated with the `input_df` and returned.

    Parameters:
    ----------
    input_df : pd.DataFrame
        The original input DataFrame, which will be concatenated with the extracted medication data.
    
    response_list : list of str
        A list of strings containing the LLM responses. Each response includes a categorized list of medications
        (active, discontinued, and neither).

    Returns:
    -------
    pd.DataFrame
        A new DataFrame that concatenates the `input_df` with the extracted medication data.
        The resulting DataFrame will have the original columns from `input_df`, plus three new columns:
        `active_medications`, `discontinued_medications`, and `neither_medications`, each containing a list of medications.

    Example:
    --------
    >>> input_df = pd.DataFrame({'notes': ["Note 1", "Note 2"]})
    >>> response_list = [
    >>>     'Current Medications (Active): Aspirin\nDiscontinued Medications: Atenolol\nOther Mentioned Medications: Ibuprofen',
    >>>     'Current Medications (Active): None\nDiscontinued Medications: Metoprolol\nOther Mentioned Medications: Acetaminophen'
    >>> ]
    >>> final_df = process_output(input_df, response_list)
    >>> print(final_df)
    
    Output:
    -------
        notes    active_medications     discontinued_medications    neither_medications
        Note 1   [Aspirin]              [Atenolol]                  [Ibuprofen]
        Note 2   []                     [Metoprolol]                [Acetaminophen]
    """
    # Initialize lists to store the medications for each category
    active_medications_list = []
    discontinued_medications_list = []
    neither_medications_list = []

    # Loop through each response in the response_list
    for response in response_list:
        # Extract the active, discontinued, and neither medications using regular expressions
        active_medications = re.search(r'Current Medications \(Active\):\s*(.*)', response)
        discontinued_medications = re.search(r'Discontinued Medications:\s*(.*)', response)
        neither_medications = re.search(r'Other Mentioned Medications.*:\s*(.*)', response)

        # Convert to lists and handle None cases
        active_medications = active_medications.group(1).split(', ') if active_medications and active_medications.group(1) != "None" else []
        discontinued_medications = discontinued_medications.group(1).split(', ') if discontinued_medications and discontinued_medications.group(1) != "None" else []
        neither_medications = neither_medications.group(1).split(', ') if neither_medications and neither_medications.group(1) != "None" else []

        # Append each category list to their respective main lists
        active_medications_list.append(active_medications)
        discontinued_medications_list.append(discontinued_medications)
        neither_medications_list.append(neither_medications)

    # Create a new DataFrame from the lists
    output_df = pd.DataFrame({
        'active_medications_pred': active_medications_list,
        'discontinued_medications_pred': discontinued_medications_list,
        'neither_medications_pred': neither_medications_list
    })

    # Concatenate the input_df with the output_df by rows
    result_df = pd.concat([input_df, output_df], axis=1)

    return result_df

# 4. Function to calculate metrics (Precision, Recall, F1, Accuracy)
def calculate_row_metrics(df):
    columns = df.columns.tolist()

    # Iterate over the three categories
    for category in ['active_medications', 'discontinued_medications', 'neither_medications']:
        true_col = category
        pred_col = category + '_pred'

        # Check the type of true_col, if not list, use eval to convert back
        # Ensure that the column contains lists (if it's a string, evaluate it)
        df[true_col] = df[true_col].apply(lambda x: eval(x) if isinstance(x, str) else x)

        # Initialize columns to store row-wise metrics
        df.loc[:, 'avg_precision'] = np.nan
        df.loc[:, 'avg_recall'] = np.nan
        df.loc[:, 'avg_f1'] = np.nan
        df.loc[:, 'avg_accuracy'] = np.nan

        # For each row, compute metrics
        for index, row in df.iterrows():
            # Convert lists to sets for easier comparison
            true_set = set(row[true_col])
            pred_set = set(row[pred_col])
            
            # Check if both sets are empty
            if not true_set and not pred_set:
                precision, recall, f1, accuracy = 1.0, 1.0, 1.0, 1.0  # perfect scores when both are empty
            else:
                # Create binary lists: 1 if medication is present, 0 otherwise
                all_medications = list(true_set.union(pred_set))
                true_binary = [1 if med in true_set else 0 for med in all_medications]
                pred_binary = [1 if med in pred_set else 0 for med in all_medications]

                # Calculate precision, recall, F1, accuracy
                precision = precision_score(true_binary, pred_binary, zero_division=1)
                recall = recall_score(true_binary, pred_binary, zero_division=1)
                f1 = f1_score(true_binary, pred_binary, zero_division=1)
                accuracy = accuracy_score(true_binary, pred_binary)

            # Append the metrics to the DataFrame
            df.loc[index, f'{category}_precision'] = precision
            df.loc[index, f'{category}_recall'] = recall
            df.loc[index, f'{category}_f1'] = f1
            df.loc[index, f'{category}_accuracy'] = accuracy
        
    # get the average of each metric and append to a column as avg_precision, avg_recall, avg_f1, avg_accuracy
    df['avg_precision'] = df[['active_medications_precision', 'discontinued_medications_precision', 'neither_medications_precision']].mean(axis=1)
    df['avg_recall'] = df[['active_medications_recall', 'discontinued_medications_recall', 'neither_medications_recall']].mean(axis=1)
    df['avg_f1'] = df[['active_medications_f1', 'discontinued_medications_f1', 'neither_medications_f1']].mean(axis=1)
    df['avg_accuracy'] = df[['active_medications_accuracy', 'discontinued_medications_accuracy', 'neither_medications_accuracy']].mean(axis=1)

    return df[columns+['avg_precision', 'avg_recall', 'avg_f1', 'avg_accuracy']]

# 5. Main function to tie everything together
def run_pipeline(model_path, input_df, prompt_template, batch_size=16, max_token_output=80, use_sampling=True):
    """
    Main function to run the text generation pipeline and compute metrics.
    
    Parameters:
    ----------
    model_path : str
        The path of the model to be used.
    input_df : pd.DataFrame
        The data to be inferred.
    prompt_template : str
        Template for constructing the prompts.
    batch_size : int
        Number of examples per batch.
    max_token_output : int
        Maximum number of tokens to generate.
    use_sampling : bool
        Whether to use sampling (or greedy decoding).
    
    Returns:
    -------
    result_df : pd.DataFrame
        DataFrame with the processed outputs and calculated metrics.
    """
    # Initialize the model
    generator = initialize_model(model_path, device=0)

    # Generate responses
    response_list = generate_responses(input_df, batch_size, generator, prompt_template, max_token_output, use_sampling)

    # Process the responses to categorize medications
    df_w_classifications = process_output(input_df, response_list)

    # Calculate row-level metrics
    df_w_metrics = calculate_row_metrics(df_w_classifications)

    # Return the final DataFrame with metrics
    return df_w_metrics

# 6. Function to benchmark the model
def benchmark_model(name_dataset, model_path, prompt_template, input_df, data_folder, result_df_path, use_sampling=True, batch_size=16, max_token_output=80):
    # Run the pipeline
    df_w_row_metrics = run_pipeline(model_path=model_path, 
                                    input_df=input_df, 
                                    prompt_template=prompt_template, 
                                    use_sampling=use_sampling,
                                    batch_size=batch_size, 
                                    max_token_output=max_token_output)

    result_df = pd.read_csv(data_folder+'results.csv')
    metrics_mean = df_w_row_metrics[['avg_precision', 'avg_recall', 'avg_f1', 'avg_accuracy']].mean(axis=0)

    # Define your result row
    new_row = {
        'Dataset': name_dataset,
        'Model': model_path.split('/')[-1],
        'Prompt': prompt_template,
        'Accuracy': metrics_mean.get('avg_accuracy', np.nan),
        'Precision': metrics_mean.get('avg_precision', np.nan),
        'Recall': metrics_mean.get('avg_recall', np.nan),
        'F1': metrics_mean.get('avg_f1', np.nan)
    }

    result_df = result_df._append(new_row, ignore_index=True).round(3)
    result_df.to_csv(result_df_path, index=False)


def clear_cuda_memory():
    # Clear the cache
    torch.cuda.empty_cache()
    
    # Run garbage collection
    gc.collect()


In [36]:
name_model_paths ={   
    "Bio_ClinicalBERT": "/PHShome/jn180/llm_public_host/Bio_ClinicalBERT",

    "Llama-3.1-8B": "/netapp3/raw_data3/share/llm_public_host/Llama-3.1-8B",
    "Llama-3.1-8B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Llama-3.1-8B-Instruct",

    "Llama-3.2-1B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Llama-3.2-1B-Instruct",
    "Llama-3.2-3B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Llama-3.2-3B-Instruct",

    "Qwen2-7B-Instruct": "/PHShome/jn180/llm_public_host/Qwen2-7B-Instruct",
    "Qwen2.5-14B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Qwen2.5-14B-Instruct",

    "meditron-7b": "/PHShome/jn180/llm_public_host/meditron-7b",

}

name_dataset = "MIT"
data_folder = "/PHShome/cs1839/capstone_data/"
results_df_path = data_folder + "results.csv"
medication_status_test = pd.read_csv(data_folder + "medication_status_test.csv")

prompt_template = """
Identify and categorize the medications mentioned in the following medical note. Extract all medications the patient has taken before, is currently taking, and any other medications mentioned.
Note: Adjust the number of medications in each category based on the input. Write None if no other medication mentioned. Strictly follow the output format.
Expected Output Format:
"
- Current Medications (Active): [Medication 1], [Medication 2]
- Discontinued Medications: [Medication 3], [Medication 4]
- Other Mentioned Medications (neither active nor discontinued): [Medication 5], [Medication 6]
END"

Input Medical Note:
{}

Output:
"""

for model_name, model_path in name_model_paths.items():
    benchmark_model(name_dataset = name_dataset,
                    model_path = model_path,
                    prompt_template = prompt_template,
                    input_df = medication_status_test,
                    data_folder = data_folder,
                    result_df_path = results_df_path,
                    use_sampling = False,
                    batch_size = 16,
                    max_token_output = 80)


# name_dataset = "MIMIC-IV"
# mimic_iv = pd.read_csv(data_folder + "mimic_iv_snippets.csv")
# # convert the active, discontinued, and neither medications to contained in a list
# mimic_iv['active_medications'] = mimic_iv['active_medications'].apply(lambda x: [med.strip() for med in x.split(',')] if x is not np.nan else [])
# mimic_iv['discontinued_medications'] = mimic_iv['discontinued_medications'].apply(lambda x: [med.strip() for med in x.split(',')] if x is not np.nan else [])
# mimic_iv['neither_medications'] = mimic_iv['neither_medications'].apply(lambda x: [med.strip() for med in x.split(',')] if x is not np.nan else [])
# for model_name, model_path in name_model_paths.items():
#     clear_cuda_memory()
#     benchmark_model(name_dataset = name_dataset,
#                     model_path = model_path,
#                     prompt_template = prompt_template,
#                     input_df = medication_status_test,
#                     data_folder = data_folder,
#                     result_df_path = results_df_path,
#                     use_sampling = False,
#                     batch_size = 16,
#                     max_token_output = 80)

In [40]:
result_df = pd.read_csv(results_df_path).sort_values(by=['Dataset', 'F1'], ascending=[True, False])
result_df.columns = ['Dataset', 'Model', 'Prompt', 'Accuracy_avg', 'Precision_avg', 'Recall_avg', 'F1_avg']
result_df

Unnamed: 0,Dataset,Model,Prompt,Accuracy_avg,Precision_avg,Recall_avg,F1_avg
10,MIMIC-IV,Llama-3.1-8B-Instruct,\nIdentify and categorize the medications ment...,0.649,0.735,0.775,0.675
14,MIMIC-IV,Qwen2.5-14B-Instruct,\nIdentify and categorize the medications ment...,0.611,0.716,0.756,0.636
12,MIMIC-IV,Llama-3.2-3B-Instruct,\nIdentify and categorize the medications ment...,0.589,0.766,0.719,0.625
9,MIMIC-IV,Llama-3.1-8B,\nIdentify and categorize the medications ment...,0.414,0.53,0.581,0.433
13,MIMIC-IV,Qwen2-7B-Instruct,\nIdentify and categorize the medications ment...,0.384,0.453,0.629,0.402
11,MIMIC-IV,Llama-3.2-1B-Instruct,\nIdentify and categorize the medications ment...,0.377,0.762,0.487,0.39
8,MIMIC-IV,Bio_ClinicalBERT,\nIdentify and categorize the medications ment...,0.367,1.0,0.367,0.367
15,MIMIC-IV,meditron-7b,\nIdentify and categorize the medications ment...,0.038,0.061,0.381,0.042
2,MIT,Llama-3.1-8B-Instruct,\nIdentify and categorize the medications ment...,0.679,0.767,0.79,0.704
6,MIT,Qwen2.5-14B-Instruct,\nIdentify and categorize the medications ment...,0.601,0.706,0.751,0.626
