In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import json
from huggingface_hub import login

from tqdm import tqdm
import torch
from transformers import AutoTokenizer, pipeline

import pandas as pd
import numpy as np
import re

from baseline import *

# Read the JSON config file
with open('config.json', 'r') as f:
    config = json.load(f)

# Get the token from the JSON file
hg_token = config['HuggingFace']['token']
# Login using the token
login(token=hg_token)

# LLM folder
llm_folder = "/PHShome/jn180/llm_public_host"
# Data folder
data_folder = "/PHShome/cs1839/capstone_data/"
# results table path
results_df_path = data_folder + "results.csv"

# data to inference 
medication_status_test = pd.read_csv(data_folder + "medication_status_test.csv")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /PHShome/cs1839/.cache/huggingface/token
Login successful


# Pipeline

In [29]:
name_model_paths ={   
    # "Bio_ClinicalBERT": "/PHShome/jn180/llm_public_host/Bio_ClinicalBERT",

    # "Llama-3.1-8B": "/netapp3/raw_data3/share/llm_public_host/Llama-3.1-8B",
    "Llama-3.1-8B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Llama-3.1-8B-Instruct",

    "Llama-3.2-1B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Llama-3.2-1B-Instruct",
    "Llama-3.2-3B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Llama-3.2-3B-Instruct",

    "Qwen2-7B-Instruct": "/PHShome/jn180/llm_public_host/Qwen2-7B-Instruct",
    "Qwen2.5-14B-Instruct": "/netapp3/raw_data3/share/llm_public_host/Qwen2.5-14B-Instruct",

    "meditron-7b": "/PHShome/jn180/llm_public_host/meditron-7b",

    # "Mistral-7B-Instruct-v0.3": "/netapp3/raw_data3/share/llm_public_host/Mistral-7B-Instruct-v0.3"

}

import os
# Set the environment variable to specify the GPUs

os.environ["CUDA_VISIBLE_DEVICES"] = "2"


name_dataset = "MIT"
data_folder = "/PHShome/cs1839/capstone_data/"
results_df_path = data_folder + "results.csv"
medication_status_test = pd.read_csv(data_folder + "medication_status_test.csv")

# prompt_template = """
# Identify and categorize the medications mentioned in the following medical note. Extract all medications the patient has taken before, is currently taking, and any other medications mentioned.
# Note: Adjust the number of medications in each category based on the input. Write None if no other medication mentioned. Strictly follow the output format.
# Expected Output Format:
# "
# - Current Medications (Active): Medication_1, Medication_2
# - Discontinued Medications: Medication_3, Medication_4
# - Other Mentioned Medications (neither active nor discontinued): Medication_5, Medication_6
# END"

# Input Medical Note:
# {}

# Output:
# """

prompt_template = """
Input Medical Note:
{}

Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.

Expected Output Format:
"
- Current Medications (Active): Medication_1, Medication_2
- Discontinued Medications: Medication_3, Medication_4
- Other Mentioned Medications (neither active nor discontinued): Medication_5, Medication_6
END"

Output:
"""

for model_name, model_path in name_model_paths.items():
    df = run_pipeline(model_path=model_path,
                        input_df=medication_status_test[medication_status_test['index']==96],
                        prompt_template=prompt_template,
                        batch_size=16,
                        max_token_output=80,
                        use_sampling=False)
df

NameError: name 'run_pipeline' is not defined

# Metrics


## Task 1: Medication Extraction

- **Precision**: Measures the proportion of correctly predicted medications out of all predicted medications.

$$
  \text{Precision} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Positives (FP)}}
  $$

- **Recall**: Measures the proportion of correctly predicted medications out of all actual medications.

 $$
  \text{Recall} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Negatives (FN)}}
  $$

## Task 2: Status Classification

- **Conditional Accuracy**: Measures the proportion of correct status predictions out of all correctly extracted medications from Task 1.
  $$
  \text{Conditional Accuracy} = \frac{\text{Correct Predictions for the Classes}}{\text{Total Correctly Extracted Medications from Task 1}}
  $$

- **Conditional Macro F1**: Combines precision and recall for each status class, calculates the F1-score for each, then averages them across classes.
  $$
  \text{F1\text{-}score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
  $$

---

# Example
## Task 1

| Active Medication | Discontinued Medication | Active Medication (Predicted) | Discontinued Medication (Predicted) |
|-------------------|-------------------------|-------------------------------|-------------------------------------|
| A                 | B                       | A                             | C                                   |


True Set: A, B

Pred Set: A, C



Precision = 1/2

Recall = 1/2


## Task 2
conditional metrics will only consider: A (C is not correctly extracted, removed)
| Active Medication | Discontinued Medication | Active Medication (Predicted) | Discontinued Medication (Predicted) |
|-------------------|-------------------------|-------------------------------|-------------------------------------|
| A                 | B                       | A                             |                                     |

conditional_accuracy = 1/2 

conditional_precision:
- Active: 1
- **Discountinued: 1**

conditional_recall:
- Active: 1
- Discountinued: 0



| Active Medication | Discontinued Medication | Active Medication (Predicted) | Discontinued Medication (Predicted) |
|-------------------|-------------------------|-------------------------------|-------------------------------------|
| A, C              |                         | A                             |C                                    |
| A                 |B, C                     | A                             |C                                    |
| A, B              |                         |                               |                                     |

conditional_acc =（A+A+C）/ (A+C+A+C) = 3/4 

conditional_precision_active = (A+A)/(A+A) = 1

conditional_precision_discountinued = C / (C+C) = 1/2

conditional_recall_active = (A+A)/ (A+A+A+B+c) = 2/5

conditional_recall_discountinued = (C)/ (B+C) = 1/2


In [111]:
import pandas as pd
from baseline import calculate_metrics_by_dataset

# Example usage
data = {
    'active_medications': [['A', 'C'], ['A'], ['A', 'B']],
    'discontinued_medications': [['E'], ['B', 'C'], []],
    'neither_medications': [['D'], [], []],
    'active_medications_pred': [['A','E'], ['A'], ['A','B']],
    'discontinued_medications_pred': [['C','D'], ['C'], []],
    'neither_medications_pred': [[], [], ['E']]
}

# Create the DataFrame
mimic_iv = pd.DataFrame(data)

# Run the function on the dataset
extraction_precision, extraction_recall, extraction_f1, conditional_accuracy, conditional_macro_f1, conditional_macro_precision, conditional_macro_recall = calculate_metrics_by_dataset(mimic_iv, 'MIMIC')

# Print the results
print(f"Extraction Precision: {extraction_precision:.3f}")
print(f"Extraction Recall: {extraction_recall:.3f}")
print(f"Extraction F1: {extraction_f1:.3f}")
print(f"Conditional Accuracy: {conditional_accuracy:.3f}")
print(f"Conditional Macro Precision: {conditional_macro_precision:.3f}")
print(f"Conditional Macro Recall: {conditional_macro_recall:.3f}")
print(f"Conditional Macro F1: {conditional_macro_f1:.3f}")

mimic_iv[['active_medications', 'discontinued_medications', 'neither_medications', 'active_medications_pred', 'discontinued_medications_pred', 'neither_medications_pred']]

Extraction Precision: 0.889
Extraction Recall: 0.889
Extraction F1: 0.889
Conditional Accuracy: 0.556
Conditional Macro Precision: 0.378
Conditional Macro Recall: 0.378
Conditional Macro F1: 0.378


Unnamed: 0,active_medications,discontinued_medications,neither_medications,active_medications_pred,discontinued_medications_pred,neither_medications_pred
0,"[A, C]",[E],[D],"[A, E]","[C, D]",[]
1,[A],"[B, C]",[],[A],[C],[]
2,"[A, B]",[],[],"[A, B]",[],[E]


# Result

In [18]:
import pandas as pd
import json

# Data folder
data_folder = "/PHShome/cs1839/capstone_data/"
# Results table path
results_df_path = data_folder + "results.csv"

# Load the results data
result_df = pd.read_csv(results_df_path).round(3)

# read the prompts.json file
with open('prompts.json') as f:
    prompts_json = json.load(f)

# Get the list of one-shot prompt values for only 1_shot included in the keys
one_shot_prompts = [prompts_json[prompt_key] for prompt_key in prompts_json.keys() if '1_shot' in prompt_key]


# Add one_shot column based on existence in prompts.json values
result_df.loc[:,'one_shot'] = result_df.loc[:,'Prompt'].apply(lambda prompt: 1 if prompt in one_shot_prompts else 0)


# Add one_shot column
result_df.loc[:,'cot'] = result_df.apply(
    lambda row: 1 if "Let's think step by step" in row['Prompt'] else 0, axis=1
)

# Filter one-shot results and calculate averages for one-shot prompts
one_shot_df = result_df[result_df['one_shot'] == 1].copy()
one_shot_df.loc[:, 'Method'] = ['1-Shot' if row['one_shot'] == 1 and row['cot'] == 0 else '1-Shot & COT' for _, row in one_shot_df.iterrows()]
one_shot_avg = one_shot_df.groupby(['Dataset', 'Model', 'Method']).agg({
    'extraction_precision': 'mean',
    'extraction_recall': 'mean',
    'extraction_f1': 'mean',
    'conditional_accuracy': 'mean',
    'conditional_macro_f1': 'mean',
    'conditional_macro_precision': 'mean',
    'conditional_macro_recall': 'mean',
    'accuracy': 'mean',
    'macro_f1': 'mean',
    'macro_precision': 'mean',
    'macro_recall': 'mean'
}).reset_index()

baseline_df = result_df[result_df['one_shot'] == 0].copy()
baseline_df.loc[:,'Method'] = ['CoT' if 'Let\'s think step by step' in prompt else '0-Shot' for prompt in baseline_df['Prompt']]

# Append new GPT results
gpt_results = pd.DataFrame([
    {
        # 'Prompt': 'Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.',
        'Dataset': 'MIT',
        'Model_Method': 'GPT-3 + R(32 LOC)(0-Shot)',
        'extraction_precision': 0.87,
        'extraction_recall': 0.83,
        'extraction_f1': round(2 * 0.87 * 0.83 / (0.87 + 0.83), 3),
        'conditional_accuracy': 0.85,
        'conditional_macro_f1': 0.69,
        'conditional_macro_precision': '--',
        'conditional_macro_recall': '--',
        'accuracy': '--',
        'macro_f1': '--',
        'macro_precision': '--',
        'macro_recall': '--',
    },
    {
        # 'Prompt': 'Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.',
        'Dataset': 'MIT',
        'Model_Method': 'GPT-3 + R(8 LOC)(1-Shot)',
        'extraction_precision': 0.90,
        'extraction_recall': 0.92,
        'extraction_f1': round(2 * 0.90 * 0.92 / (0.90 + 0.92), 3),
        'conditional_accuracy': 0.89,
        'conditional_macro_f1': 0.62,
        'conditional_macro_precision': '--',
        'conditional_macro_recall': '--',
        'accuracy': '--',
        'macro_f1': '--',
        'macro_precision': '--',
        'macro_recall': '--',
    }
])

# Concatenate the results
combined_df = pd.concat([baseline_df, one_shot_avg], ignore_index=True)

# Select columns to show the final results by Method
final_results = combined_df[['Dataset', 'Model', 'Method', 
                             'extraction_f1', 'conditional_accuracy', 'conditional_macro_f1',  
                             'accuracy', 'macro_f1', 
                             'extraction_precision', 'extraction_recall',
                             'conditional_macro_precision', 'conditional_macro_recall', 'macro_precision', 'macro_recall']].copy()

final_results.loc[:,'Model_Method'] = final_results['Model'] + ' (' + final_results['Method'] + ')'

final_results.drop(columns=['Model', 'Method'], inplace=True)
final_results = pd.concat([final_results, gpt_results], ignore_index=True)
final_results.columns = ['Dataset', 
                         'extraction_f1', 'conditional_accuracy', 'conditional_macro_f1', 'accuracy_w_gt', 'macro_f1_w_gt', 
                         'extraction_precision', 'extraction_recall',
                         'conditional_macro_precision', 'conditional_macro_recall', 'macro_precision_w_gt', 'macro_recall_w_gt', 'Model_Method']
pd.set_option('display.max_rows', None)


# Sort the final results by Method, Dataset, and extraction_f1
col_to_show = ['Dataset', 'Model_Method', 'extraction_f1', 'conditional_accuracy', 'conditional_macro_f1', 'accuracy_w_gt', 'macro_f1_w_gt']
final_results[col_to_show].sort_values(
    by=['Dataset', 'extraction_f1', 'conditional_accuracy', 'conditional_macro_f1', 'accuracy_w_gt', 'macro_f1_w_gt'],
    ascending=[False, False, False, False, False, False]
).set_index(['Dataset', 'Model_Method']).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,extraction_f1,conditional_accuracy,conditional_macro_f1,accuracy_w_gt,macro_f1_w_gt
Dataset,Model_Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MIT,Llama-3.1-70B-Instruct (1-Shot),0.954,0.871,0.838,0.9222,0.8764
MIT,Llama-3.1-70B-Instruct (1-Shot & COT),0.948,0.848,0.771,0.8936,0.8346
MIT,Qwen2.5-32B-Instruct (0-Shot),0.941,0.835,0.729,0.863,0.778
MIT,Llama-3.1-70B-Instruct (0-Shot),0.937,0.845,0.832,0.926,0.862
MIT,Qwen2.5-32B-Instruct (CoT),0.936,0.887,0.832,0.89,0.811
MIT,Qwen2-72B-Instruct (1-Shot),0.936,0.855,0.837,0.9358,0.892
MIT,Qwen2.5-32B-Instruct (1-Shot & COT),0.935,0.88,0.773,0.8966,0.8324
MIT,Qwen2.5-14B-Instruct (1-Shot & COT),0.934,0.839,0.743,0.8588,0.8044
MIT,Llama-3.1-70B-Instruct (CoT),0.933,0.85,0.807,0.887,0.793
MIT,Qwen2.5-14B-Instruct (CoT),0.933,0.821,0.711,0.887,0.868


In [19]:
final_results.to_csv(data_folder + "final_results.csv", index=False)