In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import json
from huggingface_hub import login

# Read the JSON config file
with open('config.json', 'r') as f:
    config = json.load(f)

# Get the token from the JSON file
hg_token = config['HuggingFace']['token']
# Login using the token
login(token=hg_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /PHShome/cs1839/.cache/huggingface/token
Login successful


# Metrics


## Task 1: Medication Extraction

- **Precision**: Measures the proportion of correctly predicted medications out of all predicted medications.

$$
  \text{Precision} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Positives (FP)}}
  $$

- **Recall**: Measures the proportion of correctly predicted medications out of all actual medications.

 $$
  \text{Recall} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Negatives (FN)}}
  $$

## Task 2: Status Classification

- **Conditional Accuracy**: Measures the proportion of correct status predictions out of all correctly extracted medications from Task 1.
  $$
  \text{Conditional Accuracy} = \frac{\text{Correct Predictions for the Classes}}{\text{Total Correctly Extracted Medications from Task 1}}
  $$

- **Conditional Macro F1**: Combines precision and recall for each status class, calculates the F1-score for each, then averages them across classes.
  $$
  \text{F1\text{-}score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}
  $$

---

# Example
## Task 1

| Active Medication | Discontinued Medication | Active Medication (Predicted) | Discontinued Medication (Predicted) |
|-------------------|-------------------------|-------------------------------|-------------------------------------|
| A                 | B                       | A                             | C                                   |


True Set: A, B

Pred Set: A, C



Precision = 1/2

Recall = 1/2


## Task 2
conditional metrics will only consider: A (C is not correctly extracted, removed)
| Active Medication | Discontinued Medication | Active Medication (Predicted) | Discontinued Medication (Predicted) |
|-------------------|-------------------------|-------------------------------|-------------------------------------|
| A                 | B                       | A                             |                                     |

joint_accuracy = 1/2 

joint_precision:
- Active: 1
- **Discountinued: 1**

conditional_recall:
- Active: 1
- Discountinued: 0



| Active Medication | Discontinued Medication | Active Medication (Predicted) | Discontinued Medication (Predicted) |
|-------------------|-------------------------|-------------------------------|-------------------------------------|
| A, C              |                         | A                             |C                                    |
| A                 |B, C                     | A                             |C                                    |
| A, B              |                         |                               |                                     |

conditional_acc =（A+A+C）/ (A+C+A+C) = 3/4 

conditional_precision_active = (A+A)/(A+A) = 1

conditional_precision_discountinued = C / (C+C) = 1/2

conditional_recall_active = (A+A)/ (A+A+A+B+c) = 2/5

conditional_recall_discountinued = (C)/ (B+C) = 1/2


In [111]:
import pandas as pd
from baseline import calculate_metrics_by_dataset

# Example usage
data = {
    'active_medications': [['A', 'C'], ['A'], ['A', 'B']],
    'discontinued_medications': [['E'], ['B', 'C'], []],
    'neither_medications': [['D'], [], []],
    'active_medications_pred': [['A','E'], ['A'], ['A','B']],
    'discontinued_medications_pred': [['C','D'], ['C'], []],
    'neither_medications_pred': [[], [], ['E']]
}

# Create the DataFrame
mimic_iv = pd.DataFrame(data)

# Run the function on the dataset
extraction_precision, extraction_recall, extraction_f1, conditional_accuracy, conditional_macro_f1, conditional_macro_precision, conditional_macro_recall = calculate_metrics_by_dataset(mimic_iv, 'MIMIC')

# Print the results
print(f"Extraction Precision: {extraction_precision:.3f}")
print(f"Extraction Recall: {extraction_recall:.3f}")
print(f"Extraction F1: {extraction_f1:.3f}")
print(f"Conditional Accuracy: {conditional_accuracy:.3f}")
print(f"Conditional Macro Precision: {conditional_macro_precision:.3f}")
print(f"Conditional Macro Recall: {conditional_macro_recall:.3f}")
print(f"Conditional Macro F1: {conditional_macro_f1:.3f}")

mimic_iv[['active_medications', 'discontinued_medications', 'neither_medications', 'active_medications_pred', 'discontinued_medications_pred', 'neither_medications_pred']]

Extraction Precision: 0.889
Extraction Recall: 0.889
Extraction F1: 0.889
Conditional Accuracy: 0.556
Conditional Macro Precision: 0.378
Conditional Macro Recall: 0.378
Conditional Macro F1: 0.378


Unnamed: 0,active_medications,discontinued_medications,neither_medications,active_medications_pred,discontinued_medications_pred,neither_medications_pred
0,"[A, C]",[E],[D],"[A, E]","[C, D]",[]
1,[A],"[B, C]",[],[A],[C],[]
2,"[A, B]",[],[],"[A, B]",[],[E]


# Result

In [26]:
import pandas as pd
import json
import re
import numpy as np

# Data folder
data_folder = "/PHShome/cs1839/capstone_data/"
# Results table path
results_df_path = data_folder + "results.csv"

# Load the results data
result_df = pd.read_csv(results_df_path).round(3)
result_df = result_df[result_df['Dataset'] != 'MIMIC-IV']
# load gpt-4o result
gpt_results = pd.read_csv(data_folder + "results_gpt4o.csv").round(3)
# load mimic-iv result
mimic_iv_results = pd.read_csv(data_folder + "results_mimic_iv_new.csv").round(3)
# load_internal_data_results
internal_data_results = pd.read_csv(data_folder + "internal_data_results_updated.csv").round(3)
# concat the gpt-4o results
result_df = pd.concat([result_df, gpt_results, mimic_iv_results, internal_data_results], ignore_index=True)

# Read the prompts.json file
with open('prompts.json') as f:
    prompts_json = json.load(f)

# Create a mapping from prompt to its key name
prompt_to_key_map = {value: key for key, value in prompts_json.items()}

# Define a function to extract the method from the prompt key
def get_method_from_prompt(prompt):
    key = prompt_to_key_map.get(prompt, None)
    if key:
        if len(key.split('_')) > 1:
            return '_'.join(key.split('_')[1:])  # Get the method from the second position
        else:
            return '0-Shot'
    return 'Unknown'  # Assign 'Unknown' if the key is not found

# Assign the 'Method' column based on the prompt key
result_df['Method'] = result_df['Prompt'].apply(get_method_from_prompt)

# replace 1_shot_1, 1_shot_2, 1_shot_3, 1_shot_4, 1_shot_5 with `1-Shot`, there is existence of 1_shot_CoT, which is not a one-shot prompt
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'1_shot_([1-5])(?!_CoT)', '1-Shot', x))
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'1_shot_([1-5])_CoT', '1-Shot & CoT', x))
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'5_shots(?!_CoT)', '5-Shot', x))
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'5_shots_CoT', '5-Shot & CoT', x))

# Filter one-shot results and calculate averages for one-shot prompts
result_df = result_df.groupby(['Dataset', 'Model', 'Method']).agg({
    # calcualte the mean and standard deviation for the following columns
    'extraction_precision': ['mean','std'],
    'extraction_recall': ['mean','std'],
    'extraction_f1': ['mean','std'],
    
    'accuracy_w_gt':['mean','std'],
    'macro_f1_w_gt': ['mean','std'],
    'macro_precision_w_gt': ['mean','std'],
    'macro_recall_w_gt': ['mean','std'],

    'joint_accuracy': ['mean','std'],
    'joint_macro_f1':['mean','std'],
    'joint_macro_precision': ['mean','std'],
    'joint_macro_recall':['mean','std'],
}).reset_index().round(3)

# Append new GPT results
gpt_results = pd.DataFrame([
    {
        # 'Prompt': 'Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.',
        'Dataset': 'MIT',
        'Model': 'GPT-3 + R (32 LOC)',
        'Method': '0-Shot',
        'joint_macro_f1': '--',
        'joint_accuracy': '--',
        # 'extraction_precision': 0.87,
        # 'extraction_recall': 0.83,
        'extraction_f1': round(2 * 0.87 * 0.83 / (0.87 + 0.83), 3),
        # 'conditional_accuracy': 0.85,
        # 'conditional_macro_f1': 0.69,
        # 'conditional_macro_precision': '--',
        # 'conditional_macro_recall': '--',
        'macro_f1_w_gt': '--',
        'accuracy_w_gt': '--',
        'extraction_f1_mean': np.nan,
        'accuracy_w_gt_mean': np.nan,
        'macro_f1_w_gt_mean': np.nan,
        'joint_accuracy_mean': np.nan,
        'joint_macro_f1_mean': np.nan,
    },
    {
        # 'Prompt': 'Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.',
        'Dataset': 'MIT',
        'Model': 'GPT-3 + R (8 LOC)',
        'Method': '1-Shot',
        'joint_macro_f1': '--',
        'joint_accuracy': '--',
        # 'extraction_precision': 0.90,
        # 'extraction_recall': 0.92,
        'extraction_f1': round(2 * 0.90 * 0.92 / (0.90 + 0.92), 3),
        # 'conditional_accuracy': 0.89,
        # 'conditional_macro_f1': 0.62,
        # 'conditional_macro_precision': '--',
        # 'conditional_macro_recall': '--',
        'macro_f1_w_gt': '--',
        'accuracy_w_gt': '--',
        'extraction_f1_mean': np.nan,
        'accuracy_w_gt_mean': np.nan,
        'macro_f1_w_gt_mean': np.nan,
        'joint_accuracy_mean': np.nan,
        'joint_macro_f1_mean': np.nan,
    }
])


pd.set_option('display.max_rows', None)
# reset the multi-level index with appending the column names, ignore Dataset, Model, Method
result_df.columns = ['_'.join(col).strip() if col[0] not in ['Dataset', 'Model', 'Method'] else col[0] for col in result_df.columns.values]
# combine the mean and std by having new columns of mean ± std
for col in ['extraction_f1', 'accuracy_w_gt', 'macro_f1_w_gt', 'joint_accuracy', 'joint_macro_f1']:
    result_df[col] = result_df[col + '_mean'].round(3).astype(str) + ' ± ' + result_df[col + '_std'].astype(str)
# drop all columns with  _std
result_df = result_df.drop(columns=[col for col in result_df.columns if '_std' in col])

# Sort the final results by Method, Dataset, and extraction_f1
col_to_include = ['Dataset', 'Model', 'Method', 'joint_macro_f1', 'joint_accuracy', 'extraction_f1', 'macro_f1_w_gt', 'accuracy_w_gt', 'extraction_f1_mean', 'accuracy_w_gt_mean', 'macro_f1_w_gt_mean', 'joint_accuracy_mean', 'joint_macro_f1_mean']
col_to_show = ['Dataset', 'Model', 'Method', 'joint_macro_f1', 'joint_accuracy', 'extraction_f1', 'macro_f1_w_gt', 'accuracy_w_gt']

merged_df = pd.concat([result_df[col_to_include],gpt_results])
# filter out model name starting with Llama-3.2, Qwen2-
merged_df = merged_df[~merged_df['Model'].str.contains(r'Llama-3.2|Qwen2-')]
merged_df.sort_values(
    # sort by the means
    by=['Dataset', 'joint_macro_f1_mean','extraction_f1_mean', 'macro_f1_w_gt_mean', 'joint_accuracy_mean', 'accuracy_w_gt_mean', ],
    ascending=[False, False, False, False, False, False]
).set_index(['Dataset', 'Model', 'Method']).round(3)
# .to_csv(data_folder + "result_w_gpt.csv", index=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,joint_macro_f1,joint_accuracy,extraction_f1,macro_f1_w_gt,accuracy_w_gt,extraction_f1_mean,accuracy_w_gt_mean,macro_f1_w_gt_mean,joint_accuracy_mean,joint_macro_f1_mean
Dataset,Model,Method,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
MIT,Llama-3.1-70B-Instruct,1-Shot,0.806 ± 0.024,0.879 ± 0.012,0.956 ± 0.005,0.86 ± 0.023,0.917 ± 0.012,0.956,0.917,0.86,0.879,0.806
MIT,gpt-4o,5-Shot,0.804 ± 0.013,0.904 ± 0.005,0.961 ± 0.003,0.866 ± 0.006,0.919 ± 0.005,0.961,0.919,0.866,0.904,0.804
MIT,Llama-3.1-70B-Instruct,5-Shot,0.794 ± 0.003,0.897 ± 0.004,0.957 ± 0.003,0.854 ± 0.01,0.919 ± 0.004,0.957,0.919,0.854,0.897,0.794
MIT,gpt-4o,1-Shot,0.781 ± 0.031,0.888 ± 0.016,0.958 ± 0.006,0.852 ± 0.015,0.905 ± 0.011,0.958,0.905,0.852,0.888,0.781
MIT,Qwen2.5-72B-Instruct,5-Shot,0.778 ± 0.001,0.873 ± 0.001,0.953 ± 0.001,0.847 ± 0.001,0.907 ± 0.001,0.953,0.907,0.847,0.873,0.778
MIT,Llama-3.1-70B-Instruct,CoT,0.776 ± 0.024,0.848 ± 0.01,0.94 ± 0.003,0.79 ± 0.014,0.883 ± 0.008,0.94,0.883,0.79,0.848,0.776
MIT,Qwen2.5-32B-Instruct,CoT,0.769 ± 0.009,0.867 ± 0.01,0.945 ± 0.009,0.836 ± 0.01,0.911 ± 0.004,0.945,0.911,0.836,0.867,0.769
MIT,Llama-3.1-70B-Instruct,0-Shot,0.762 ± 0.006,0.847 ± 0.003,0.935 ± 0.002,0.811 ± 0.017,0.908 ± 0.011,0.935,0.908,0.811,0.847,0.762
MIT,gpt-4o,1-Shot & CoT,0.758 ± 0.016,0.905 ± 0.013,0.942 ± 0.01,0.865 ± 0.012,0.904 ± 0.013,0.942,0.904,0.865,0.905,0.758
MIT,Llama-3.1-70B-Instruct,1-Shot & CoT,0.757 ± 0.02,0.851 ± 0.016,0.949 ± 0.005,0.835 ± 0.021,0.898 ± 0.012,0.949,0.898,0.835,0.851,0.757


In [31]:
import pandas as pd
import json
import re
import numpy as np

# Data folder
data_folder = "/PHShome/cs1839/capstone_data/"
# Results table path
results_df_path = data_folder + "results.csv"

# Load the results data
result_df = pd.read_csv(results_df_path).round(3)
result_df = result_df[result_df['Dataset'] != 'MIMIC-IV']
# load gpt-4o result
gpt_results = pd.read_csv(data_folder + "results_gpt4o.csv").round(3)
# load mimic-iv result
mimic_iv_results = pd.read_csv(data_folder + "results_mimic_iv_new.csv").round(3)
# load_internal_data_results
internal_data_results = pd.read_csv(data_folder + "internal_data_results_updated.csv").round(3)
# concat the gpt-4o results
result_df = pd.concat([result_df, gpt_results, mimic_iv_results, internal_data_results], ignore_index=True)

# Read the prompts.json file
with open('prompts.json') as f:
    prompts_json = json.load(f)

# Create a mapping from prompt to its key name
prompt_to_key_map = {value: key for key, value in prompts_json.items()}

# Define a function to extract the method from the prompt key
def get_method_from_prompt(prompt):
    key = prompt_to_key_map.get(prompt, None)
    if key:
        if len(key.split('_')) > 1:
            return '_'.join(key.split('_')[1:])  # Get the method from the second position
        else:
            return '0-Shot'
    return 'Unknown'  # Assign 'Unknown' if the key is not found

# Assign the 'Method' column based on the prompt key
result_df['Method'] = result_df['Prompt'].apply(get_method_from_prompt)

# replace 1_shot_1, 1_shot_2, 1_shot_3, 1_shot_4, 1_shot_5 with `1-Shot`, there is existence of 1_shot_CoT, which is not a one-shot prompt
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'1_shot_([1-5])(?!_CoT)', '1-Shot', x))
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'1_shot_([1-5])_CoT', '1-Shot & CoT', x))
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'5_shots(?!_CoT)', '5-Shot', x))
result_df['Method'] = result_df['Method'].apply(lambda x: re.sub(r'5_shots_CoT', '5-Shot & CoT', x))

# Filter one-shot results and calculate averages for one-shot prompts
result_df = result_df.groupby(['Dataset', 'Model', 'Method']).agg({
    # calcualte the mean and standard deviation for the following columns
    'extraction_precision': ['mean','std'],
    'extraction_recall': ['mean','std'],
    'extraction_f1': ['mean','std'],
    
    'accuracy_w_gt':['mean','std'],
    'macro_f1_w_gt': ['mean','std'],
    'macro_precision_w_gt': ['mean','std'],
    'macro_recall_w_gt': ['mean','std'],

    'joint_accuracy': ['mean','std'],
    'joint_macro_f1':['mean','std'],
    'joint_macro_precision': ['mean','std'],
    'joint_macro_recall':['mean','std'],
}).reset_index().round(3)

# Append new GPT results
gpt_results = pd.DataFrame([
    {
        # 'Prompt': 'Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.',
        'Dataset': 'MIT',
        'Model': 'GPT-3 + R (32 LOC)',
        'Method': '0-Shot',
        'joint_macro_f1': '--',
        'joint_accuracy': '--',
        # 'extraction_precision': 0.87,
        # 'extraction_recall': 0.83,
        'extraction_f1': round(2 * 0.87 * 0.83 / (0.87 + 0.83), 3),
        # 'conditional_accuracy': 0.85,
        # 'conditional_macro_f1': 0.69,
        # 'conditional_macro_precision': '--',
        # 'conditional_macro_recall': '--',
        'macro_f1_w_gt': '--',
        'accuracy_w_gt': '--',
        'extraction_f1_mean': np.nan,
        'accuracy_w_gt_mean': np.nan,
        'macro_f1_w_gt_mean': np.nan,
        'joint_accuracy_mean': np.nan,
        'joint_macro_f1_mean': np.nan,
    },
    {
        # 'Prompt': 'Create a bulleted list of which medications are mentioned and whether they are active, discontinued, or neither.',
        'Dataset': 'MIT',
        'Model': 'GPT-3 + R (8 LOC)',
        'Method': '1-Shot',
        'joint_macro_f1': '--',
        'joint_accuracy': '--',
        # 'extraction_precision': 0.90,
        # 'extraction_recall': 0.92,
        'extraction_f1': round(2 * 0.90 * 0.92 / (0.90 + 0.92), 3),
        # 'conditional_accuracy': 0.89,
        # 'conditional_macro_f1': 0.62,
        # 'conditional_macro_precision': '--',
        # 'conditional_macro_recall': '--',
        'macro_f1_w_gt': '--',
        'accuracy_w_gt': '--',
        'extraction_f1_mean': np.nan,
        'accuracy_w_gt_mean': np.nan,
        'macro_f1_w_gt_mean': np.nan,
        'joint_accuracy_mean': np.nan,
        'joint_macro_f1_mean': np.nan,
    }
])


pd.set_option('display.max_rows', None)
# reset the multi-level index with appending the column names, ignore Dataset, Model, Method
result_df.columns = ['_'.join(col).strip() if col[0] not in ['Dataset', 'Model', 'Method'] else col[0] for col in result_df.columns.values]
# combine the mean and std by having new columns of mean ± std
for col in ['extraction_f1', 'accuracy_w_gt', 'macro_f1_w_gt', 'joint_accuracy', 'joint_macro_f1']:
    result_df[col] = result_df[col + '_mean'].round(3).astype(str) + ' ± ' + result_df[col + '_std'].astype(str)
# drop all columns with  _std
result_df = result_df.drop(columns=[col for col in result_df.columns if '_std' in col])

# Sort the final results by Method, Dataset, and extraction_f1
col_to_include = ['Dataset', 'Model', 'Method', 'joint_macro_f1', 'joint_accuracy', 'extraction_f1', 'macro_f1_w_gt', 'accuracy_w_gt', 'extraction_f1_mean', 'accuracy_w_gt_mean', 'macro_f1_w_gt_mean', 'joint_accuracy_mean', 'joint_macro_f1_mean']
col_to_show = ['Dataset', 'Model', 'Method', 'joint_macro_f1', 'joint_accuracy', 'extraction_f1', 'macro_f1_w_gt', 'accuracy_w_gt']

merged_df = pd.concat([result_df[col_to_include],gpt_results])
# filter out model name starting with Llama-3.2, Qwen2-
merged_df = merged_df[~merged_df['Model'].str.contains(r'Llama-3.2|Qwen2-')]
merged_df.sort_values(
    # sort by the means
    by=['Dataset', 'joint_macro_f1_mean','extraction_f1_mean', 'macro_f1_w_gt_mean', 'joint_accuracy_mean', 'accuracy_w_gt_mean', ],
    ascending=[False, False, False, False, False, False]
).set_index(['Dataset', 'Model', 'Method']).round(3)
# .to_csv(data_folder + "result_w_gpt.csv", index=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,joint_macro_f1,joint_accuracy,extraction_f1,macro_f1_w_gt,accuracy_w_gt,extraction_f1_mean,accuracy_w_gt_mean,macro_f1_w_gt_mean,joint_accuracy_mean,joint_macro_f1_mean
Dataset,Model,Method,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
MIT,Llama-3.1-70B-Instruct,1-Shot,0.806 ± 0.024,0.879 ± 0.012,0.956 ± 0.005,0.86 ± 0.023,0.917 ± 0.012,0.956,0.917,0.86,0.879,0.806
MIT,gpt-4o,5-Shot,0.804 ± 0.013,0.904 ± 0.005,0.961 ± 0.003,0.866 ± 0.006,0.919 ± 0.005,0.961,0.919,0.866,0.904,0.804
MIT,Llama-3.1-70B-Instruct,5-Shot,0.794 ± 0.003,0.897 ± 0.004,0.957 ± 0.003,0.854 ± 0.01,0.919 ± 0.004,0.957,0.919,0.854,0.897,0.794
MIT,gpt-4o,1-Shot,0.781 ± 0.031,0.888 ± 0.016,0.958 ± 0.006,0.852 ± 0.015,0.905 ± 0.011,0.958,0.905,0.852,0.888,0.781
MIT,Qwen2.5-72B-Instruct,5-Shot,0.778 ± 0.001,0.873 ± 0.001,0.953 ± 0.001,0.847 ± 0.001,0.907 ± 0.001,0.953,0.907,0.847,0.873,0.778
MIT,Llama-3.1-70B-Instruct,CoT,0.776 ± 0.024,0.848 ± 0.01,0.94 ± 0.003,0.79 ± 0.014,0.883 ± 0.008,0.94,0.883,0.79,0.848,0.776
MIT,Qwen2.5-32B-Instruct,CoT,0.769 ± 0.009,0.867 ± 0.01,0.945 ± 0.009,0.836 ± 0.01,0.911 ± 0.004,0.945,0.911,0.836,0.867,0.769
MIT,Llama-3.1-70B-Instruct,0-Shot,0.762 ± 0.006,0.847 ± 0.003,0.935 ± 0.002,0.811 ± 0.017,0.908 ± 0.011,0.935,0.908,0.811,0.847,0.762
MIT,gpt-4o,1-Shot & CoT,0.758 ± 0.016,0.905 ± 0.013,0.942 ± 0.01,0.865 ± 0.012,0.904 ± 0.013,0.942,0.904,0.865,0.905,0.758
MIT,Llama-3.1-70B-Instruct,1-Shot & CoT,0.757 ± 0.02,0.851 ± 0.016,0.949 ± 0.005,0.835 ± 0.021,0.898 ± 0.012,0.949,0.898,0.835,0.851,0.757


# Confusion matrix

In [27]:
import pandas as pd
import numpy as np

# confusion matrix for extraction task
# focus on column true_set and pred_set
def calculate_extraction_confusion_matrix(df):
    """
    Calculate and aggregate TP, FP, FN, and TN counts across the dataset.

    Parameters:
        df (pd.DataFrame): The DataFrame containing 'true_set' and 'pred_set' as string representations of sets.

    Returns:
        dict: Aggregated counts for TP, FP, FN, and TN.
    """
    from collections import Counter
    # Initialize counts
    aggregated_counts = Counter({'TP': 0, 'FP': 0, 'FN': 0, 'TN': '-'})

    # Iterate through each row to calculate TP, FP, and FN
    for _, row in df.iterrows():
        true_labels = row['true_set']
        pred_labels = row['pred_set']

        # True positives (correct predictions)
        tp = len(true_labels & pred_labels)

        # False positives (incorrect predictions not in ground truth)
        fp = len(pred_labels - true_labels)

        # False negatives (missed predictions in ground truth)
        fn = len(true_labels - pred_labels)

        # Update aggregated counts
        aggregated_counts['TP'] += tp
        aggregated_counts['FP'] += fp
        aggregated_counts['FN'] += fn

    # Create a DataFrame for the confusion matrix
    confusion_matrix = pd.DataFrame({
        "Predicted Positive": [aggregated_counts['TP'], aggregated_counts['FP']],
        "Predicted Negative": [aggregated_counts['FN'], aggregated_counts['TN']]
    }, index=["True Positive", "True Negative"])
    # Return aggregated counts
    return confusion_matrix

from sklearn.metrics import confusion_matrix
# def calculate_pure_classification_confusion_matrix(df, class_name, mode = 'pure_classification'):
#     """
#     Calculate and aggregate TP, FP, FN, and TN counts across the dataset.

#     Parameters:
#         df (pd.DataFrame): The DataFrame containing 'true_set' and 'pred_set' as string representations of sets.

#     Returns:
#         dict: Aggregated counts for TP, FP, FN, and TN.
#     """
#     from collections import Counter
#     # Initialize counts
#     aggregated_counts = Counter({'TP': 0, 'FP': 0, 'FN': 0, 'TN': 0})

#     true_col = class_name
#     pred_col = class_name + '_pred_with_groundtruth' if mode == 'pure_classification' else class_name + '_pred'

#     # Iterate through each row to calculate TP, FP, and FN
#     for _, row in df.iterrows():
#         true_labels = row[true_col]
#         pred_labels = row[pred_col]

#         all_possible_labels = row['true_set']

#         # conver both true and pred to binary list with index of all_possible_labels
#         true_labels = [1 if label in true_labels else 0 for label in all_possible_labels]
#         pred_labels = [1 if label in pred_labels else 0 for label in all_possible_labels]

#         # calculate TN, FP, FN, TP using confusion matrix
#         tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels, labels=[0,1]).ravel()

#         # Update aggregated counts
#         aggregated_counts['TP'] += tp
#         aggregated_counts['FP'] += fp
#         aggregated_counts['FN'] += fn
#         aggregated_counts['TN'] += tn

#     # Return aggregated counts
#     return dict(aggregated_counts)
def calculate_classification_confusion_matrix(df, dataset_name, mode = 'pure_classification'):
    """
    Calculate and aggregate TP, FP, FN, and TN counts across the dataset.

    Parameters:
        df (pd.DataFrame): The DataFrame containing 'true_set' and 'pred_set' as string representations of sets.

    Returns:
        dict: Aggregated counts for TP, FP, FN, and TN.
    """
    # Initialize confusion matrix with 'Missing' and 'Extra' rows/columns
    all_classes = ['Active', 'Discontinued', 'Neither'] if dataset_name == 'external' else ['Active', 'Discontinued']
    extended_classes = all_classes + ["Extra Model Prediction"]
    cols = all_classes + ["Missing"]
    confusion_data = pd.DataFrame(0, index=extended_classes, columns=cols)

    class_names = ['active_medications', 'discontinued_medications', 'neither_medications'] if dataset_name == 'external' else ['active_medications', 'discontinued_medications']

    # Iterate over each row
    for _, row in df.iterrows():
        # Reconstruct the true and predicted sets
        true_labels = {}
        pred_labels = {}

        # Populate true labels
        for class_name, class_label in zip(class_names, all_classes):
            for med in row[class_name]:  # Medications in this class
                true_labels[med] = class_label

        # Populate predicted labels
        for class_name, class_label in zip([f"{c}_pred" if mode == 'pure_classification' else f"{c}_pred_with_groundtruth" for c in class_names], all_classes):
            for med in row[class_name]:  # Medications in this predicted class
                pred_labels[med] = class_label

        # Get all unique medication names
        all_meds = row['true_set']
        for med in all_meds:
            true_class = true_labels.get(med, "Extra Model Prediction")  
            pred_class = pred_labels.get(med, "Missing")   

            # Increment the corresponding cell in the confusion matrix
            confusion_data.loc[true_class, pred_class] += 1
        
    confusion_data.columns.name = "Pred"
    confusion_data.index.name = "True"
    return confusion_data


# def generate_confusion_matrix(df, not_internal=False):
#     # apply eval to all columns
#     for col in df.columns:
#         df[col] = df[col].apply(eval)

#     # Calculate the confusion matrix
#     extraction_confusion_matrix = calculate_extraction_confusion_matrix(df)

#     # Calculate the confusion matrix
#     pure_classification_confusion_matrix_active = calculate_pure_classification_confusion_matrix(df, 'active_medications')
#     pure_classification_confusion_matrix_discontinued = calculate_pure_classification_confusion_matrix(df, 'discontinued_medications')
#     if not_internal:
#         pure_classification_confusion_matrix_neither = calculate_pure_classification_confusion_matrix(df, 'neither_medications')
#         pure_classification_confusion_matrix = {key: pure_classification_confusion_matrix_active[key] + pure_classification_confusion_matrix_discontinued[key] + pure_classification_confusion_matrix_neither[key] for key in extraction_confusion_matrix.keys()}
#     else:
#         pure_classification_confusion_matrix = {key: pure_classification_confusion_matrix_active[key] + pure_classification_confusion_matrix_discontinued[key] for key in extraction_confusion_matrix.keys()}

#     joint_classification_confusion_matrix_active = calculate_pure_classification_confusion_matrix(df, 'active_medications', mode='joint_classification')
#     joint_classification_confusion_matrix_discontinued = calculate_pure_classification_confusion_matrix(df, 'discontinued_medications', mode='joint_classification')
#     if not_internal:
#         joint_classification_confusion_matrix_neither = calculate_pure_classification_confusion_matrix(df, 'neither_medications', mode='joint_classification')
#         joint_classification_confusion_matrix = {key: joint_classification_confusion_matrix_active[key] + joint_classification_confusion_matrix_discontinued[key] + joint_classification_confusion_matrix_neither[key] for key in extraction_confusion_matrix.keys()}
#     else:
#         joint_classification_confusion_matrix = {key: joint_classification_confusion_matrix_active[key] + joint_classification_confusion_matrix_discontinued[key] for key in extraction_confusion_matrix.keys()}
    
#     return extraction_confusion_matrix, pure_classification_confusion_matrix, joint_classification_confusion_matrix

def generate_confusion_matrix(df, not_internal=False):
    # apply eval to all columns
    for col in df.columns:
        df[col] = df[col].apply(eval)

    # Calculate the confusion matrix
    extraction_confusion_matrix = calculate_extraction_confusion_matrix(df)

    # Calculate the confusion matrix
    if not_internal:
        pure_classification_confusion_matrix = calculate_classification_confusion_matrix(df, dataset_name = 'external', mode='pure_classification')
        joint_classification_confusion_matrix = calculate_classification_confusion_matrix(df, dataset_name = 'external', mode='joint')
    else:
        pure_classification_confusion_matrix = calculate_classification_confusion_matrix(df, dataset_name = 'internal', mode='pure_classification')
        joint_classification_confusion_matrix = calculate_classification_confusion_matrix(df, dataset_name = 'internal', mode='joint')


    print('\nExtraction Confusion Matrix:')
    print(extraction_confusion_matrix)

    print('\nPure Classification Confusion Matrix:')
    print(pure_classification_confusion_matrix)

    print('\nJoint Classification Confusion Matrix:')
    print(joint_classification_confusion_matrix)

    return extraction_confusion_matrix, pure_classification_confusion_matrix, joint_classification_confusion_matrix


# models to evaluate: GPT-4o, Llama-3.1-70B-Instruct,  Qwen2.5-72B-Instruct
# method: 0-shot


columns = ['active_medications', 'discontinued_medications', 'active_medications_pred',
           'discontinued_medications_pred', 'true_set', 'pred_set', 'active_medications_pred_with_groundtruth',
            'discontinued_medications_pred_with_groundtruth']

file_path = '/PHShome/cs1839/capstone_data/base_pred_data/{}_{}_sim_0_{}.csv'

# , 'Llama-3.1-70B-Instruct',
for model in ['gpt-4o', 'Llama-3.1-70B-Instruct', 'Qwen2.5-72B-Instruct']: 
    print('==================================')
    print('==================================')
    print('==================================')
    print(f'\n\nModel: {model}')
    mit_df = pd.read_csv(file_path.format('MIT',model,'Other'))[columns+['neither_medications', 'neither_medications_pred', 'neither_medications_pred_with_groundtruth']]
    mimic_iv_df = pd.read_csv(file_path.format('MIMIC-IV',model,'Other'))[columns+['neither_medications', 'neither_medications_pred', 'neither_medications_pred_with_groundtruth']]
    internal_df = pd.read_csv(file_path.format('Internal Data',model,'Internal Data'))[columns]

    print('\nMIT:')
    mit_extraction_confusion_matrix, mit_pure_classification_confusion_matrix, mit_joint_classification_confusion_matrix = generate_confusion_matrix(mit_df, not_internal=True)
    print('==================================')
    print('\nMIMIC-IV:')
    mimic_iv_extraction_confusion_matrix, mimic_iv_pure_classification_confusion_matrix, mimic_iv_joint_classification_confusion_matrix = generate_confusion_matrix(mimic_iv_df, not_internal=True)
    print('==================================')
    print('\nInternal Data:')
    internal_extraction_confusion_matrix, internal_pure_classification_confusion_matrix, internal_joint_classification_confusion_matrix = generate_confusion_matrix(internal_df, not_internal=False)



Model: gpt-4o

MIT:

Extraction Confusion Matrix:
               Predicted Positive Predicted Negative
True Positive                 313                 23
True Negative                  11                  -

Pure Classification Confusion Matrix:
Pred                    Active  Discontinued  Neither  Missing
True                                                          
Active                     166             4       14        6
Discontinued                 3           100        5        4
Neither                      0             3       18       13
Extra Model Prediction       0             0        0        0

Joint Classification Confusion Matrix:
Pred                    Active  Discontinued  Neither  Missing
True                                                          
Active                     167             4       16        3
Discontinued                 3           104        5        0
Neither                      0             3       31        0
Extra Model Predi

In [15]:
pd.read_csv(file_path.format('Internal Data',model,'Internal Data'))

Unnamed: 0,snippet,active_medications,discontinued_medications,true_set,model_response,active_medications_pred,discontinued_medications_pred,pred_set,intersection,true_count,...,joint_active_pred_count,joint_active_true_count,joint_active_intersection_count,joint_discontinued_pred_count,joint_discontinued_true_count,joint_discontinued_intersection_count,correct_pred_count,model_response_with_groundtruth,active_medications_pred_with_groundtruth,discontinued_medications_with_groundtruth
0,"At time of aspiration event and RR on 3/12 PM,...","['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,2,2,2,0,0,0,2,- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[]
1,Last time this was given: 5/15/2018 7:50 AM ...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,2,2,2,0,0,0,2,- compazine (active)\n- prochlorperazine (active),"['compazine', 'prochlorperazine']",[]
2,Take 600mg in am and 900mg nightly Last time...,['navane'],[],{'navane'},- navane (active)\n- thiothixene (active),"['navane', 'thiothixene']",[],"{'navane', 'thiothixene'}",{'navane'},1,...,2,1,1,0,0,0,1,- navane (active),['navane'],[]
3,oxyCODONE 5 MG immediate release tablet Tak...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,2,2,2,0,0,0,2,- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[]
4,ondansetron (ZOFRAN) 8 MG tablet Take 1 tablet...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- ondansetron (discontinued)\n- zofran (discon...,"['compazine', 'prochlorperazine']","['zofran', 'ondansetron']","{'compazine', 'ondansetron', 'zofran', 'prochl...","{'compazine', 'prochlorperazine'}",2,...,2,2,2,2,0,0,2,- compazine (active)\n- prochlorperazine (active),"['compazine', 'prochlorperazine']",[]
5,predniSONE 20 MG tablet Commonly known as: DE...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,2,2,2,0,0,0,2,- compazine (active)\n- prochlorperazine (active),"['compazine', 'prochlorperazine']",[]
6,Last time this was given: 1/6/2017 9:12 PM ...,['navane'],[],{'navane'},- navane (active),['navane'],[],{'navane'},{'navane'},1,...,1,1,1,0,0,0,1,- navane (active),['navane'],[]
7,Remove & Discard patch within 12 hours or as d...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (discontinued)\n- compazine...,[],"['compazine', 'prochlorperazine']","{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,0,2,0,2,0,0,0,- prochlorperazine (discontinued)\n- compazine...,[],[]
8,Dose: 20 MG; Form: Take 1 TABLET DR; Route: ...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,2,2,2,0,0,0,2,- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[]
9,oxyCODONE HCl 10 mg Tab Take 1 tablet (10 mg t...,"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}",- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[],"{'compazine', 'prochlorperazine'}","{'compazine', 'prochlorperazine'}",2,...,2,2,2,0,0,0,2,- prochlorperazine (active)\n- compazine (active),"['compazine', 'prochlorperazine']",[]


In [None]:
Internal Data_Llama-3.1-70B-Instruct_sim_0_Internal Data

Unnamed: 0,Predicted Positive,Predicted Negative
True Positive,763,73
True Negative,99,-


In [127]:
file_path.format('Internal Data',model,'Internal Data')

'/PHShome/cs1839/capstone_data/base_pred_data/Internal Data_Qwen2.5-72B-Instruct_sim_0_Internal Data.csv'

In [122]:
print(mimic_iv_joint_classification_confusion_matrix)

Pred                    Active  Discontinued  Missing
True                                                 
Active                     511            22       89
Discontinued                18           134       19
Extra Model Prediction       3            16       24
