In [2]:
#added by parvati 
#wilcoxsons
import pandas as pd
from scipy.stats import wilcoxon
from itertools import combinations

# Your data
data = {
    "task_name": ["primary_diagnaois", "ICD9_code", "readmission_risk"],
    "llama": [85, 42.67, 41.33],
    "chatgpt": [84.9, 40.67, 40.67],
    "gemini": [79, 14.67, 33],
    "o3": [90, 45.33, 70.7],
    "R1": [85, 40.33, 72.67]
}
df = pd.DataFrame(data)
df.set_index("task_name", inplace=True)

# Wilcoxon pairwise
results = []
for m1, m2 in combinations(df.columns, 2):
    try:
        stat, p = wilcoxon(df[m1], df[m2])
        results.append({
            "Model_1": m1,
            "Model_2": m2,
            "Wilcoxon_stat": stat,
            "p_value": p
        })
    except ValueError as e:
        results.append({
            "Model_1": m1,
            "Model_2": m2,
            "Wilcoxon_stat": None,
            "p_value": None,
            "Error": str(e)
        })

results_df = pd.DataFrame(results)
results_df["p_value_bonferroni"] = results_df["p_value"] * len(results_df)
results_df["Significant"] = results_df["p_value_bonferroni"] < 0.05

print(results_df.sort_values("p_value_bonferroni"))



   Model_1  Model_2  Wilcoxon_stat   p_value  p_value_bonferroni  Significant
0    llama  chatgpt            0.0  0.250000            2.500000        False
1    llama   gemini            0.0  0.250000            2.500000        False
2    llama       o3            0.0  0.250000            2.500000        False
4  chatgpt   gemini            0.0  0.250000            2.500000        False
5  chatgpt       o3            0.0  0.250000            2.500000        False
7   gemini       o3            0.0  0.250000            2.500000        False
8   gemini       R1            0.0  0.250000            2.500000        False
9       o3       R1            1.0  0.500000            5.000000        False
3    llama       R1            1.0  0.654721            6.547208        False
6  chatgpt       R1            2.0  0.750000            7.500000        False




In [3]:
import pandas as pd
import numpy as np

# Your accuracy table (converted to decimal form)
data = {
    "task_name": ["primary_diagnaois", "ICD9_code", "readmission_risk"],
    "llama": [85, 42.67, 41.33],
    "chatgpt": [84.9, 40.67, 40.67],
    "gemini": [79, 14.67, 33],
    "o3": [90, 45.33, 70.7],
    "R1": [85, 40.33, 72.67]
}
df = pd.DataFrame(data)
df.set_index("task_name", inplace=True)

# Bootstrap function
def bootstrap_ci(data, n_boot=10000, ci=95):
    means = []
    for _ in range(n_boot):
        sample = data.sample(frac=1, replace=True)
        means.append(sample.mean())
    lower = np.percentile(means, (100 - ci) / 2)
    upper = np.percentile(means, 100 - (100 - ci) / 2)
    return data.mean(), lower, upper

# Apply bootstrap for each model
results = []
for col in df.columns:
    mean, low, high = bootstrap_ci(df[col])
    results.append({
        "Model": col,
        "Mean_Accuracy": round(mean, 2),
        "95% CI Lower": round(low, 2),
        "95% CI Upper": round(high, 2)
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values("Mean_Accuracy", ascending=False))


     Model  Mean_Accuracy  95% CI Lower  95% CI Upper
3       o3          68.68         45.33          90.0
4       R1          66.00         40.33          85.0
0    llama          56.33         41.33          85.0
1  chatgpt          55.41         40.67          84.9
2   gemini          42.22         14.67          79.0


In [4]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations

# Your accuracy data
data = {
    "task_name": ["primary_diagnaois", "ICD9_code", "readmission_risk"],
    "llama": [85, 42.67, 41.33],
    "chatgpt": [84.9, 40.67, 40.67],
    "gemini": [79, 14.67, 33],
    "o3": [90, 45.33, 70.7],
    "R1": [85, 40.33, 72.67]
}

df = pd.DataFrame(data)
df.set_index("task_name", inplace=True)

# Mann-Whitney U test pairwise
results = []
for m1, m2 in combinations(df.columns, 2):
    stat, p = mannwhitneyu(df[m1], df[m2], alternative='two-sided')  # two-sided test
    results.append({
        "Model_1": m1,
        "Model_2": m2,
        "U_stat": stat,
        "p_value": p
    })

results_df = pd.DataFrame(results)
results_df["p_value_bonferroni"] = results_df["p_value"] * len(results_df)
results_df["Significant"] = results_df["p_value_bonferroni"] < 0.05

print(results_df.sort_values("p_value_bonferroni"))


   Model_1  Model_2  U_stat   p_value  p_value_bonferroni  Significant
0    llama  chatgpt     7.0  0.375825            3.758251        False
4  chatgpt   gemini     7.0  0.375825            3.758251        False
5  chatgpt       o3     2.0  0.375825            3.758251        False
1    llama   gemini     7.0  0.400000            4.000000        False
2    llama       o3     2.0  0.400000            4.000000        False
7   gemini       o3     2.0  0.400000            4.000000        False
8   gemini       R1     2.0  0.400000            4.000000        False
3    llama       R1     4.5  1.000000           10.000000        False
6  chatgpt       R1     4.0  1.000000           10.000000        False
9       o3       R1     5.0  1.000000           10.000000        False


In [None]:
### code we used to get f1 scores we used in paper - just for your infor if you need this 

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

# Load the dataset
df = pd.read_csv('C:/Users/Parvati/Desktop/mimic/llama_final_with_diagnosis_readmisison_sick_comparison.csv')

# Step 1: Split the comma-separated strings into lists
df['icd9_codes'] = df['icd9_codes'].apply(lambda x: [i.strip() for i in str(x).split(',')])
df['converted_icd_code'] = df['converted_icd_code'].apply(lambda x: [i.strip() for i in str(x).split(',')])

# Step 2: Get all unique ICD codes in the ground truth (only use the true labels for comparison)
all_labels = set().union(*df['converted_icd_code'])

# Step 3: Create binary indicator matrices for ground truths and predictions
label_to_index = {label: i for i, label in enumerate(all_labels)}
y_true = np.zeros((len(df), len(all_labels)), dtype=int)
y_pred = np.zeros((len(df), len(all_labels)), dtype=int)

# Fill in y_true and y_pred matrices based on ground_truth_icd and llama_predicted_icd
for i, (true_labels, pred_labels) in enumerate(zip(df['converted_icd_code'], df['icd9_codes'])):
    for label in true_labels:
        y_true[i, label_to_index[label]] = 1
    for label in pred_labels:
        # Only mark as predicted if the label is in the ground truth set (for comparison)
        if label in label_to_index:
            y_pred[i, label_to_index[label]] = 1

# Step 4: Calculate multilabel confusion matrices
conf_matrix = multilabel_confusion_matrix(y_true, y_pred)

# Initialize lists to store metrics
Diagnosis_list = []
Precision_list = []
Recall_list = []
F1_list = []
Support_list = []
TTP, TTN, TFP, TFN = [], [], [], []

# Step 5: Calculate per-class metrics
for label, conf in zip(all_labels, conf_matrix):
    TP = conf[1, 1]
    TN = conf[0, 0]
    FP = conf[0, 1]
    FN = conf[1, 0]
    TTP.append(TP)
    TTN.append(TN)
    TFP.append(FP)
    TFN.append(FN)

    # Avoid division by zero
    Precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    Recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    F1 = (2 * Precision * Recall / (Precision + Recall)) if (Precision + Recall) > 0 else 0

    Diagnosis_list.append(label)
    Precision_list.append(Precision)
    Recall_list.append(Recall)
    F1_list.append(F1)
    Support_list.append(np.sum(y_true[:, label_to_index[label]]))

# Step 6: Create a DataFrame for per-class metrics
per_class_metrics = pd.DataFrame({
    'Diagnosis': Diagnosis_list,
    'Precision': Precision_list,
    'Recall': Recall_list,
    'F1 Score': F1_list,
    'Support': Support_list
})
print("Per-Class Metrics:")
print(per_class_metrics)

# Step 7: Calculate aggregate metrics
Precision_list, Recall_list, F1_list, Support_list = [], [], [], []
list_section = ['micro avg', 'macro avg', 'weighted avg']

for section in list_section:
    if section == 'micro avg':
        TP = sum(TTP)
        FP = sum(TFP)
        FN = sum(TFN)
        Precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        Recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = (2 * Precision * Recall / (Precision + Recall)) if (Precision + Recall) > 0 else 0
        Support = np.sum(Support_list)

    elif section == 'macro avg':
        Precision = np.mean(Precision_list)
        Recall = np.mean(Recall_list)
        F1 = np.mean(F1_list)
        Support = np.sum(Support_list)

    elif section == 'weighted avg':
        total_support = np.sum(Support_list)
        if total_support > 0:
            Precision = np.sum(np.array(Precision_list) * np.array(Support_list)) / total_support
            Recall = np.sum(np.array(Recall_list) * np.array(Support_list)) / total_support
            F1 = np.sum(np.array(F1_list) * np.array(Support_list)) / total_support
        else:
            Precision, Recall, F1 = 0, 0, 0  # Handle division by zero case
        Support = np.sum(Support_list)

    Precision_list.append(Precision)
    Recall_list.append(Recall)
    F1_list.append(F1)
    Support_list.append(Support)

# Step 8: Create a DataFrame for aggregate metrics
aggregate_metrics = pd.DataFrame({
    'Section': list_section,
    'Precision': Precision_list,
    'Recall': Recall_list,
    'F1 Score': F1_list,
    'Support': Support_list
})
print("Aggregate Metrics:")
print(aggregate_metrics)

In [None]:
### this code below is boot strap confidence interval calculation. You have to tweak it

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Your function to compute F1 from the df (no changes here)
def compute_micro_f1_from_df(df, prediction_col='llama_predicted_icd'):
    df['llama_predicted_icd'] = df[prediction_col].apply(lambda x: [i.strip() for i in str(x).split(',')])
    df['ground_truth_icd'] = df['ground_truth_icd'].apply(lambda x: [i.strip() for i in str(x).split(',')])

    all_labels = set().union(*df['ground_truth_icd'])
    label_to_index = {label: i for i, label in enumerate(all_labels)}
    y_true = np.zeros((len(df), len(all_labels)), dtype=int)
    y_pred = np.zeros((len(df), len(all_labels)), dtype=int)

    for i, (true_labels, pred_labels) in enumerate(zip(df['ground_truth_icd'], df['llama_predicted_icd'])):
        for label in true_labels:
            y_true[i, label_to_index[label]] = 1
        for label in pred_labels:
            if label in label_to_index:
                y_pred[i, label_to_index[label]] = 1

    conf_matrix = multilabel_confusion_matrix(y_true, y_pred)

    TTP, TFP, TFN = 0, 0, 0
    for conf in conf_matrix:
        TTP += conf[1, 1]
        TFP += conf[0, 1]
        TFN += conf[1, 0]

    Precision = TTP / (TTP + TFP) if (TTP + TFP) > 0 else 0
    Recall = TTP / (TTP + TFN) if (TTP + TFN) > 0 else 0
    F1 = (2 * Precision * Recall / (Precision + Recall)) if (Precision + Recall) > 0 else 0

    return F1


# Function to generate the null distribution by shuffling predictions
def null_distribution_f1(df, prediction_col='llama_predicted_icd', n_iterations=1000):
    null_scores = []

    for _ in range(n_iterations):
        shuffled_df = df.copy()
        shuffled_df[prediction_col] = np.random.permutation(df[prediction_col].values)
        f1 = compute_micro_f1_from_df(shuffled_df, prediction_col=prediction_col)
        null_scores.append(f1)

    return np.array(null_scores)


# Function to perform bootstrap resampling to compute confidence interval (CI) for F1 score
def bootstrap_f1(df, prediction_col='llama_predicted_icd', n_iterations=1000):
    bootstrap_scores = []

    # Resampling with replacement
    for _ in range(n_iterations):
        bootstrap_sample = df.sample(n=len(df), replace=True)  # Bootstrap sampling
        f1 = compute_micro_f1_from_df(bootstrap_sample, prediction_col=prediction_col)
        bootstrap_scores.append(f1)

    return np.array(bootstrap_scores)


# Load your data
df = pd.read_csv('test_f1_score_for_llama.csv', sep='\t')

# Step 1: Compute observed F1 using your existing method
observed_f1 = compute_micro_f1_from_df(df)

# Step 2: Generate the null distribution by shuffling predictions and calculating F1
null_f1s = null_distribution_f1(df)

# Step 3: Generate the bootstrap distribution for F1 and calculate CI
bootstrap_f1s = bootstrap_f1(df)

# Calculate 95% CI for the F1 score
ci_lower = np.percentile(bootstrap_f1s, 2.5)
ci_upper = np.percentile(bootstrap_f1s, 97.5)

# Step 4: Calculate the p-value (How often are null F1s greater than or equal to observed F1)
p_value = np.mean(null_f1s >= observed_f1)

# Print Results
print(f"Observed F1: {observed_f1}")
print(f"Bootstrap 95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
print(f"p-value (H0: F1 <= random): {p_value:.4f}")


In [None]:
#this is another version of the same that chatgpt gave

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

# Define a function to calculate bootstrap CI for F1 score
def bootstrap_f1_ci(df, n_iterations=1000, ci_percentile=95):
    # Create a list to store F1 scores for each bootstrap iteration
    bootstrap_f1_scores = []
    
    # Iterate for n_iterations to generate bootstrap samples
    for _ in range(n_iterations):
        # Resample the data with replacement
        resample_data = df.sample(frac=1, replace=True)
        
        # Recalculate F1 score for the bootstrap sample
        y_true = resample_data['ground_truth_icd']
        y_pred = resample_data['llama_predicted_icd']
        
        # Convert lists of ICD codes into binary matrices for F1 calculation
        all_labels = set().union(*y_true)
        label_to_index = {label: i for i, label in enumerate(all_labels)}
        
        # Initialize matrices
        y_true_matrix = np.zeros((len(resample_data), len(all_labels)), dtype=int)
        y_pred_matrix = np.zeros((len(resample_data), len(all_labels)), dtype=int)
        
        for i, (true_labels, pred_labels) in enumerate(zip(y_true, y_pred)):
            for label in true_labels:
                y_true_matrix[i, label_to_index[label]] = 1
            for label in pred_labels:
                if label in label_to_index:
                    y_pred_matrix[i, label_to_index[label]] = 1
        
        # Calculate F1 score for this resample
        f1 = f1_score(y_true_matrix, y_pred_matrix, average='micro')
        bootstrap_f1_scores.append(f1)
    
    # Calculate the confidence intervals
    lower_bound = np.percentile(bootstrap_f1_scores, (100 - ci_percentile) / 2)
    upper_bound = np.percentile(bootstrap_f1_scores, 100 - (100 - ci_percentile) / 2)
    
    return lower_bound, upper_bound

# Example usage
lower_ci, upper_ci = bootstrap_f1_ci(df)

# Output the Bootstrap Confidence Interval for the F1 score
print(f"Bootstrap 95% Confidence Interval for F1 Score: ({lower_ci:.4f}, {upper_ci:.4f})")
