In [1]:
import sys
sys.path.insert(0,'..')

from src.models import *
from src.loss_functions import *
from src.noise import *
from src.metrics import *
from src.plotting import *
from src.generate_data import *
from src.real_data import *

import sklearn
import pandas as pd

from scipy.stats import bernoulli

from operator import xor

import os


In [17]:
# Function to calculate the percentage of rates greater than 25
def threshold(x):
    return (x > 25).mean() * 100

# Function to calculate the percentage of rates = 0
def no_regret(x):
    return (x == 0).mean() * 100

# Define model types and datasets
model_types = ["LR", "NN"]
datasets = ["cshock_eicu", "cshock_mimic", "saps", "support", "lungcancer"]
population_metrics = ['noisy_train_loss', 'clean_test_loss', 'clean_test_acc']
individual_metrics = ['regret_test', 'disagreement_test']
our_metrics = ["train_loss", "train_acc", 'test_loss', "test_acc"]

uncertainty_type = "forward"
noise_type = "class_independent"
epsilon = 0.1
fixed_class = 0
fixed_noise = 0

dfs = []

# Iterate over datasets and model types
for dataset in tqdm(datasets):
    for model_type in model_types:
        for noise_type in ["class_independent", "class_conditional"]:
            for metric in population_metrics:
                try:
                    # Load metrics data
                    metrics_df = load_metrics(model_type, noise_type, "forward", metric=metric, dataset=dataset, fixed_class=fixed_class, fixed_noise=fixed_noise, epsilon=epsilon)

                    # Rename 'Rate (%)' column to 'value'
                    metrics_df.rename(columns={'Rate (%)': 'value', 'Noise Level (%)': "noise", "Loss Function": "method_name", "Metric":"metric"}, inplace=True)

                    # Add Draw_id as the index of each sub DataFrame
                    metrics_df['draw_id'] = metrics_df.index

                    # Add additional columns
                    metrics_df['dataset'] = dataset
                    metrics_df['model_class'] = model_type
                    metrics_df['noise_type'] = noise_type
                    metrics_df['noise'] = metrics_df['noise'] / 100

                    if "acc" in metric:
                        metrics_df['value'] = metrics_df['value'] / 100

                    # Append the modified DataFrame to the list
                    dfs.append(metrics_df)

                except Exception as e:
                    print(f"Error processing {dataset} - {noise_type} - {model_type} - {metric}: {e}")
                    continue

            for m in our_metrics:
                try:
                    # Load metrics data
                    metrics_df = load_metrics(model_type, noise_type, "backward", metric=m, dataset=dataset, fixed_class=fixed_class, fixed_noise=fixed_noise, epsilon=epsilon)

                    # Rename 'Rate (%)' column to 'value'
                    metrics_df.rename(columns={'Rate (%)': 'value', 'Noise Level (%)': "noise", "Loss Function": "method_name", "Metric":"metric"}, inplace=True)

                    # Add Draw_id as the index of each sub DataFrame
                    metrics_df['draw_id'] = metrics_df.index

                    # Add additional columns
                    metrics_df['method_name'] = "ours"
                    metrics_df['dataset'] = dataset
                    metrics_df['model_class'] = model_type
                    metrics_df['noise_type'] = noise_type
                    metrics_df['noise'] = metrics_df['noise'] / 100

                    if "acc" in m:
                        metrics_df['value'] = metrics_df['value'] / 100

                    # Rename metric columns if needed
                    if m == 'test_loss':
                        m = 'clean_test_loss'
                    elif m == 'test_acc':
                        m = 'clean_test_acc'

                    # Append the modified DataFrame to the list
                    metrics_df['metric'] = m  # Adding the metric name as a column
                    dfs.append(metrics_df)


                except Exception as e:
                    print(f"Error processing {dataset} - {noise_type} - {model_type} - {m}: {e}")
                    continue

            for m in individual_metrics:

                for uncertainty in ["forward", "backward"]:
                    try:
                        # Load metrics data
                        metrics_df = load_metrics(model_type, noise_type, uncertainty, metric=m, dataset=dataset, fixed_class=fixed_class, fixed_noise=fixed_noise, epsilon=epsilon)


                                # Grouping by 'Loss Function' and 'Noise Level' and calculating statistics
                        summary_stats = metrics_df.groupby(['Loss Function', 'Noise Level (%)'])['Rate (%)'].agg([
                            'min', 'max', 'mean', 'std',
                            ('Percentage Over 25%', threshold),
                            ('Stable', no_regret)# Custom aggregation function
                        ]).reset_index()

                        # Renaming columns for clarity (optional as names are automatically set by the tuple)
                        summary_stats.columns = ['method_name', 'noise', m+'_min', m+'_max', m+'_mean', m+'_std', m+'_25', m+'_stable']

                        df_pivoted = summary_stats.melt(id_vars=['method_name', 'noise'], var_name='metric', value_name='value')

                        if uncertainty == "backward":
                            df_pivoted["method_name"] = "ours"

                        df_pivoted["dataset"] = dataset
                        df_pivoted["noise"] = df_pivoted["noise"]/100
                        df_pivoted["model_class"] = model_type
                        df_pivoted["noise_type"] = noise_type
                        df_pivoted["value"] = df_pivoted["value"]/100

                        # Append the modified DataFrame to the list
                        dfs.append(df_pivoted)

                    except Exception as e:
                        print(f"Error processing {dataset} - {noise_type} -  {model_type} - {m}: {e}")
                        continue

            
                
# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)




  0%|          | 0/5 [00:00<?, ?it/s]

Error processing cshock_eicu - LR - noisy_train_loss: 'noise'
Error processing cshock_eicu - LR - clean_test_loss: 'noise'
Error processing cshock_eicu - LR - clean_test_acc: 'noise'
Error processing cshock_eicu - LR - train_loss: 'noise'
Error processing cshock_eicu - LR - train_acc: 'noise'
Error processing cshock_eicu - LR - test_loss: 'noise'
Error processing cshock_eicu - LR - test_acc: 'noise'
Error processing cshock_eicu - LR - regret_test: 'Loss Function'
Error processing cshock_eicu - LR - regret_test: 'Loss Function'
Error processing cshock_eicu - LR - disagreement_test: 'Loss Function'
Error processing cshock_eicu - LR - disagreement_test: 'Loss Function'
Error processing cshock_eicu - NN - noisy_train_loss: 'noise'
Error processing cshock_eicu - NN - clean_test_loss: 'noise'
Error processing cshock_eicu - NN - clean_test_acc: 'noise'
Error processing cshock_eicu - NN - train_loss: 'noise'
Error processing cshock_eicu - NN - train_acc: 'noise'
Error processing cshock_eicu - 

In [18]:
final_df

Unnamed: 0,metric,noise,value,method_name,Index,draw_id,dataset,model_class,noise_type
0,noisy_train_loss,0.00,0.479459,BCE,0.0,0.0,cshock_eicu,LR,class_independent
1,noisy_train_loss,0.00,0.479459,BCE,1.0,1.0,cshock_eicu,LR,class_independent
2,noisy_train_loss,0.00,0.479459,BCE,2.0,2.0,cshock_eicu,LR,class_independent
3,noisy_train_loss,0.00,0.479459,BCE,3.0,3.0,cshock_eicu,LR,class_independent
4,noisy_train_loss,0.00,0.479459,BCE,4.0,4.0,cshock_eicu,LR,class_independent
...,...,...,...,...,...,...,...,...,...
544555,disagreement_test_stable,0.01,0.989828,ours,,,lungcancer,NN,class_independent
544556,disagreement_test_stable,0.05,0.968055,ours,,,lungcancer,NN,class_independent
544557,disagreement_test_stable,0.10,0.931739,ours,,,lungcancer,NN,class_independent
544558,disagreement_test_stable,0.20,0.794978,ours,,,lungcancer,NN,class_independent


In [14]:
# def rename_metric(metric):
#     parts = metric.split('_')
    
#     if len(parts)==4:
#         return f"{parts[0]}_{parts[2]}_{parts[1]}_{parts[3]}"
        
#     if len(parts)==3 and ("disagreement" not in metric and "regret" not in metric):
#         return f"{parts[1]}_{parts[0]}_{parts[2]}"
#     else:
#         return metric
        
    
# test_df = final_df.copy()
# # Apply the rename_metric function to the metric column
# test_df['metric'] = test_df['metric'].apply(lambda x: rename_metric(x))
# test_df

In [15]:
# Step 3: Export to CSV
final_df.to_csv(f"big_table.csv", index=False)

In [16]:
final_df.metric.unique()

array(['noisy_train_loss', 'clean_test_loss', 'clean_test_acc',
       'train_loss', 'train_acc', 'regret_test_min', 'regret_test_max',
       'regret_test_mean', 'regret_test_std', 'regret_test_25',
       'regret_test_stable', 'disagreement_test_min',
       'disagreement_test_max', 'disagreement_test_mean',
       'disagreement_test_std', 'disagreement_test_25',
       'disagreement_test_stable'], dtype=object)