In [1]:
import sys
sys.path.insert(0,'..')

from src.models import *
from src.loss_functions import *
from src.noise import *
from src.metrics import *
from src.plotting import *
from src.generate_data import *
from src.real_data import *

import sklearn
import pandas as pd

from scipy.stats import bernoulli

from operator import xor

import os


In [3]:
 # Function to calculate the percentage of rates greater than 25
def threshold(x):
    return (x > 25).mean() * 100

 # Function to calculate the percentage of rates = 0
def no_regret(x):
    return (x == 0).mean() * 100

# Define model types and datasets
model_types = ["LR", "NN"]
datasets = ["cshock_eicu", "cshock_mimic", "saps", "support", "lungcancer"]
population_metrics = ['clean_test_acc', 'flip_frequency', 'noisy_train_loss', 'clean_test_loss']
individual_metrics = ['regret_test', 'disagreement_test']

uncertainty_type = "forward"
noise_type = "class_independent"
epsilon = 0.1
fixed_class = 0
fixed_noise = 0

dfs =[]


# Iterate over datasets and model types
for dataset in datasets:
    for model_type in model_types:
        local_dfs = []
        for metric in population_metrics:
            
            try:
                # Load metrics data
                metrics_df = load_metrics(model_type, noise_type, uncertainty_type, metric=metric, dataset=dataset, fixed_class=fixed_class, fixed_noise=fixed_noise, epsilon = epsilon)

                
                                # Grouping by 'Loss Function' and 'Noise Level' and calculating statistics
                summary_stats = metrics_df.groupby(['Loss Function', 'Noise Level (%)'])['Rate (%)'].agg([
                    'min', 'max', 'mean', 'std'  # Custom aggregation function
                ]).reset_index()

                # Renaming columns for clarity (optional as names are automatically set by the tuple)
                summary_stats.columns = ['loss', 'noise', 'min_'+metric, 'max_'+metric, 'mean_'+metric, 'std_'+metric]
                
                local_dfs.append(summary_stats)
                
            except:
                continue 
               
        for m in ["disagreement_test", "regret_test"]:

                # Load metrics data
            metrics_df = load_metrics(model_type, noise_type, uncertainty_type, metric=m, dataset=dataset, fixed_class=fixed_class, fixed_noise=fixed_noise, epsilon = epsilon)

            # Grouping by 'Loss Function' and 'Noise Level' and calculating statistics
            summary_stats = metrics_df.groupby(['Loss Function', 'Noise Level (%)'])['Rate (%)'].agg([
                'min', 'max', 'mean', 'std',
                ('Percentage Over 25%', threshold),
                ('Stable', no_regret)# Custom aggregation function
            ]).reset_index()

            # Renaming columns for clarity (optional as names are automatically set by the tuple)
            summary_stats.columns = ['loss', 'noise', 'min_'+m, 'max_'+m, 'mean_'+m, 'std_'+m, '%>25_'+m, '%stable_'+m]

            local_dfs.append(summary_stats)

            
        # Initialize the result with the first DataFrame
        
        if len(local_dfs)!= 0:
            result = local_dfs[0]

            # Sequentially merge each DataFrame
            for df in local_dfs[1:]:
                #print(dataset)
                #print(df.columns)
                
                result = pd.merge(result, df, on=['loss', 'noise'], how='outer')
            result["dataset"] = dataset
            result["model_type"] = model_type
            dfs.append(result)

In [4]:
final_df = pd.concat(dfs)
final_df

Unnamed: 0,loss,noise,min_clean_test_acc,max_clean_test_acc,mean_clean_test_acc,std_clean_test_acc,min_flip_frequency,max_flip_frequency,mean_flip_frequency,std_flip_frequency,...,%>25_disagreement_test,%stable_disagreement_test,min_regret_test,max_regret_test,mean_regret_test,std_regret_test,%>25_regret_test,%stable_regret_test,dataset,model_type
0,BCE,0,75.144509,75.144509,75.144509,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,100.000000,0.0,100.0,24.855491,43.248776,24.855491,75.144509,cshock_eicu,LR
1,BCE,1,73.410405,76.156069,75.058671,0.405579,0.009045,0.010854,0.009924,0.000609,...,5.924855,86.416185,0.0,100.0,24.941329,42.093540,26.011561,67.919075,cshock_eicu,LR
2,BCE,5,72.687861,77.167630,74.926879,0.705578,0.045224,0.054993,0.050003,0.002648,...,14.739884,70.953757,0.0,100.0,25.073121,40.144260,27.890173,59.537572,cshock_eicu,LR
3,BCE,10,71.676301,77.312139,74.757081,0.907978,0.090087,0.109986,0.100043,0.004823,...,22.687861,56.358382,0.0,100.0,25.242919,38.352361,30.635838,49.855491,cshock_eicu,LR
4,BCE,20,70.375723,77.601156,74.078757,1.227359,0.180535,0.219609,0.199749,0.007323,...,36.705202,28.901734,0.0,100.0,25.921243,34.937617,34.682081,26.734104,cshock_eicu,LR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,forward,1,67.768595,69.794978,69.104895,0.376069,0.009020,0.010868,0.009981,0.000400,...,23.235855,62.126510,0.0,100.0,30.895105,41.436086,35.441831,47.917991,lungcancer,NN
14,forward,5,67.975207,69.866497,69.005960,0.397611,0.047703,0.052174,0.049894,0.000898,...,24.920534,55.538779,0.0,100.0,30.994040,40.816083,36.053719,43.928798,lungcancer,NN
15,forward,10,68.070566,69.882390,68.974452,0.408496,0.096817,0.103632,0.099849,0.001333,...,26.716465,52.129688,0.0,100.0,31.025548,40.346693,37.190083,41.775270,lungcancer,NN
16,forward,20,67.244120,69.977750,68.763032,0.463061,0.195164,0.204244,0.199936,0.001894,...,31.849968,48.585505,0.0,100.0,31.236968,39.394509,38.016529,39.200572,lungcancer,NN


In [5]:
# Step 3: Export to CSV
final_df.to_csv(f"{noise_type}.csv", index=False)

In [3]:
df = load_metrics(model_type, noise_type, uncertainty_type, metric="clean_test_acc", dataset=dataset, fixed_class=fixed_class, fixed_noise=fixed_noise, epsilon = epsilon)

In [4]:
# Grouping by 'Loss Function' and 'Noise Level' and calculating statistics
summary_stats1 = df.groupby(['Loss Function', 'Noise Level (%)'])['Rate (%)'].agg([
    'min', 'max', 'mean', 'std'  # Custom aggregation function
]).reset_index()

# Renaming columns for clarity (optional as names are automatically set by the tuple)
summary_stats1.columns = ['loss', 'noise', 'min', 'max', 'mean', 'std']


summary_stats1

Unnamed: 0,loss,noise,min,max,mean,std
0,BCE,0,75.144509,75.144509,75.144509,0.0
1,BCE,1,73.410405,76.156069,75.058671,0.405579
2,BCE,5,72.687861,77.16763,74.926879,0.705578
3,BCE,10,71.676301,77.312139,74.757081,0.907978
4,BCE,20,70.375723,77.601156,74.078757,1.227359
5,BCE,40,54.046243,72.398844,65.458526,2.722588
6,backward,0,75.144509,75.144509,75.144509,0.0
7,backward,1,73.410405,76.300578,75.053035,0.413115
8,backward,5,72.83237,77.023121,74.96185,0.719373
9,backward,10,71.676301,77.456647,74.868497,0.896494
