ELAPSE Traces 

Dataset Properties

In [None]:
import csv


rows = [
    ("Adult", "(gender, age, race)"),
    ("KDD", "(gender, age, race)"),
    ("DC", "(gender, age)"),
    ("MobiAct", "(gender, age)"),
    ("ARS", "(gender)")
]


with open("../traces/DatasetProperties.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Dataset", "SensitiveAttributes"])  
    writer.writerows(rows)


Experiment Configurations

In [None]:
import csv

datasets = ["Adult", "KDD", "DC", "MobiAct", "ARS"]
models_all = ["SVM", "MLP", "LR"]
models_mobiact = ["MLP"]
selection_methods = ["Full", "Craig", "Glister", "GradMatch", "Random"]
selection_ratios_partial = [0.05, 0.1, 0.2, 0.3]
selection_frequency = 20
num_runs = 5


configurations = []
ec_id = 1
for dataset in datasets:
    models = models_mobiact if dataset == "MobiAct" else models_all
    for model in models:
        for method in selection_methods:
            ratios = [1.0] if method == "Full" else selection_ratios_partial
            for ratio in ratios:
                configurations.append([ec_id, dataset, model, method, ratio, selection_frequency, num_runs])
                ec_id += 1


with open("../traces/ExperimentConfigurations.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["EC_ID", "Dataset", "Model", "Selection method", "Selection ratio", "Selection frequency", "#Runs"])
    writer.writerows(configurations)



Experiment Measurements

In [8]:
import os
import re
import pandas as pd


systems_path = ['../results/ars-selection/CRAIGPB', '../results/ars-selection/GLISTERPB', '../results/ars-selection/GradMatchPB', '../results/ars-selection/Random']
ratio_path = ['/ars_0.05', '/ars_0.1', '/ars_0.2', '/ars_0.3', '/ars_0.5']
directory_full = '../results/ars-selection/Full/ars_1'
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []


all_epochs = []

def load_epochs(fair_path, cost_path, model):
    fair_df = pd.read_csv(fair_path).iloc[0:49, 1:]
    cost_df = pd.read_csv(cost_path).iloc[0:49, 1:]
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_epochs_full(fair_path, cost_path, model):
    fair_df = pd.read_csv(fair_path).iloc[20:69, 1:]
    cost_df = pd.read_csv(cost_path).iloc[20:69, 1:]
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

# === Full traces ===
for model in models:
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename:
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break

            if not cost_file:
                continue

            df_metrics = load_epochs_full(fair_file, cost_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'ars'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id
            df_metrics['epochID'] = list(range(len(df_metrics)))

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename:
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break

                    if not cost_file:
                        continue

                    df_metrics = load_epochs(fair_file, cost_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'ars'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id
                    df_metrics['epochID'] = list(range(len(df_metrics)))

                    all_epochs.append(df_metrics)


if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)
os.makedirs('../results/test', exist_ok=True)
df_all_epochs.to_csv('../results/test/ars_epoch_traces.csv', index=False)


In [9]:
import os
import re
import pandas as pd

systems_path = ['../results/census-selection/CRAIGPB', '../results/census-selection/GLISTERPB', '../results/census-selection/GradMatchPB', '../results/census-selection/Random']
ratio_path = ['/census_0.05', '/census_0.1', '/census_0.2', '/census_0.3', '/census_0.5']
directory_full = '../results/census-selection/Full/census_1'
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []
all_epochs = []


def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

def load_epochs_full(fair_path, cost_path, model):
    if model == 'MLP':
        fair_df = pd.read_csv(fair_path).iloc[20:199, 1:]
        cost_df = pd.read_csv(cost_path).iloc[20:199, 1:]
    else:
        fair_df = pd.read_csv(fair_path).iloc[20:99, 1:]
        cost_df = pd.read_csv(cost_path).iloc[20:99, 1:]

    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_epochs(fair_path, cost_path, model):
    if model == 'MLP':
        fair_df = pd.read_csv(fair_path).iloc[0:179, 1:]
        cost_df = pd.read_csv(cost_path).iloc[0:179, 1:]
    else:
        fair_df = pd.read_csv(fair_path).iloc[0:79, 1:]
        cost_df = pd.read_csv(cost_path).iloc[0:79, 1:]

    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_first_training_time(cost_path, model):
    df = pd.read_csv(cost_path)
    if model == 'MLP':
        df = df.iloc[89:199, 1:]
    else:
        df = df.iloc[54:99, 1:]
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

# === FULL traces ===
for model in models:
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename:
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break

            if not cost_file:
                continue

            df_metrics = load_epochs_full(fair_file, cost_file, model)
            full_time = load_first_training_time(cost_file, model)

            df_metrics['dataset'] = 'census'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id
            df_metrics['epochID'] = list(range(len(df_metrics)))

            all_epochs.append(df_metrics)

# === Sélection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename:
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break

                    if not cost_file:
                        continue

                    df_metrics = load_epochs(fair_file, cost_file, model)
                    full_time = load_first_training_time(cost_file, model)

                    df_metrics['dataset'] = 'census'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id
                    df_metrics['epochID'] = list(range(len(df_metrics)))

                    all_epochs.append(df_metrics)


if not all_epochs:
    raise ValueError("Aucune donnée chargée : vérifie les chemins ou noms des fichiers.")
df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../results/test', exist_ok=True)
df_all_epochs.to_csv('../results/test/census_epoch_traces.csv', index=False)


In [10]:
import os
import re
import pandas as pd

systems_path = ['../results/kdd-selection/CRAIGPB', '../results/kdd-selection/GLISTERPB', '../results/kdd-selection/GradMatchPB', '../results/kdd-selection/Random']
ratio_path = ['/kdd_0.05', '/kdd_0.1', '/kdd_0.2', '/kdd_0.3', '/kdd_0.5']
directory_full = '../results/kdd-selection/Full/kdd_1' 
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []
all_epochs = []


def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

def load_epochs(fair_path, cost_path, model):
    fair_df = pd.read_csv(fair_path).iloc[0:59, 1:]
    cost_df = pd.read_csv(cost_path).iloc[0:59, 1:]

    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_epochs_full(fair_path, cost_path, model):
    fair_df = pd.read_csv(fair_path).iloc[20:79, 1:]
    cost_df = pd.read_csv(cost_path).iloc[20:79, 1:]
    
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_first_training_time(cost_path, model):
    df = pd.read_csv(cost_path)
    if model == 'MLP':
        df = df.iloc[34:79, 1:]
    else:
        df = df.iloc[24:79, 1:]
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

# === Full traces  ===
for model in models:
    if not os.path.exists(directory_full):
        raise FileNotFoundError(f" Le dossier {directory_full} est introuvable. Vérifie son nom ou son chemin.")
    
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename:
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)
            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            df_metrics = load_epochs(fair_file, cost_file, model)
            full_time = load_first_training_time(cost_file, model)

            df_metrics['dataset'] = 'kdd'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id
            df_metrics['epochID'] = list(range(len(df_metrics)))

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            if not os.path.exists(directory):
                continue

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename:
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)
                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    df_metrics = load_epochs(fair_file, cost_file, model)
                    full_time = load_first_training_time(cost_file, model)

                    df_metrics['dataset'] = 'kdd'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id
                    df_metrics['epochID'] = list(range(len(df_metrics)))

                    all_epochs.append(df_metrics)

 
if not all_epochs:
    raise ValueError(" Aucun fichier n’a été traité. Vérifie les chemins et formats des fichiers.")
df_all_epochs = pd.concat(all_epochs, ignore_index=True)

 
os.makedirs('../results/test', exist_ok=True)
df_all_epochs.to_csv('../results/test/kdd_epoch_traces.csv', index=False)


In [None]:
import os
import re
import pandas as pd

systems_path = ['../results/dc-selection/CRAIGPB', '../results/dc-selection/GLISTERPB', '../results/dc-selection/GradMatchPB', '../results/dc-selection/Random']
ratio_path = ['/dc_0.05', '/dc_0.1', '/dc_0.2', '/dc_0.3', '/dc_0.5']
directory_full = '../results/dc-selection/Full/dc_1'
models = ['LogReg', 'MLP', 'SVM']
excluded_columns = []
all_epochs = []


def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

def load_epochs(fair_path, cost_path, model):
    if model == 'MLP':
        fair_df = pd.read_csv(fair_path).iloc[0:99, 1:]
        cost_df = pd.read_csv(cost_path).iloc[0:99, 1:]
    elif model == 'SVM':
        fair_df = pd.read_csv(fair_path).iloc[0:99, 1:]
        cost_df = pd.read_csv(cost_path).iloc[0:99, 1:]
    else:
        fair_df = pd.read_csv(fair_path).iloc[0:249, 1:]
        cost_df = pd.read_csv(cost_path).iloc[0:249, 1:]
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_first_training_time(cost_path, model):
    df = pd.read_csv(cost_path)
    if model == 'MLP':
        df = df.iloc[44:99, 1:]
    elif model == 'SVM':
        df = df.iloc[29:99, 1:]
    else:
        df = df.iloc[124:269, 1:]
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

# === Full traces ===
for model in models:
    if not os.path.exists(directory_full):
        raise FileNotFoundError(f" Dossier introuvable : {directory_full}")
    
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename:
            run_id = extract_run_id(filename)
            if run_id is None:
                continue
            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            df_metrics = load_epochs(fair_file, cost_file, model)
            full_time = load_first_training_time(cost_file, model)

            df_metrics['dataset'] = 'dc'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id
            df_metrics['epochID'] = list(range(len(df_metrics)))

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            if not os.path.exists(directory):
                continue

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename:
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue
                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    df_metrics = load_epochs(fair_file, cost_file, model)
                    full_time = load_first_training_time(cost_file, model)

                    df_metrics['dataset'] = 'dc'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id
                    df_metrics['epochID'] = list(range(len(df_metrics)))

                    all_epochs.append(df_metrics)


if not all_epochs:
    raise ValueError(" Aucun fichier traité. Vérifie les chemins, noms de fichiers ou extensions.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)
os.makedirs('../results/test', exist_ok=True)
df_all_epochs.to_csv('../results/test/dc_epoch_traces.csv', index=False)

In [12]:
import os
import re
import pandas as pd


systems_path = ['../results/mobiact-selection/CRAIGPB', '../results/mobiact-selection/GLISTERPB', '../results/mobiact-selection/GradMatchPB', '../results/mobiact-selection/Random']
ratio_path = ['/mobiact_0.05', '/mobiact_0.1', '/mobiact_0.2', '/mobiact_0.3', '/mobiact_0.5']
directory_full = '../results/mobiact-selection/Full/mobiact_1'
models = ['MLP']
excluded_columns = []
all_epochs = []


def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

def load_epochs(fair_path, cost_path, system):
    if system == 'GradMatchPB':
        fair_df = pd.read_csv(fair_path).iloc[0:89, 1:]
        cost_df = pd.read_csv(cost_path).iloc[0:89, 1:]
    else:
        fair_df = pd.read_csv(fair_path).iloc[10:99, 1:]
        cost_df = pd.read_csv(cost_path).iloc[10:99, 1:]
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore').abs() * 100
    fair_df['accuracy'] = cost_df['accuracy'] if 'accuracy' in cost_df.columns else None
    return fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path).iloc[19:79, 1:]
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

# === FFull traces ===
for model in models:
    for filename in os.listdir(directory_full):
        if filename.startswith("train_mobiact_fair_metrics_"):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)
            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith("train_mobiact_cost_metrics_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break

            if not cost_file:
                continue

            df_metrics = load_epochs(fair_file, cost_file, 'Full')
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'mobiact'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id
            df_metrics['epochID'] = list(range(len(df_metrics)))

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if filename.startswith("train_mobiact_fair_metrics_"):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)
                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith("train_mobiact_cost_metrics_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break

                    if not cost_file:
                        continue

                    df_metrics = load_epochs(fair_file, cost_file, system)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'mobiact'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id
                    df_metrics['epochID'] = list(range(len(df_metrics)))

                    all_epochs.append(df_metrics)


if not all_epochs:
    raise ValueError(" Aucun fichier traité pour Mobiact.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)
os.makedirs('../results/test', exist_ok=True)
df_all_epochs.to_csv('../results/test/mobiact_epoch_traces.csv', index=False)

In [13]:
import pandas as pd

def concatenate_csv_files(file_paths):
    target_columns = [
       'SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender',
       'SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race', 'SPD_age',
       'EOD_age', 'AOD_age', 'DI_age', 'DcI_age', 'F1_score', 'Precision',
       'Recall', 'accuracy', 'dataset', 'model', 'system', 'ratio',
       'Full_training_time', 'runID', 'epochID'
    ]
    
    dfs = []

    for file_path in file_paths:
        df = pd.read_csv(file_path, usecols=lambda c: c in target_columns, engine='python')
        missing_cols = set(target_columns) - set(df.columns)
        for col in missing_cols:
            df[col] = pd.NA
        df = df.reindex(columns=target_columns)
        dfs.append(df)

    result = pd.concat(dfs, ignore_index=True)

    return result

In [14]:
file_paths = ['../results/test/ars_epoch_traces.csv', 
              '../results/test/dc_epoch_traces.csv', 
              '../results/test//mobiact_epoch_traces.csv', 
              '../results/test/census_epoch_traces.csv', 
              '../results/test//kdd_epoch_traces.csv']
result_df = concatenate_csv_files(file_paths)
result_df.to_csv('../results/test/epoch_traces.csv', index=False)
print(len(result_df))

125595


In [17]:
import pandas as pd


df_resultats = pd.read_csv("../results/test/epoch_traces.csv")
df_config = pd.read_csv("../results/test/ExperimentConfigurations.csv")


def clean_text(val):
    val = str(val).strip()
    replacements = {
        'logreg': 'LR',
        'logReg': 'LR',
        'LogReg': 'LR',
        'Logreg': 'LR',
        'mlp': 'MLP',
        'svm': 'SVM',
        'dc': 'DC',
        'census': 'Adult',
        'ars': 'ARS',
        'kdd': 'KDD',
        'mobiact': 'MobiAct',
        'craig': 'Craig',
        'glister': 'Glister',
        'gradmatch': 'GradMatch',
        'full': 'Full',
        'random': 'Random',
        
    }
    return replacements.get(val.lower(), val)


df_resultats["model"] = df_resultats["model"].apply(clean_text)
df_resultats["dataset"] = df_resultats["dataset"].apply(clean_text)
df_resultats["system"] = df_resultats["system"].apply(clean_text)


df_config["model"] = df_config["model"].apply(clean_text)
df_config["dataset"] = df_config["dataset"].apply(clean_text)
df_config["system"] = df_config["system"].apply(clean_text)


df_resultats["ratio"] = df_resultats["ratio"].astype(float)
df_config["ratio"] = df_config["ratio"].astype(float)

config_dict = {
    (row['dataset'], row['model'], row['system'], row['ratio']): row['EC_ID']
    for _, row in df_config.iterrows()
}

ec_ids = []
for _, row in df_resultats.iterrows():
    key = (row['dataset'], row['model'], row['system'], row['ratio'])
    ec_id = config_dict.get(key, None)  # None si pas trouvé
    ec_ids.append(ec_id)

df_resultats["EC_ID"] = ec_ids


df_resultats.to_csv("../results/test/epoch_traces_EC_ID.csv", index=False)


In [None]:
import pandas as pd

file_path = '../results/test/epoch_traces_EC_ID.csv'  
df = pd.read_csv(file_path)


df = df.rename(columns={
    'runID': 'Run ID',
    'epochID': 'Epoch ID',
    'accuracy': 'Accuracy',
    'Full_training_time': 'Time'
})


desired_order = [
    'EC_ID', 'Run ID', 'Epoch ID', 'Time', 'Accuracy',
    'F1_score', 'Precision', 'Recall',
    'SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender',
    'SPD_age', 'EOD_age', 'AOD_age', 'DI_age', 'DcI_age',
    'SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race'
]

columns_to_keep = [col for col in desired_order if col in df.columns]

df = df[columns_to_keep]

df.to_csv('../traces/ExperimentMeasurements.csv', index=False)


Experiment Statistics

In [1]:
import pandas as pd

path_var = "../results/test/std_avg_vc_ttest_results_with_vc_5c-w-random.csv"
path_sel = "../results/test/ttest_5-5c-w-random.csv"

df_var = pd.read_csv(path_var)


df_var = df_var.rename(columns={
    col: col + "_var"
    for col in df_var.columns if col.startswith("test_")
})



df_sel = pd.read_csv(path_sel)

join_cols = ['dataset', 'model', 'system', 'ratio']


df_merged = None
df_merged = pd.merge(df_sel, df_var, on=join_cols, how='inner')

df_merged.to_csv("../results/test/t-tests.csv", index=False)


In [2]:
import pandas as pd


file_path = "../results/test/t-tests.csv"
df = pd.read_csv(file_path)

rename_map = {
    'test_acc': 'test_accuracy',
    'test_acc_var': 'test_accuracy_var',
    'test_f1': 'test_F1_score',
    'test_f1_var': 'test_F1_score_var',
    'test_recall': 'test_Recall',
    'test_recall_var': 'test_Recall_var',
    'test_precision': 'test_Precision',
    'test_precision_var': 'test_Precision_var'
}

df = df.rename(columns=rename_map)

df.to_csv(file_path, index=False)


In [3]:
import pandas as pd


df = pd.read_csv("../results/test/t-tests.csv")  

 
id_vars = ['dataset', 'model', 'system', 'ratio']

metrics = [
    'Full_training_time', 'accuracy', 'Precision', 'Recall', 'F1_score',
    'SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender',
    'SPD_age', 'EOD_age', 'AOD_age', 'DI_age', 'DcI_age',
    'SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race'
]

suffixes = {
    'Mean': '_avg_avg',
    'Standard deviation': '_std_avg',
    'Variability coefficient': '_vc_avg',
    'Selection impact': 'test_{}',
    'Variability impact': 'test_{}_var'
}

rows = []

for _, row in df.iterrows():
    for metric in metrics:
        new_row = {key: row[key] for key in id_vars}
        new_row["Evaluation metric"] = metric
        
        for col_name, suffix in suffixes.items():
            if metric == "Full_training_time" and col_name == "SelImpact":
                column_name = "test_time"
            elif '{}' in suffix:
                column_name = suffix.format(metric)
            else:
                column_name = metric + suffix

            if column_name in df.columns:
                new_row[col_name] = row[column_name]
            else:
                new_row[col_name] = None

        rows.append(new_row)

df_reshaped = pd.DataFrame(rows)

def should_remove(row):
    metric = row["Evaluation metric"]
    dataset = row["dataset"].lower()
    if dataset == "ars" and (metric.endswith("_age") or metric.endswith("_race")):
        return True
    if dataset.startswith("mobiact") and metric.endswith("_race"):
        return True
    if dataset.startswith("dc") and metric.endswith("_race"):
        return True
    return False

df_filtered = df_reshaped[~df_reshaped.apply(should_remove, axis=1)]

# Sauvegarde dans un nouveau fichier CSV
df_filtered.to_csv("../results/test/t-tests-metrics.csv", index=False)


In [1]:
import pandas as pd


df_resultats = pd.read_csv("../results/test/t-tests-metrics.csv")
df_config = pd.read_csv("../results/test/ExperimentConfigurations.csv")


def clean_text(val):
    val = str(val).strip()
    replacements = {
        'logreg': 'LR',
        'logReg': 'LR',
        'LogReg': 'LR',
        'Logreg': 'LR',
        'mlp': 'MLP',
        'svm': 'SVM',
        'dc': 'DC',
        'census': 'Adult',
        'ars': 'ARS',
        'kdd': 'KDD',
        'mobiact': 'MobiAct',
        'gradmatchpb': 'GradMatch',
        'craigpb': 'Craig',
        'glisterpb': 'Glister',
        'full': 'Full',
        'random': 'Random',
        
    }
    return replacements.get(val.lower(), val)


df_resultats["model"] = df_resultats["model"].apply(clean_text)
df_resultats["dataset"] = df_resultats["dataset"].apply(clean_text)
df_resultats["system"] = df_resultats["system"].apply(clean_text)


df_config["model"] = df_config["model"].apply(clean_text)
df_config["dataset"] = df_config["dataset"].apply(clean_text)
df_config["system"] = df_config["system"].apply(clean_text)


df_resultats["ratio"] = df_resultats["ratio"].astype(float)
df_config["ratio"] = df_config["ratio"].astype(float)



config_dict = {
    (row['dataset'], row['model'], row['system'], row['ratio']): row['EC_ID']
    for _, row in df_config.iterrows()
}


ec_ids = []
for _, row in df_resultats.iterrows():
    key = (row['dataset'], row['model'], row['system'], row['ratio'])
    ec_id = config_dict.get(key, None)   
    ec_ids.append(ec_id)

df_resultats["EC_ID"] = ec_ids


df_resultats.to_csv("../results/test/t-tests-metrics_ECID.csv", index=False)


In [None]:
import pandas as pd

file_path = "../results/test/t-tests-metrics_ECID.csv"

df = pd.read_csv(file_path)

df = df[df["Evaluation metric"] != "Full_training_time"]

df = df.drop(columns=["dataset", "model", "system", "ratio"], errors="ignore")

cols = df.columns.tolist()
if "EC_ID" in cols:
    cols.insert(0, cols.pop(cols.index("EC_ID")))
    df = df[cols]

df.to_csv("../traces/ExperimentStatistics.csv", index=False)
