Implementation of attack models

## Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as spio
from os.path import join as osj
import pandas as pd
import seaborn as sns
import random
import pickle
import os

import logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


This notebook trains and tests multiple attacker models for membership and attribute inference.

The best and worst performing DP setups are used here to further analyse their privacy level. 

## Setup for Attribute Inference

### Functions

In [3]:
def get_dp_signals(m, e):
    with open(osj("..", "data_dp", f"{m}_{e}.pkl"), "rb") as f:
        return pickle.load(f)

def get_real_data():
    dict_samples = spio.loadmat('../data/s2s_mitbih_aami.mat')
    samples = dict_samples['s2s_mitbih']
    values = samples[0]['seg_values']
    return values

def get_patient_infos():
    with open(osj("..", "data", "patient_infos.pkl"), "rb") as f:
        return pickle.load(f)

# def get_ids():
#     with open(osj("..", "data", "all_patients.pkl"), "rb") as f:
#         return pickle.load(f)

def get_patient_attribute(attribute, data):
    attr_dict = {str(pid): data[attribute] for pid, data in data.items()}
    attr_array = list(attr_dict.values())
    return attr_array

def read_intra_attack_setups():
    intra_setups = pd.read_csv('../results_dp/attack_setup_intra.csv')
    return intra_setups

def read_inter_attack_setups():
    inter_setups = pd.read_csv('../results_dp/attack_setup_inter.csv')
    return inter_setups

def get_attack_setups():
    """
    Loads the attack setups for intra and inter patient attacks.

    Returns:
        dict_all_setups: dict with structure {model: {epsilon: [delta]}}
    """

    # intra
    intra_setups = read_intra_attack_setups()
    intra_setups = intra_setups.sort_values(by=["Model", "Epsilon", "Delta"], ascending=False)
    intra_setups["Model"] = intra_setups["Model"].str.replace("Intra-", "", regex=False)

    # inter
    inter_setups = read_inter_attack_setups()
    inter_setups = inter_setups.sort_values(by=["Model", "Epsilon", "Delta"], ascending=False)
    inter_setups["Model"] = inter_setups["Model"].str.replace("Inter-", "", regex=False)

    # all
    all_setups = pd.concat([intra_setups, inter_setups], axis=0)
    all_setups.drop_duplicates(inplace=True, ignore_index=True)
    all_setups = all_setups.sort_values(by=["Model", "Epsilon", "Delta"], ascending=False)

    # dict
    dict_all_setups = {}
    for _, row in all_setups.iterrows():
        model = row["Model"]
        epsilon = row["Epsilon"]
        delta = row["Delta"]
        
        if model not in dict_all_setups:
            dict_all_setups[model] = {}
        if epsilon not in dict_all_setups[model]:
            dict_all_setups[model][epsilon] = []
        
        dict_all_setups[model][epsilon].append(delta)

    return dict_all_setups, len(all_setups)


def flatten_data(signals, attr_array):
    """
    Flatten the signals to have all beats of a patient in one array.

    Parameters:
        signals: ndarray with structure [patient][segment][1][beat (280,)]
        attr_array: List or ndarray, target attribute

    Returns:
        X: ndarray in format [n_patients, n_values]
        y: ndarray in format [n_patients]
    """
    # PREPARE X
    n_beats = 1500
    beat_length = 280

    new_signal = {}
    for patient_idx in range(0, len(signals)): # 48

        all_values = []
        for beat_idx in range(n_beats):
            for value_idx in range(0, beat_length): #280 values per beat
                all_values.append(signals[patient_idx][beat_idx][0][value_idx].item())
            
        new_signal[patient_idx] = all_values
    
    signal_array = np.array(list(new_signal.values()))
    signal_array.shape
    X = signal_array
    
    # PREPARE Y
    attr_array = np.array(attr_array)
    attr_array.shape
    y = attr_array

    return X, y   

def train_inference(X, y, binary):
    """
    Trains gender inference model using Random Forest Classifier.

    Parameters:
        X: ndarray with shape [n_patients, n_values]
        y: ndarray with shape [n_patients]
        binary: bool, used for performance metrics

    Returns:
        metrics: dict with accuracy, precision, recall, f1-score, mae, mse, r2
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        random_state=42
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    if binary == True:
        pre, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
    elif binary == False:   
        pre, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # save the metrics
    metrics = {
        "acc": acc,
        "rec": rec,
        "pre": pre,
        "f1": f1,
        "mae": mae,
        "mse": mse,
        "r2": r2
    }

    return metrics

def save_attack_performance(attack, metrics):    
    with open(osj("..", "results_attacks", f"{attack}_performance.pkl"), "wb") as f:
        pickle.dump(metrics, f) 

def load_attack_performance(attack):    
    with open(osj("..", "results_attacks", f"{attack}_performance.pkl"), "rb") as f:
        metrics = pickle.load(f) 
    return metrics

### Attack on real data

In [5]:
attack = "AIA" # Attribute Inference Attack
attributes = ["gender", "age"]
patient_infos = get_patient_infos()
real_signals = get_real_data()

aia_metrics = load_attack_performance(attack)

mechanism = "no_dp"

if mechanism not in aia_metrics.keys(): 
    aia_metrics[mechanism] = {}

attribute_metrics = {}

### ITERATION OVER ATTRIBUTES ###
for attribute in attributes:

    attr_array = get_patient_attribute(attribute, patient_infos)

    if attribute in aia_metrics[mechanism].keys(): 
        logger.info(f"Skipping existing {attribute} inference")
        continue

    else:

        aia_metrics[mechanism][attribute] = {}

        if attribute == "gender": 
                            
            X, y = flatten_data(real_signals, attr_array)
            logger.info(f"Training {attribute} inference")
            
            y = np.where(y == 'M', 0, 1)

            metrics = train_inference(X, y, binary=True)

        if attribute == "age":


            X, y = flatten_data(real_signals,attr_array)
            logger.info(f"Training {attribute} inference")
            
            # y has two entries with age "-1" which need to be removed
            valid_patients = y >= 0
            X = X[valid_patients]
            y = y[valid_patients]
            y_grouped = pd.cut(y, bins=[0, 40, 70, 100], labels=["0", "1", "2"]).astype(int)

            metrics = train_inference(X, y_grouped, binary=False)
        
        aia_metrics[mechanism][attribute] = metrics

    # save the metrics
    save_attack_performance(attack, aia_metrics)
    logger.info(f"Saved attack performance for {attack}.")


2025-05-14 17:20:35 - INFO - Skipping existing gender inference
2025-05-14 17:20:35 - INFO - Skipping existing age inference


### Attack on private data

In [4]:
dict_setups, len_setups = get_attack_setups()

In [6]:
dict_setups["laplace"].keys()


dict_keys([0.91, 0.81, 0.71, 0.61, 0.51, 0.081, 0.071, 0.061, 0.051, 0.041, 0.031, 0.021, 0.01, 0.001])

In [10]:
attack = "AIA" # Attribute Inference Attack
attributes = ["gender", "age"]
patient_infos = get_patient_infos()
dict_all_setups, len_setups = get_attack_setups()
counter = 1

# Load the attack performance metrics if exists
if os.path.exists(osj("..", "results_attacks", f"{attack}_performance.pkl")):
    aia_metrics = load_attack_performance(attack)
else:
    aia_metrics = {mechanism:{epsilon: {delta: {attribute: None for attribute in attributes} for delta in dict_all_setups[mechanism][epsilon]} for epsilon in dict_all_setups[mechanism]} for mechanism in dict_all_setups}

for attribute in attributes:
    counter = 1

    attr_array = get_patient_attribute(attribute, patient_infos)

    ###### PREPARE ITERATIONS ######
    for mechanism in dict_all_setups:
        ekg_loaded = False
        last_epsilon = 0.0

        for epsilon in dict_all_setups[mechanism]:

            if epsilon <= 0.091:
                file_epsilon = 0.091
            elif epsilon <= 0.91:
                file_epsilon = 0.91
            elif epsilon <= 2.01:
                file_epsilon = 2.01
            
            if epsilon not in aia_metrics[mechanism].keys():
                aia_metrics[mechanism][epsilon] = {delta: {attribute: None for attribute in attributes} for delta in dict_all_setups[mechanism][epsilon]}

            for delta in dict_all_setups[mechanism][epsilon]:

                if delta not in aia_metrics[mechanism][epsilon].keys():
                    aia_metrics[mechanism][epsilon][delta] = {attribute: None for attribute in attributes}
                
                attribute_metrics = {}

                if aia_metrics[mechanism][epsilon][delta][attribute] != None:  # attribute was trained already
                    logger.info(f"{attribute}: Skipping existing inference for {mechanism}, {epsilon}, {delta} - ({counter}/{len_setups})")
                    counter += 1
                    continue

                else:

                    if file_epsilon != last_epsilon or ekg_loaded == False:
                        ekg_loaded = True
                        logger.info(f"Loading {mechanism} data until epsilon {file_epsilon} (1-2 minutes) ...")
                        ekg_signals_dp = get_dp_signals(mechanism, file_epsilon)
                    
                    try:

                        if attribute == "gender": 
                            
                            counter += 1
                            X, y = flatten_data(ekg_signals_dp[epsilon][delta], attr_array)
                            logger.info(f"Training {attribute} inference ({counter}/{len_setups})")
                            
                            y = np.where(y == 'M', 0, 1)

                            metrics = train_inference(X, y, binary=True)
                            aia_metrics[mechanism][epsilon][delta][attribute] = metrics

                        if attribute == "age":

                            counter += 1
                            X, y = flatten_data(ekg_signals_dp[epsilon][delta],attr_array)
                            logger.info(f"Training {attribute} inference ({counter} of {len_setups})")
                            
                            # y has two entries with age "-1" which need to be removed
                            valid_patients = y >= 0
                            X = X[valid_patients]
                            y = y[valid_patients]
                            y_grouped = pd.cut(y, bins=[0, 40, 70, 100], labels=["0", "1", "2"]).astype(int)

                            metrics = train_inference(X, y_grouped, binary=False)
                            aia_metrics[mechanism][epsilon][delta][attribute] = metrics


                    except KeyError:
                        logger.info(f"{attribute}: No data available for {mechanism} with epsilon {epsilon} and delta {delta}.")
                        continue
            
            last_epsilon = file_epsilon

    # save the metrics
    save_attack_performance(attack, aia_metrics)
    logger.info(f"Saved attack performance for {attack}.")

2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.91, 0.6 - (1/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.91, 0.0 - (2/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.81, 1.0 - (3/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.81, 0.0 - (4/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.71, 0.0 - (5/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.61, 0.6 - (6/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.61, 0.2 - (7/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.61, 0.0 - (8/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.51, 0.0 - (9/54)
2025-05-13 22:12:08 - INFO - gender: Skipping existing inference for laplace, 0.081, 1.0 - (10/54)
2025-05-13 22:12:08 - INFO -

2025-05-13 22:12:15 - INFO - gender: No data available for bounded_n with epsilon 0.01 and delta 0.1.
2025-05-13 22:12:16 - INFO - Saved attack performance for AIA.
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.91, 0.6 - (1/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.91, 0.0 - (2/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.81, 1.0 - (3/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.81, 0.0 - (4/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.71, 0.0 - (5/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.61, 0.6 - (6/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.61, 0.2 - (7/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.61, 0.0 - (8/54)
2025-05-13 22:12:16 - INFO - age: Skipping existing inference for laplace, 0.51, 0.

In [29]:
aia_metrics = load_attack_performance("AIA")
aia_metrics

{'laplace': {0.91: {0.6: {'gender': {'acc': 0.6,
     'rec': 0.6,
     'pre': 0.6,
     'f1': 0.6,
     'mae': 0.4,
     'mse': 0.4,
     'r2': -0.6000000000000001},
    'age': {'acc': 0.5,
     'rec': 0.3333333333333333,
     'pre': 0.16666666666666666,
     'f1': 0.2222222222222222,
     'mae': 0.5,
     'mse': 0.5,
     'r2': -0.21951219512195141}},
   0.0: {'gender': {'acc': 0.6,
     'rec': 0.6,
     'pre': 0.6,
     'f1': 0.6,
     'mae': 0.4,
     'mse': 0.4,
     'r2': -0.6000000000000001},
    'age': {'acc': 0.4,
     'rec': 0.26666666666666666,
     'pre': 0.14814814814814814,
     'f1': 0.19047619047619047,
     'mae': 0.6,
     'mse': 0.6,
     'r2': -0.46341463414634165}}},
  0.81: {1.0: {'gender': {'acc': 0.7,
     'rec': 0.8,
     'pre': 0.6666666666666666,
     'f1': 0.7272727272727273,
     'mae': 0.3,
     'mse': 0.3,
     'r2': -0.19999999999999996},
    'age': {'acc': 0.2,
     'rec': 0.13333333333333333,
     'pre': 0.09523809523809523,
     'f1': 0.111111111111111

# Archive: Manual Attack Testing

In [None]:
# def get_dp_signals(m, e):
#     with open(osj("..", "data_dp", f"{m}_{e}.pkl"), "rb") as f:
#         return pickle.load(f)

# def get_real_data():
#     dict_samples = spio.loadmat('../data/s2s_mitbih_aami.mat')
#     samples = dict_samples['s2s_mitbih']
#     values = samples[0]['seg_values']
#     return values

# def get_patient_infos():
#     with open(osj("..", "data", "patient_infos.pkl"), "rb") as f:
#         return pickle.load(f)

# def get_ids():
#     with open(osj("..", "data", "all_patients.pkl"), "rb") as f:
#         return pickle.load(f)

# def get_patient_attribute(attribute, data):
#     attr_dict = {str(pid): data[attribute] for pid, data in data.items()}
#     attr_array = list(attr_dict.values())
#     return attr_array

# def prepare_inference_data(signals, attr_array, beats_per_patient=1000, beats_separated=True):
#     """
#     Prepare the data for attribute inference attack.

#     Parameters:
#         signals: ndarray with structure [patient][segment][1][beat (280,)]
#         attr_array: List or ndarray, target attribute
#         beats_per_patient: int, Count of beats per patient (default: 1000)
#         beats_separated: bool, if every beat is separated (default: True)

#     Returns:
#         X: ndarray in format [n_patients, beats_per_patient, beat_length] (if beats_separated=False)
#            or [total_beats, beat_length] (if beats_separated=True)
#         y: ndarray in format [n_patients] (if beats_separated=False)
#            or [total_beats] (if beats_separated=True)
#     """
#     X = []
#     y = []

#     for patient_idx in range(len(signals)):
#         attr = attr_array[patient_idx]
#         segments = signals[patient_idx] 

#         patient_beats = np.concatenate([segment[0] for segment in segments], axis=0)
            
#         # Sampling
#         n_beats = min(beats_per_patient, len(patient_beats))
#         sampled_beats = patient_beats[np.random.choice(len(patient_beats), n_beats, replace=False)]

#         if beats_separated:
#             X.append(sampled_beats)
#             y.append(np.full(n_beats, attr))
#         else:
#             if len(sampled_beats) < beats_per_patient:
#                 # Padding with zeros if less than beats_per_patient
#                 padding = np.zeros((beats_per_patient - len(sampled_beats), sampled_beats.shape[1]))
#                 sampled_beats = np.vstack((sampled_beats, padding))
#             X.append(sampled_beats)
#             y.append(attr)

#     if beats_separated:
#         X = np.concatenate(X, axis=0)  # shape: [n_beats, 280]
#         y = np.concatenate(y, axis=0)  # shape: [n_beats]
#     else:
#         X = np.array(X) # shape: [n_patients, beats_per_patient, 280]
#         y = np.array(y) # shape: [n_patients]
    
#     return X, y

# def train_attribute_inference(signals, attr_array, attr_task="classification"):
#     """
#     signals: ndarray with structure [patient][segment][1][beat (280,)]
#     patient_metadata: dict with structure {patient_index: {"gender": 0/1, ...}}
#     attr_task: str, type of model to use for inference (default: "randomForest")

#     Returns:
#         y_test: real labels
#         y_pred: predicted labels
#         pred_classes: predicted classes (for classification tasks)
#     """

#     if attr_task == "classification":

#         X, y = prepare_inference_data(signals, attr_array, beats_per_patient=1000, beats_separated=True)

#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
#         clf = RandomForestClassifier(
#             n_estimators=100,
#             max_depth=20,
#             n_jobs=-1,
#             random_state=42
#         )

#         clf.fit(X_train, y_train)
#         y_pred = clf.predict(X_test)
#         pred_classes = list(clf.classes_) 

#         return y_test, y_pred, pred_classes
    
#     elif attr_task == "regression":

#         X, y = prepare_inference_data(signals, attr_array, beats_per_patient=1000, beats_separated=True)
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#         # xgb_model = XGBRegressor(
#         #     tree_method="gpu_hist",
#         #     n_estimators=100,
#         #     max_depth=10,
#         #     learning_rate=0.1,
#         #     random_state=42,
#         #     n_jobs=-1
#         # )
#         # xgb_model = XGBRegressor(tree_method="gpu_hist")
#         # xgb_model.fit(X_train, y_train)
#         # y_pred = xgb_model.predict(X_test)

#         # model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
#         # model.fit(X_train, y_train)
#         # y_pred = model.predict(X_test)

#         # model = LinearRegression()
#         # model.fit(X_train, y_train)
#         # y_pred = model.predict(X_test)

#         model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)

#         return y_test, y_pred
    
#     else:
#         raise ValueError(f"Unsupported task type: {attr_task}")



In [None]:
# def get_dp_signals(m, e):
#     with open(osj("..", "data_dp", f"{m}_{e}.pkl"), "rb") as f:
#         return pickle.load(f)

# def get_patient_infos():
#     with open(osj("..", "data", "patient_infos.pkl"), "rb") as f:
#         return pickle.load(f)

# def get_ids():
#     with open(osj("..", "data", "all_patients.pkl"), "rb") as f:
#         return pickle.load(f)

# def get_patient_attribute(attribute, data):
#     attr_dict = {str(pid): data[attribute] for pid, data in data.items()}
#     attr_array = list(attr_dict.values())
#     return attr_array


# def prepare_signal_for_inference(signal, attr_array, n_beats=1000, flat="patient"):
#     """
#     Prepare the signal for attribute inference attack.

#     Parameters:
#         signal: ndarray with structure [patient][segment][1][beat (280,)]
#         attr_array: List or ndarray, target attribute
#         n_beats: int, Count of beats per patient (default: 1000)
#         flat: bool, if True, flatten the signal to [n_patients, n_values] (default: True)
#               if False, flatten only by one dimension to [n_patients, n_segments, n_beats]

#     Returns:
#         X: ndarray in format [n_segments, 1, beat_length]
#     """

#     # returns the signal in the format [n_patients, n_values]
#     if flat == "patient":
#         new_signal = {}
#         for patient_idx in range(0, len(signal)): # 48
#             all_values = []
#             for segment_idx in range(min(n_beats, len(signal[patient_idx]))):
#                 for value_idx in range(0, 280): #280 values per beat
#                     all_values.append(signal[patient_idx][segment_idx][0][value_idx].item())
                
#             new_signal[patient_idx] = all_values
        
#         # requires no change to the attribute array, as it is already in the format [n_patients, attr] (48, 1)
        

#     # returns the signal in the format [n_patients, n_beats, n_values]
#     elif flat == "one_beat":
#         new_signal = {}
#         for patient_idx in range(0, len(signal)): # 48
#             all_values = [i for i in range(min(n_beats, len(signal[patient_idx])))]
#             for segment_idx in range(min(n_beats, len(signal[patient_idx]))): 
#                 all_values[segment_idx] = signal[patient_idx][segment_idx][0]
#             new_signal[patient_idx] = all_values
        
#         # reproduce attribute array to match the number of segments
#         attr_array = np.array(attr_array)
#         attr_segmented = [[attr[0]] * 1500 for attr in attr_array]

#     # returns the signal in the format [n_beats, n_values] (200, 50 * 280)
#     elif flat == "50_beats":
#         new_signal = {}
#         for patient_idx in range(0, len(signal)): # 48
#             all_values = [i for i in range(min(n_beats, len(signal[patient_idx])))]
#             for segment_idx in range(min(n_beats, len(signal[patient_idx]))): 
#                 all_values[segment_idx] = signal[patient_idx][segment_idx][0]
#             new_signal[patient_idx] = all_values
        
#         # reproduce attribute array to match the number of segments
#         attr_array = np.array(attr_array)
#         attr_segmented = [[attr[0]] * 1500 for attr in attr_array]


#     return new_signal


# def prepare_inference_data(signals, attr_array, beats_per_patient=1000, beats_separated=True):
#     """
#     Prepare the data for attribute inference attack.

#     Parameters:
#         signals: ndarray with structure [patient][segment][1][beat (280,)]
#         attr_array: List or ndarray, target attribute
#         beats_per_patient: int, Count of beats per patient (default: 1000)
#         beats_separated: bool, if every beat is separated (default: True)

#     Returns:
#         X: ndarray in format [n_patients, beats_per_patient, beat_length] (if beats_separated=False)
#            or [total_beats, beat_length] (if beats_separated=True)
#         y: ndarray in format [n_patients] (if beats_separated=False)
#            or [total_beats] (if beats_separated=True)
#     """
#     X = []
#     y = []

#     for patient_idx in range(len(signals)):
#         attr = attr_array[patient_idx]
#         segments = signals[patient_idx] 

#         patient_beats = np.concatenate([segment[0] for segment in segments], axis=0)
            
#         # Sampling
#         n_beats = min(beats_per_patient, len(patient_beats))
#         sampled_beats = patient_beats[np.random.choice(len(patient_beats), n_beats, replace=False)]

#         if beats_separated:
#             X.append(sampled_beats)
#             y.append(np.full(n_beats, attr))
#         else:
#             if len(sampled_beats) < beats_per_patient:
#                 # Padding with zeros if less than beats_per_patient
#                 padding = np.zeros((beats_per_patient - len(sampled_beats), sampled_beats.shape[1]))
#                 sampled_beats = np.vstack((sampled_beats, padding))
#             X.append(sampled_beats)
#             y.append(attr)

#     if beats_separated:
#         X = np.concatenate(X, axis=0)  # shape: [n_beats, 280]
#         y = np.concatenate(y, axis=0)  # shape: [n_beats]
#     else:
#         X = np.array(X) # shape: [n_patients, beats_per_patient, 280]
#         y = np.array(y) # shape: [n_patients]
    
#     return X, y

# def train_attribute_inference(signals, attr_array, attr_task="classification"):
#     """
#     signals: ndarray with structure [patient][segment][1][beat (280,)]
#     patient_metadata: dict with structure {patient_index: {"gender": 0/1, ...}}
#     attr_task: str, type of model to use for inference (default: "randomForest")

#     Returns:
#         y_test: real labels
#         y_pred: predicted labels
#         pred_classes: predicted classes (for classification tasks)
#     """

#     if attr_task == "classification":

#         X, y = prepare_inference_data(signals, attr_array, beats_per_patient=1000, beats_separated=True)

#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        
#         clf = RandomForestClassifier(
#             n_estimators=100,
#             max_depth=20,
#             n_jobs=-1,
#             random_state=42
#         )

#         clf.fit(X_train, y_train)
#         y_pred = clf.predict(X_test)
#         pred_classes = list(clf.classes_) 

#         return y_test, y_pred, pred_classes
    
#     elif attr_task == "regression":

#         X, y = prepare_inference_data(signals, attr_array, beats_per_patient=1000, beats_separated=True)
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#         # xgb_model = XGBRegressor(
#         #     tree_method="gpu_hist",
#         #     n_estimators=100,
#         #     max_depth=10,
#         #     learning_rate=0.1,
#         #     random_state=42,
#         #     n_jobs=-1
#         # )
#         # xgb_model = XGBRegressor(tree_method="gpu_hist")
#         # xgb_model.fit(X_train, y_train)
#         # y_pred = xgb_model.predict(X_test)

#         # model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
#         # model.fit(X_train, y_train)
#         # y_pred = model.predict(X_test)

#         # model = LinearRegression()
#         # model.fit(X_train, y_train)
#         # y_pred = model.predict(X_test)

#         model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)

#         return y_test, y_pred
    
#     else:
#         raise ValueError(f"Unsupported task type: {attr_task}")



In [None]:
# # TEST GENDER INFERENCE
# patient_infos = get_patient_infos()
# attr_array = get_patient_attribute("gender", patient_infos)

# y_test, y_pred, pred_classes = train_attribute_inference(real_signals, attr_array, task="randomforest")

# print(classification_report(y_test, y_pred, target_names=pred_classes))

In [None]:
# # TEST AGE INFERENCE - CLASSIFICATION
# patient_infos = get_patient_infos()
# attr_array = get_patient_attribute("age", patient_infos)

# # patient 4 (idx = 3) and 38 (idx = 37, will be 36) have no age and therefore will be removed
# del attr_array[3]
# real_signals_age = np.delete(real_signals, 3, axis=0) 
# del attr_array[36]
# real_signals_age = np.delete(real_signals_age, 36, axis=0) 

# print(attr_array.value_counts())

# attr_array = pd.cut(attr_array, bins=[0, 60, 70, 100], labels=["adult", "older", "senior"])

# y_test, y_pred, pred_classes = train_attribute_inference(real_signals_age, attr_array, task="randomforest")

# print(classification_report(y_test, y_pred, target_names=pred_classes))

In [None]:
# # TEST AGE INFERENCE - REGRESSION
# patient_infos = get_patient_infos()
# attr_array = get_patient_attribute("age", patient_infos)

# # patient 4 (idx = 3) and 38 (idx = 37, will be 36) have no age and therefore will be removed
# del attr_array[3]
# real_signals_age = np.delete(real_signals, 3, axis=0) 
# del attr_array[36]
# real_signals_age = np.delete(real_signals_age, 36, axis=0) 

# y_test, y_pred = train_attribute_inference(real_signals_age, attr_array, task="regression")

# print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
# print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
# print("R² Score:", r2_score(y_test, y_pred))

# # Results GradientBoostingRegressor:
# # Mean Absolute Error (MAE): 13.795175370909545
# # Mean Squared Error (MSE): 311.87486641343287
# # R² Score: -0.0013360272321962796

# # Results LinearRegression:
# # Mean Absolute Error (MAE): 13.746101115734675
# # Mean Squared Error (MSE): 311.4475515455648
# # R² Score: 3.595178326243342e-05

# # Results MLPRegressor:
# # Mean Absolute Error (MAE): 13.765186316133146
# # Mean Squared Error (MSE): 310.87597922384793
# # R² Score: 0.0018710979253216964

In [None]:
# mechanism = "laplace"
# file_epsilon = 0.091
# epsilon = 0.01
# delta = 0.5

# inference_attributes = ["gender", "age", "patient_id"]

In [None]:
# # GET DP X
# ekg_signals_dp = get_dp_signals(mechanism, file_epsilon)
# dp_signals = ekg_signals_dp[epsilon][delta]
# del ekg_signals_dp

In [None]:
# # GET Y
# patient_infos = get_patient_infos()
# attr_array = get_patient_attribute("gender", patient_infos)

In [None]:
# def flatten_data(signals, attr_array):
#     """
#     Flatten the signals to have all beats of a patient in one array.

#     Parameters:
#         signals: ndarray with structure [patient][segment][1][beat (280,)]
#         attr_array: List or ndarray, target attribute

#     Returns:
#         X: ndarray in format [n_patients, n_values]
#         y: ndarray in format [n_patients]
#     """
#     # PREPARE X
#     n_beats = 1500
#     beat_length = 280

#     new_signal = {}
#     for patient_idx in range(0, len(signals)): # 48

#         all_values = []
#         for beat_idx in range(n_beats):
#             for value_idx in range(0, beat_length): #280 values per beat
#                 all_values.append(signals[patient_idx][beat_idx][0][value_idx].item())
            
#         new_signal[patient_idx] = all_values
    
#     signal_array = np.array(list(new_signal.values()))
#     signal_array.shape
#     X = signal_array
    
#     # PREPARE Y
#     attr_array = np.array(attr_array)
#     attr_array.shape
#     y = attr_array

#     return X, y   

Transforming both, X and Y into 2 dimensional arrays

In [None]:
# attr_array = np.array(attr_array)
# attr_array.shape

(48,)

In [None]:
# signal_array = np.array(list(new_signal.values()))
# signal_array.shape

(48, 420000)

In [None]:
# X = signal_array
# y = attr_array
# y = np.where(y == 'M', 0, 1)

Train Random Forest Classifier

In [None]:
# def train_gender_inference(X, y):
#     """
#     Trains gender inference model using Random Forest Classifier.

#     Parameters:
#         X: ndarray with shape [n_patients, n_values]
#         y: ndarray with shape [n_patients]

#     Returns:
#         metrics: dict with accuracy, precision, recall, f1-score, mae, mse, r2
#     """

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#     clf = RandomForestClassifier(
#         n_estimators=100,
#         max_depth=None,
#         random_state=42
#     )
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)

#     acc = accuracy_score(y_test, y_pred)
#     pre, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")

#     mae = mean_absolute_error(y_test, y_pred)
#     mse = mean_squared_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)

#     # save the metrics
#     metrics = {
#         "acc": acc,
#         "rec": rec,
#         "pre": pre,
#         "f1": f1,
#         "mae": mae,
#         "mse": mse,
#         "r2": r2
#     }

#     return metrics

In [None]:
# metrics

{'acc': 0.4,
 'rec': 0.2,
 'pre': 0.3333333333333333,
 'f1': 0.25,
 'mae': 0.6,
 'mse': 0.6,
 'r2': -1.4}

Old tests

In [None]:
# labels_segmented = np.repeat(attr_array, 1500, axis=0)
# labels_segmented

In [None]:
# test_signals = dp_signals.copy()

In [None]:
# beat_signal = {}
# for patient in range(0, len(test_signals)): # 48
#     all_values = [i for i in range(len(test_signals[patient]))]
#     for segment in range(len(test_signals[patient])): # individual per patient but for all min. 1500
#         all_values[segment] = test_signals[patient][segment][0]
#     beat_signal[patient] = all_values

# Archive: Setup for Membership Inference

## Functions

In [None]:
# def load_real_train_test_data():
#     dict_samples = spio.loadmat('../data/s2s_mitbih_aami_DS1DS2.mat')

#     DS1_samples = dict_samples['s2s_mitbih_DS1']
#     DS1_values = DS1_samples[0]['seg_values']
#     DS1_labels = DS1_samples[0]['seg_labels']

#     DS2_samples = dict_samples['s2s_mitbih_DS2']
#     DS2_values = DS2_samples[0]['seg_values']
#     DS2_labels = DS2_samples[0]['seg_labels']

#     DS1_values, DS1_labels = prepare_data(DS1_values, DS1_labels)
#     DS2_values, DS2_labels = prepare_data(DS2_values, DS2_labels)

#     return DS1_values, DS1_labels, DS2_values, DS2_labels

# def load_train_test_data(m, e):
#     with open(osj("..", "dp_models", "train_test_data", m, f"{e}_data.pkl"), "rb") as f:
#         data = pickle.load(f)
#     return data 

# def prepare_data(values, labels, max_time=100, classes=['N', 'S', 'V'], max_label=100):

#     # calculate the number of annotations and sequences
#     num_annots = sum([item.shape[0] for item in temp_values]) 
#     n_seqs = num_annots / max_time

#     # add all beats together
#     count_b = 0
#     nr_recordings = [] # number of recordings per patient (each recording contains 280 measurements)
#     for _, item in enumerate(temp_values):
#         l = item.shape[0] # number of recordings per patient (each recording contains 280 measurements)
#         nr_recordings.append(l)
#         for itm in item:
#             if count_b == num_annots: # hence all recordings have been added
#                 break
#             beats.append(itm[0]) # itm is one recording, with 280 measurements
#             count_b += 1

#     # add all labels together
#     count_l  = 0
#     t_labels = []
#     for _, item in enumerate(labels): 
#         if len(t_labels) == num_annots: # break if all labels have been added
#             break
#         item = item[0]
#         # iterate over all recordings per patient
#         for lbl in item: 
#             if count_l == num_annots: # break if all labels have been added
#                 break
#             t_labels.append(str(lbl))
#             count_l += 1
    
#     del temp_values
#     # convert list to array & reshape
#     beats = np.asarray(beats)
#     t_labels = np.asarray(t_labels)  
#     shape_v = beats.shape # 109338 rows with each 280 entries (109338, 280, 1)
#     beats = np.reshape(beats, [shape_v[0], -1]) # new shape = (109338, 280)

#     # Create empty arrays for data and labels
#     random_beats  = np.asarray([],dtype=np.float64).reshape(0,shape_v[1])
#     random_labels = np.asarray([],dtype=np.dtype('|S1')).reshape(0,)

#     # iterate over all classes and truncate to max_label samples, so that all classes are equally represented
#     for cl in classes:
#         _label = np.where(t_labels == cl) # select indices that match the class
#         logger.info(f"Class {cl} is represented {len(_label[0])}")

#         # random permutation of indices
#         permute = np.random.permutation(len(_label[0])) 
#         _label = _label[0][permute[:max_label]] # choose the first X indices
#         logger.info(f"Class {cl} is now represented {len(_label)}")

#         random_beats = np.concatenate((random_beats, beats[_label]))
#         random_labels = np.concatenate((random_labels, t_labels[_label]))

#     # shorten data to multiple of max_time
#     signals = random_beats[:int(len(random_beats)/ max_time) * max_time, :]
#     _labels  = random_labels[:int(len(random_beats) / max_time) * max_time]

#     #  reshape data into groups of max_time
#     data   = [signals[i:i + max_time] for i in range(0, len(signals), max_time)]
#     labels = [_labels[i:i + max_time] for i in range(0, len(_labels), max_time)]

#     permute = np.random.permutation(len(labels)) # random permutation of indices only

#     # transform from list to array
#     data   = np.asarray(data, dtype=object) 
#     labels = np.asarray(labels, dtype=object)

#     # reorder data and labels according to random permute
#     data   = data[permute]
#     labels = labels[permute]

#     logger.info('Signals and labels processed!')

#     return data, labels

In [None]:
# # F1 Read files
# def read_data(filename, values, max_time=100, classes=['N', 'S', 'V'], max_label=100, trainset=1):

#     random.seed(654)
#     beats = [] 
#     dict_samples = spio.loadmat(filename + '.mat')
#     samples = dict_samples['s2s_mitbih'] # 2D array with 2 columns: ecg values and labels
#     labels = samples[0]['seg_labels'] # labels

#     # patient IDs for train / test set
#     # DS1 = [101, 106, 108, 109, 112, 114, 115, 116, 118,119, 122, 124, 201, 203, 205, 207, 208, 209, 215, 220, 223,230];
#     # DS2 = [100, 103, 105, 111, 113, 117, 121, 123,200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233,234];
#     DS1_idx = [1, 6, 8,  9, 11, 13, 14, 15, 17, 18, 20, 22, 24, 26, 27, 28, 29, 30, 35, 38, 41, 43]
#     DS2_idx = [0, 3, 5, 10, 12, 16, 19, 21, 23, 25, 31, 32, 33, 34, 37, 39, 40, 42, 44, 45, 46, 47]

#     # Select train and test data
#     if trainset == 1:
#         temp_values = [values[i] for i in DS1_idx]
#         labels = [labels[i] for i in DS1_idx]

#     elif trainset == 0:
#         temp_values = [values[i] for i in DS2_idx]
#         labels = [labels[i] for i in DS2_idx]

    