Run some attacks on DP-CGANS

In [101]:
import pandas as pd 
import os 
import json 
import numpy as np 

import tapas.datasets
import tapas.generators
import tapas.threat_models
import tapas.attacks
import tapas.report


Define some parameters

In [102]:
datapath = "../datasets"
dataset_name = "Adult"
model_name = "DPCGANS"
file = f"Real/real_{dataset_name.lower()}_data.csv"

schema = "data_schemas/adult.json"
executable_generator = "src/generator_dp_cgans.py"


# make some restrictions to speed up training
N_subsample = 500 
# keep only these columns for faster training. Needs to keep columns in the same order
# columns_to_keep = ["age", "education", "marital-status", "occupation", "race", "sex", "label"]
columns_to_keep = ["age", "occupation", "race", "label"]

np.random.seed(1)


Notes
- I created a json file with the data schema in `data_schemas/adult.json`. I am not sure about all the "countable" data types, but none of the input data seems continuous/to have decimals. See the tapas documentation: https://privacy-sdg-toolbox.readthedocs.io/en/latest/dataset-schema.html

In [103]:
def load_tabular_dataset(filename_data, filename_schema, N_subsample=None, columns_to_keep=None):
    """Load a tabular dataset for use with TAPAS.
    
    Arguments
    ---------
    filename_data: str
        Full path to the file to load.
    filename_schema: str
        Full path to the json file with the data schema.
    N_subsample: int, optional
        Number of random subsample to draw from the original data set.
    columns_to_keep: list, optional
        If specified, only work with a dataframe with those columns.
    """

    df = pd.read_csv(filename_data, index_col=0)
    df = df.sample(N_subsample)
    if columns_to_keep is not None:
        df = df.loc[:, columns_to_keep]

    with open(filename_schema) as file:
        # Load the JSON data into a dictionary
        data_schema = json.load(file)

    data_schema = [col for col in data_schema if col["name"] in columns_to_keep]
    assert len(data_schema) == len(columns_to_keep), "all columns in the dataframe need to be in the data schema"

    data_description = tapas.datasets.DataDescription(schema=data_schema)
    data = tapas.datasets.TabularDataset(data=df, description=data_description)
    return data


def load_generator(filename_exec, data):
    """Load the executable generator and fit it to the data
    
    filename_exec: str
        Full path and filename of the executable
    data: tapas.TabularDataset
        The real data that will be used by the generator to create synthetic data.
    """

    generator = tapas.generators.GeneratorFromExecutable(exe=filename_exec)
    generator.fit(data)
    return generator

Open questions
- how is the data knowledge with `specific_data` related to the data set per se? is it the same? a subset? is the target record included or not?

Load the data and the generator

In [104]:
data = load_tabular_dataset(
    filename_data=os.path.join(datapath, dataset_name, file),
    filename_schema=schema,
    N_subsample=N_subsample,
    columns_to_keep=columns_to_keep
)

generator = load_generator(executable_generator, data=data)

we need to separate the target record from the original data

In [105]:
attack_id = 0

In [106]:
target_record = data.get_records([attack_id])
data.drop_records([attack_id], in_place=True)

In [107]:
training_data_set_size = 10
specific_data = data.sample(n_samples=training_data_set_size)
synthetic_dataset_size = 9

In [108]:
threat_model = tapas.threat_models.TargetedMIA(
    attacker_knowledge_data=tapas.threat_models.ExactDataKnowledge(
        specific_data),       
    attacker_knowledge_generator=tapas.threat_models.BlackBoxKnowledge(
            generator, num_synthetic_records=synthetic_dataset_size,
        ),
    target_record=target_record,
    generate_pairs=False,
    replace_target=False
)

In [109]:
attack = tapas.attacks.ClosestDistanceMIA(criterion="accuracy", label="Closest-Distance")

In [110]:
vars(attack)

{'target_criterion': 'accuracy',
 'positive_label': None,
 'negative_label': None,
 '_threshold': None,
 'distance': <tapas.attacks.distances.HammingDistance at 0x7fa7b1508d60>,
 '_label': 'Closest-Distance'}

In [111]:
num_training = 10
attack.train(threat_model, num_samples=num_training) # that's short, 55.2 seconds with few columns

In [112]:
attack_summary = threat_model.test(attack, num_samples=10) # 100 takes 10 minutes. 10 takes 40 seconds

In [113]:
display(attack_summary.scores) # what do the scores mean? try with smaller samples?
display(len(attack_summary.scores)) # these are the number of samples in the test()
display(attack_summary.labels) # I guess these are the indicators for whether the dataset contains the record or not?


type(attack_summary)
attack_summary.predictions # so this explains why the FPR and TPR are 0. How can I change it?

array([-2., -2., -2., -2., -2., -1., -1., -1., -2., -1.])

10

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0])

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 1])

why do we get all 0s?
- because we do not use the generator properly. -- for this, need to check some functioning code (try one of their examples)
- because we are too strict (threshold too high/low) -- but should this threshold not be learned in training?

In [114]:
metrics = attack_summary.get_metrics() # but then I get again this RuntimeWarning: invalid value encountered in divide
#   return np.log(max(np.max(tp / fp), np.max((1 - fp) / (1 - tp)))) -- but the TP/FP below are not 

  return np.log(max(np.max(tp / fp), np.max((1 - fp) / (1 - tp))))


In [115]:
metrics

Unnamed: 0,dataset,target_id,generator,attack,accuracy,true_positive_rate,false_positive_rate,mia_advantage,privacy_gain,auc,effective_epsilon
0,Unnamed dataset (EXACT),0,src/generator_dp_cgans.py,Closest-Distance,0.5,0.333333,0.428571,-0.095238,1.095238,0.452381,0.0


### Notes
- `attack_score` takes the min distance between the target record and all synthetic datasets in the list `datasets`. 
    - so, I need more than one synthetic datasets as inputs?
    - should they stem from the same generator?
    - read some paper on these attacks/need to understand the background better
