Run some attacks on DP-CGANS

In [24]:
import pandas as pd 
import os 
import tapas.datasets
import tapas.generators
import json 


In [25]:
datapath = "../datasets"
dataset_name = "Adult"
model_name = "DPCGANS"
file = f"Real/real_{dataset_name.lower()}_data.csv"

In [26]:
df = pd.read_csv(os.path.join(datapath, dataset_name, file), index_col=0)

In [27]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,27,Private,177119,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,44,United-States,<=50K
1,27,Private,216481,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K
2,25,Private,256263,Assoc-acdm,12,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K
3,46,Private,147640,5th-6th,3,Married-civ-spouse,Transport-moving,Husband,Amer-Indian-Eskimo,Male,0,1902,40,United-States,<=50K
4,45,Private,172822,11th,7,Divorced,Transport-moving,Not-in-family,White,Male,0,2824,76,United-States,>50K


In [28]:
var = "fnlwgt"
df[var].unique()

array([177119, 216481, 256263, ...,  45317, 215862, 186925])

I created a json file with the data schema in `data_schemas/adult.json`. I am not sure about all the "countable" data types, but none of the input data seems continuous/to have decimals. See the tapas documentation: https://privacy-sdg-toolbox.readthedocs.io/en/latest/dataset-schema.html

In [29]:
with open('data_schemas/adult.json') as file:
    # Load the JSON data into a dictionary
    data_schema = json.load(file)


In [30]:
data_description = tapas.datasets.DataDescription(schema=data_schema)

In [31]:
data = tapas.datasets.TabularDataset(data=df, description=data_description)

we need to separate the target record from the original data

In [32]:
attack_id = 0

In [33]:
target_record = data.get_records([attack_id])
data.drop_records([attack_id], in_place=True)

In [34]:
assert data.data.shape[0] + 1 == df.shape[0], "data does not contain 1 record less than original df"

In [35]:
generator = tapas.generators.GeneratorFromExecutable(exe="src/generator_from_dataset.py")
generator.fit(data)

In [36]:
training_data_set_size = 9
specific_data = data.sample(n_samples=training_data_set_size)
synthetic_dataset_size = 10

### New approach to the generator

In [37]:
import tapas.datasets
import tapas.generators
import tapas.threat_models
import tapas.attacks
import tapas.report

In [38]:
data_knowledge = tapas.threat_models.AuxiliaryDataKnowledge( # see the source code and the paper: there can be exact knowledge, and auxiliary knowledge 
      data,
      auxiliary_split=0.5,
      num_training_records=1000
)

In [39]:
threat_model = tapas.threat_models.TargetedMIA(
    attacker_knowledge_data=tapas.threat_models.ExactDataKnowledge(
        specific_data),       
    attacker_knowledge_generator=tapas.threat_models.BlackBoxKnowledge(
            generator, num_synthetic_records=synthetic_dataset_size,
        ),
    target_record=target_record,
    generate_pairs=False,
    replace_target=False
)

In [40]:
attack = tapas.attacks.ClosestDistanceMIA(criterion="accuracy", label="Closest-Distance")

In [41]:
num_training = 1000
attack.train(threat_model, num_samples=num_training)

In [59]:
attack_summary = threat_model.test(attack, num_samples=100)

In [60]:
display(attack_summary.scores) # what do the scores mean? try with smaller samples?
display(len(attack_summary.scores)) # these are the number of samples in the test()
display(attack_summary.labels) # I guess these are the indicators for whether the dataset contains the record or not?


type(attack_summary)
attack_summary.predictions # so this explains why the FPR and TPR are 0. How can I change it?

array([ -8.,  -5.,  -6.,  -7., -10.,  -6.,  -9.,  -8.,  -9.,  -8.,  -8.,
        -7.,  -7.,  -8.,  -8.,  -6.,  -9.,  -8.,  -7.,  -6.,  -7.,  -8.,
        -8.,  -9.,  -8.,  -8.,  -7.,  -6.,  -5.,  -6.,  -8.,  -7., -10.,
        -7.,  -7.,  -6.,  -8.,  -9.,  -9.,  -8.,  -7.,  -8.,  -6.,  -8.,
       -10.,  -7.,  -8.,  -9.,  -5.,  -8.,  -9.,  -7.,  -7.,  -7.,  -8.,
        -8.,  -8.,  -7.,  -8.,  -5.,  -7.,  -8.,  -7.,  -8.,  -8.,  -9.,
        -9.,  -8.,  -7.,  -9.,  -7.,  -6.,  -5.,  -6., -10.,  -6.,  -5.,
        -5.,  -5.,  -7.,  -9.,  -4.,  -7.,  -8.,  -8.,  -4.,  -7.,  -9.,
        -6.,  -8.,  -7.,  -8.,  -7.,  -8., -10.,  -8.,  -7.,  -8.,  -6.,
        -8.,  -8.,  -7.,  -7.,  -7.,  -9.,  -6.,  -7.,  -4.,  -8.,  -8.,
        -8.,  -9.,  -9.,  -5.,  -8.,  -8.,  -8.,  -9.,  -6.,  -8.,  -9.,
        -8.,  -8.,  -6.,  -9.,  -8.,  -7.,  -9.,  -6.,  -6.,  -9.,  -6.,
        -8.,  -7.,  -7.,  -8.,  -8.,  -8.,  -7.,  -7.,  -8.,  -7.,  -7.,
        -9.,  -8.,  -8.,  -9.,  -8.,  -8.,  -7.,  -

200

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1])

array([0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1])

why do we get all 0s?
- because we do not use the generator properly. -- for this, need to check some functioning code (try one of their examples)
- because we are too strict (threshold too high/low) -- but should this threshold not be learned in training?

In [61]:
metrics = attack_summary.get_metrics()

In [62]:
metrics

Unnamed: 0,dataset,target_id,generator,attack,accuracy,true_positive_rate,false_positive_rate,mia_advantage,privacy_gain,auc,effective_epsilon
0,Unnamed dataset (EXACT),0,src/generator_from_dataset.py,Closest-Distance,0.475,0.427083,0.480769,-0.053686,1.053686,0.45623,-0.002478


### Notes
- `attack_score` takes the min distance between the target record and all synthetic datasets in the list `datasets`. 
    - so, I need more than one synthetic datasets as inputs?
    - should they stem from the same generator?
    - read some paper on these attacks/need to understand the background better
