Try to build a very simple generator executable that works with the tapas library

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
    # this line: https://github.com/alan-turing-institute/privacy-sdg-toolbox/blob/ebe3e94bc08f91290e809d4cadd2cfd72d9ea561/tapas/datasets/utils.py#L157
    # creates some FutureWarnings when running the notebook. Ignoring for now


import tapas.datasets
import tapas.generators
import numpy as np 
import pandas as pd

first we need to create an instance of the TabularDataset class

In [2]:
def create_data(n=2000):
    "Create dataframe and schema as input for TabularDataset"
    rng = np.random.default_rng(3)
    wage = rng.random(n)
    identifier = np.arange(n)
    raw_data = np.stack([identifier, wage], axis=1)
    df = pd.DataFrame(raw_data, columns=["id", "wage"])
    df["id"] = df["id"].astype(int)

    schema = [
        {"name": "id", "type": "finite", "representation": df["id"].unique()},
        {"name": "wage", "type": "finite", "representation": df["wage"].unique()}
    ]
    return df, schema

In [3]:
df_data, data_schema = create_data()
data_description = tapas.datasets.DataDescription(schema=data_schema)
data = tapas.datasets.TabularDataset(data=df_data, description=data_description)

In [4]:
data.data.head()

Unnamed: 0,id,wage
0,0,0.085649
1,1,0.236811
2,2,0.801274
3,3,0.582162
4,4,0.094129


this is our "original" dataset; now we want to load a generator that takes the original dataset and returns synthetic data based on it (?)

## The generator

We make a generator in the file `src/myexecutable.py`, and need to make it executable with `chmod +x src/myexecutable.py`. 
Key features of the generator:
- it has the same structure as the original data set 
- it prints out the generated data set to the console, and the console output is read back into the python by tapas 


Then we can instantiate the generator.

In [5]:
generator = tapas.generators.GeneratorFromExecutable(exe="src/myexecutable.py")

In [6]:
generator.fit(data)

In [7]:
synthetic_data = generator.generate(3)

In [8]:
synthetic_data.data.head()

Unnamed: 0,id,wage
0,0.0,0.682352
1,1.0,0.053821
2,2.0,0.22036


### Now let's try to run an attack on this model

In [9]:
import tapas.datasets
import tapas.generators
import tapas.threat_models
import tapas.attacks
import tapas.report

In [10]:
df_data, data_schema = create_data(100000)
data_description = tapas.datasets.DataDescription(schema=data_schema)
data = tapas.datasets.TabularDataset(data=df_data, description=data_description)

In [11]:
generator = tapas.generators.GeneratorFromExecutable(exe="src/myexecutable.py")

In [12]:
data_knowledge = tapas.threat_models.AuxiliaryDataKnowledge(
      data,
      auxiliary_split=0.5,
      num_training_records=5000
)

In [13]:
sdg_knowledge = tapas.threat_models.BlackBoxKnowledge(
    generator,
    num_synthetic_records=5000,
)

In [14]:
threat_model = tapas.threat_models.TargetedMIA(
   attacker_knowledge_data=data_knowledge,
   target_record=data.get_records([0]),
   attacker_knowledge_generator=sdg_knowledge,
   generate_pairs=True,
   replace_target=True
)

In [15]:
attacker = tapas.attacks.GroundhogAttack()

the cell below fails with an attribute error: numpy.ndarray has no attribute index.
I don't think the reason is a missing index on the generated dataset; see https://github.com/alan-turing-institute/privacy-sdg-toolbox/blob/ebe3e94bc08f91290e809d4cadd2cfd72d9ea561/tapas/datasets/dataset.py#L14: the method assumes no index column when the data are read 

In [16]:
attacker.train(threat_model, num_samples=5)


AttributeError: 'numpy.ndarray' object has no attribute 'index'

### Try some other attacks

https://privacy-sdg-toolbox.readthedocs.io/en/latest/library-of-attacks.html

use one threat model for all of them 

In [37]:
threat_model = tapas.threat_models.TargetedMIA(
   attacker_knowledge_data=data_knowledge,
   target_record=data.get_records(range(10)), # focus on the first 10 records
   attacker_knowledge_generator=sdg_knowledge,
   generate_pairs=True,
   replace_target=True
)

synth_datasets = [generator.generate(100), generator.generate(50)] # note they use the same seed (can I change it anywhere), so the records overlap

#### ClosestDistanceMIA

In [38]:
attacker = tapas.attacks.ClosestDistanceMIA()


In [39]:
attacker.train(threat_model, num_samples=10)

In [40]:
attacker.attack_score(synth_datasets) 

array([-1., -1.])

what does this output mean? it returns 1 number per data set, irrespectively of the number of targeted records in the threat model 

#### LocalNeighbourhoodAttack

In [43]:
attacker = tapas.attacks.LocalNeighbourhoodAttack()
attacker.train(threat_model, num_samples=10)

In [45]:
attacker.attack_score(synth_datasets)

array([0.01, 0.02])

#### ProbabilityEstimationAttack

In [47]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity(kernel='gaussian', bandwidth=0.2)

In [50]:
attacker = tapas.attacks.ProbabilityEstimationAttack(estimator=kde, criterion="accuracy")
attacker.train(threat_model, num_samples=10)

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [51]:
dir(attacker)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_label',
 '_threshold',
 '_train_attack_score',
 'attack',
 'attack_score',
 'estimator',
 'label',
 'negative_label',
 'positive_label',
 'target_criterion',
 'threat_model',
 'train']

effective epsilons??

### Evaluation and reports

tbd