In [1]:
from math import ceil
from os import path, getcwd
import random
from types import SimpleNamespace

In [2]:
import numpy as np
import torch
from torch import nn, optim, Tensor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
import sys
!{sys.executable} -m pip install -e ../.

from privacy_meter.audit import Audit
from privacy_meter.audit_report import ROCCurveReport, SignalHistogramReport
from privacy_meter.constants import InferenceGame
from privacy_meter.dataset import Dataset
from privacy_meter.hypothesis_test import threshold_func
from privacy_meter.information_source import InformationSource
from privacy_meter.information_source_signal import ModelLoss
from privacy_meter.metric import ShadowMetric
from privacy_meter.model import PytorchModel

Obtaining file:///home/victor/ml_privacy_meter
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: privacy-meter
  Attempting uninstall: privacy-meter
    Found existing installation: privacy-meter 1.0
    Uninstalling privacy-meter-1.0:
      Successfully uninstalled privacy-meter-1.0
  Running setup.py develop for privacy-meter
Successfully installed privacy-meter-1.0


In [4]:
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f8e0c11caf0>

In [5]:
N_TRAININGS = 5
N_SHADOW_MODELS = 3
EPOCHS = 10
BATCH_SIZE = 32

In [6]:
def preprocess_purchase100():
    """
    Cf. https://www.cs.cornell.edu/~shmat/shmat_oak17.pdf page 7
    Returns:

    """
    # Read raw dataset
    dataset_path = "../privacy_meter/dataset_purchase"
    with open(dataset_path, "r") as f:
        purchase_dataset = f.readlines()
    # Separate features and labels into different arrays
    x, y = [], []
    for datapoint in purchase_dataset:
        split = datapoint.rstrip().split(",")
        label = int(split[0]) - 1  # The first value is the label
        features = np.array(split[1:], dtype=np.float32)  # The next values are the features
        x.append(features)
        y.append(label)
    # Make sure the datatype is correct
    x = np.array(x, dtype=np.float32)
    # Convert labels into one hot vectors
    y = OneHotEncoder(sparse=False).fit_transform(np.expand_dims(y, axis=1))
    # Split data into train, test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1234)
    return x_train, y_train, x_test, y_test

In [7]:
def generate_splits(x_train, y_train, x_test, y_test, n_shadow_models):
    dataset = Dataset(
        data_dict={'train': {'x': x_train, 'y': y_train}, 'test': {'x': x_test, 'y': y_test}},
        default_input='x',
        default_output='y'
    )
    datasets_list = dataset.subdivide(
        num_splits=n_shadow_models + 1,
        return_results=True,
        method='independent'
    )
    return datasets_list

In [8]:
def generate_datasets(n_trainings, n_shadow_models):
    x_train, y_train, x_test, y_test = preprocess_purchase100()
    datasets_lists = [generate_splits(x_train, y_train, x_test, y_test, n_shadow_models) for _ in range(n_trainings)]
    return datasets_lists

In [9]:
def get_torch_models(n_trainings, n_shadow_models):
    torch_models = [[
        nn.Sequential(
            nn.Linear(in_features=600, out_features=350),
            nn.Tanh(),
            nn.Linear(in_features=350, out_features=100),
            nn.Softmax(dim=1)
        )
        for _ in range(n_shadow_models + 1)
        ] for _ in range(n_trainings)
    ]
    return torch_models

In [10]:
def get_trained_torch_models(n_trainings, n_shadow_models, criterion, datasets_lists, batch_size, epochs):
    torch_models = get_torch_models(n_trainings, n_shadow_models)
    for i in range(n_trainings):
        for j in range(n_shadow_models + 1):
            optimizer = optim.Adam(torch_models[i][j].parameters())
            x = datasets_lists[i][j].get_feature(split_name=f'train', feature_name='<default_input>')
            y = datasets_lists[i][j].get_feature(split_name=f'train', feature_name='<default_output>')
            n_samples = x.shape[0]
            n_batches = ceil(n_samples / batch_size)
            x = np.array_split(x, n_batches)
            y = np.array_split(y, n_batches)
            for epoch in range(epochs):
                epoch_loss, acc = 0.0, 0.0
                for b in range(n_batches):
                    optimizer.zero_grad()
                    y_pred = torch_models[i][j](Tensor(x[b]))
                    loss = criterion(Tensor(y[b]), y_pred)
                    loss.backward()
                    optimizer.step()
                    epoch_loss += loss.item()
                    acc += torch.sum(y_pred.argmax(axis=1) == Tensor(y[b]).argmax(axis=1))
                acc /= n_samples
                epoch_loss /= n_samples
                print(f'round #{i+1:02d}/{n_trainings:02d}, model #{j+1:02d}/{n_shadow_models+1:02d}, epoch #{epoch+1:02d}/{epochs:02d}:\ttrain_acc = {acc:.3f}\ttrain_loss = {epoch_loss:.3e}')
    return torch_models

In [11]:
def get_models(n_trainings, n_shadow_models, datasets_lists, batch_size, epochs):
    criterion = nn.CrossEntropyLoss(reduction='sum')
    torch_models = get_trained_torch_models(n_trainings, n_shadow_models, criterion, datasets_lists, batch_size, epochs)
    models = [[
        PytorchModel(
            model_obj=torch_models[i][j],
            loss_fn=criterion
        )
        for j in range(n_shadow_models + 1)
        ] for i in range(n_trainings)
    ]
    return models

In [12]:
def get_info_sources(models, datasets_lists, n_trainings):
    target_info_sources = [InformationSource(
        models=[models[i][0]],
        datasets=[datasets_lists[i][0]]
        ) for i in range(n_trainings)
    ]
    reference_info_sources = [InformationSource(
        models=models[i][1:],
        datasets=datasets_lists[i][1:]
        ) for i in range(n_trainings)
    ]
    return target_info_sources, reference_info_sources

In [13]:
def get_metrics(models, datasets_lists, n_trainings, empty=False):
    if empty:
        metrics = [ShadowMetric(
            target_info_source=SimpleNamespace(models=[]),
            reference_info_source=SimpleNamespace(models=[]),
            signals=[ModelLoss()],
            hypothesis_test_func=threshold_func,
            unique_dataset=False,
            reweight_samples=True
        ) for _ in range(n_trainings)]
    else:
        target_info_sources, reference_info_sources = get_info_sources(models, datasets_lists, n_trainings)
        metrics = [ShadowMetric(
            target_info_source=target_info_sources[i],
            reference_info_source=reference_info_sources[i],
            signals=[ModelLoss()],
            hypothesis_test_func=threshold_func,
            unique_dataset=False,
            reweight_samples=True
        ) for i in range(n_trainings)]
    return metrics

In [14]:
def get_audit_results(n_trainings, n_shadow_models, batch_size, epochs):
    logs_directory_names = [path.join(getcwd(), f'test5_{i:02d}') for i in range(n_trainings)]
    if all(path.isdir(d) for d in logs_directory_names):
        metrics = get_metrics(None, None, n_trainings, True)
    else:
        datasets_lists = generate_datasets(n_trainings, n_shadow_models)
        models = get_models(n_trainings, n_shadow_models, datasets_lists, batch_size, epochs)
        metrics = get_metrics(models, datasets_lists, n_trainings)
    audit = Audit(
        metrics=metrics,
        inference_game_type=InferenceGame.AVG_PRIVACY_LOSS_TRAINING_ALGO,
        logs_directory_names=logs_directory_names
    )
    audit.prepare()
    results = audit.run()
    return results

In [None]:
results = get_audit_results(N_TRAININGS, N_SHADOW_MODELS, BATCH_SIZE, EPOCHS)
for result in results:
    print(result)

In [None]:
# This instruction won't be needed once the tool is on pip
from privacy_meter import audit_report
audit_report.REPORT_FILES_DIR = '../privacy_meter/report_files'

In [None]:
ROCCurveReport.generate_report(
    metric_result=results,
    inference_game_type=InferenceGame.AVG_PRIVACY_LOSS_TRAINING_ALGO,
    save=False,
    show=True
)

In [None]:
SignalHistogramReport.generate_report(
    metric_result=results,
    inference_game_type=InferenceGame.AVG_PRIVACY_LOSS_TRAINING_ALGO,
    save=False,
    show=True
)