# Experiment 1: Comparison of copying algorithms in dimension 2 with SNN copies

In this notebook, we perform computations corresponding to Experiment 1, described in the thesis report. Specifically, we use Algorithms 1 and 2 to train Small Neural Network copies (SNN) in the two-dimensional datasets, for each of the 3 different black box models considered in the experiment, that later we compare to the corresponding SNN hard copies. Computations are limited to 1,000,000 synthetic samples and 240 seconds. Results are stored in the corresponding results folder present in the repository.

As a remark, this particular notebook corresponds to the execution with seed 45. Nevertheless, the computations have been repeated for five different seeds (41, 42, 43, 44, and 45), aiming to increase the reliability and significance of the obtained results.

In [1]:
# All necessary imports
import numpy as np
import os
import types
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from time import perf_counter
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.colors as mcolors
import random
import pickle
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras import Model as KerasModel
import gc


original_cwd = os.getcwd()
os.chdir('../utils')
from utils import *
os.chdir(original_cwd)

In [2]:
# Set the seed
seed = 45
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)
tf.config.experimental.enable_op_determinism()

# Create a wrapper for our Neural network black boxes
def bbmodelW(x):
    if isinstance(bbmodel, tf.keras.models.Model):
        return np.where(bbmodel(x) > 0.5, 1, -1).flatten()
    return np.where(bbmodel.predict(x) > 0.5, 1, -1).flatten()

## Overlapping Gaussians dataset

In [3]:
# Import dataset
data_train = np.load("../data/Synth_dataset_1_train.npz")
X_train = data_train["X"]
y_train = data_train["y"]

data_test = np.load("../data/Synth_dataset_1_test.npz")
X_test = data_test["X"]
y_test = data_test["y"]

print("Size of training dataset:", len(X_train))
print("Size of test dataset:", len(X_test))

Size of training dataset: 800
Size of test dataset: 200


### Black box 1: Random Forest classifier

In [4]:
# Define, train and evaluate the black box
bbmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=5)
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [5]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 1659 points in 240.11 seconds


In [6]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 1660 points done


In [7]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [8]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [9]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [10]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [11]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS1_1_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)


data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

17061

### Black box 2: Gradient Boosting classifier

In [12]:
# Define, train and evaluate the black box
bbmodel = HistGradientBoostingClassifier()
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [13]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 1907 points in 240.08 seconds


In [14]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 1908 points done


In [15]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [16]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [17]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [18]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [19]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS1_2_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)


data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

36118

### Black box 3: Neural Network classifier

In [20]:
# Define, train and evaluate the black box
bbmodel = keras.Sequential(
    [
        layers.Dense(128, activation = "relu"),
        layers.Dense(64, activation = "relu"),
        layers.Dense(32, activation = "relu"),
        layers.Dense(16, activation = "relu"),
        layers.Dense(1, activation = "sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(learning_rate=0.01)
bbmodel.compile(optimizer="adam", loss=keras.losses.BinaryCrossentropy())
bbmodel.fit(X_train, y_train, batch_size=32, epochs=50, verbose = 0)

yhat = bbmodelW(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == (2*y_test-1))

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [21]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 4283 points in 240.03 seconds


In [22]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 4284 points done


In [23]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [24]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [25]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [26]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [27]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS1_3_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

28629

## Two spirals dataset

In [28]:
# Import dataset
data_train = np.load("../data/Synth_dataset_2_train.npz")
X_train = data_train["X"]
y_train = data_train["y"]

data_test = np.load("../data/Synth_dataset_2_test.npz")
X_test = data_test["X"]
y_test = data_test["y"]

print("Size of training dataset:", len(X_train))
print("Size of test dataset:", len(X_test))

Size of training dataset: 8000
Size of test dataset: 2000


### Black box 1: Random Forest classifier

In [29]:
# Define, train and evaluate the black box
bbmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=5)
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [30]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 1425 points in 240.16 seconds


In [31]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 1426 points done


In [32]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [33]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [34]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [35]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [36]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS2_1_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)


data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

12559

### Black box 2: Gradient Boosting classifier

In [37]:
# Define, train and evaluate the black box
bbmodel = HistGradientBoostingClassifier()
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [38]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 1861 points in 240.05 seconds


In [39]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 1862 points done


In [40]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [41]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [42]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [43]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [44]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS2_2_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

58180

### Black box 3: Neural Network classifier

In [45]:
# Define, train and evaluate the black box
bbmodel = keras.Sequential(
    [
        layers.Dense(128, activation = "relu"),
        layers.Dense(64, activation = "relu"),
        layers.Dense(32, activation = "relu"),
        layers.Dense(16, activation = "relu"),
        layers.Dense(1, activation = "sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(learning_rate=0.01)
bbmodel.compile(optimizer="adam", loss=keras.losses.BinaryCrossentropy())
bbmodel.fit(X_train, y_train, batch_size=32, epochs=50, verbose = 0)

yhat = bbmodelW(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == (2*y_test-1))

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [46]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 4125 points in 240.02 seconds


In [47]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 4126 points done


In [48]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [49]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [50]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [51]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [52]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS2_3_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

60674

## Space-filling and convoluted dataset

In [53]:
# Import dataset
data_train = np.load("../data/Synth_dataset_3_train.npz")
X_train = data_train["X"]
y_train = data_train["y"]

data_test = np.load("../data/Synth_dataset_3_test.npz")
X_test = data_test["X"]
y_test = data_test["y"]

print("Size of training dataset:", len(X_train))
print("Size of test dataset:", len(X_test))

Size of training dataset: 8000
Size of test dataset: 2000


### Black box 1: Random Forest classifier

In [54]:
# Define, train and evaluate the black box
bbmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=5)
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [55]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 993 points in 240.16 seconds


In [56]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 994 points done


In [57]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 700025 points in 255.08 seconds


In [58]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 700025 points done


In [59]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [60]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [61]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS3_1_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

57508

### Black box 2: Gradient Boosting classifier

In [62]:
# Define, train and evaluate the black box
bbmodel = HistGradientBoostingClassifier()
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [63]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 2077 points in 240.1 seconds


In [64]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 2078 points done


In [65]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [66]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [67]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [68]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [69]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS3_2_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

58180

### Black box 3: Neural Network classifier

In [70]:
# Define, train and evaluate the black box
bbmodel = keras.Sequential(
    [
        layers.Dense(128, activation = "relu"),
        layers.Dense(64, activation = "relu"),
        layers.Dense(32, activation = "relu"),
        layers.Dense(16, activation = "relu"),
        layers.Dense(1, activation = "sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(learning_rate=0.01)
bbmodel.compile(optimizer="adam", loss=keras.losses.BinaryCrossentropy())
bbmodel.fit(X_train, y_train, batch_size=32, epochs=50, verbose=0)

yhat = bbmodelW(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == (2*y_test-1))

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 2))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [71]:
pts_1, data_1, lab_1 = generate_distances_algo1(2, -1, 1, 3, 2, bbmodelW)

We have labelled 50 points
We have labelled 200 points
We have labelled 500 points
We have labelled 1000 points
We have labelled 3639 points in 240.01 seconds


In [72]:
efe_1, acc_1, efe_unif_1, model1 = train_copy_SNNd(pts_1, data_1, lab_1, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 3640 points done


In [73]:
pts_2, data_2, lab_2 = generate_distances_algo2(2, -1, 1, 40_000, 25, 1_200, 1, 0.1, bbmodelW, 2_000)

We have labelled 10000 points
We have labelled 50000 points
We have labelled 200000 points
We have labelled 400000 points
We have labelled 600000 points
We have labelled 800000 points
We have labelled 1000000 points


In [74]:
efe_2, acc_2, efe_unif_2, model2 = train_copy_SNNd(pts_2, data_2, lab_2, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [75]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=2, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [76]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_SNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done
Computations for 200 points done
Computations for 500 points done
Computations for 1000 points done
Computations for 5000 points done
Computations for 10000 points done
Computations for 50000 points done
Computations for 200000 points done
Computations for 400000 points done
Computations for 600000 points done
Computations for 800000 points done
Computations for 1000000 points done


In [77]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/results_DS3_3_1_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

60612