# Extension: Comparison of copy stages in higher dimensions with MNN copies

In this notebook, we perform computations corresponding to the Two-stage Distance Copying Extension, described in Appendix D of the thesis report. Specifically, we train two-stage Medium Neural Network copies (MNN) in the UCI high-dimensional datasets, for each of the 3 different black box models considered, that later we compare to the corresponding MNN hard copies. Computations are limited to 1,000,000 synthetic samples generated with Algorithm 2 and 600 seconds. Results are stored in the corresponding results folder present in the repository.

As a remark, this particular notebook corresponds to the execution with seed 45. Nevertheless, the computations have been repeated for five different seeds (41, 42, 43, 44, and 45), aiming to increase the reliability and significance of the obtained results.

In [1]:
# All necessary imports
import numpy as np
import os
import types
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from time import perf_counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.colors as mcolors
import random
import pickle
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras import Model as KerasModel
from ucimlrepo import fetch_ucirepo
from scipy.stats import qmc
import gc


original_cwd = os.getcwd()
os.chdir('../utils')
from utils import *
os.chdir(original_cwd)

In [2]:
# Set the seed
seed = 45
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)
tf.config.experimental.enable_op_determinism()

# Create a wrapper for our Neural network black boxes
def bbmodelW(x):
    if isinstance(bbmodel, tf.keras.models.Model):
        return np.where(bbmodel(x) > 0.5, 1, -1).flatten()
    return np.where(bbmodel.predict(x) > 0.5, 1, -1).flatten()

## Breast cancer dataset

In [3]:
# Import dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
X = breast_cancer_wisconsin_diagnostic.data.features.values
X = normalize(X)
y = np.where(breast_cancer_wisconsin_diagnostic.data.targets.values.flatten() == 'M', 1.,0.)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

### Black box 1: Random Forest classifier

In [4]:
# Define, train and evaluate the black box
bbmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=5)
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 30))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [5]:
pts_1, data_1, lab_1 = generate_distances_algo3(30, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [6]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [7]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=30, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [8]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [9]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS4_1_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

13567

### Black box 2: Gradient Boosting classifier

In [10]:
# Define, train and evaluate the black box
bbmodel = HistGradientBoostingClassifier()
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 30))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [11]:
pts_1, data_1, lab_1 = generate_distances_algo3(30, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [12]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [13]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=30, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [14]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [15]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS4_2_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

26659

### Black box 3: Neural Network classifier

In [16]:
# Define, train and evaluate the black box
bbmodel = keras.Sequential(
    [
        layers.Dense(128, activation = "relu"),
        layers.Dense(64, activation = "relu"),
        layers.Dense(32, activation = "relu"),
        layers.Dense(16, activation = "relu"),
        layers.Dense(1, activation = "sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(learning_rate=0.01)
bbmodel.compile(optimizer="adam", loss=keras.losses.BinaryCrossentropy())
bbmodel.fit(X_train, y_train, batch_size=32, epochs=50, verbose = 0)

yhat = bbmodelW(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == (2*y_test-1))

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 30))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [17]:
pts_1, data_1, lab_1 = generate_distances_algo3(30, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [18]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [19]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=30, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [20]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [21]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS4_3_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

49608

## Rice dataset

In [22]:
# Import dataset
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 
  
X = rice_cammeo_and_osmancik.data.features.values
X = normalize(X)
y = np.where(rice_cammeo_and_osmancik.data.targets.values.flatten() == 'Cammeo', 1.,0.)  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

### Black box 1: Random Forest classifier

In [23]:
# Define, train and evaluate the black box
bbmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=5)
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 7))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [24]:
pts_1, data_1, lab_1 = generate_distances_algo3(7, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [25]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [26]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=7, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [27]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [28]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS5_1_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

26633

### Black box 2: Gradient Boosting classifier

In [29]:
# Define, train and evaluate the black box
bbmodel = HistGradientBoostingClassifier()
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 7))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [30]:
pts_1, data_1, lab_1 = generate_distances_algo3(7, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [31]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [32]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=7, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [33]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [34]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS5_2_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

8785

### Black box 3: Neural Network classifier

In [35]:
# Define, train and evaluate the black box
bbmodel = keras.Sequential(
    [
        layers.Dense(128, activation = "relu"),
        layers.Dense(64, activation = "relu"),
        layers.Dense(32, activation = "relu"),
        layers.Dense(16, activation = "relu"),
        layers.Dense(1, activation = "sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(learning_rate=0.01)
bbmodel.compile(optimizer="adam", loss=keras.losses.BinaryCrossentropy())
bbmodel.fit(X_train, y_train, batch_size=32, epochs=50, verbose = 0)

yhat = bbmodelW(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == (2*y_test-1))

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 7))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [36]:
pts_1, data_1, lab_1 = generate_distances_algo3(7, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [37]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [38]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=7, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [39]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [40]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS5_3_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

99259

## Connectionist bench (mines vs rocks) dataset

In [41]:
# Import dataset
connectionist_bench_sonar_mines_vs_rocks = fetch_ucirepo(id=151) 

X = connectionist_bench_sonar_mines_vs_rocks.data.features.values 
X = normalize(X) 
y = np.where(connectionist_bench_sonar_mines_vs_rocks.data.targets.values.flatten() == 'M', 1.,0.)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

### Black box 1: Random Forest classifier

In [42]:
# Define, train and evaluate the black box
bbmodel = RandomForestClassifier(max_depth=10, min_samples_leaf=5)
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 60))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [43]:
pts_1, data_1, lab_1 = generate_distances_algo3(60, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [44]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [45]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=60, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [46]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [47]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS6_1_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

91768

### Black box 2: Gradient Boosting classifier

In [48]:
# Define, train and evaluate the black box
bbmodel = HistGradientBoostingClassifier()
bbmodel.fit(X_train, y_train)
yhat = bbmodel.predict(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == y_test)

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 60))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [49]:
pts_1, data_1, lab_1 = generate_distances_algo3(60, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 840020 points in 619.83 seconds


In [50]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 840020 points done


In [51]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=60, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [52]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [53]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS6_2_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

93064

### Black box 3: Neural Network classifier

In [54]:
# Define, train and evaluate the black box
bbmodel = keras.Sequential(
    [
        layers.Dense(128, activation = "relu"),
        layers.Dense(64, activation = "relu"),
        layers.Dense(32, activation = "relu"),
        layers.Dense(16, activation = "relu"),
        layers.Dense(1, activation = "sigmoid"),
    ]
)

optimizer = keras.optimizers.Adam(learning_rate=0.01)
bbmodel.compile(optimizer="adam", loss=keras.losses.BinaryCrossentropy())
bbmodel.fit(X_train, y_train, batch_size=32, epochs=50, verbose=0)

yhat = bbmodelW(X_test)

# Accuracy of the black box
accbb = np.mean(yhat == (2*y_test-1))

# We generate 1,000,000 points uniformly to test the copy. 
data_test_syn = np.random.uniform(-1,1, (1000000, 60))

# We label these points with labels 1 and -1
y_test_syn = bbmodelW(data_test_syn)

In [55]:
pts_1, data_1, lab_1 = generate_distances_algo3(60, -1, 1, 50_000, 20, 2_500, 3, 1.5, bbmodelW, 2_000)
pts_2 = pts_1

We have labelled 10000 points


We have labelled 50000 points


We have labelled 200000 points


We have labelled 400000 points


We have labelled 600000 points


We have labelled 800000 points


We have labelled 1000000 points


In [56]:
efe_1, acc_1, efe_unif_1, model1, efe_2, acc_2, efe_unif_2, model2 = train_copy_MNNd_2_stages(pts_1, 
                                                                                              data_1, lab_1, 
                                                                                              X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [57]:
n_pow2 = next_power_of_2(1_000_000)
sampler = qmc.Sobol(d=60, scramble=False)
sobol_points = sampler.random_base2(m=int(np.log2(n_pow2)))
sobol_points = sobol_points[:1_000_000]
data_3 = 2*sobol_points - 1
lab_3 = bbmodelW(data_3)

pts_3 = [50, 200, 500, 1_000, 5_000, 10_000, 50_000, 200_000, 400_000, 600_000, 800_000, 1_000_000]

In [58]:
efe_3, acc_3, efe_unif_3, model3 = train_copy_MNNh(data_3, lab_3, X_test, y_test, data_test_syn, y_test_syn, bbmodelW)

Computations for 50 points done


Computations for 200 points done


Computations for 500 points done


Computations for 1000 points done


Computations for 5000 points done


Computations for 10000 points done


Computations for 50000 points done


Computations for 200000 points done


Computations for 400000 points done


Computations for 600000 points done


Computations for 800000 points done


Computations for 1000000 points done


In [59]:
data_to_store = {}

for i in range(1, 4):
    model_key = f"model{i}"
    data_to_store[model_key] = {
        "model": globals()[f"model{i}"],
        "pts": globals()[f"pts_{i}"],
        "efe": globals()[f"efe_{i}"],
        "acc": globals()[f"acc_{i}"],
        "efe_unif": globals()[f"efe_unif_{i}"]
    }

data_to_store["blackb"] = {
    "model": bbmodel,
    "acc": accbb
}

filename = f"../results/extension_results_DS6_3_2_seed{seed}.pkl"
with open(filename, "wb") as f:
    pickle.dump(data_to_store, f)

data_to_store.clear()
del data_to_store
for i in range(1, 4):
    del globals()[f"model{i}"]
    del globals()[f"pts_{i}"]
    del globals()[f"efe_{i}"]
    del globals()[f"acc_{i}"]
    del globals()[f"efe_unif_{i}"]
del bbmodel
del accbb
gc.collect()

93302