In [1]:
# Automatically reload modules after executing each cell.
%load_ext autoreload
%autoreload 2

In [2]:
# General imports
import os
import tensorflow as tf
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from joblib import dump, load 

# Utility imports
from utils.losses import *
from utils.plotting import *
from utils.training import *

np.random.seed(666) # Need to do more to ensure data is the same across runs.

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # pick a number < 4 on ML4HEP; < 3 on Voltan 
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Vertical

## $d = 1$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 1
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d)).reshape(-1, 1)
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

bkgd = stats.norm(-0.1, 1)
sgnl = stats.norm(+0.1, 1)

lr = make_lr(bkgd, sgnl)

In [None]:
N = 10**6
data, m, s = split_data(X[:N], y[:N])

bce_lrs = [None] * reps
gbc_lrs = [None] * reps
for i in range(reps):
    print(i, end = ' ')
    bce_model = create_model(**bce_params)
    bce_model.load_weights(bce_filestr.format(N, i))
    bce_lrs[i] = odds_lr(bce_model, m, s)

    gbc_model = load(gbc_filestr.format(N, i))
    gbc_lrs[i] = tree_lr(gbc_model)

In [None]:
xs = np.linspace(-6, 6, 1201).reshape(-1, 1)

In [None]:
bce_preds = get_preds(bce_lrs, xs)

In [None]:
gbc_preds = get_preds(gbc_lrs, xs)

In [None]:
avg_bce = bce_preds.mean(axis = 0)
avg_gbc = gbc_preds.mean(axis = 0)

In [None]:
ratio_plot([bce_preds, gbc_preds], ['BCE', 'GBC'], lr, xs.reshape(-1), 
           figsize = (w, h), title = '\it Likelihood Ratio Models', 
           filename = 'plots/lr_models.png') 

## $d = 2$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 2
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d=4$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 4
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(91, reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d=8$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 8
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d=16$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 16
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d = 32$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 32
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

# Zenodo

## $d = 1$

In [4]:
# Experiment parameters
num = 0
reps = 100
d = 1
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d)).reshape(-1, 1)
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [5]:
Ns = [10**7]
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(87, reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
        print()
    print()

10000000
0.6910528750873685 	 45
87 

2023-03-10 19:32:06.351600: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-10 19:32:07.004419: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22243 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:01:00.0, compute capability: 7.5


0.6913653612136841 	 100	
88 0.6912950277328491 	 100	
89 0.6913024187088013 	 100	
90 0.6912943720817566 	 100	
91 0.6913145780563354 	 100	
92 0.691287100315094 	 100	
93 0.6912788152694702 	 100	
94 0.6912984848022461 	 100	
95 0.6912990808486938 	 100	
96 0.6913532018661499 	 100	
97 0.6912767887115479 	 100	
98 0.6912504434585571 	 100	
99 0.6913511753082275 	 100	



## $d = 2$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 2
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
        print()
    print()

## $d = 4$

In [8]:
# Experiment parameters
num = 0
reps = 100
d = 4
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
        print()
    print()

100
1.0303720071911813 	 11
0 0.7152805924415588 	 15	
1 0.7930371165275574 	 21	
2 0.7494280338287354 	 20	
3 0.7697917222976685 	 20	
4 0.688759982585907 	 17	
5 0.7402358055114746 	 19	
6 0.7901978492736816 	 21	
7 0.7297656536102295 	 14	
8 0.7083227634429932 	 18	
9 0.7640546560287476 	 22	
10 0.7565174102783203 	 22	
11 0.7596052289009094 	 20	
12 0.6614600419998169 	 14	
13 0.7304542064666748 	 17	
14 0.722554624080658 	 19	
15 0.7085067629814148 	 20	
16 0.7608336806297302 	 17	
17 0.7507644891738892 	 21	
18 0.7018964886665344 	 17	
19 0.7381808757781982 	 17	
20 0.7348161339759827 	 21	
21 0.7106039524078369 	 18	
22 0.7307910323143005 	 16	
23 0.7920247912406921 	 22	
24 0.7406968474388123 	 18	
25 0.7073522210121155 	 22	
26 0.7665988802909851 	 21	
27 0.7552981376647949 	 21	
28 0.7021545171737671 	 18	
29 0.719815731048584 	 20	
30 0.7171556949615479 	 19	
31 0.7294284701347351 	 22	
32 0.7039250135421753 	 21	
33 0.6961967945098877 	 17	
34 0.7264349460601807 	 20	
35 0.

## $d = 8$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 8
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d = 11$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 11
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()