In [1]:
# Automatically reload modules after executing each cell.
%load_ext autoreload
%autoreload 2

In [2]:
# General imports
import os
import tensorflow as tf
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from joblib import dump, load 

# Utility imports
from utils.losses import *
from utils.plotting import *
from utils.training import *

np.random.seed(666) # Need to do more to ensure data is the same across runs.

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # pick a number < 4 on ML4HEP; < 3 on Voltan 
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Vertical

## $d = 1$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 1
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d)).reshape(-1, 1)
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

bkgd = stats.norm(-0.1, 1)
sgnl = stats.norm(+0.1, 1)

lr = make_lr(bkgd, sgnl)

In [None]:
N = 10**6
data, m, s = split_data(X[:N], y[:N])

bce_lrs = [None] * reps
gbc_lrs = [None] * reps
for i in range(reps):
    print(i, end = ' ')
    bce_model = create_model(**bce_params)
    bce_model.load_weights(bce_filestr.format(N, i))
    bce_lrs[i] = odds_lr(bce_model, m, s)

    gbc_model = load(gbc_filestr.format(N, i))
    gbc_lrs[i] = tree_lr(gbc_model)

In [None]:
xs = np.linspace(-6, 6, 1201).reshape(-1, 1)

In [None]:
bce_preds = get_preds(bce_lrs, xs)

In [None]:
gbc_preds = get_preds(gbc_lrs, xs)

In [None]:
avg_bce = bce_preds.mean(axis = 0)
avg_gbc = gbc_preds.mean(axis = 0)

In [None]:
ratio_plot([bce_preds, gbc_preds], ['BCE', 'GBC'], lr, xs.reshape(-1), 
           figsize = (w, h), title = '\it Likelihood Ratio Models', 
           filename = 'plots/lr_models.png') 

## $d = 2$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 2
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d=4$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 4
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(91, reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d=8$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 8
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d=16$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 16
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d = 32$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 32
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    #bdt_model = XGBClassifier(early_stopping_rounds = 10)
    #X_trn, X_vld, y_trn, y_vld = data
    #bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    #trace = bdt_model.evals_result()['validation_0']
    #print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    #bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        #bce_model, trace = train(data, **bce_params)
        #bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

# Zenodo

## $d = 1$

In [4]:
# Experiment parameters
num = 0
reps = 100
d = 1
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d)).reshape(-1, 1)
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

100
0.9300370988249779 	 11
0 

2023-03-10 08:06:47.141047: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-10 08:06:47.715220: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22243 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:01:00.0, compute capability: 7.5


0.7343477606773376 	 11	1 0.7395363450050354 	 13	2 0.7321134805679321 	 11	3 0.7334631085395813 	 11	4 0.7235618829727173 	 11	5 0.7205410599708557 	 11	6 0.7434526085853577 	 11	7 0.7404569983482361 	 11	8 0.739315927028656 	 13	9 0.7340620160102844 	 11	10 0.7243669629096985 	 12	11 0.7452961802482605 	 18	12 0.7284832000732422 	 11	13 0.7443119883537292 	 11	14 0.7229510545730591 	 11	15 0.7280807495117188 	 11	16 0.7416480779647827 	 11	17 0.7267101407051086 	 12	18 0.727083683013916 	 11	19 0.7150323390960693 	 11	20 0.7355436086654663 	 11	21 0.7389186024665833 	 11	22 0.7348420023918152 	 11	23 0.7321674823760986 	 12	24 0.7340015172958374 	 11	25 0.7204300761222839 	 13	26 0.7411311864852905 	 11	27 0.72242271900177 	 11	28 0.7186024188995361 	 11	29 0.7358741164207458 	 11	30 0.7357438802719116 	 13	31 0.7386594414710999 	 14	32 0.7243064045906067 	 13	33 0.7336468696594238 	 11	34 0.7279472947120667 	 11	35 0.7523735761642456 	 11	36 0.7208036780357361 	 11	37 0.727706015110

## $d = 2$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 2
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d = 4$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 4
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d = 8$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 8
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()

## $d = 11$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 11
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/zenodo/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'
gbc_filestr = filestr + 'gbc/model_{}_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')
    
if not os.path.isdir(filestr + 'gbc/'):
    os.mkdir(filestr + 'gbc/')

# Data parameters
X = np.load('data/zenodo/{}/X_trn.npy'.format(d))
y = np.load('data/zenodo/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ' ')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        
        # Train GBC model
        gbc_model = GradientBoostingClassifier(validation_fraction = 0.25,
                                               n_iter_no_change = 10)
        gbc_model.fit(X[:N], y[:N])
        dump(gbc_model, gbc_filestr.format(N, i))
    print()