In [1]:
# Automatically reload modules after executing each cell.
%load_ext autoreload
%autoreload 2

In [2]:
# General imports
import os
import tensorflow as tf
from scipy import stats
from xgboost import XGBClassifier

# Utility imports
from utils.losses import *
from utils.plotting import *
from utils.training import *

np.random.seed(666) # Need to do more to ensure data is the same across runs.

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3" # pick a number < 4 on ML4HEP; < 3 on Voltan 
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# $d = 1$

In [27]:
# Experiment parameters
num = 0
reps = 100
d = 1
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d)).reshape(-1, 1)
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [28]:
Ns = 10**np.arange(5, 8)
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

100000
0.6890942058837414 	 20
0:	0.6882143616676331 	 16	
1:	0.6881936192512512 	 15	
2:	0.688223659992218 	 19	
3:	0.6882207989692688 	 22	
4:	0.6881642937660217 	 15	
5:	0.6881690621376038 	 18	
6:	0.6882678270339966 	 12	
7:	0.6882228851318359 	 18	
8:	0.6881581544876099 	 16	
9:	0.6882997155189514 	 14	
10:	0.6882254481315613 	 15	
11:	0.6882297396659851 	 17	
12:	0.688232958316803 	 12	
13:	0.6881539821624756 	 18	
14:	0.6882113814353943 	 15	
15:	0.6881851553916931 	 12	
16:	0.6882152557373047 	 12	
17:	0.6882097721099854 	 15	
18:	0.6882142424583435 	 13	
19:	0.6882191300392151 	 16	
20:	0.6882660388946533 	 19	
21:	0.6882752180099487 	 14	
22:	0.6881738901138306 	 15	
23:	0.6882686614990234 	 19	
24:	0.6881874799728394 	 15	
25:	0.6881445050239563 	 20	
26:	0.6882646083831787 	 14	
27:	0.6881853342056274 	 20	
28:	0.6881673336029053 	 15	
29:	0.688194215297699 	 15	
30:	0.6882137656211853 	 14	
31:	0.6883093118667603 	 14	
32:	0.688245952129364 	 16	
33:	0.6882208585739136 	 1

# $d = 2$

In [16]:
# Experiment parameters
num = 0
reps = 100
d = 2
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [18]:
Ns = 10**np.arange(5, 8)
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

100000
0.6897383813726902 	 16
0:	0.6885854601860046 	 13	
1:	0.6886233687400818 	 29	
2:	0.688554584980011 	 15	
3:	0.688561201095581 	 14	
4:	0.6885870099067688 	 14	
5:	0.6886135935783386 	 16	
6:	0.688571572303772 	 12	
7:	0.688606321811676 	 22	
8:	0.6885364055633545 	 14	
9:	0.6886024475097656 	 14	
10:	0.688636064529419 	 15	
11:	0.6885892748832703 	 14	
12:	0.6885815858840942 	 14	
13:	0.688644528388977 	 21	
14:	0.6885932087898254 	 15	
15:	0.6885793805122375 	 18	
16:	0.6886373162269592 	 18	
17:	0.6886154413223267 	 19	
18:	0.6886027455329895 	 16	
19:	0.6886194348335266 	 21	
20:	0.6885715126991272 	 15	
21:	0.6885250210762024 	 14	
22:	0.6886364817619324 	 17	
23:	0.6885867714881897 	 15	
24:	0.6885355710983276 	 14	
25:	0.6886297464370728 	 21	
26:	0.6885899901390076 	 14	
27:	0.6885555386543274 	 14	
28:	0.6885570883750916 	 24	
29:	0.6886312365531921 	 14	
30:	0.6886309385299683 	 18	
31:	0.6886470913887024 	 17	
32:	0.688592791557312 	 22	
33:	0.6886105537414551 	 18	


# $d=4$

In [21]:
# Experiment parameters
num = 0
reps = 100
d = 4
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [22]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

100
1.2132698711752892 	 11
0:	1.1392271518707275 	 11	
1:	1.0972708463668823 	 11	
2:	1.0643147230148315 	 11	
3:	1.1282507181167603 	 11	
4:	1.0355725288391113 	 11	
5:	1.1457513570785522 	 12	
6:	1.0880184173583984 	 11	
7:	1.0901670455932617 	 11	
8:	0.9783453345298767 	 11	
9:	1.112248182296753 	 11	
10:	1.1720819473266602 	 11	
11:	1.0681439638137817 	 11	
12:	1.0900442600250244 	 11	
13:	1.0243535041809082 	 11	
14:	1.065598487854004 	 11	
15:	1.0911563634872437 	 11	
16:	1.0760564804077148 	 11	
17:	1.114027976989746 	 11	
18:	1.0206708908081055 	 11	
19:	1.0512977838516235 	 11	
20:	1.0820294618606567 	 11	
21:	1.0495824813842773 	 11	
22:	1.069369912147522 	 11	
23:	1.1231400966644287 	 11	
24:	0.9952241778373718 	 11	
25:	1.1331915855407715 	 11	
26:	1.0562050342559814 	 11	
27:	1.0912673473358154 	 11	
28:	1.0572243928909302 	 11	
29:	1.0950396060943604 	 11	
30:	1.0500222444534302 	 11	
31:	1.037409782409668 	 11	
32:	1.0728603601455688 	 11	
33:	1.0947318077087402 	 11	
3

# $d=8$

In [23]:
# Experiment parameters
num = 0
reps = 100
d = 8
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [24]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

100
0.8161247763037681 	 11
0:	0.845988392829895 	 11	
1:	0.7803048491477966 	 11	
2:	0.8140516877174377 	 11	
3:	0.9187474846839905 	 11	
4:	0.9603713750839233 	 13	
5:	0.806976318359375 	 11	
6:	0.8726725578308105 	 11	
7:	0.8822808265686035 	 11	
8:	0.82355135679245 	 11	
9:	0.8435800075531006 	 11	
10:	0.8601838946342468 	 11	
11:	0.8769786357879639 	 11	
12:	0.7496845722198486 	 11	
13:	0.921619176864624 	 11	
14:	0.8317793011665344 	 13	
15:	0.8853189945220947 	 11	
16:	0.835055410861969 	 12	
17:	0.7931764125823975 	 11	
18:	0.8814626932144165 	 11	
19:	0.8971964120864868 	 13	
20:	0.8035995960235596 	 11	
21:	0.8035678267478943 	 11	
22:	0.8431211113929749 	 11	
23:	0.8974059820175171 	 12	
24:	0.9004664421081543 	 11	
25:	0.8459731936454773 	 11	
26:	0.8231882452964783 	 11	
27:	0.8270463347434998 	 12	
28:	0.8848233819007874 	 11	
29:	0.7932478189468384 	 11	
30:	0.8189102411270142 	 11	
31:	0.8290271162986755 	 11	
32:	0.8902076482772827 	 11	
33:	0.8329524397850037 	 11	
34

# $d=16$

In [25]:
# Experiment parameters
num = 0
reps = 100
d = 16
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [26]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

100
0.8878225636482239 	 11
0:	0.8421497344970703 	 14	
1:	0.7959799766540527 	 11	
2:	0.840049147605896 	 12	
3:	0.8344448208808899 	 13	
4:	0.7635347843170166 	 14	
5:	0.794751763343811 	 11	
6:	0.9265921115875244 	 15	
7:	0.8118041753768921 	 11	
8:	0.8609725832939148 	 13	
9:	0.9237632155418396 	 11	
10:	0.9997531771659851 	 13	
11:	0.930898129940033 	 14	
12:	0.8204989433288574 	 12	
13:	0.8422346711158752 	 12	
14:	0.8205422759056091 	 11	
15:	0.9314346313476562 	 11	
16:	0.7772156596183777 	 13	
17:	0.8900584578514099 	 14	
18:	0.7886533141136169 	 15	
19:	0.853091835975647 	 13	
20:	0.8329296112060547 	 11	
21:	0.897925853729248 	 12	
22:	0.8238064050674438 	 15	
23:	0.7742559313774109 	 11	
24:	0.8114219903945923 	 11	
25:	0.8524292707443237 	 13	
26:	0.9202665686607361 	 11	
27:	0.8460264801979065 	 11	
28:	0.8669698238372803 	 13	
29:	0.908454954624176 	 12	
30:	0.8716256022453308 	 11	
31:	0.9306991696357727 	 15	
32:	0.7616233825683594 	 11	
33:	0.9581162333488464 	 13	
34

## $d = 32$

In [30]:
# Experiment parameters
num = 0
reps = 100
d = 32
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

100
1.1042766082286835 	 11
0:	1.175673246383667 	 11	
1:	1.188735008239746 	 11	
2:	1.1000075340270996 	 11	
3:	1.2252099514007568 	 11	
4:	1.393407940864563 	 11	
5:	1.0724115371704102 	 14	
6:	0.8996843099594116 	 11	
7:	1.5201584100723267 	 11	
8:	1.1904983520507812 	 11	
9:	1.1521830558776855 	 11	
10:	1.2806000709533691 	 11	
11:	1.0063570737838745 	 12	
12:	1.3078508377075195 	 13	
13:	1.6024388074874878 	 12	
14:	1.3089947700500488 	 11	
15:	1.3294837474822998 	 13	
16:	1.2809847593307495 	 11	
17:	1.214433193206787 	 11	
18:	1.4200842380523682 	 15	
19:	1.2920008897781372 	 11	
20:	1.215727686882019 	 11	
21:	1.5087207555770874 	 12	
22:	1.3249294757843018 	 11	
23:	1.1299859285354614 	 13	
24:	1.2084264755249023 	 11	
25:	1.1566591262817383 	 12	
26:	1.039830207824707 	 11	
27:	1.3354700803756714 	 12	
28:	1.2894092798233032 	 13	
29:	1.4113234281539917 	 13	
30:	1.159742832183838 	 11	
31:	1.0342679023742676 	 11	
32:	1.449352741241455 	 12	
33:	1.2929284572601318 	 11	
34:	