In [1]:
# Automatically reload modules after executing each cell.
%load_ext autoreload
%autoreload 2

In [2]:
# General imports
import os
import tensorflow as tf
from scipy import stats
from xgboost import XGBClassifier

# Utility imports
from utils.losses import *
from utils.plotting import *
from utils.training import *

np.random.seed(666) # Need to do more to ensure data is the same across runs.

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # pick a number < 4 on ML4HEP; < 3 on Voltan 
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# $d = 1$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 1
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d)).reshape(-1, 1)
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))

    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

# $d = 2$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 2
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
Ns = 10**np.arange(5, 8)
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

# $d=4$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 4
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

# $d=8$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 8
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

# $d=16$

In [None]:
# Experiment parameters
num = 0
reps = 100
d = 16
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [None]:
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(reps):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

# $d = 32$

In [4]:
# Experiment parameters
num = 0
reps = 100
d = 32
Ns = 10**np.arange(2, 8)

# Model parameters
bce_params = {'loss':bce, 'd': d}

filestr = 'models/trees/{}/set_{}/'.format(d, num)
bce_filestr = filestr + 'bce/model_{}_{}.h5'
bdt_filestr = filestr + 'bdt/model_{}.h5'

if not os.path.isdir(filestr):
    os.mkdir(filestr)

if not os.path.isdir(filestr + 'bce/'):
    os.mkdir(filestr + 'bce/')
    
if not os.path.isdir(filestr + 'bdt/'):
    os.mkdir(filestr + 'bdt/')

# Data parameters
X = np.load('data/trees/{}/X_trn.npy'.format(d))
y = np.load('data/trees/{}/y_trn.npy'.format(d)).astype('float32')

In [5]:
Ns = [10**7]
for N in Ns:
    print('===================================================\n{}'.format(N))
    # Take the first N samples.
    data, m, s = split_data(X[:N], y[:N])
    
    # Train BDT model (only need to train 1)
    bdt_model = XGBClassifier(early_stopping_rounds = 10)
    X_trn, X_vld, y_trn, y_vld = data
    bdt_model.fit(X_trn, y_trn, eval_set = [(X_vld, y_vld)], verbose = 0)
    trace = bdt_model.evals_result()['validation_0']
    print(trace['logloss'][-1], '\t', len(trace['logloss']), end = '\n')
    bdt_model.save_model(bdt_filestr.format(N))
    
    for i in range(31, 100):
        print(i, end = ':\t')
        # Train BCE model
        bce_model, trace = train(data, **bce_params)
        bce_model.save_weights(bce_filestr.format(N, i))
        print()

10000000
0.6882202447947383 	 21
31:	

2023-02-05 13:59:40.661488: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-05 13:59:41.261656: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21271 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:41:00.0, compute capability: 7.5


0.6882731914520264 	 100	
32:	0.688291609287262 	 100	
33:	0.6882471442222595 	 94	
34:	0.6882668733596802 	 55	
35:	0.6882753968238831 	 76	
36:	0.6882576942443848 	 100	
37:	0.688265323638916 	 69	
38:	0.6882765293121338 	 100	
39:	0.6882578730583191 	 89	
40:	0.6882681846618652 	 76	
41:	0.6882495880126953 	 62	
42:	0.6882864832878113 	 100	
43:	0.688258171081543 	 87	
44:	0.6882637143135071 	 81	
45:	0.688267707824707 	 100	
46:	0.6882681250572205 	 74	
47:	0.6882712244987488 	 100	
48:	0.6882588863372803 	 63	
49:	0.6882491707801819 	 75	
50:	0.6882544755935669 	 95	
51:	0.688260018825531 	 57	
52:	0.6882814764976501 	 100	
53:	0.6882668137550354 	 91	
54:	0.6882739067077637 	 91	
55:	0.6882550120353699 	 86	
56:	0.6882740259170532 	 100	
57:	0.6882724761962891 	 87	
58:	0.6882892847061157 	 82	
59:	0.6882728934288025 	 88	
60:	0.6882638335227966 	 81	
61:	0.688271701335907 	 70	
62:	0.6882848739624023 	 76	
63:	0.6882895827293396 	 100	
64:	0.6882631778717041 	 86	
65:	0.68826007