In [1]:
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt

plt.style.use("science.mplstyle")
matplotlib.rcParams['text.latex.preamble'] = r'\usepackage{amsmath} \usepackage{amssymb}'

colors_dict = {"truth":"grey",
               "salad": "#009E73",
               "feta": "#CC79A7",
               "cathode":"#D55E00",
               "curtains":"#E69F00"}

In [None]:
# loading and understanding the data
data_path = '/global/home/users/rrmastandrea/scaled_data_wide/'

cathode_data = np.load(f"{data_path}nsig_injected_0/cathode.npy")
curtains_data = np.load(f"{data_path}nsig_injected_0/curtains.npy")
feta_data = np.load(f"{data_path}nsig_injected_0/feta_o6.npy")
salad_data = np.load(f"{data_path}nsig_injected_0/salad.npy")
salad_weights = np.load(f"{data_path}nsig_injected_0/salad_weights.npy")

truth_data = np.load(f"{data_path}nsig_injected_0/data.npy")


print("data contains: mj1, delta mj, tau21_j1, tau21_d2, deltaR, m_jj")
for data in [cathode_data, curtains_data, feta_data, salad_data, truth_data]:
    print("shape: ", data.shape)
    print("min: ", data.min(axis=0))
    print("mean: ", data.mean(axis=0))
    print("max: ", data.max(axis=0))

Looking at these numbers, we see that we can train a **balanced** multiclass classifier of CATHODE vs CURTAINS vs FETA vs SALAD  using 400k events of each method. We evaluate on the 120k truth events in the end. 

For training, a train / test / val split of 60/20/20 would mean 240k train, 80k test, and 80k val events. 

Before training the classifier, let's have a look at a few histograms in physical space.

In [None]:
# from https://github.com/rmastand/FETA/blob/ee4942e668b94df7b504b1503b027bdc28827eb1/helpers/datasets.py#L227

# standardization transformation and its inverse:
def minmaxscale(data, col_minmax, lower = -3.0, upper = 3.0, forward = True):
    if forward:    
        minmaxscaled_data = np.zeros(data.shape)
        for col in range(data.shape[1]):
            X_std = (data[:, col] - col_minmax[col][0]) / (col_minmax[col][1] - col_minmax[col][0])
            minmaxscaled_data[:, col] = X_std * (upper - lower) + lower      
        return minmaxscaled_data

    else:  
        reversescaled_data = np.zeros(data.shape)
        for col in range(data.shape[1]):
            X_std = (data[:, col] - lower) / (upper - lower)
            reversescaled_data[:, col] = X_std * (col_minmax[col][1] - col_minmax[col][0]) + col_minmax[col][0]
        return reversescaled_data
    


In [None]:
# load minmax and transform data to physical space

col_minmax = np.load(f"{data_path}col_minmax.npy")
cathode_physical = minmaxscale(cathode_data, col_minmax, lower=0, upper=1, forward=False)
curtains_physical = minmaxscale(curtains_data, col_minmax, lower=0, upper=1, forward=False)
feta_physical = minmaxscale(feta_data, col_minmax, lower=0, upper=1, forward=False)
salad_physical = minmaxscale(salad_data, col_minmax, lower=0, upper=1, forward=False)
truth_physical = minmaxscale(truth_data, col_minmax, lower=0, upper=1, forward=False)

In [None]:
# plot them in comparison to see if transformation etc. worked correctly


bins = 40
sim_color = "#990000"
dat_color = "#1c4587"
linewidth = 2

fig, ax = plt.subplots(2, 3, figsize = (12, 8))
fig.subplots_adjust(right=0.8, left = 0.2)

title_dict = {0: f"$m_{{J1}}$ [GeV]", 1: f"$\Delta m_{{JJ}}$ [GeV]", 2: f"$\\tau_{{J1}}^{{21}}$", 
              3: f"$\\tau_{{J2}}^{{21}}$", 4: f"$\Delta R_{{JJ}}$", 5: f"$m_{{JJ}}$ [GeV]"}
kwargs = {'density': True, 'histtype': 'step', 'lw': 2., 'alpha': 0.6}
legend_loc = ['upper right', 'upper right', 'upper left', 'upper left', 'upper left', 'lower left']


for ax_x in range(2):
    for ax_y in range(3):
        i = ax_x*3 + ax_y
        ax[ax_x, ax_y].hist(salad_physical[:,i], bins = np.linspace(min(truth_physical[:,i]), max(truth_physical[:,i]), bins), density = True, label = "SALAD", 
                            histtype = "step", linewidth = linewidth, weights = salad_weights,
                           color=colors_dict['salad'])
        ax[ax_x, ax_y].hist(cathode_physical[:,i], bins =  np.linspace(min(truth_physical[:,i]), max(truth_physical[:,i]), bins), density = True, label = "CATHODE", 
                            histtype = "step",linewidth = linewidth,
                           color=colors_dict['cathode'])
        ax[ax_x, ax_y].hist(curtains_physical[:,i], bins =  np.linspace(min(truth_physical[:,i]), max(truth_physical[:,i]), bins), density = True, label = "CURTAINS", 
                            histtype = "step",  linewidth = linewidth,
                           color=colors_dict['curtains'])
        ax[ax_x, ax_y].hist(feta_physical[:,i], bins =  np.linspace(min(truth_physical[:,i]), max(truth_physical[:,i]), bins), density = True, label = "FETA", 
                            histtype = "step", linewidth = linewidth,
                           color=colors_dict['feta'])
        ax[ax_x, ax_y].hist(truth_physical[:,i], bins =  np.linspace(min(truth_physical[:,i]), max(truth_physical[:,i]), bins), density = True, label = "Truth", 
                            histtype = "stepfilled", linewidth = linewidth,
                           color=colors_dict['truth'], alpha = 0.6)
        
        
        
        ax[ax_x, ax_y].set_xlabel(title_dict[i], fontsize = 30)
        ax[ax_x, ax_y].tick_params(axis='both', which='major', labelsize=20)
        ax[ax_x, ax_y].set_yticks([])

ax[0,0].set_ylabel("Density", fontsize = 30)  
ax[1,0].set_ylabel("Density", fontsize = 30) 
ax[0,0].legend(fontsize = 19, loc = "upper right")  
#plt.legend(loc='lower center', ncol=5, bbox_to_anchor=(0.0, -0.45), fontsize = 15)

plt.tight_layout()


dpi = 800 
#plt.subplot(2,3,5)
plt.savefig(f'plots/background_validation.pdf', facecolor='white', dpi = dpi)
plt.show()

The distributions seem to agree with fig. 4 of FETA (2212.11285).

In [11]:
# plot average of the log posterior of the runs above, new layout by Radha

l = 3
f_1 = 20
f_2 = 30
alpha = 0.4
dpi = 800
ms = 12

with open(f'log_posterior_mod.npy', 'rb') as f:
    to_plot_central = np.load(f)
    to_plot_err = np.load(f)

ymin, ymax = np.min(to_plot_central-1.05*to_plot_err), np.max(to_plot_central+1.05*to_plot_err)

order = ["cathode", "curtains", "feta", "salad"]


#order = ["salad", "cathode", "curtains", "feta"]


fig = plt.figure(figsize = (16, 5)) 

to_bold = [0, 5, 10, 15]
legends = [ '$p_{\mathrm{CATHODE}}(x)$', '$p_{\mathrm{CURTAINs}}(x)$','$p_{\mathrm{FETA}}(x)$','$p_{\mathrm{SALAD}}(x)$', ]
for i in range(20):
    
    if i in to_bold:
        plt.errorbar(i, to_plot_central[i], to_plot_err[i], fmt='o', color = colors_dict[order[i%4]],
                     elinewidth=5, markersize = 15 )
    elif i >= 16:
        plt.errorbar(i, to_plot_central[i], to_plot_err[i], fmt='x', color = colors_dict[order[i%4]], 
                     elinewidth=3, markersize = 15, label = legends[i-16] )

    else:
        plt.errorbar(i, to_plot_central[i], to_plot_err[i], fmt='x', color = colors_dict[order[i%4]], 
                     elinewidth=3, markersize = 15)

    
plt.vlines([3.5, 7.5, 11.5, 15.5], ymin, ymax, ls='dashed', color='k')
plt.ylim((ymin, ymax))
plt.xlim((-0.5, 19.5))
plt.xticks([])
plt.yticks(fontsize = f_1)

plt.ylabel('$\\langle LP(\\text{model } i|\\text{samples } j) \\rangle$', fontsize = f_2)


plt.text(1.5, to_plot_central+1.05*to_plot_err, '$x \in$ CATHODE', ha='center', fontsize = f_1)
plt.text(5.5, to_plot_central+1.05*to_plot_err, '$x \in$ CURTAINs', ha='center', fontsize = f_1)
plt.text(9.5, to_plot_central+1.05*to_plot_err, '$x \in$ FETA', ha='center', fontsize = f_1)
plt.text(13.5, to_plot_central+1.05*to_plot_err, '$x \in$ SALAD', ha='center', fontsize = f_1)
plt.text(17.5, -1.345, '$x \in$ Truth', ha='center', fontsize = f_1)

plt.legend(fontsize = f_1, loc = "lower right")


fig.savefig(f"plots/discrim_methods.pdf", dpi = dpi)

plt.show()




TypeError: only size-1 arrays can be converted to Python scalars

TypeError: only size-1 arrays can be converted to Python scalars

<Figure size 1152x360 with 1 Axes>