In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import uniform

import weakref 

from bayes_opt import BayesianOptimization


from data_cleaning import clean_df
from KFPF_lambda_cuts import KFPF_lambda_cuts
from plot_tools import AMS, preds_prob, plot_confusion_matrix, plt_sig_back
import tree_importer 
import uproot


#To save some memory we will delete unused variables
class TestClass(object): 
    def check(self): 
        print ("object is alive!") 
    def __del__(self): 
        print ("object deleted") 
        
from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(7)

import gc
import ROOT

In [None]:
df_clean_signal = uproot.open('/home/shahid/cbmsoft/Cut_optimization/uncut_data/Project/dcm/dcm_signal.root:plain_tree',decompression_executor=executor,
                                  interpretation_executor=executor).arrays(library='pd',decompression_executor=executor,
                                  interpretation_executor=executor)
aa = ['b', 'chi2geo','chi2primneg', 'chi2primpos', 'chi2_topo', 'cosine_first', 'cosine_second', 'cosine_topo', 'distance', 'eta', 'l', 'ldl', 'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity', 'vtx_chi2_first', 'vtx_chi2_second',
 'z_first', 'z_second', 'z_smaller', 'M', 'issignal', 'id', 'nhits_mvd_first', 'nhits_mvd_second', 'nhits_mvd_sum', 'nhits_tot_first', 'nhits_tot_second', 'nhits_tot_sum', 'pid']
df_clean_signal.columns=aa
signal = df_clean_signal[ (df_clean_signal['mass']>df_clean_signal['mass'].mean()-1.5*df_clean_signal['mass'].std())
               & (df_clean_signal['mass']<df_clean_signal['mass'].mean()+1.5*df_clean_signal['mass'].std()) & (df_clean_signal['M']>200) & (df_clean_signal['M']<250)]
del df_clean_signal
signal["issignal"].replace({3: 2, 4: 2, 5:2}, inplace=True)


a = ['index', 'chi2geo','chi2primneg', 'chi2primpos','distance', 'ldl','l','cosine_first', 'cosine_second', 'chi2_topo', 'cosine_topo',
 'mass', 'pT', 'b', 'eta', 'p', 'phi', 'rapidity', 'vtx_chi2_first', 'vtx_chi2_second', 'z_first', 'z_second', 'z_smaller', 'M', 'issignal', 'nhits_mvd_first',
 'nhits_mvd_second', 'nhits_mvd_sum', 'nhits_tot_first', 'nhits_tot_second', 'nhits_tot_sum']

df_clean_urqmd = uproot.open('/home/shahid/Mount/gsi/u/Mount/lustre/khan/cbmsoft/at_tree_plainer/install/bin/urqmd/c_0_pt_0_9_y_0_9_M_200_250_urqmd.root:plain_tree',decompression_executor=executor,
                                  interpretation_executor=executor).arrays(library='pd',decompression_executor=executor,
                                  interpretation_executor=executor)
df_clean_urqmd.columns = a
df_clean_urqmd["issignal"].replace({3: 2, 4: 2, 5:2}, inplace=True)

df_clean =  uproot.open('/home/shahid/Mount/gsi/u/Mount/lustre/khan/cbmsoft/at_tree_plainer/install/bin/dcm/c_0_pt_0_9_y_0_9_M_200_250_dcm.root:plain_tree',decompression_executor=executor,
                                  interpretation_executor=executor).arrays(library='pd',decompression_executor=executor,
                                  interpretation_executor=executor)
df_clean.columns = a
df_clean["issignal"].replace({3: 2, 4: 2, 5:2}, inplace=True)

In [None]:
back = df_clean_urqmd[(df_clean_urqmd['issignal'] == 0)
                & ((df_clean_urqmd['mass'] > 1.077)
                & (df_clean_urqmd['mass'] < 1.1) | (df_clean_urqmd['mass']>1.135) 
                   & (df_clean_urqmd['mass'] < 1.2))]

signal_selected= signal[(signal['mass']>1.1) & (signal['mass']<1.135)]
background_selected = back.sample(n=3*(signal_selected.shape[0]))
del back
gc.collect()
dfs = [signal_selected, background_selected]
df_scaled = pd.concat(dfs)
df_scaled = df_scaled.sample(frac=1)
del signal, signal_selected, background_selected, dfs

print(df_scaled.shape)
print(df_scaled[df_scaled['issignal']==1].shape)
print(df_scaled[df_scaled['issignal']==2].shape)

fig, axs = plt_sig_back(df_scaled)
fig.set_figheight(5)
fig.set_figwidth(8)
axs.text(1.13, 6000, r'DCM-QGSM-SMM', color = 'magenta',  fontsize=15)
axs.text(1.13, 4000, r'Au+Au @ 12 $A$GeV/$c$', color = 'magenta',  fontsize=15)
axs.text(1.13, 2000, r'URQMD, Au+Au @ 12 $A$GeV/$c$', fontsize=15)
fig.savefig("hists.pdf")

cuts = [ 'chi2geo', 'chi2primneg', 'chi2primpos', 'chi2_topo', 'cosine_topo', 'distance', 'ldl', 'nhits_mvd_first','nhits_mvd_second','nhits_tot_first','nhits_tot_second']
x = df_scaled[cuts].copy()
y =pd.DataFrame(df_scaled['issignal'], dtype='int8')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=324, stratify=y)
dtrain = xgb.DMatrix(x_train, label = y_train)
gc.collect()

In [None]:
import time
starttime = time.time()


#Bayesian Optimization function for xgboost
#specify the parameters you want to tune as keyword arguments
def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):
    params = {'max_depth': int(max_depth),
              'gamma': gamma,
              'alpha':alpha,
              'n_estimators': n_estimators,
              'learning_rate':learning_rate,
              'subsample': 0.8, 
              'num_class':np.unique(dtrain.get_label()).shape[0], 
              'eval_metric': 'auc','tree_method':'hist', 'nthread' : 7}
    cv_result = xgb.cv(params=params, dtrain=dtrain, num_boost_round=10, nfold=5)
    return  cv_result['test-auc-mean'].iloc[-1]

#Invoking the Bayesian Optimizer with the specified parameters to tune
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),
                                             'gamma': (0, 1),
                                            'alpha': (2,20),
                                             'learning_rate':(0.01,1),
                                             'n_estimators':(100,1000)
                                            })
cpproot_time = time.time() - starttime
print(f"total time: {cpproot_time} sec")
#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
xgb_bo.maximize(n_iter=5, init_points=5, acq='ei')

In [None]:
max_param = xgb_bo.max['params']
param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'],
        'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 
         'objective':'binary:logistic','tree_method':'hist','nthread' : 7}

#Fit/train on training data
bst = xgb.XGBClassifier(**param).fit(x_train, y_train)
del x_train, y_train
gc.collect()

In [None]:
x_whole = df_clean[cuts].copy()
bst_test1 = pd.DataFrame(data=bst.predict_proba(x_whole))
df_clean['xgb_preds0'], df_clean['xgb_preds1'], df_clean['xgb_preds2']= bst_test1[0], bst_test1[1], bst_test1[2]
del x_whole, bst_test1

x_whole_1 = df_clean_urqmd[cuts].copy()
bst_test2 = pd.DataFrame(data=bst.predict_proba(x_whole_1))
df_clean_urqmd['xgb_preds0'], df_clean_urqmd['xgb_preds1'], df_clean_urqmd['xgb_preds2']= bst_test2[0], bst_test2[1], bst_test2[2]
del x_whole_1,  bst_test2

In [None]:
ax = xgb.plot_importance(bst)
plt.rcParams['figure.figsize'] = [5, 3]
plt.show()
ax.figure.tight_layout() 
ax.figure.savefig("hits.png")

In [None]:
def preds_prob(df,preds,true,df1,preds1, true1):
    fig, ax = plt.subplots(figsize=(12, 8))
    bins1=100
    TP = df[(df[true]==1)]
    TN = df[(df[true]==0)]
    
    plt.hist(TN[preds], bins=bins1,facecolor='blue',alpha = 0.3, label='background in train')
    plt.hist(TP[preds], bins=bins1,facecolor='red',alpha = 0.3, label='signal in train')
    del TP, TN
    
    TP1 = df1[(df1[true1]==1)]
    TN1 = df1[(df1[true1]==0)]
    
    hist1, bins1 = np.histogram(TN1[preds1], bins=bins1)
    err1 = np.sqrt(hist1)
    center1 = (bins1[:-1] + bins1[1:]) / 2
    plt.errorbar(center1, hist1, yerr=err1, fmt='o',
                 c='blue', label='background in test')
    
    hist, bins = np.histogram(TP1[preds1], bins=bins1)
    err = np.sqrt(hist)
    center = (bins[:-1] + bins[1:]) / 2
    plt.errorbar(center, hist, yerr=err, fmt='o',
                 c='red', label='signal in test')
    del TP1, TN1
    
   # ax.annotate('cut on probability', xy=(0, 90),  xycoords='data',xytext=(0.13,0.5), textcoords='axes fraction',
    #            fontsize=15,arrowprops=dict(facecolor='black', shrink=0.05),horizontalalignment='right', verticalalignment='top')
    
    
    
    if df[true].unique().shape[0]>2:
        TP2= df[df[true]>1]
        plt.hist(TP2[preds], bins=bins1,facecolor='green',alpha = 0.3, label='secondaries in train')
        TP2= df1[df1[true1]>1]
        hist2, bins2 = np.histogram(TP2[preds1], bins=bins1)
        center2 = (bins2[:-1] + bins2[1:]) / 2
        err2 = np.sqrt(hist2)
        plt.errorbar(center2, hist2,yerr=err2, fmt='o',c='green',label='secondaries in test')

    del TP2
    ax.set_yscale('log')
    ax.set_xlabel('Probability',fontsize=18)
    plt.ylabel('Counts', fontsize=18)
    ax.set_xticks(np.arange(0,1.1,0.1))
    ax.tick_params(axis='both', which='major', labelsize=18)
    ax.tick_params(axis='both', which='minor', labelsize=16)
    plt.legend(fontsize=18)
    fig.show()
    fig.tight_layout()
    
    return fig, ax

In [None]:
cut = 0.08
df = df_clean[df_clean['xgb_preds0']<cut]
#df = df[df['xgb_preds1']>0.6]
#df = df[df['xgb_preds2']>0.1]
for i in ['xgb_preds0','xgb_preds1','xgb_preds2']:
#for i in ['xgb_preds2']:
    fig, ax = preds_prob(df,i, 'issignal',df,i, 'issignal')
    plt.legend(["back","prim","second"],fontsize=18)
    plt.title(str(i))
    fig.savefig("hists"+str(i)+".png")

In [None]:
cut3 = 0.08
df3_base=df[(df['xgb_preds0']<cut3) ]
eff1 = (df3_base[(df3_base['issignal']==1)].shape[0])/ (df_clean_urqmd[(df_clean_urqmd['issignal']==1)].shape[0])

fig, axs = plt.subplots(figsize=(12, 8))
range1= (1.105, 1.14)
bins1 = 150

df3_base['mass'].plot.hist(bins = bins1, range=range1, facecolor='red',alpha = 0.3,grid=True,sharey=True, label='XGB selected $\Lambda$s')
#df3_base[df3_base['issignal']==1]['mass'].plot.hist(bins = 300, range=range1,facecolor='blue',alpha = 0.3,grid=True,sharey=True, '\n True positives = \n (MC =1)\n signal in \n the distribution')
#df3_base[df3_base['issignal']==1]['mass'].plot.hist(bins = bins1, range=range1,facecolor='magenta',alpha = 0.3,grid=True,sharey=True )
df3_base[df3_base['issignal']==0]['mass'].plot.hist(bins = bins1, range=range1,facecolor='green',alpha = 0.3,grid=True,sharey=True, label ='\n False positives = \n (MC =0)\n background in \n the distribution')

plt.legend( fontsize = 18, loc='upper right')
#plt.rcParams["legend.loc"] = 'upper right'
plt.title("XGB selected $\Lambda$ candidates with a cut of %.3f "%cut3 +"on the XGB back probability distribution", fontsize = 18)
axs.set_xlabel("Mass (GeV/${c^2}$)", fontsize = 18)
plt.ylabel("Counts", fontsize = 18)
axs.text(1.123, 4000, 'CBM Performance', fontsize=18)
axs.text(1.123, 3500, 'URQMD, Au+Au @ 12A GeV/$c$', fontsize=18)
axs.text(1.123, 3000, "Primaries efficiency = %.3f"%eff+"", fontsize=18)
axs.tick_params(labelsize=18)
fig.tight_layout()
fig.savefig("whole_sample_invmass_with_ML.png")
del df3_base

In [None]:
import uproot
import awkward as ak
cut = 0.8
df3 = df_clean[df_clean['xgb_preds0']<cut]
df3 = df3[df3['issignal']>0]
df3 = df3[['pT', 'rapidity', 'mass','issignal','xgb_preds1']]
df3.columns.values[[0,1,2,3,4]] = ['MCpT', 'MCrapidity','MCmass', 'MCissignal','MCxgb_preds']
df3["MCissignal"]=df3["MCissignal"].astype("float")
df3["MCxgb_preds"]=df3["MCxgb_preds"].astype("float")
del df_clean

df3_base = df_clean_urqmd[df_clean_urqmd['xgb_preds0']<cut]
df3_base3 = df3_base[['pT', 'rapidity', 'mass', 'issignal','xgb_preds1']]
df3_base3["issignal"]=df3_base3["issignal"].astype("double")
df3_base3["xgb_preds"]=df3_base3["xgb_preds"].astype("double")
del df_clean_urqmd, df3_base
file = uproot.recreate("new_c3_pt_y_y_yield_bdt_cut_0.8.root")
file["t1"] = df3_base3
file["t2"] = df3

In [None]:
import uproot
import awkward as ak
cut = 0.8
df3 = df_clean[df_clean['xgb_preds0']<cut]
df3_base = df_clean_urqmd[df_clean_urqmd['xgb_preds0']<cut]
file = uproot.recreate("new_c3_pt_y_y_yield_bdt_cut_0.8.root")
file["t1"] = df3_base
file["t2"] = df3