In [None]:
import os
import sys
import importlib
import inspect
import numpy as np
import awkward
import ROOT

In [None]:
class args:
    out_dir = "BDT_Ada_1"
    in_dir = "../plotting"
    data_frames = f"{in_dir}/df_step1.awkd"
    training_preselection = "training_01_W"

In [None]:
dfs = awkward.load(args.data_frames)

In [None]:
keys = list(dfs.keys())
sigs = [i for i in keys if "VBS_EWK/" in i]
bkgs = [i for i in keys if "WJets/" in i]

In [None]:
os.makedirs(args.out_dir, exist_ok=True)

In [None]:
training_variables = [
    "lept1_pt", "lept1_eta",
    "pf_met_corr",
    "vbf_jj_m", "vbf_jj_Deta",
    "vbf_j1_pt", "vbf_j1_eta", 
    "vbf_j2_pt", "vbf_j2_eta",
    "fatjet_m", "fatjet_pt", "fatjet_eta",
    "wv_m", "wv_pt", "wv_eta",
    "boson_centrality", "zeppenfeld_w_Deta", "zeppenfeld_v_Deta",
    "costheta1_type0", "costheta2_type0", "phi_type0", "phi1_type0", "costhetastar_type0",
    "w_pt", "w_eta", "w_mt",
    "ht"
]

weight_variables = [
    "gen_weight",
#    "pu_weight"
]

variables_ = open(f"{args.out_dir}/variable_list.txt", "w")
for i in training_variables:
    print(i, file=variables_)
variables_.close()

In [None]:
sys.path.append(f"{args.in_dir}/selections")
preselection_code = importlib.import_module(args.training_preselection)

preselection = preselection_code.region_

print(inspect.getsource(preselection))

In [None]:
X_sig = None
w_sig = None

for sig_key in sigs:
    df = dfs[sig_key]["dframe"]

    preselect_df = df[preselection(df, "m") | preselection(df, "e")]

    training_columns = [preselect_df[i] for i in training_variables]

    X_ = np.column_stack(training_columns)
    w_ = 1.0
    for i in weight_variables:
        w_ = w_ * preselect_df[i]

    if X_sig is None:
        X_sig = X_
        w_sig = w_

    else:
        X_sig = np.concatenate([X_sig, X_])
        w_sig = np.concatenate([w_sig, w_])

print("Signal dataset shape: ", X_sig.shape)

In [None]:
X_bkg = None
w_bkg = None

for bkg_key in bkgs:
    df = dfs[bkg_key]["dframe"]

    preselect_df = df[preselection(df, "m") | preselection(df, "e")]

    training_columns = [preselect_df[i] for i in training_variables]

    X_ = np.column_stack(training_columns)
    w_ = 1.0
    for i in weight_variables:
        w_ = w_ * preselect_df[i]

    if X_bkg is None:
        X_bkg = X_
        w_bkg = w_

    else:
        X_bkg = np.concatenate([X_bkg, X_])
        w_bkg = np.concatenate([w_bkg, w_])

print("Background dataset shape: ", X_bkg.shape)

In [None]:
def make_std_vector(X):
    events = []
    for row in X:
        a = ROOT.std.vector("double")()
        for r in row:
            a.push_back(r)
        events.append(a)
    return events

In [None]:
permuate_sig = np.random.permutation(X_sig.shape[0])
X_sig = X_sig[permuate_sig]

permuate_bkg = np.random.permutation(X_bkg.shape[0])
X_bkg = X_bkg[permuate_bkg]

ns_train = int(X_sig.shape[0] / 2)
nb_train = int(X_bkg.shape[0] / 2)

X_sig_train, w_sig_train = X_sig[:ns_train], w_sig[:ns_train]
X_sig_test, w_sig_test = X_sig[ns_train:], w_sig[ns_train:]

X_bkg_train, w_bkg_train = X_bkg[:nb_train], w_bkg[:nb_train]
X_bkg_test, w_bkg_test = X_bkg[nb_train:], w_bkg[nb_train:]

In [None]:
X_sig_train_vec = make_std_vector(X_sig_train)
X_sig_test_vec = make_std_vector(X_sig_test)

X_bkg_train_vec = make_std_vector(X_bkg_train)
X_bkg_test_vec = make_std_vector(X_bkg_test)

In [None]:
outfile = ROOT.TFile(f"{args.out_dir}/tmva_output.root", "recreate")

factory = ROOT.TMVA.Factory(
    "VBS",
    outfile,
    ":".join([
        "!V", "!Silent",
        "Color", "DrawProgressBar",
        "Transformations=I,G",
        f"AnalysisType=Classification"
    ])
)

dataloader = ROOT.TMVA.DataLoader(args.out_dir)

for var in training_variables:
    dataloader.AddVariable(var, "F")

for event, w_ in zip(X_sig_train_vec, w_sig_train):
    dataloader.AddSignalTrainingEvent(event, w_)

for event, w_ in zip(X_sig_test_vec, w_sig_test):
    dataloader.AddSignalTestEvent(event, w_)

for event, w_ in zip(X_bkg_train_vec, w_bkg_train):
    dataloader.AddBackgroundTrainingEvent(event, w_)

for event, w_ in zip(X_bkg_test_vec, w_bkg_test):
    dataloader.AddBackgroundTestEvent(event, w_)

dataloader.PrepareTrainingAndTestTree(ROOT.TCut(""), "SplitMode=Random:NormMode=NumEvents")

In [None]:
factory.BookMethod(
    dataloader,
    "BDT",
    "BDT",
    ":".join([
        "!H", "!V",
        "NTrees=500",
        "MinNodeSize=10%",
        "BoostType=AdaBoost",
        "AdaBoostBeta=0.05",
        "NegWeightTreatment=Pray"
    ])
)

#factory.BookMethod(
#    dataloader,
#    "BDT",
#    "BDT",
#    ":".join([
#        "!H", "!V",
#        "NTrees=500",
#        "MinNodeSize=10%",
#        "BoostType=Grad",
#        "Shrinkage=0.05",
#        "UseBaggedBoost", "BaggedSampleFraction=0.5",
#        "NegWeightTreatment=Pray"
#    ])
#)

In [None]:
factory.TrainAllMethods()
factory.TestAllMethods()
factory.EvaluateAllMethods()
   
outfile.cd()
outfile.Close()