<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#signal" data-toc-modified-id="signal-1">signal</a></span></li><li><span><a href="#background" data-toc-modified-id="background-2">background</a></span></li><li><span><a href="#combine-together" data-toc-modified-id="combine-together-3">combine together</a></span></li></ul></div>

Preserve features which have power to discriminate signal from background.

Taken from ./DiscriminateVariables.ipynb
Shape comparisons are plotted there.

In [1]:
import utils.histoHelpers as uhh
import utils.uprootHelpers as uuh
import mvatrain.preprocessors as mpp
from utils.SignalDescription import SignalDescription
from utils.BackgroundDescription import BackgroundDescription

import numpy as np
import awkward

# signal

In [2]:
sd = SignalDescription()
sig4mu = sd.files('4mu', grouped=True, nth=-1)
sig2mu2e = sd.files('2mu2e', grouped=True, nth=-1)

sigfs4mu   = [f for x in sig4mu.values() for f in x]
sigfs2mu2e = [f for x in sig2mu2e.values() for f in x]

In [15]:
def match(obj):
    t = obj.tree
    genp4 = uuh.p4Array(t['gen_p4'])
    mDarkPhoton = t['gen_pid'].array()==32
    jetp4 = uuh.p4Array(t['pfjet_p4'])
    mGendpMatch, mJetMatch = uuh.MaskArraysFromMatching(genp4[mDarkPhoton], jetp4)
    res = mJetMatch[obj.mHLT].flatten()
    return res

def pt(obj):
    t = obj.tree
    return uuh.p4Array(t['pfjet_p4'])[obj.mHLT].pt.flatten()

def eta(obj):
    t = obj.tree
    return uuh.p4Array(t['pfjet_p4'])[obj.mHLT].eta.flatten()

def neutralEnergyFrac(obj):
    t = obj.tree
    jetp4 = uuh.p4Array(t['pfjet_p4'])
    _res = (
        t['pfjet_neutralEmE'].array()
        +t['pfjet_neutralHadronE'].array()
    )/jetp4.energy
    return _res[obj.mHLT].flatten()

def pickExtreme(ja, maxmin):
    if ja.size==0: return np.array([])
    _ja = np.abs(ja[~np.isnan(ja)])
    if _ja.size==0: return np.array([])
    if maxmin == 'max':
        _res = _ja[_ja.argmax()]
    if maxmin == 'min':
        _res = _ja[_ja.argmin()]
    cnts = _res.count()
    res = np.array([0.]*len(_res), dtype='float32')
    singletonIdx, zerotonIdx = np.where(cnts==1), np.where(cnts==0)
    res[singletonIdx] = _res[singletonIdx]
    res[zerotonIdx] = np.nan
    return res

def maxd0(obj):
    t = obj.tree
    candTkd0 = uuh.NestNestObjArrayToJagged(t['pfjet_pfcand_tkD0'].array())[obj.mHLT].flatten()
    return pickExtreme(candTkd0, 'max')

def mind0(obj):
    t = obj.tree
    candTkd0 = uuh.NestNestObjArrayToJagged(t['pfjet_pfcand_tkD0'].array())[obj.mHLT].flatten()
    return pickExtreme(candTkd0, 'min')

def tkiso(obj):
    return obj.tree['pfjet_tkIsolation05'].array()[obj.mHLT].flatten()

def pfiso(obj):
    return obj.tree['pfjet_pfIsolation05'].array()[obj.mHLT].flatten()

def ptspread(obj):
    return obj.tree['pfjet_ptDistribution'].array()[obj.mHLT].flatten()

def drspread(obj):
    return obj.tree['pfjet_dRSpread'].array()[obj.mHLT].flatten()

def jetsub_lambda(obj):
    return obj.tree['pfjet_subjet_lambda'].array()[obj.mHLT].flatten()

def jetsub_epsilon(obj):
    return obj.tree['pfjet_subjet_epsilon'].array()[obj.mHLT].flatten()

def ecf1(obj):
    return obj.tree['pfjet_subjet_ecf1'].array()[obj.mHLT].flatten()

def ecf2(obj):
    return obj.tree['pfjet_subjet_ecf2'].array()[obj.mHLT].flatten()

def ecf3(obj):
    return obj.tree['pfjet_subjet_ecf3'].array()[obj.mHLT].flatten()

# ------------------

In [4]:
_pm = {
    'target': match,
    'pt': pt,
    'eta': eta,
    'neufrac': neutralEnergyFrac,
    'maxd0': maxd0,
    'mind0': mind0,
    'tkiso': tkiso,
    'pfiso': pfiso,
    'spreadpt': ptspread,
    'spreaddr': drspread,
    'lambda': jetsub_lambda,
    'epsilon': jetsub_epsilon,
    'ecf1': ecf1,
    'ecf2': ecf2,
    'ecf3': ecf3,
}

In [5]:
mp = mpp.ffMultiPicker(sigfs4mu+sigfs2mu2e, pickmethods=_pm)

In [6]:
mp_res = mp.pick()

In [7]:
[a.size for a in mp_res.values()]

[295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891,
 295891]

In [8]:
awkward.save('data/signal_190513.awkd', mp_res, mode='w')

In [9]:
np.unique(mp_res['target'], return_counts=True)

(array([False,  True]), array([ 46991, 248900]))

----

# background

In [10]:
bd = BackgroundDescription()
bkgfs = bd.getTotalFiles()

In [16]:
_pm = {
    'pt': pt,
    'eta': eta,
    'neufrac': neutralEnergyFrac,
    'maxd0': maxd0,
    'mind0': mind0,
    'tkiso': tkiso,
    'pfiso': pfiso,
    'spreadpt': ptspread,
    'spreaddr': drspread,
    'lambda': jetsub_lambda,
    'epsilon': jetsub_epsilon,
    'ecf1': ecf1,
    'ecf2': ecf2,
    'ecf3': ecf3,
}

In [20]:
import importlib
importlib.reload(mpp)

<module 'mvatrain.preprocessors' from '/uscms/home/wsi/nobackup/lpcdm/ffAna/mvatrain/preprocessors.py'>

In [21]:
mp = mpp.ffMultiPicker(bkgfs, pickmethods=_pm)

In [22]:
mp_res = mp.pick()

Exception occured in ffMultiPicker.pick()
File:  root://cmseos.fnal.gov//eos/uscms/store/group/lpcmetx/MCSIDM/ffNtuple/2018/QCD_Pt-20to30_MuEnrichedPt5_TuneCP5_13TeV_pythia8/RunIIAutumn18DRPremix-102X_upgrade2018_realistic_v15-v4/190508_203602/0000/ffNtuple_1.root
Msg:  zero-size array to reduction operation maximum which has no identity
Exception occured in ffMultiPicker.pick()
File:  root://cmseos.fnal.gov//eos/uscms/store/group/lpcmetx/MCSIDM/ffNtuple/2018/QCD_Pt-20to30_MuEnrichedPt5_TuneCP5_13TeV_pythia8/RunIIAutumn18DRPremix-102X_upgrade2018_realistic_v15-v4/190508_203602/0000/ffNtuple_106.root
Msg:  zero-size array to reduction operation maximum which has no identity
Exception occured in ffMultiPicker.pick()
File:  root://cmseos.fnal.gov//eos/uscms/store/group/lpcmetx/MCSIDM/ffNtuple/2018/QCD_Pt-20to30_MuEnrichedPt5_TuneCP5_13TeV_pythia8/RunIIAutumn18DRPremix-102X_upgrade2018_realistic_v15-v4/190508_203602/0000/ffNtuple_143.root
Msg:  zero-size array to reduction operation maximu

In [23]:
[a.size for a in mp_res.values()]

[2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372,
 2577372]

In [24]:
target_ = np.zeros(len(mp_res['pt']), dtype=bool)

In [25]:
mp_res['target'] = target_

In [26]:
awkward.save('data/bkg_190513.awkd', mp_res, mode='w')

----

# combine together

In [27]:
sig = awkward.load('data/signal_190513.awkd')
bkg = awkward.load('data/bkg_190513.awkd')

In [28]:
combo_ = dict()
for k in sig.keys():
    combo_[k] = np.concatenate([sig[k], bkg[k]])

In [29]:
[a.size for a in combo_.values()]

[2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263,
 2873263]

In [30]:
awkward.save('data/combo_190513.awkd', combo_, mode='w')

----