# Interfacing Coffea processor/plotting with the xgboost model

This notebook is a clean version of the coffea processor notebook I've been working on in an attempt to interface the coffea processor and histogramming tools with the xgboost BDT model, so as to be able to plot BDT prediction values using the coffea histogramming and plotting tools. This way, alternate models could be substituted in, and predictions are made as part of the processor, rather than saved in the ntuples.

First, imports (though not all of these are probably necessary for this notebook, they are the ones I have in my longer notebook).

In [13]:
%matplotlib inline
np.seterr(divide='ignore', invalid='ignore', over='ignore')
import os
import time
import json
import uproot
import awkward
import xgboost as xgb
import pandas as pd
import numpy as np
import utils.histoHelpers as uhh
import utils.uprootHelpers as uuh
import mvatrain.preprocessors as mpp
import matplotlib.pyplot as plt
import coffea.processor as processor
from mvatrain.metfilter import *
from os.path import join
from coffea import hist
from coffea.analysis_objects import JaggedCandidateArray
from coffea.processor import defaultdict_accumulator
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from mvatrain.ROCPlot import ROCPlot
from mvatrain.hist_errorbars import hist_errorbars
from FireHydrant.Tools.uproothelpers import NestNestObjArrayToJagged
from FireHydrant.Tools.trigger import Triggers
from FireHydrant.Tools.metfilter import MetFilters

Set the directory for our model (I have various models trained, earl_grey_strong was trained on a mix of background + 2mu2e signal + 4mu signal, for a longer training time).

In [16]:
OUTPUT_DIR = join(os.environ["FFANA_BASE"], f"mvatrain/outputs/earl_grey_strong")  #model

Load in the model so that we can use it to make predictions later:

In [17]:
print("loading model...")
xgbm_default = xgb.Booster({"nthread": 16})
xgbm_default.load_model(join(OUTPUT_DIR, "model_default/model.bin"))
xgbm_optimized = xgb.Booster({"nthread": 16})
xgbm_optimized.load_model(join(OUTPUT_DIR, "model_optimized/model.bin"))
print("model loaded.")

loading model...
model loaded.


Load in the files from the long 2018.json file, and split them into background and signal:

In [18]:
print("Getting background files from .json...")
datasets_ = json.load(open('2018.json'))
bkgdatasets = {}
for group in datasets_.keys():
    if 'DoubleMuon' in group: continue #not data
    if 'CRAB_PrivateMC' in group: continue #not signal
    files = datasets_[group]['files']
    bkgdatasets[group] = [files] #normal background
            
print("Getting signal files from .json...")
datasets_ = json.load(open('2018.json'))
sigdatasets = []
for group in datasets_.keys():
    if not ('CRAB_PrivateMC' in group): continue #not data or background
    files = datasets_[group]['files']
    for file in files: sigdatasets.append(file)
    
print("Files gotten!")

Getting background files from .json...
Getting signal files from .json...
Files gotten!


Split the signal files into 2mu2e and 4mu subsets:

In [19]:
lssig4mu = []
lssig2mu2e = []

for path in sigdatasets:
    if '4Mu' in path:
        lssig4mu.append(path)
    else:
        lssig2mu2e.append(path)

Put those subsets into a dictionary with treenames so that the processor can read them later:/

In [11]:
dataset=dict(
    sig4mu={'files': [], 'treename': 'ffNtuplizer/ffNtuple'},
    sig2mu2e={'files': [], 'treename': 'ffNtuplizer/ffNtuple'},
)

dataset['sig4mu']['files'].extend(lssig4mu)
dataset['sig2mu2e']['files'].extend(lssig2mu2e)

Now, we set up our coffea processor to predictions and back-save predictions into an output so that we can plot them associated with our respective events (this is currently the part that's broken):

In [14]:
class LeptonJetProcessor(processor.ProcessorABC):
    def __init__(self):
        dataset_axis = hist.Cat('dataset', 'signal datasets')
        match_axis   = hist.Cat('match', 'matched')
        pt_axis       = hist.Bin("pt", "pT [GeV]", 50, 0, 800)
        eta_axis      = hist.Bin("eta", 'eta', 50, -2.4, 2.4)
        nef_axis      = hist.Bin("nef", "neutral energy fraction", 50, 0, 1)
        maxd0_axis    = hist.Bin("maxd0", 'track max |d0|', 50, 0, 0.5)
        mind0_axis    = hist.Bin("mind0", 'track min |d0|', 50, 0, 0.5)
        tkiso_axis    = hist.Bin('tkiso', 'track isolation', 50, 0, 1)
        pfiso_axis    = hist.Bin("pfiso", "PFCands isolation", 50, 0, 1)
        spreadpt_axis = hist.Bin("spreadpt", "spreadpt", 50, 0, 1)
        spreaddr_axis = hist.Bin("spreaddr", "spreaddr", 50, 0, 0.1)
        lambda_axis   = hist.Bin('lamb', 'jet sub - lambda', 50, -8, 0)
        epsilon_axis  = hist.Bin('epsi', 'jet sub - epsilon', 50, 0, 0.25)
        ecfe1_axis    = hist.Bin('ecfe1', 'energy correlation function - e1', 50, 0, 750)
        ecfe2_axis    = hist.Bin('ecfe2', 'energy correlation function - e2', 50, 0, 2000)
        ecfe3_axis    = hist.Bin('ecfe3', 'energy correlation function - e3', 50, 0, 1000)
        
        self._accumulator = processor.dict_accumulator({
            'pt': hist.Hist("#counts/16GeV", dataset_axis, pt_axis, match_axis),
            "eta": hist.Hist("#counts/0.096", dataset_axis, eta_axis, match_axis),
            "nef": hist.Hist("#counts/0.02", dataset_axis, nef_axis, match_axis),
            "maxd0": hist.Hist("#counts/0.01cm", dataset_axis, maxd0_axis, match_axis),
            "mind0": hist.Hist("#counts/0.01cm", dataset_axis, mind0_axis, match_axis),
            "tkiso": hist.Hist("#counts/0.02", dataset_axis, tkiso_axis, match_axis),
            "pfiso": hist.Hist("#counts/0.02", dataset_axis, pfiso_axis, match_axis),
            "spreadpt": hist.Hist("#counts/0.02", dataset_axis, spreadpt_axis, match_axis),
            "spreaddr": hist.Hist("#counts/0.002", dataset_axis, spreaddr_axis, match_axis),
            "lamb": hist.Hist("#counts/0.16", dataset_axis, lambda_axis, match_axis),
            "epsi": hist.Hist("#counts/0.005", dataset_axis, epsilon_axis, match_axis),
            "ecfe1": hist.Hist("#counts/25", dataset_axis, ecfe1_axis, match_axis),
            "ecfe2": hist.Hist("#counts/40", dataset_axis, ecfe2_axis, match_axis),
            "ecfe3": hist.Hist("#counts/20", dataset_axis, ecfe3_axis, match_axis),
        })

    @property
    def accumulator(self):
        return self._accumulator
    
    def process(self, df):
        output = self.accumulator.identity()
        
        dataset = df['dataset']  
        
        maxd0_ = np.abs(NestNestObjArrayToJagged(df['pfjet_pfcand_tkD0'])).fillna(0).max()
        mind0_ = np.abs(NestNestObjArrayToJagged(df['pfjet_pfcand_tkD0'])).fillna(0).min()
        
        leptonjets = JaggedCandidateArray.candidatesfromcounts(
            df['pfjet_p4'],
            px=df['pfjet_p4.fCoordinates.fX'],
            py=df['pfjet_p4.fCoordinates.fY'],
            pz=df['pfjet_p4.fCoordinates.fZ'],
            energy=df['pfjet_p4.fCoordinates.fT'],
            nef=(df['pfjet_neutralEmE']+df['pfjet_neutralHadronE'])/df['pfjet_p4.fCoordinates.fT'],
            maxd0=maxd0_.content,
            mind0=mind0_.content,
            tkiso=df['pfjet_tkIsolation05'],
            pfiso=df['pfjet_pfIsolation05'],
            spreadpt=df['pfjet_ptDistribution'],
            spreaddr=df['pfjet_dRSpread'],
            lamb=df['pfjet_subjet_lambda'],
            epsi=df['pfjet_subjet_epsilon'],
            ecf1=df['pfjet_subjet_ecf1'],
            ecf2=df['pfjet_subjet_ecf2'],
            ecf3=df['pfjet_subjet_ecf3'],
        )
        
        genparticles = JaggedCandidateArray.candidatesfromcounts(
            df['gen_p4'],
            px=df['gen_p4.fCoordinates.fX'],
            py=df['gen_p4.fCoordinates.fY'],
            pz=df['gen_p4.fCoordinates.fZ'],
            energy=df['gen_p4.fCoordinates.fT'],
            pid=df['gen_pid']
        )
        darkphotons = genparticles[genparticles.pid==32]
        matchmask = leptonjets.match(darkphotons, deltaRCut=0.3)
        
        metfiltermask = np.logical_and.reduce([df[mf] for mf in MetFilters])
        triggermask = np.logical_or.reduce([df[tp] for tp in Triggers])
        
        leptonjets_t = leptonjets[matchmask][metfiltermask&triggermask]
        leptonjets_f = leptonjets[~matchmask][metfiltermask&triggermask]
        
        flatleptonjets_t = leptonjets_t.flatten()
        flatleptonjets_f = leptonjets_f.flatten()
        dfleptonjets_t = pd.DataFrame(flatleptonjets_t)
        dfleptonjets_f = pd.DataFrame(flatleptonjets_f)
        dfleptonjets_t.fillna(0)
        dfleptonjets_f.fillna(0)
        xglj_t = xgb.DMatrix(dfleptonjets_t)
        xglj_f = xgb.DMatrix(dfleptonjets_f)
        predictions_t = xgbm_optimized.predict(xglj_t)
        predictions_f = xgbm_optimized.predict(xglj_f)
        
        offsets_t = leptonjets_t.offsets
        offsets_f = leptonjets_f.offsets
        jaggedpredictions_t = JaggedCandidateArray.candidatesfromoffsets(predictions_t, offsets_t)
        jaggedpredictions_f = JaggedCandidateArray.candidatesfromoffsets(predictions_f, offsets_f)
        
        output['pt']      .fill(dataset=dataset, match='matched', pt=leptonjets_t.pt.flatten())
        output['eta']     .fill(dataset=dataset, match='matched', eta=leptonjets_t.eta.flatten())
        output['nef']     .fill(dataset=dataset, match='matched', nef=leptonjets_t.nef.flatten())
        output['maxd0']   .fill(dataset=dataset, match='matched', maxd0=leptonjets_t.maxd0.flatten())
        output['mind0']   .fill(dataset=dataset, match='matched', mind0=leptonjets_t.mind0.flatten())
        output['tkiso']   .fill(dataset=dataset, match='matched', tkiso=leptonjets_t.tkiso.flatten())
        output['pfiso']   .fill(dataset=dataset, match='matched', pfiso=leptonjets_t.pfiso.flatten())
        output['spreadpt'].fill(dataset=dataset, match='matched', spreadpt=leptonjets_t.spreadpt.flatten())
        output['spreaddr'].fill(dataset=dataset, match='matched', spreaddr=leptonjets_t.spreaddr.flatten())
        output['lamb']    .fill(dataset=dataset, match='matched', lamb=leptonjets_t.lamb.flatten())
        output['epsi']    .fill(dataset=dataset, match='matched', epsi=leptonjets_t.epsi.flatten())
        output['ecfe1']   .fill(dataset=dataset, match='matched', ecfe1=leptonjets_t.ecf1.flatten())
        output['ecfe2']   .fill(dataset=dataset, match='matched', ecfe2=leptonjets_t.ecf2.flatten())
        output['ecfe3']   .fill(dataset=dataset, match='matched', ecfe3=leptonjets_t.ecf3.flatten())
        output['mva']     .fill(dataset=dataset, match='matched', mva=jaggedpredictions_t.flatten())
        
        output['pt']      .fill(dataset=dataset, match='unmatched', pt=leptonjets_f.pt.flatten())
        output['eta']     .fill(dataset=dataset, match='unmatched', eta=leptonjets_f.eta.flatten())
        output['nef']     .fill(dataset=dataset, match='unmatched', nef=leptonjets_f.nef.flatten())
        output['maxd0']   .fill(dataset=dataset, match='unmatched', maxd0=leptonjets_f.maxd0.flatten())
        output['mind0']   .fill(dataset=dataset, match='unmatched', mind0=leptonjets_f.mind0.flatten())
        output['tkiso']   .fill(dataset=dataset, match='unmatched', tkiso=leptonjets_f.tkiso.flatten())
        output['pfiso']   .fill(dataset=dataset, match='unmatched', pfiso=leptonjets_f.pfiso.flatten())
        output['spreadpt'].fill(dataset=dataset, match='unmatched', spreadpt=leptonjets_f.spreadpt.flatten())
        output['spreaddr'].fill(dataset=dataset, match='unmatched', spreaddr=leptonjets_f.spreaddr.flatten())
        output['lamb']    .fill(dataset=dataset, match='unmatched', lamb=leptonjets_f.lamb.flatten())
        output['epsi']    .fill(dataset=dataset, match='unmatched', epsi=leptonjets_f.epsi.flatten())
        output['ecfe1']   .fill(dataset=dataset, match='unmatched', ecfe1=leptonjets_f.ecf1.flatten())
        output['ecfe2']   .fill(dataset=dataset, match='unmatched', ecfe2=leptonjets_f.ecf2.flatten())
        output['ecfe3']   .fill(dataset=dataset, match='unmatched', ecfe3=leptonjets_f.ecf3.flatten())
        output['mva']     .fill(dataset=dataset, match='unmatched', mva=jaggedpredictions_f.flatten())
        
        return output
    
    def postprocess(self, accumulator):
        return accumulator

In [15]:
output = processor.run_uproot_job(dataset,
                                  treename=None,
                                  processor_instance=LeptonJetProcessor(),
                                  executor=processor.futures_executor,
                                  executor_args=dict(workers=12, flatten=True),
                                  chunksize=500000,
                                 )

Preprocessing: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
Processing:   0%|          | 0/300 [00:00<?, ?items/s]


ValueError: feature_names mismatch: ['target', 'pt', 'eta', 'neufrac', 'maxd0', 'mind0', 'tkiso', 'pfiso', 'spreadpt', 'spreaddr', 'lambda', 'epsilon', 'ecf1', 'ecf2', 'ecf3'] []
expected ecf1, ecf2, target, tkiso, spreadpt, mind0, spreaddr, epsilon, pt, ecf3, lambda, maxd0, neufrac, pfiso, eta in input data