<center>
    <img src="img/CMS_Jets.png" width="40%" />
    <br />
    <h1>Implementing a Charm Tagger with Scikit-Learn</h1>
    <br /><br />
    Seth Moortgat, December 14, 2015
    <br /><br />
    Machine Learning Seminar @ IIHE
</center>

<center>
    <h1> Overview: What is a charm tagger?
    <img src="img/CSV.png" width="40%" />
</center>
* Charm tagging in CMS: Exploit the lifetime of D mesons 
→ travels some distance in the tracker before it decays = secondary vertex (SV) with displaced tracks
* Combine information from Secondary Vertices, displaced tracks and soft leptons inside the jet to identify charm-quark jets from bottom- or light-flavour jets.

<center>
    <h1> Use Multivariate Analysis (MVA) techniques
    <img src="img/MVA.png" width="80%" />
</center>

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["figure.max_open_warning"] = -1

# Print options
import numpy as np
np.set_printoptions(precision=3)

In [None]:
variables = [
    # Displaced Tracks
  "trackSip2dSig_0", "trackSip2dSig_1",
  "trackSip3dSig_0", "trackSip3dSig_1",
  "trackPtRel_0", "trackPtRel_1",
  "trackPPar_0", "trackPPar_1",
  "trackEtaRel_0","trackEtaRel_1",
  "trackDeltaR_0", "trackDeltaR_1",
  "trackPtRatio_0", "trackPtRatio_1",
  "trackPParRatio_0", "trackPParRatio_1",
  "trackJetDist_0","trackJetDist_1",
  "trackDecayLenVal_0", "trackDecayLenVal_1",
  "trackSip2dSigAboveCharm_0",
  "trackSip3dSigAboveCharm_0",
  "trackSumJetEtRatio",
  "trackSumJetDeltaR",
    # Secondary Vertex
  "vertexMass_0",
  "vertexEnergyRatio_0",
  "flightDistance2dSig_0",
  "flightDistance3dSig_0",
  "vertexJetDeltaR_0",
  "massVertexEnergyFraction_0",
  "vertexBoostOverSqrtJetPt_0",
  "jetNSecondaryVertices",
  "jetNTracks",
  "vertexNTracks_0",
    # Soft Leptons
  "leptonPtRel_0","leptonPtRel_1",
  "leptonSip3d_0","leptonSip3d_1",
  "leptonDeltaR_0","leptonDeltaR_1",
  "leptonRatioRel_0","leptonRatioRel_1",
  "leptonEtaRel_0","leptonEtaRel_1",
  "leptonRatio_0","leptonRatio_1",
  ]

## Discriminate charm-jets from light jets

In [None]:
signal_files = [ # C = charm
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/QCD/skimmed_20k_eachptetabin_CombinedSVNoVertex_C.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/QCD/skimmed_20k_eachptetabin_CombinedSVPseudoVertex_C.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/QCD/skimmed_20k_eachptetabin_CombinedSVRecoVertex_C.root"
    ]
bckgr_files = [  # DUSG = light
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/QCD/skimmed_20k_eachptetabin_CombinedSVNoVertex_DUSG.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/QCD/skimmed_20k_eachptetabin_CombinedSVPseudoVertex_DUSG.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/QCD/skimmed_20k_eachptetabin_CombinedSVRecoVertex_DUSG.root"
    ]

In [None]:
print 'Merging and converting the samples'
nfiles_per_sample = None
skip_n_events = 300

import root_numpy as rootnp

#root_numpy.root2array(filenames, treename=None, branches=None, selection=None, start=None, stop=None, step=None, include_weight=False, weight_name='weight', cache_size=-1)
signal_merged = np.ndarray((0,len(variables)),float)
bckgr_merged = np.ndarray((0,len(variables)),float)
for f_sig in signal_files:
	signal = rootnp.root2array(f_sig,'tree',variables,None,0,nfiles_per_sample,skip_n_events,False,'weight')
	signal = rootnp.rec2array(signal)
	signal_merged = np.concatenate((signal_merged,signal),0)
for f_bck in bckgr_files:
	bckgr = rootnp.root2array(f_bck,'tree',variables,None,0,nfiles_per_sample,skip_n_events,False,'weight')
	bckgr = rootnp.rec2array(bckgr)
	bckgr_merged = np.concatenate((bckgr_merged,bckgr),0)

In [None]:
X = np.concatenate((signal_merged, bckgr_merged))
y = np.concatenate((np.ones(signal_merged.shape[0]),np.zeros(bckgr_merged.shape[0])))
print X[:2]
print 'signal:',y[1], 'bckgr:', y[-1]

## Apply weights from a branch called 'weight'

In [None]:
print 'Getting event weights from the trees'
# Get the weights
weights = np.ones(0)
for f_sig in signal_files:
	weights_sig = rootnp.root2array(f_sig,'tree','weight',None,0,nfiles_per_sample,skip_n_events,False,'weight')
	weights = np.concatenate((weights,weights_sig),0)
for f_bck in bckgr_files:	
	weights_bckgr = rootnp.root2array(f_bck,'tree','weight',None,0,nfiles_per_sample,skip_n_events,False,'weight')
	weights = np.concatenate((weights,weights_bckgr),0)

## Step 1: Feature (variable) selection

In [None]:
import time
from sklearn.ensemble import RandomForestClassifier 
#BDT with 10 trees, minimum 10 events to split a node, running 5 jobs at the same time
clf = RandomForestClassifier(n_estimators=10,min_samples_split = 10,n_jobs = 1,verbose = 0)
from sklearn.feature_selection import RFE
# select the top 20 features 
feature_selector = RFE(clf, n_features_to_select=20, verbose=1)

In [None]:
start = time.time()
feature_selector.fit(X, y)
end = time.time()
print 'training completed --> Elapsed time: ' , (end-start)/60 ,  'minutes'

In [None]:
# print the final set of parameters
print 'variables = ['
for idx,ft in enumerate(feature_selector.get_support()):
	if ft:
		print '\t\''+variables[idx]+'\','
print ']'		

In [None]:
variables = [
	'trackSip2dSig_0',
	'trackSip2dSig_1',
	'trackSip3dSig_0',
	'trackSip3dSig_1',
	'trackPtRel_0',
	'trackPPar_0',
	'trackPPar_1',
	'trackDeltaR_0',
	'trackDeltaR_1',
	'trackPtRatio_0',
	'trackJetDist_0',
	'trackJetDist_1',
	'trackDecayLenVal_0',
	'trackDecayLenVal_1',
	'trackSip2dSigAboveCharm_0',
	'trackSip3dSigAboveCharm_0',
	'trackSumJetEtRatio',
	'trackSumJetDeltaR',
	'flightDistance3dSig_0',
	'vertexBoostOverSqrtJetPt_0',
]

In [None]:
print 'Merging and converting the samples'
nfiles_per_sample = None
skip_n_events = 300

import root_numpy as rootnp

#root_numpy.root2array(filenames, treename=None, branches=None, selection=None, start=None, stop=None, step=None, include_weight=False, weight_name='weight', cache_size=-1)
signal_merged = np.ndarray((0,len(variables)),float)
bckgr_merged = np.ndarray((0,len(variables)),float)
for f_sig in signal_files:
	signal = rootnp.root2array(f_sig,'tree',variables,None,0,nfiles_per_sample,skip_n_events,False,'weight')
	signal = rootnp.rec2array(signal)
	signal_merged = np.concatenate((signal_merged,signal),0)
for f_bck in bckgr_files:
	bckgr = rootnp.root2array(f_bck,'tree',variables,None,0,nfiles_per_sample,skip_n_events,False,'weight')
	bckgr = rootnp.rec2array(bckgr)
	bckgr_merged = np.concatenate((bckgr_merged,bckgr),0)
    
X = np.concatenate((signal_merged, bckgr_merged))
y = np.concatenate((np.ones(signal_merged.shape[0]),np.zeros(bckgr_merged.shape[0])))

# Now run a training on a more advanced tree with these variables

In [None]:
clf = RandomForestClassifier(n_estimators=100,min_samples_split = 50,n_jobs = 1, verbose = 3)
clf.fit(X, y,weights)

# Run validation on a different set of samples

In [None]:
val_signal_files = [
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/TTbar/CombinedSVNoVertex_C.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/TTbar/CombinedSVPseudoVertex_C.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/TTbar/CombinedSVRecoVertex_C.root"
    ] 
val_bckgr_files = [
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/TTbar/CombinedSVNoVertex_DUSG.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/TTbar/CombinedSVPseudoVertex_DUSG.root",
    "/Users/Moortgat/Documents/VUB/PhD/CharmTagging/FlatTrees_SL_7_5_1/TTbar/CombinedSVRecoVertex_DUSG.root"
    ]


print 'Starting validation'
skip_n_events = 10

#root_numpy.root2array(filenames, treename=None, branches=None, selection=None, start=None, stop=None, step=None, include_weight=False, weight_name='weight', cache_size=-1)
val_signal_merged = np.ndarray((0,len(variables)),float)
val_bckgr_merged = np.ndarray((0,len(variables)),float)
for f_sig in val_signal_files:
	val_signal = rootnp.root2array(f_sig,'tree',variables,None,0,nfiles_per_sample,skip_n_events,False,'weight')
	val_signal = rootnp.rec2array(val_signal)
	val_signal_merged = np.concatenate((val_signal_merged,val_signal),0)
for f_bck in val_bckgr_files:	
	val_bckgr = rootnp.root2array(f_bck,'tree',variables,None,0,nfiles_per_sample,skip_n_events,False,'weight')
	val_bckgr = rootnp.rec2array(val_bckgr)
	val_bckgr_merged = np.concatenate((val_bckgr_merged,val_bckgr),0)

X_val = np.concatenate((val_signal_merged, val_bckgr_merged))
y_val = np.concatenate((np.ones(val_signal_merged.shape[0]),np.zeros(val_bckgr_merged.shape[0])))

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_val, clf.predict_proba(X_val)[:, 1])
plt.semilogy(tpr, fpr,label='RFC, ntrees = 500')
plt.ylabel("Light Efficiency")
plt.xlabel("Charm Efficiency")
plt.legend(loc='best')
plt.grid(True)
plt.show()