In [1]:
from __future__ import print_function
from glob import glob
from ROOT import vector, RDataFrame, RDF, TFile, TH1F, TH2F, gInterpreter, TMath
import ROOT
import sys
import yaml
from bff_processor.cpp_function import def_cpp
from bff_processor.utils import toVector, get_nEvents
import pandas as pd
from time import perf_counter
import uproot
from pathlib import Path

Welcome to JupyROOT 6.22/09


In [2]:
from src.RDF_tools.df_definitions import *

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# Compile functions in c++ for the dataframe
def_cpp()

True

In [5]:
#set up multithreading
multiThreading = False
if multiThreading: ROOT.ROOT.EnableImplicitMT()
RDFrame = RDataFrame

In [6]:
#set up metadata for files
era = '2016'
deepflavour = False
deepcsv = True
extras = False
csv_v_flavor=False
if era == "2016":
    fname = "samplesCR_2016_Apr2020.yml"
    bDiscValue = 0.3093
if era == "2017":
    fname = "samplesCR_2017_Apr2020.yml"
    bDiscValue = 0.3033
if era == "2018":
    fname = "samplesCR_2018_Apr2020.yml"
    bDiscValue = 0.2770


sampleDir = "samples"
outname = fname.replace('.yml','.root')
fname = "{}/{}".format(sampleDir, fname)
outname, fname

('samplesCR_2016_Apr2020.root', 'samples/samplesCR_2016_Apr2020.yml')

In [7]:
from src.assets.bff_skimmer_bffv2 import *
columns_data, columns_mc, var_postfix = make_columns(era, columns_data)

In [8]:
sample_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/{}/{{}}'.format(era)
eff_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/{}/{{}}'.format(era)

In [9]:
class sample_processor():
    '''Class that takes file name from yml and helps manage samples.'''
    def __init__(self,file_name,outname,bDiscValue,is_inclusive=0):
        #load config
        self.file_name = file_name
        with open(file_name,'r') as f:
            self.sample_dict = yaml.load(f, Loader=yaml.FullLoader)
        #setup outfile
        self.outname = outname
        self.out = TFile(outname, 'recreate')
        self.outdirs_dict = {}
        for sample in self.samples():
            name = sample['name']
            self.outdirs_dict[name] = self.out.mkdir(name)
        self.lumi = self.sample_dict['lumi']
        #get and write lumi info
        hlumi = TH1F("lumi", "lumi", 1, 0, 1)
        hlumi.SetDirectory(self.out)
        hlumi.SetBinContent(1, self.lumi)
        hlumi.Write()
        self.bDiscValue = bDiscValue
        self.is_inclusive = is_inclusive
    def samples(self):
        return self.sample_dict['samples']
    def sample_names(self):
        return [s['name'] for s in self.samples()]
    def close(self):
        self.out.Close()
    def __repr__(self):
        text_dict = {"fn":self.file_name,
                     "on":self.outname, 
                     "lumi":self.lumi,
                    "samples": self.sample_names()}
        return '''from {fn} to {on}\nlumi: {lumi}\nSamples {samples}'''.format(**text_dict)

In [10]:
# create instance of sample manager class 
sp = sample_processor(fname, outname, bDiscValue)
print(sp)

from samples/samplesCR_2016_Apr2020.yml to samplesCR_2016_Apr2020.root
lumi: 35.5
Samples ['y3_250_deepflavour_bff', 'y3_400_deepflavour_bff', 'y3_1000_deepflavour_bff', 'y3_700_deepflavour_bff', 'ST_t-channel_top_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1', 'ST_s-channel_4f_leptonDecays_13TeV-amcatnlo-pythia8_TuneCUETP8M1', 'WWZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'ZZZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'WZZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'WWW_4F_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'WJetsToQQ_HT-800toInf_qc19_3j_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'WJetsToQQ_HT-600ToInf_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'WJetsToLNu_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'TTWJetsToQQ_TuneCUETP8M1_13TeV-amcatnloFXFX-madspin-pythia8', 'TTWJetsToLNu_TuneCUETP8M1_13TeV-amcatnloFXFX-madspin-pythia8', 'TTZToQQ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'TTZToLL_M-1to10_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'TTZToLLNuNu_M-10_TuneCUETP8M1_13TeV-amcatnlo-pythia8', '

In [11]:
# this creates long list of or statements for all permuations to select for events that are present in at least one region 
rs = ["CR10", "CR11", "CR12", "CR13", "CR14", "CR20", "CR21", "CR22", "CR23", "CR24", "SR1", "SR2"]

mcstring = ""
for jv in var_postfix:
    for r in rs:
        mcstring += "{}{} or ".format(r,jv)

JERC_var = ['jet_nom_muon_corrected_pt_ele_pt']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
        mcstring += "{}_{} or ".format(r,jv)
data_region = string[:-3]
mc_region = mcstring[:-3]
mc_region

'CR10_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR11_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR12_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR13_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR14_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR20_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR21_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR22_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR23_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR24_jet_jesTotalUp_muon_corrected_pt_ele_pt or SR1_jet_jesTotalUp_muon_corrected_pt_ele_pt or SR2_jet_jesTotalUp_muon_corrected_pt_ele_pt or CR10_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR11_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR12_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR13_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR14_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR20_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR21_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR22_jet_jesTotalDown_muon_corrected_pt_ele_pt or CR23_jet_jesTotalDown_muo

In [12]:
!ls /eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/2016/DYJetsToLL_M-200to400_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8

DYJLL_M_200_400_deepflavour_bff_eff


In [13]:
def process_sample(sp,sample,era,verbose=1):
    '''Process each sample and produced csv.'''
    #get metadata
    name,xsec,nevts = sample['name'],sample['xsec'],sample['nevts']
    ismc,dirName = int(sample['ismc']),sample['fileglob']
    filePathName = sample_path.format(dirName)
    #make file list
    print("filePathName", filePathName)
    files_paths = list(Path(filePathName).rglob('*.root'))
    files = list(map(lambda x: str(x), files_paths))
    #return 1,1,1
    #btageff file list
    bTagEffGlobName = eff_path.format(dirName)
    bTagEffGlobName = '/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/2016/DYJetsToLL_M-200to400_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8'
    eff_paths = list(Path(bTagEffGlobName).rglob('*.root'))+ list(Path(bTagEffGlobName+"_eff").rglob('*.root'))
    list_effs = list(map(lambda x: str(x), eff_paths))   
    # get n events from skim if 0
    if not nevts:
        nevts = get_nEvents(files)
        print(nevts)  
    sample_weight = float(xsec)*sp.lumi/float(nevts)
    print(float(xsec), sp.lumi, float(nevts), sample_weight)
    if verbose: print("name: {} , xsec: {}, nevents: {} ismc: {}".format(name,xsec,nevts,ismc))
    #set up btagging and puid sf files
    bTagFile, PUIDSFfile = setup_btag_puid(ismc, era, list_effs)
    #make rdf
    df = RDFrame('Events', files)
    #df = df.Range(0, int(1000))
    ##
    ## filter
    ##
    df = df.Filter("DiLepMass_jet_nom_muon_corrected_pt_ele_pt>105", "mass_cut")
    print("ismc", ismc)
    if ismc:
        df = df.Filter(mc_region, "in_region")
    else:
        df = df.Filter(data_region, "in_region")
    df = df.Filter("Flag_METFilters==1", "METFilter")    
    ##
    ## set up objects
    ##    
    var_string = '_jet_nom_muon_corrected_pt_ele_pt'
    df = def_good_leptons(df, ismc, era, var_string)
    ##
    ## set weights
    ##     
    df = def_HLT(df, ismc, era)
    df = bjet_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = pdf_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = fsr_isr_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = muon_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = electron_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = k_factor(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = PU_weight(df,ismc, sp.is_inclusive, name, sample_weight, era, bDiscValue)
    df = finalize_weights(df,ismc, sp.is_inclusive, name, sample_weight, era)
    if ismc:
        df = df.Define("JER", "SysPercPerObj(Jet_pt_nom, Jet_pt_jerUp, Jet_pt_jerDown, 0)")
        df = df.Define("AvgJER", "clip(CalcAverage(JER))")
        df = df.Define("JES", "SysPercPerObj(Jet_pt_nom, Jet_pt_jesTotalUp, Jet_pt_jesTotalDown, 0)")
        df = df.Define("AvgJES", "CalcAverage(JES)")
        df = df.Define("HEM", "SysPercPerObj(Jet_pt_nom, Jet_pt_jesHEMIssueUp, Jet_pt_jesHEMIssueDown, 0)")
        df = df.Define("AvgHEM", "CalcAverage(HEM)")
    else:
        df = df.Define("JER", "1.")
        df = df.Define("AvgJER", "1.")
        df = df.Define("JES", "1.")
        df = df.Define("AvgJES", "1.")
        df = df.Define("HEM", "1.")
        df = df.Define("AvgHEM", "1.")     
    lcolumn = columns_mc
    if not ismc:
        lcolumn = columns_data
    lcolumn += ['PUIDWeight','PUIDWeightUp','PUIDWeightDown',
                'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown',
                'genWeight'
                ,'k_factor'
                ,'puWeight'
                ,'PUIDWeight'
                ,'MuonSFweight'
                ,'ElectronSFweight'
                ,'TriggerWeight'
                ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet',
                'AvgBtagWeight',
                
               ]
    df_np = df.AsNumpy(lcolumn)
    df_df = pd.DataFrame(df_np)
    df_df.to_csv('data/tw_{}_{}.csv'.format(era,name))
    return name,df_df,filePathName

In [None]:
for sample in sp.samples():
    name = sample['name']
    #print(name)
    if  'y3' in name: continue
    if  'BFF' in name: continue
    #if  not 'BFF' in name: continue
    #if name!="BFFZprimeToMuMu_M_350_dbs0p04": continue
    #if not ((('350' in name) and ('0p04' in name)) or (('300' in name) and ('0p04' in name))) : continue
    #if not '0p04' in name: continue
    print(name)
    #if not (('250' in name) or ('300' in name) or ('450' in name)): continue
    #if not 'data' in name: continue
    start_time = perf_counter()
    name,df,fileglob = process_sample(sp,sample,era, verbose=1)
    #count = df.Count()
    end_time = perf_counter()
    try:
        print(name,end_time-start_time)
    except:
        print(name,end_time-start_time)
sp.close()

ST_t-channel_top_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1
filePathName /eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/2016/ST_t-channel_top_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1
116806840.0
113300.0 35.5 116806840.0 0.03443419923011358
name: ST_t-channel_top_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1 , xsec: 113300, nevents: 116806840.0 ismc: 1
ismc 1
ST_t-channel_top_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1 63.872497729957104
ST_s-channel_4f_leptonDecays_13TeV-amcatnlo-pythia8_TuneCUETP8M1
filePathName /eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/2016/ST_s-channel_4f_leptonDecays_13TeV-amcatnlo-pythia8_TuneCUETP8M1
1245980.0
3365.0 35.5 1245980.0 0.09587433185123356
name: ST_s-channel_4f_leptonDecays_13TeV-amcatnlo-pythia8_TuneCUETP8M1 , xsec: 3365, nevents: 1245980.0 ismc: 1
ismc 1
ST_s-channel_4f_leptonDecays_13TeV-amcatnlo-pythia8_TuneCUETP8M1 10.

In [15]:
import pandas as pd

In [16]:
path = 'data/tw_2017_BFFZprimeToMuMu_M_750_dbs0p04.csv'
df = pd.read_csv(path)

In [17]:
for x in df:
    print(x)

Unnamed: 0
Weight_PuUp
Weight_PuDown
Weight_BTagUp
Weight_BTagDown
Weight_PUIDUp
Weight_PUIDDown
Weight_PDF_Up
Weight_PDF_Down
Weight_ISRFSR_Up
Weight_ISRFSR_Down
Weight_MuonSFUp
Weight_MuonSFDown
Weight_ElectronSFUp
Weight_ElectronSFDown
Weight
sample_weight
TriggerWeight
Flag_goodVertices
Flag_globalSuperTightHalo2016Filter
Flag_HBHENoiseFilter
Flag_HBHENoiseIsoFilter
Flag_EcalDeadCellTriggerPrimitiveFilter
Flag_BadPFMuonFilter
Flag_eeBadScFilter
Flag_METFilters
minGoodJetElDR_jet_nom_muon_corrected_pt_ele_pt
minGoodJetMuDR_jet_nom_muon_corrected_pt_ele_pt
DiLepMass_jet_nom_muon_corrected_pt_ele_pt
HTLT_jet_nom_muon_corrected_pt_ele_pt
RelMET_jet_nom_muon_corrected_pt_ele_pt
TMB_jet_nom_muon_corrected_pt_ele_pt
TMBMin_jet_nom_muon_corrected_pt_ele_pt
TMBMax_jet_nom_muon_corrected_pt_ele_pt
SR2_jet_nom_muon_corrected_pt_ele_pt
SR1_jet_nom_muon_corrected_pt_ele_pt
CR10_jet_nom_muon_corrected_pt_ele_pt
CR11_jet_nom_muon_corrected_pt_ele_pt
CR12_jet_nom_muon_corrected_pt_ele_pt
CR13_jet_

In [18]:
df.Weight_MuonTriggerUp.mean()*100

0.01304017195398331

In [19]:
df.Weight_BTagUp.mean()*100

0.20212692828851295

In [20]:
import uproot as upr

In [21]:
skim_path = '/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/2017/BFF_175_dbs0p5_deepflavour_bffv2/221019_071128/0000/tree_1.root'

In [22]:
upf = upr.open(skim_path)['Events']

In [23]:
upf.arrays(['Muon_effSF_sys_triggerUp',
           'Muon_effSF_trigger',
           'Muon_effSF_sys_triggerDown'], library='pd').mean()

Muon_effSF_sys_triggerUp      0.003894
Muon_effSF_trigger            0.981585
Muon_effSF_sys_triggerDown    0.003894
dtype: float32

In [24]:
import pandas as pd

In [25]:
df = pd.read_csv('data/tw_2016_BFFZprimeToMuMu_M_250_dbs0p04.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/tw_2016_BFFZprimeToMuMu_M_250_dbs0p04.csv'

In [None]:
[x for x in df.keys() if 'HLT' in x], [x for x in df.keys() if 'SR' in x]

In [None]:
df.SR2_jet_nom_muon_corrected_pt_ele_pt.sum(), (df.SR2_jet_nom_muon_corrected_pt_ele_pt*df.HLT_Mu50).sum()

In [None]:
df[[ 
    'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown'
    ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet', 'AvgBtagWeight']].mean()*100

In [None]:
df.AvgJER.mean()

In [None]:
from src.data_tools.get_file_list import get_file_df


In [None]:
era = 2016
file_df = get_file_df()
file_df = file_df[file_df.era==era]
file_df.reset_index(inplace=True)

In [None]:
means_list = []
for i, x in file_df.iterrows():
    print(x.file)
    _df = pd.read_csv(x.file)
    _df = _df[_df.AvgJER<5]
    means = _df[[ 
    'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown'
    ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet', 'AvgBtagWeight']].mean()*100
    means_dict = means.to_dict()
    means_dict['mass'] = x.mass
    means_dict['dbs'] = x.dbs
    means_list.append(means_dict)

In [None]:
mean_df = pd.DataFrame(means_list)

In [None]:
mean_df.min().round(1)

In [None]:
mean_df.max().round(1)

In [None]:
import numpy as np

In [None]:
mean_df.AvgJER