In [None]:
from __future__ import print_function
from glob import glob
from ROOT import vector, RDataFrame, RDF, TFile, TH1F, TH2F, gInterpreter, TMath
import ROOT
import sys
import yaml
from bff_processor.cpp_function import def_cpp
from bff_processor.utils import toVector, get_nEvents
from bff_processor.df_definitions import *
import pandas as pd
from time import perf_counter
import uproot
from pathlib import Path

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Compile functions in c++ for the dataframe
def_cpp()

In [None]:
#set up multithreading
multiThreading = False
if multiThreading: ROOT.ROOT.EnableImplicitMT()
RDFrame = RDataFrame

In [None]:
#set up metadata for files
era = '2016'
deepflavour = False
deepcsv = True
extras = False
csv_v_flavor=False
if era == "2016":
    fname = "samplesCR_2016_Apr2020.yml"
    bDiscValue = 0.3093
if era == "2017":
    fname = "samplesCR_2017_Apr2020.yml"
    bDiscValue = 0.3033
if era == "2018":
    fname = "samplesCR_2018_Apr2020.yml"
    bDiscValue = 0.2770


sampleDir = "samples"
outname = fname.replace('.yml','.root')
fname = "{}/{}".format(sampleDir, fname)
outname, fname

In [None]:
from src.assets.bff_skimmer import *
columns = make_columns(era, columns)

In [None]:
sample_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff/{}/{{}}'.format(era)
eff_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/{}/{{}}'.format(era)

In [None]:
class sample_processor():
    '''Class that takes file name from yml and helps manage samples.'''
    def __init__(self,file_name,outname,bDiscValue,is_inclusive=0):
        #load config
        self.file_name = file_name
        with open(file_name,'r') as f:
            self.sample_dict = yaml.load(f, Loader=yaml.FullLoader)
        #setup outfile
        self.outname = outname
        self.out = TFile(outname, 'recreate')
        self.outdirs_dict = {}
        for sample in self.samples():
            name = sample['name']
            self.outdirs_dict[name] = self.out.mkdir(name)
        self.lumi = self.sample_dict['lumi']
        #get and write lumi info
        hlumi = TH1F("lumi", "lumi", 1, 0, 1)
        hlumi.SetDirectory(self.out)
        hlumi.SetBinContent(1, self.lumi)
        hlumi.Write()
        self.bDiscValue = bDiscValue
        self.is_inclusive = is_inclusive
    def samples(self):
        return self.sample_dict['samples']
    def sample_names(self):
        return [s['name'] for s in self.samples()]
    def close(self):
        self.out.Close()
    def __repr__(self):
        text_dict = {"fn":self.file_name,
                     "on":self.outname, 
                     "lumi":self.lumi,
                    "samples": self.sample_names()}
        return '''from {fn} to {on}\nlumi: {lumi}\nSamples {samples}'''.format(**text_dict)

In [None]:
# create instance of sample manager class 
sp = sample_processor(fname, outname, bDiscValue)
print(sp)

In [None]:
# this creates long list of or statements for all permuations to select for events that are present in at least one region 
rs = ["CR10", "CR11", "CR12", "CR13", "CR14", "CR20", "CR21", "CR22", "CR23", "CR24", "SR1", "SR2"]
JERC_var = ['nom','jerUp','jerDown','jesTotalUp','jesTotalDown']
if era=="2018":
    JERC_var+= ['jesHEMIssueUp', 'jesHEMIssueDown']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
mc_region = string[:-3]
JERC_var = ['nom']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
data_region = string[:-3]
mc_region

In [None]:
def process_sample(sp,sample,era,verbose=1):
    '''Process each sample and produced csv.'''
    #get metadata
    name,xsec,nevts = sample['name'],sample['xsec'],sample['nevts']
    ismc,dirName = int(sample['ismc']),sample['fileglob']
    filePathName = sample_path.format(dirName)
    #make file list
    files_paths = list(Path(filePathName).rglob('*.root'))
    files = list(map(lambda x: str(x), files_paths))
    #btageff file list
    bTagEffGlobName = eff_path.format(dirName)
    eff_paths = list(Path(bTagEffGlobName).rglob('*.root'))+ list(Path(bTagEffGlobName+"_eff").rglob('*.root'))
    list_effs = list(map(lambda x: str(x), eff_paths))   
    # get n events from skim if 0
    if not nevts:
        nevts = get_nEvents(files)
        print(nevts)
    sample_weight = float(xsec)*sp.lumi/float(nevts)
    print(float(xsec), sp.lumi, float(nevts), sample_weight)
    if verbose: print("name: {} , xsec: {}, nevents: {} ismc: {}".format(name,xsec,nevts,ismc))
    #set up btagging and puid sf files
    bTagFile, PUIDSFfile = setup_btag_puid(ismc, era, list_effs)
    #make rdf
    df = RDFrame('Events', files)
    #some filters
    df = df.Filter("DiLepMass>105", "mass_cut")
    if ismc:
        df = df.Filter(mc_region, "in_region")
    else:
        df = df.Filter(data_region, "in_region")
    #met filter
    df = df.Filter("Flag_METFilters==1", "METFilter")

    
    df = def_good_leptons(df, ismc, era)
    df = def_good_jet(df,ismc, bDiscValue)
    #delta r
    df = df.Filter("minGoodJetMuDR>0.4", 'MuJDeltaR')
    df = df.Filter("minGoodJetElDR>0.4", 'ElJDeltaR')
    
    df = def_HLT(df, ismc, era)
    df = def_sf_and_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = def_lep_selections(df)
    # throw out events with more than 2 leptons
    df = df.Filter('nLowPtLep<3', 'lowPtLeptonFilter')
    # EE L1 prefiring test https://twiki.cern.ch/twiki/bin/viewauth/CMS/ExoPreapprovalChecklist
    df = df.Define("EE_L1_range", "GoodJetPt > 100 && GoodJetEta > 2.25 && GoodJetEta < 3.0")
    df = df.Define("EE_L1_prefire_test", "max_vec(EE_L1_range)!=0")

    lcolumns = columns
    if not ismc:
        lcolumns = columns_data
    #save as CSV
    df_np = df.AsNumpy(lcolumns)
    df_df = pd.DataFrame(df_np)
    df_df.to_csv('data/tw_{}_{}.csv'.format(era,name))
    hem_columns = ['GoodJetEta','GoodJetPhi','GoodJetPt','GoodBJet','GoodMuonPt','GoodMuonEta','GoodMuonPhi','GoodElePt','GoodEleEta','GoodElePhi','DiLepMass','HTLT_nom','RelMET_nom','TMB_nom','SR2_nom','SR1_nom','CR10_nom','CR11_nom','CR12_nom','CR13_nom','CR14_nom','SR2_nom','CR20_nom','CR21_nom','CR22_nom','CR23_nom','CR24_nom',
                            'MET_phi','MET_pt',]
    if not ismc: hem_columns.append('run')
    df.Snapshot("Events", "data/tw_{}_{}.root".format(era,name), hem_columns)
    return name,df,filePathName

In [None]:
for sample in sp.samples():
    name = sample['name']
    #if not 'ZToEE_M_800_1400_flavour' in name: continue
    #if not (('250' in name) or ('300' in name) or ('450' in name)): continue
    #if not 'data' in name: continue
    print(name)
    start_time = perf_counter()
    name,df,fileglob = process_sample(sp,sample,era, verbose=1)
    #count = df.Count()
    end_time = perf_counter()
    try:
        print(name,end_time-start_time)
    except:
        print(name,end_time-start_time)
sp.close()