In [None]:
from __future__ import print_function
from glob import glob
from ROOT import vector, RDataFrame, RDF, TFile, TH1F, TH2F, gInterpreter, TMath
import ROOT
import sys
import yaml
from bff_processor.cpp_function import def_cpp
from bff_processor.utils import toVector, get_nEvents
from bff_processor.df_definitions import *
import pandas as pd
from time import perf_counter
import uproot

In [None]:
columns = ['Weight_PuUp','Weight_PuDown','Weight_BTagUp','Weight_BTagDown','Weight_PUIDUp','Weight_PUIDDown','Weight_PDF_ISRFSR_Up','Weight_PDF_ISRFSR_Down','Weight_MuonSFUp','Weight_MuonSFDown','Weight_ElectronSFUp','Weight_ElectronSFDown','Weight','sample_weight','DiLepMass','TriggerWeight','Flag_goodVertices','Flag_globalSuperTightHalo2016Filter','Flag_HBHENoiseFilter','Flag_HBHENoiseIsoFilter','Flag_EcalDeadCellTriggerPrimitiveFilter','Flag_BadPFMuonFilter','Flag_eeBadScFilter','Flag_METFilters','HTLT_nom','RelMET_nom','TMB_nom','SR2_nom','SR1_nom','CR10_nom','CR11_nom','CR12_nom','CR13_nom','CR14_nom','SR2_nom','CR20_nom','CR21_nom','CR22_nom','CR23_nom','CR24_nom','HTLT_jerDown','RelMET_jerDown','TMB_jerDown','SR2_jerDown','SR1_jerDown','CR10_jerDown','CR11_jerDown','CR12_jerDown','CR13_jerDown','CR14_jerDown','SR2_jerDown','CR20_jerDown','CR21_jerDown','CR22_jerDown','CR23_jerDown','CR24_jerDown','HTLT_jerUp','RelMET_jerUp','TMB_jerUp','SR2_jerUp','SR1_jerUp','CR10_jerUp','CR11_jerUp','CR12_jerUp','CR13_jerUp','CR14_jerUp','SR2_jerUp','CR20_jerUp','CR21_jerUp','CR22_jerUp','CR23_jerUp','CR24_jerUp','HTLT_jesTotalDown','RelMET_jesTotalDown','TMB_jesTotalDown','SR2_jesTotalDown','SR1_jesTotalDown','CR10_jesTotalDown','CR11_jesTotalDown','CR12_jesTotalDown','CR13_jesTotalDown','CR14_jesTotalDown','SR2_jesTotalDown','CR20_jesTotalDown','CR21_jesTotalDown','CR22_jesTotalDown','CR23_jesTotalDown','CR24_jesTotalDown','HTLT_jesTotalUp','RelMET_jesTotalUp','TMB_jesTotalUp','SR2_jesTotalUp','SR1_jesTotalUp','CR10_jesTotalUp','CR11_jesTotalUp','CR12_jesTotalUp','CR13_jesTotalUp','CR14_jesTotalUp','SR2_jesTotalUp','CR20_jesTotalUp','CR21_jesTotalUp','CR22_jesTotalUp','CR23_jesTotalUp','CR24_jesTotalUp']

columns_data = ['Weight','sample_weight','DiLepMass','TriggerWeight','Flag_goodVertices','Flag_globalSuperTightHalo2016Filter','Flag_HBHENoiseFilter','Flag_HBHENoiseIsoFilter','Flag_EcalDeadCellTriggerPrimitiveFilter','Flag_BadPFMuonFilter','Flag_eeBadScFilter','Flag_METFilters','HTLT_nom','RelMET_nom','TMB_nom','SR2_nom','SR1_nom','CR10_nom','CR11_nom','CR12_nom','CR13_nom','CR14_nom','SR2_nom','CR20_nom','CR21_nom','CR22_nom','CR23_nom','CR24_nom',]

In [None]:
def_cpp()

In [None]:
#ROOT.ROOT.EnableImplicitMT()
RDFrame = RDataFrame

In [None]:
era = '2018'
if era == "2016":
    fname = "samplesCR_2016_Apr2020.yml"
    bDiscValue = 0.6321
if era == "2017":
    fname = "samplesCR_2017_Apr2020.yml"
    bDiscValue = 0.4941
if era == "2018":
    fname = "samplesCR_2018_Apr2020.yml"
    bDiscValue = 0.4184
outname = fname.replace('.yml','.root')
outname

In [None]:
class sample_processor():
    def __init__(self,file_name,outname,bDiscValue,is_inclusive=0):
        #load config
        self.file_name = file_name
        with open(file_name,'r') as f:
            self.sample_dict = yaml.load(f, Loader=yaml.FullLoader)
        #setup outfile
        self.outname = outname
        self.out = TFile(outname, 'recreate')
        self.outdirs_dict = {}
        for sample in self.samples():
            name = sample['name']
            self.outdirs_dict[name] = self.out.mkdir(name)
        self.lumi = self.sample_dict['lumi']
        #get and write lumi info
        hlumi = TH1F("lumi", "lumi", 1, 0, 1)
        hlumi.SetDirectory(self.out)
        hlumi.SetBinContent(1, self.lumi)
        hlumi.Write()
        self.bDiscValue = bDiscValue
        self.is_inclusive = is_inclusive
    def samples(self):
        return self.sample_dict['samples']
    def sample_names(self):
        return [s['name'] for s in self.samples()]
    def close(self):
        self.out.Close()
    def __repr__(self):
        text_dict = {"fn":self.file_name,
                     "on":self.outname, 
                     "lumi":self.lumi,
                    "samples": self.sample_names()}
        return '''from {fn} to {on}\nlumi: {lumi}\nSamples {samples}'''.format(**text_dict)

In [None]:
sp = sample_processor(fname, outname, bDiscValue)
print(sp)

In [None]:
rs = ["CR10", "CR11", "CR12", "CR13", "CR14", "CR20", "CR21", "CR22", "CR23", "CR24", "SR1", "SR2"]
JERC_var = ['nom','jerUp','jerDown','jesTotalUp','jesTotalDown']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
mc_region = string[:-3]
JERC_var = ['nom']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
data_region = string[:-3]
mc_region

In [None]:
def create_regions(df, ismc):
    # create regions
    if int(ismc):
        JERC_var = ['nom','jerUp','jerDown','jesUp','jesDown']
    else:
        JERC_var = ['nom']
    rs = ["CR10", "CR11", "CR12", "CR13", "CR14", "CR20", "CR21", "CR22", "CR23", "CR24", "SR1", "SR2"]
    rs = [(r,var) for r in rs for var in JERC_var]
    for reg,var in rs:
        r = '{}_{}'.format(reg,var)
        HTLT_string = 'HTLT_{}'.format(var)
        RelMET_string = 'RelMET_{}'.format(var)
        SBM_string = 'SBM_{}'.format(var)

        HTLT,RelMET,SBM = -120,0.22,0
        if r[2] == "2":
            HTLT,RelMET,SBM = -60,0.22,150
        
        region_string = "{}pre_bff".format(r)
        format_dict = {"region_string":region_string, "r": r,"HTLT": HTLT,"RelMET": RelMET,"SBM": SBM,"HTLT_string": HTLT_string,"RelMET_string": RelMET_string,"SBM_string": SBM_string}
        regions.append((df.Filter("{r} && DiLepMass>54".format(**format_dict), region_string), format_dict))

        region_string = "{}".format(r)
        format_dict = {"region_string":region_string, "r": r,"HTLT": HTLT,"RelMET": RelMET,"SBM": SBM,"HTLT_string": HTLT_string,"RelMET_string": RelMET_string,"SBM_string": SBM_string}
        regions.append((df.Filter("{r} && DiLepMass>54 && {HTLT_string}<{HTLT} && {RelMET_string}<{RelMET} && {SBM_string}>{SBM}".format(**format_dict), region_string), format_dict))

        region_string = "{}_200_GeV_htlt_sig".format(r)
        format_dict = {"region_string":region_string, "r": r,"HTLT": HTLT,"RelMET": RelMET,"SBM": SBM,"HTLT_string": HTLT_string,"RelMET_string": RelMET_string,"SBM_string": SBM_string}
        regions.append((df.Filter("{r} && DiLepMass>54 && {RelMET_string}<{RelMET} && {SBM_string}>{SBM}".format(**format_dict), region_string), format_dict))

        region_string = "{}_200_GeV_sig".format(r)
        format_dict = {"region_string":region_string, "r": r,"HTLT": HTLT,"RelMET": RelMET,"SBM": SBM,"HTLT_string": HTLT_string,"RelMET_string": RelMET_string,"SBM_string": SBM_string}
        regions.append((df.Filter("{r} && DiLepMass>54 && {RelMET_string}<{RelMET} && {SBM_string}>{SBM}".format(**format_dict), region_string), format_dict))

def process_sample(sp,sample,era,verbose=1):
    #get metadata
    name,xsec,nevts = sample['name'],sample['xsec'],sample['nevts']
    ismc,fileglob,bTagEff = int(sample['ismc']),sample['fileglob'],sample['bTagEff']
    
    #make file glob
    #files = toVector('string', glob(fileglob))
    files = glob(fileglob)
    #print(fileglob)
    # get n events from skim if 0
    if not nevts:
        nevts = get_nEvents(files)
        print(nevts)
        
    sample_weight = float(xsec)*sp.lumi/float(nevts)
    if verbose: print("name: {} , xsec: {}, nevents: {} ismc: {}".format(name,xsec,nevts,ismc))

    #set up btagging and puid sf files
    bTagFile, PUIDSFfile = setup_btag_puid(ismc, era, bTagEff)
    #make rdf
    
    df = RDFrame('Events', files)
    #print([x for x in df.GetColumnNames()])
    
    df = df.Filter("DiLepMass>100", "mass_cut")
    
    if ismc:
        df = df.Filter(mc_region, "in_region")
    else:
        df = df.Filter(data_region, "in_region")
    df = def_good_jet(df,ismc, bDiscValue)
    df = def_good_leptons(df, ismc)
    df = def_HLT(df, ismc, era)
    df = def_sf_and_weight(df,ismc, sp.is_inclusive, name, sample_weight)
    df = def_lep_selections(df)
    lcolumns = columns
    if not ismc:
        lcolumns = columns_data
    df.Snapshot("Events", "data/tw_{}_{}.root".format(era,name), lcolumns)
    df_np = df.AsNumpy(lcolumns)
    df_df = pd.DataFrame(df_np)
    df_df.to_csv('data/tw_{}_{}.csv'.format(era,name))
    return name,df, fileglob

In [None]:
for sample in sp.samples():
    name = sample['name']
    print(name)
    start_time = perf_counter()
    name,df,fileglob = process_sample(sp,sample,era)
    #count = df.Count()
    end_time = perf_counter()
    try:
        print(name,end_time-start_time)
    except:
        print(name,end_time-start_time)
sp.close()