In [None]:
from __future__ import print_function
from glob import glob
from ROOT import vector, RDataFrame, RDF, TFile, TH1F, TH2F, gInterpreter, TMath
import ROOT
import sys
import yaml
from src.RDF_tools.cpp_function import def_cpp
from src.general.utils import toVector, get_files, prep_filelist
import pandas as pd
from time import perf_counter
import uproot
from pathlib import Path
from os.path import exists
from src.general.make_noise import beep_on_error, beep_repeat
beep_on_error()

In [None]:
from src.RDF_tools.df_definitions import *

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Compile functions in c++ for the dataframe
def_cpp()

In [None]:
#set up multithreading
multiThreading = False
if multiThreading: ROOT.ROOT.EnableImplicitMT()
RDFrame = RDataFrame

In [None]:
output_dir = 'assets_feb_23'

In [None]:
#set up metadata for files
era = '2016'
deepflavour = False
deepcsv = True
extras = False
csv_v_flavor=False
if era == "2016":
    fname = "samplesCR_2016_Apr2020.yml"
    bDiscValue = 0.3093
if era == "2017":
    fname = "samplesCR_2017_Apr2020.yml"
    bDiscValue = 0.3033
if era == "2018":
    fname = "samplesCR_2018_Apr2020.yml"
    bDiscValue = 0.2770


sampleDir = "samples"
outname = fname.replace('.yml','.root')
fname = "{}/{}".format(sampleDir, fname)
outname, fname

In [None]:
from src.assets.bff_skimmer_bffv2 import *
columns_data, columns_mc, var_postfix = make_columns(era, columns_data)

In [None]:
columns_data

In [None]:
columns_mc

In [None]:
sample_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/{}/{{}}'.format(era)
eff_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/{}/{{}}'.format(era)

In [None]:
class sample_processor():
    '''Class that takes file name from yml and helps manage samples.'''
    def __init__(self,file_name,outname,bDiscValue,is_inclusive=0):
        #load config
        self.file_name = file_name
        with open(file_name,'r') as f:
            self.sample_dict = yaml.load(f, Loader=yaml.FullLoader)
        #setup outfile
        self.outname = outname
        self.out = TFile(outname, 'recreate')
        self.outdirs_dict = {}
        for sample in self.samples():
            name = sample['name']
            self.outdirs_dict[name] = self.out.mkdir(name)
        self.lumi = self.sample_dict['lumi']
        #get and write lumi info
        hlumi = TH1F("lumi", "lumi", 1, 0, 1)
        hlumi.SetDirectory(self.out)
        hlumi.SetBinContent(1, self.lumi)
        hlumi.Write()
        self.bDiscValue = bDiscValue
        self.is_inclusive = is_inclusive
    def samples(self):
        return self.sample_dict['samples']
    def sample_names(self):
        return [s['name'] for s in self.samples()]
    def close(self):
        self.out.Close()
    def __repr__(self):
        text_dict = {"fn":self.file_name,
                     "on":self.outname, 
                     "lumi":self.lumi,
                    "samples": self.sample_names()}
        return '''from {fn} to {on}\nlumi: {lumi}\nSamples {samples}'''.format(**text_dict)

In [None]:
# create instance of sample manager class 
sp = sample_processor(fname, outname, bDiscValue)
#print(sp)

In [None]:
sp

In [None]:
# this creates long list of or statements for all permuations to select for events that are present in at least one region 
rs = ["CR10", "CR11", "CR12", "CR13", "CR14", "CR20", "CR21", "CR22", "CR23", "CR24", "SR1", "SR2"]

mcstring = ""
for jv in var_postfix:
    for r in rs:
        mcstring += "{}{} or ".format(r,jv)

JERC_var = ['jet_nom_muon_corrected_pt_ele_pt']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
        mcstring += "{}_{} or ".format(r,jv)
data_region = string[:-3]
mc_region = mcstring[:-3]
#mc_region

In [None]:
def process_sample(sp,sample,era,verbose=1, maxEvents=1e6):
    '''Process each sample and produced csv.'''
    #get metadata
    name,xsec,nEvents = sample['name'],sample['xsec'],sample['nevts']
    ismc,dirName = int(sample['ismc']),sample['fileglob']
    
    # get files fit for processing
    # filters out files that can't be opened, or are from old runs
    # then it selects a fewer number of files to open if there are too many files
    files_df = get_files(dirName, sample_path)
    files, nEvents = prep_filelist(files_df, ismc, verbose=True, maxEvents=maxEvents)
    #btageff file list
    bTagEffGlobName = eff_path.format(dirName)
    bTagEffGlobName = '/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/2016/DYJetsToLL_M-200to400_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8'
    eff_paths = list(Path(bTagEffGlobName).rglob('*.root'))+ list(Path(bTagEffGlobName+"_eff").rglob('*.root'))
    list_effs = list(map(lambda x: str(x), eff_paths))   
    # get n events from skim if 0
    sample_weight = float(xsec)*sp.lumi/float(nEvents)
 
    if verbose: print("name: {} , xsec: {}, nevents: {} ismc: {}, nfiles: {}".format(name,xsec,
                                                                                     nEvents,ismc,
                                                                                    len(files)))
    #set up btagging and puid sf files
    bTagFile, PUIDSFfile = setup_btag_puid(ismc, era, list_effs)
    #make rdf
    df = RDFrame('Events', files)
    if ismc:
        df = df.Range(0, int(maxEvents))
    ##
    ## filter
    ##
    df = df.Filter("DiLepMass_jet_nom_muon_corrected_pt_ele_pt>105", "mass_cut")
    if ismc:
        df = df.Filter(mc_region, "in_region")
    else:
        df = df.Filter(data_region, "in_region")
    df = df.Filter("Flag_METFilters==1", "METFilter")    
    ##
    ## set up objects
    ##    
    var_string = '_jet_nom_muon_corrected_pt_ele_pt'
    df = def_good_leptons(df, ismc, era, var_string)
    ##
    ## set weights
    ##     
    df = def_HLT(df, ismc, era)
    df = bjet_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = pdf_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = fsr_isr_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = muon_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = electron_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = k_factor(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = PU_weight(df,ismc, sp.is_inclusive, name, sample_weight, era, bDiscValue)
    df = finalize_weights(df,ismc, sp.is_inclusive, name, sample_weight, era)
    if ismc:
        df = df.Define("JER", "SysPercPerObj(Jet_pt_nom, Jet_pt_jerUp, Jet_pt_jerDown, 0)")
        df = df.Define("AvgJER", "clip(CalcAverage(JER))")
        df = df.Define("JES", "SysPercPerObj(Jet_pt_nom, Jet_pt_jesTotalUp, Jet_pt_jesTotalDown, 0)")
        df = df.Define("AvgJES", "CalcAverage(JES)")
        df = df.Define("HEM", "SysPercPerObj(Jet_pt_nom, Jet_pt_jesHEMIssueUp, Jet_pt_jesHEMIssueDown, 0)")
        df = df.Define("AvgHEM", "CalcAverage(HEM)")
    else:
        df = df.Define("JER", "1.")
        df = df.Define("AvgJER", "1.")
        df = df.Define("JES", "1.")
        df = df.Define("AvgJES", "1.")
        df = df.Define("HEM", "1.")
        df = df.Define("AvgHEM", "1.")     
    lcolumn = columns_mc
    if not ismc:
        lcolumn = columns_data
    lcolumn += ['PUIDWeight','PUIDWeightUp','PUIDWeightDown',
                'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown',
                'genWeight'
                ,'k_factor'
                ,'puWeight'
                ,'PUIDWeight'
                ,'MuonSFweight'
                ,'ElectronSFweight'
                ,'TriggerWeight'
                ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet',
                'AvgBtagWeightCorr',
                'AvgBtagWeightUncorr',
               ]
    df_np = df.AsNumpy(lcolumn)
    df_df = pd.DataFrame(df_np)
    df_df.to_csv('{}/data/tw_{}_{}.csv'.format(output_dir, era,name))
    return name,df_df

In [None]:
failed_files = []
existing_files = []

for sample in sp.samples():
    name = sample['name']
    maxEvents=int(2e10) if (('ZTo' in name) or ('DY' in name)) else int(1e7)
    #if not 'Private' in name: continue
    print(name, maxEvents)
    if  'y3' in name: continue
    start_time = perf_counter()
    outname = '{}/data/tw_{}_{}.csv'.format(output_dir, era,sample['name'])
    try:
       
        if not exists(outname):
            print("running......")
            name,df = process_sample(sp,sample,era, verbose=1, maxEvents=maxEvents)
        else:
            print("exits.......")
            existing_files.append(outname)
    except Exception as err:
        failed_files.append(outname)
        print(err)

    #count = df.Count()
    end_time = perf_counter()
    print("sample {} took {:.1f} seconds".format(name,end_time-start_time))

sp.close()

In [None]:
existing_files

In [None]:
failed_files

In [None]:
beep_repeat()

In [None]:
import pandas as pd

In [None]:
path = 'data/tw_2017_BFFZprimeToMuMu_M_750_dbs0p04.csv'
df = pd.read_csv(path)

In [None]:
for x in df:
    print(x)

In [None]:
df.Weight_MuonTriggerUp.mean()*100

In [None]:
df.Weight_BTagUp.mean()*100

In [None]:
import uproot as upr

In [None]:
skim_path = '/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/2017/BFF_175_dbs0p5_deepflavour_bffv2/221019_071128/0000/tree_1.root'

In [None]:
upf = upr.open(skim_path)['Events']

In [None]:
upf.arrays(['Muon_effSF_sys_triggerUp',
           'Muon_effSF_trigger',
           'Muon_effSF_sys_triggerDown'], library='pd').mean()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/tw_2016_BFFZprimeToMuMu_M_250_dbs0p04.csv')

In [None]:
[x for x in df.keys() if 'HLT' in x], [x for x in df.keys() if 'SR' in x]

In [None]:
df.SR2_jet_nom_muon_corrected_pt_ele_pt.sum(), (df.SR2_jet_nom_muon_corrected_pt_ele_pt*df.HLT_Mu50).sum()

In [None]:
df[[ 
    'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown'
    ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet', 'AvgBtagWeight']].mean()*100

In [None]:
df.AvgJER.mean()

In [None]:
from src.data_tools.get_file_list import get_file_df


In [None]:
era = 2016
file_df = get_file_df()
file_df = file_df[file_df.era==era]
file_df.reset_index(inplace=True)

In [None]:
means_list = []
for i, x in file_df.iterrows():
    print(x.file)
    _df = pd.read_csv(x.file)
    _df = _df[_df.AvgJER<5]
    means = _df[[ 
    'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown'
    ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet', 'AvgBtagWeight']].mean()*100
    means_dict = means.to_dict()
    means_dict['mass'] = x.mass
    means_dict['dbs'] = x.dbs
    means_list.append(means_dict)

In [None]:
mean_df = pd.DataFrame(means_list)

In [None]:
mean_df.min().round(1)

In [None]:
mean_df.max().round(1)

In [None]:
import numpy as np

In [None]:
mean_df.AvgJER