In [1]:
from __future__ import print_function
from glob import glob
from ROOT import vector, RDataFrame, RDF, TFile, TH1F, TH2F, gInterpreter, TMath
import ROOT
import sys
import yaml
from src.RDF_tools.cpp_function import def_cpp
from src.general.utils import toVector, get_files, prep_filelist
import pandas as pd
from time import perf_counter
import uproot
from pathlib import Path
from os.path import exists
from src.general.make_noise import beep_on_error, beep_repeat
beep_on_error()

Welcome to JupyROOT 6.22/09


In [20]:
from src.RDF_tools.df_definitions import *

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# Compile functions in c++ for the dataframe
def_cpp()

False

input_line_182:6:7: error: redefinition of 'bTagEff'
TH2F *bTagEff = 0;
      ^
input_line_62:6:7: note: previous definition is here
TH2F *bTagEff = 0;
      ^
input_line_182:11:7: error: redefinition of 'CalcMinPhi'
float CalcMinPhi(float phi1, float phi2){
      ^
input_line_62:11:7: note: previous definition is here
float CalcMinPhi(float phi1, float phi2){
      ^
input_line_182:19:7: error: redefinition of 'CalcDeltaR'
float CalcDeltaR(float eta1, float  eta2, float  phi1, float  phi2) {
      ^
input_line_62:19:7: note: previous definition is here
float CalcDeltaR(float eta1, float  eta2, float  phi1, float  phi2) {
      ^
input_line_182:23:6: error: redefinition of 'hard_process'
bool hard_process(int value){
     ^
input_line_62:23:6: note: previous definition is here
bool hard_process(int value){
     ^
input_line_182:28:5: error: redefinition of 'GetGenMultiplicity'
int GetGenMultiplicity(const RVec<int> &GenPart_statusFlags, const RVec<int> &GenPart_pdgId,
    ^
input_line_

In [48]:
#set up multithreading
multiThreading = False
if multiThreading: ROOT.ROOT.EnableImplicitMT()
RDFrame = RDataFrame

In [49]:
from src.assets.output_dir import output_dir
output_dir

'/eos/cms/store/group/phys_exotica/bffZprime/assets_june_23'

In [50]:
#set up metadata for files
era = '2016'
deepflavour = False
deepcsv = True
extras = False
csv_v_flavor=False
if era == "2016":
    #https://twiki.cern.ch/twiki/bin/viewauth/CMS/BtagRecommendation2016Legacy
    fname = "samplesCR_2016_Apr2020.yml"
    bDiscValue = 0.3093
    bDiscValueLoose = 0.0614 
    bDiscValueMedian = 0.015163421630859375
    bDiscValueTight = 0.7221
if era == "2017":
    fname = "samplesCR_2017_Apr2020.yml"
    bDiscValue = 0.3033
    bDiscValueLoose = 0.0521
    bDiscValueMedian = 0.015163421630859375
    bDiscValueTight = 0.7489
if era == "2018":
    fname = "samplesCR_2018_Apr2020.yml"
    bDiscValue = 0.2770
    bDiscValueLoose = 0.0494 
    bDiscValueMedian = 0.015163421630859375
    bDiscValueTight =  0.7264

sampleDir = "samples"
outname = fname.replace('.yml','.root')
fname = "{}/{}".format(sampleDir, fname)
outname, fname

('samplesCR_2016_Apr2020.root', 'samples/samplesCR_2016_Apr2020.yml')

In [51]:
from src.assets.bff_skimmer_bffv2 import *
columns_data, columns_mc, var_postfix = make_columns(era, columns_data)

In [52]:
sample_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/{}/{{}}'.format(era)
eff_path ='/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/{}/{{}}'.format(era)

In [53]:
class sample_processor():
    '''Class that takes file name from yml and helps manage samples.'''
    def __init__(self,file_name,outname,bDiscValue,is_inclusive=0):
        #load config
        self.file_name = file_name
        with open(file_name,'r') as f:
            self.sample_dict = yaml.load(f, Loader=yaml.FullLoader)
        #setup outfile
        self.outname = outname
        self.out = TFile(outname, 'recreate')
        self.outdirs_dict = {}
        for sample in self.samples():
            name = sample['name']
            self.outdirs_dict[name] = self.out.mkdir(name)
        self.lumi = self.sample_dict['lumi']
        #get and write lumi info
        hlumi = TH1F("lumi", "lumi", 1, 0, 1)
        hlumi.SetDirectory(self.out)
        hlumi.SetBinContent(1, self.lumi)
        hlumi.Write()
        self.bDiscValue = bDiscValue
        self.is_inclusive = is_inclusive
    def samples(self):
        return self.sample_dict['samples']
    def sample_names(self):
        return [s['name'] for s in self.samples()]
    def close(self):
        self.out.Close()
    def __repr__(self):
        text_dict = {"fn":self.file_name,
                     "on":self.outname, 
                     "lumi":self.lumi,
                    "samples": self.sample_names()}
        return '''from {fn} to {on}\nlumi: {lumi}\nSamples {samples}'''.format(**text_dict)

In [54]:
fname

'samples/samplesCR_2016_Apr2020.yml'

In [55]:
# create instance of sample manager class 
sp = sample_processor(fname, outname, bDiscValue)
#print(sp)

In [56]:
sp

from samples/samplesCR_2016_Apr2020.yml to samplesCR_2016_Apr2020.root
lumi: 36.31
Samples ['y3_250_deepflavour_bff', 'y3_400_deepflavour_bff', 'y3_1000_deepflavour_bff', 'y3_700_deepflavour_bff', 'ST_t-channel_top_4f_inclusiveDecays_13TeV-powhegV2-madspin-pythia8_TuneCUETP8M1', 'ST_s-channel_4f_leptonDecays_13TeV-amcatnlo-pythia8_TuneCUETP8M1', 'WWZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'ZZZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'WZZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'WWW_4F_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'WJetsToQQ_HT-800toInf_qc19_3j_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'WJetsToQQ_HT-600ToInf_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'WJetsToLNu_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'TTWJetsToQQ_TuneCUETP8M1_13TeV-amcatnloFXFX-madspin-pythia8', 'TTWJetsToLNu_TuneCUETP8M1_13TeV-amcatnloFXFX-madspin-pythia8', 'TTZToQQ_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 'TTZToLL_M-1to10_TuneCUETP8M1_13TeV-madgraphMLM-pythia8', 'TTZToLLNuNu_M-10_TuneCUETP8M1_13TeV-amcatnlo-pythia8', 

In [57]:
# this creates long list of or statements for all permuations to select for events that are present in at least one region 
rs = ["CR10", "CR11", "CR12", "CR13", "CR14", "CR20", "CR21", "CR22", "CR23", "CR24", "SR1", "SR2"]

mcstring = ""
for jv in var_postfix:
    for r in rs:
        mcstring += "{}{} or ".format(r,jv)

JERC_var = ['jet_nom_muon_corrected_pt_ele_pt']
string = ""
for jv in JERC_var:
    for r in rs:
        string += "{}_{} or ".format(r,jv)
        mcstring += "{}_{} or ".format(r,jv)
data_region = string[:-3]
mc_region = mcstring[:-3]
#mc_region

In [58]:
eff_path

'/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bff_eff/2016/{}'

In [67]:
def process_sample(sp,sample,era,verbose=1, maxEvents=1e6):
    '''Process each sample and produced csv.'''
    info_dict = {}
    #get metadata
    name,xsec,nEvents = sample['name'],sample['xsec'],sample['nevts']
    ismc,dirName = int(sample['ismc']),sample['fileglob']
    
    # get files fit for processing
    # filters out files that can't be opened, or are from old runs
    # then it selects a fewer number of files to open if there are too many files
    files_df = get_files(dirName, sample_path)

    files, nEvents = prep_filelist(files_df, ismc, verbose=True, maxEvents=maxEvents)
    #btageff file list
    if not 'eff_path' in sample:
        bTagEffGlobName = eff_path.format(dirName)
        eff_paths = list(Path(bTagEffGlobName).rglob('*.root'))+ list(Path(bTagEffGlobName+"_eff").rglob('*.root'))
        list_effs = list(map(lambda x: str(x), eff_paths)) 
    else:
        bTagEffGlobName = eff_path.format(sample['eff_path'])
        eff_paths = list(Path(bTagEffGlobName).rglob('*.root'))+list(Path(bTagEffGlobName+"_eff").rglob('*.root'))
        list_effs = list(map(lambda x: str(x), eff_paths)) 

    # get n events from skim if 0
    sample_weight = float(xsec)*sp.lumi/float(nEvents)
    info_dict = {**sample}
    info_dict['sample_weight'] = sample_weight
    info_dict['len(files)'] = len(files)
    info_dict['lumi'] = sp.lumi
    info_dict['nEvents_from_files'] = nEvents

    if verbose: print("name: {} , xsec: {}, nevents: {} ismc: {}, nfiles: {}".format(name,xsec,
                                                                                     nEvents,ismc,
                                                                                    len(files)))
    #return "name","df_df", info_dict
    #set up btagging and puid sf files
    bTagFile, PUIDSFfile = setup_btag_puid(ismc, era, list_effs)
    #make rdf
    df = RDFrame('Events', files)
    if ismc:
        df = df.Range(0, int(maxEvents))
    #df = df.Range(0, int(1000000))
        
    ##
    ## filter
    ##
    df = df.Filter("DiLepMass_jet_nom_muon_corrected_pt_ele_pt>105", "mass_cut")
    if ismc:
        df = df.Filter(mc_region, "in_region")
    else:
        df = df.Filter(data_region, "in_region")
    df = df.Filter("Flag_METFilters==1", "METFilter")    
    ##
    ## set up objects
    ##    
    var_string = '_jet_nom_muon_corrected_pt_ele_pt'
    df = def_good_leptons(df, ismc, era, var_string)
    ##
    ## set weights
    ##     
    df = def_HLT(df, ismc, era)
    df = bjet_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = pdf_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = fsr_isr_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = muon_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = electron_weight(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = k_factor(df,ismc, sp.is_inclusive, name, sample_weight, era)
    df = PU_weight(df,ismc, sp.is_inclusive, name, sample_weight, era, bDiscValue)
    df = finalize_weights(df,ismc, sp.is_inclusive, name, sample_weight, era)
    
    df = def_lep_selections(df)
    df = test_data_abcd(df, ismc, bDiscValueLoose, postfix="" )
    df = test_data_abcd(df, ismc, bDiscValueMedian, postfix="_median" )
    
    df = medium_to_tight_region(df, ismc, bDiscValueTight )
    #gen inforamtion
    if ismc:
        df = df.Define("GenMultiplicity", "GetGenMultiplicity(GenPart_statusFlags, GenPart_pdgId, GenPart_pt, GenPart_eta, GenPart_phi)")  
        df = df.Define("JER", "SysPercPerObj(Jet_pt_nom, Jet_pt_jerUp, Jet_pt_jerDown, 0)")
        df = df.Define("AvgJER", "clip(CalcAverage(JER))")
        df = df.Define("JES", "SysPercPerObj(Jet_pt_nom, Jet_pt_jesTotalUp, Jet_pt_jesTotalDown, 0)")
        df = df.Define("AvgJES", "CalcAverage(JES)")
        df = df.Define("HEM", "SysPercPerObj(Jet_pt_nom, Jet_pt_jesHEMIssueUp, Jet_pt_jesHEMIssueDown, 0)")
        df = df.Define("AvgHEM", "CalcAverage(HEM)")
    else:
        df = df.Define("GenMultiplicity", "0.")
        df = df.Define("JER", "1.")
        df = df.Define("AvgJER", "1.")
        df = df.Define("JES", "1.")
        df = df.Define("AvgJES", "1.")
        df = df.Define("HEM", "1.")
        df = df.Define("AvgHEM", "1.")     
    lcolumn = columns_mc
    if not ismc:
        lcolumn = columns_data
    lcolumn += ['PUIDWeight','PUIDWeightUp','PUIDWeightDown',
                'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown',
                'genWeight'
                ,'k_factor'
                ,'puWeight'
                ,'PUIDWeight'
                ,'MuonSFweight'
                ,'ElectronSFweight'
                ,'TriggerWeight'
                ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet',
                'AvgBtagWeightCorr',
                'AvgBtagWeightUncorr', 'GenMultiplicity',
                #'nGoodJet_subloose', 'nGoodJet_loose_to_medium',
                'CRA','CRB','CRC','CRD',
                'CRA2','CRB2','CRC2','CRD2',
                'CRA_median','CRB_median','CRC_median','CRD_median',
                'CRA2_median','CRB2_median','CRC2_median','CRD2_median',
                'CRA_tight_to_medium','CRA2_tight_to_medium'
               ]
    #print(lcolumn)
    #lcolumn = ['nGoodJet_subloose', 'nGoodJet_loose_to_medium', 'CR10_jet_nom_muon_corrected_pt_ele_pt', 'CRD', 'CRA']
    df_np = df.AsNumpy(lcolumn)
    df_df = pd.DataFrame(df_np)
    print('{}/data/tw_{}_{}.csv'.format(output_dir, era,name))
    df_df.to_csv('{}/data/tw_{}_{}.csv'.format(output_dir, era,name))
    return name,df_df, info_dict

In [68]:
info_list = []

In [69]:
sample

{'fileglob': 'BFF_125_dbs0p04_deepflavour_bffv2',
 'ismc': 1,
 'name': 'BFFZprimeToMuMu_M_125_dbs0p04',
 'nevts': 0,
 'xsec': 1528,
 'eff_path': 'BFF_125_dbs0p04_deepflavour_bff_eff'}

In [None]:
failed_files = []
existing_files = []

for sample in sp.samples():
    name = sample['name']
    maxEvents=int(2e10) if (('ZTo' in name) or ('DY' in name)) else int(1e7)
    if  'y3' in name: continue
    #if 'BFFZprimeToMuMu_M_125_dbs0p04' not in name: continue
    if not 'BFF' in name: continue
    #if  'data' in name: continue
    #if  'y3' in name: continue
    start_time = perf_counter()
    print(sample)
    outname = '{}/data/tw_{}_{}.csv'.format(output_dir, era,sample['name'])
    #name, df, info_dict = process_sample(sp,sample,era, verbose=1, maxEvents=maxEvents)
    #info_list.append(info_dict)
    try:
        if not exists(outname):
            print("running......")
            name, df, info_dict = process_sample(sp,sample,era, verbose=1, maxEvents=maxEvents)
            info_list.append(info_dict)
        else:
            print("exits.......")
            existing_files.append(outname)
    except Exception as err:
        failed_files.append(outname)
        print(err)

    #count = df.Count()
    end_time = perf_counter()
    print("sample {} took {:.1f} seconds".format(name,end_time-start_time))

sp.close()

{'fileglob': 'BFFZprimeToMuMu_M_250_TuneCUETP8M1_13TeV-madgraph-pythia8', 'ismc': 1, 'name': 'BFFZprimeToMuMu_M_250_dbs0p04', 'nevts': 0, 'xsec': 122.5}
exits.......
sample BFFZprimeToMuMu_M_250_dbs0p04 took 0.0 seconds
{'fileglob': 'BFFZprimeToMuMu_M_300_TuneCUETP8M1_13TeV-madgraph-pythia8', 'ismc': 1, 'name': 'BFFZprimeToMuMu_M_300_dbs0p04', 'nevts': 0, 'xsec': 58.86}
exits.......
sample BFFZprimeToMuMu_M_300_dbs0p04 took 0.0 seconds
{'fileglob': 'BFFZprimeToMuMu_M_400_TuneCUETP8M1_13TeV-madgraph-pythia8', 'ismc': 1, 'name': 'BFFZprimeToMuMu_M_400_dbs0p04', 'nevts': 0, 'xsec': 17.42}
exits.......
sample BFFZprimeToMuMu_M_400_dbs0p04 took 0.0 seconds
{'fileglob': 'BFFZprimeToMuMu_M_500_TuneCUETP8M1_13TeV-madgraph-pythia8', 'ismc': 1, 'name': 'BFFZprimeToMuMu_M_500_dbs0p04', 'nevts': 0, 'xsec': 6.42}
exits.......
sample BFFZprimeToMuMu_M_500_dbs0p04 took 0.0 seconds
{'fileglob': 'BFFZprimeToMuMu_M_500_dbs0p5_TuneCUETP8M1_13TeV-madgraph-pythia8', 'ismc': 1, 'name': 'BFFZprimeToMuMu_M_50

In [None]:
tdf.filter(regex='CR').sum()

In [None]:
172+263

In [None]:
break

In [None]:
df = pd.DataFrame(info_list)
df.to_csv('{}/data/info_dict_{}.csv'.format(output_dir, era))
df

In [None]:
existing_files

In [None]:
failed_files

In [None]:
beep_repeat()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
path = 'assets_feb_23/data/tw_2016_BFFZprimeToMuMu_M_250_dbs0p04.csv'
df = pd.read_csv(path)

In [None]:
for x in df:
    print(x)

In [None]:
df.shape

In [None]:
tdf = df

In [None]:
tdf = df[df.SR2_jet_nom_muon_corrected_pt_ele_pt==1]
hist = np.histogram(tdf.GenMultiplicity, bins=np.linspace(-.5,7.5, 9))
hist[0], sum(hist[0])

In [None]:
tdf = df[df.SR1_jet_nom_muon_corrected_pt_ele_pt==1]
hist = np.histogram(tdf.GenMultiplicity, bins=np.linspace(-.5,7.5, 9))
hist[0], sum(hist[0])

In [None]:
tdf.GenNbJets.hist(bins=np.linspace(-.5,7.5, 9))
plt.xlabel('n b-jets')
plt.title('SR1')


In [None]:
tdf.GenNsJets.hist(bins=np.linspace(-.5,7.5, 9))
plt.xlabel('n s-jets')
plt.title('SR1')

In [None]:
np.histogram(tdf.GenNsJets, bins=np.linspace(-.5,7.5, 9))

In [None]:
tdf = df[df.SR2_jet_nom_muon_corrected_pt_ele_pt==1]

In [None]:
tdf.GenNbJets.hist(bins=np.linspace(-.5,7.5, 9))
plt.xlabel('n b-jets')
plt.title('SR2')

In [None]:
tdf.GenNsJets.hist(bins=np.linspace(-.5,7.5, 9))
plt.xlabel('n s-jets')
plt.title('SR2')

In [None]:
df.Weight_MuonTriggerUp.mean()*100

In [None]:
df.Weight_BTagUp.mean()*100

In [None]:
import uproot as upr

In [None]:
skim_path = '/eos/cms/store/group/phys_exotica/bffZprime/nanoAODskimmed/crab_bffv2/2017/BFF_175_dbs0p5_deepflavour_bffv2/221019_071128/0000/tree_1.root'

In [None]:
upf = upr.open(skim_path)['Events']

In [None]:
upf.arrays(['Muon_effSF_sys_triggerUp',
           'Muon_effSF_trigger',
           'Muon_effSF_sys_triggerDown'], library='pd').mean()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/tw_2016_BFFZprimeToMuMu_M_250_dbs0p04.csv')

In [None]:
[x for x in df.keys() if 'HLT' in x], [x for x in df.keys() if 'SR' in x]

In [None]:
df.SR2_jet_nom_muon_corrected_pt_ele_pt.sum(), (df.SR2_jet_nom_muon_corrected_pt_ele_pt*df.HLT_Mu50).sum()

In [None]:
df[[ 
    'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown'
    ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet', 'AvgBtagWeight']].mean()*100

In [None]:
df.AvgJER.mean()

In [None]:
from src.data_tools.get_file_list import get_file_df


In [None]:
era = 2016
file_df = get_file_df()
file_df = file_df[file_df.era==era]
file_df.reset_index(inplace=True)

In [None]:
means_list = []
for i, x in file_df.iterrows():
    print(x.file)
    _df = pd.read_csv(x.file)
    _df = _df[_df.AvgJER<5]
    means = _df[[ 
    'MuonTriggerEff', 'Weight_MuonTriggerUp', 'Weight_MuonTriggerDown'
    ,'AvgMuonRecoIdIsoSFPerMuon'
                ,'AvgMuonRocPer',
               'AvgJER', 'AvgJES', 'AvgHEM',
                'AvgPUIDWeightsPerJet', 'AvgBtagWeight']].mean()*100
    means_dict = means.to_dict()
    means_dict['mass'] = x.mass
    means_dict['dbs'] = x.dbs
    means_list.append(means_dict)

In [None]:
mean_df = pd.DataFrame(means_list)

In [None]:
mean_df.min().round(1)

In [None]:
mean_df.max().round(1)

In [None]:
import numpy as np

In [None]:
mean_df.AvgJER