In [None]:
import pandas as pd
import numpy as np
from bff_processor.data_tools import regex_select, get_files, make_df
from bff_processor.bff_meta import preselection, band_cut, isin, identity, all_reg, jet_variations
from bff_processor.bff_cuts import *
from glob import glob
import re

In [None]:
era = 2016
regex_select = ".+/tw_{}_(.+)\.csv".format(era)
files = [x for x in glob('data/*') if re.match(regex_select, x)]
DY = [x for x in files if re.match('.+ZTo(?:Mu|EE).+', x)]
ST = [x for x in files if re.match('.+top.csv', x)]
VB = [x for x in files if re.match('.+mc_(?:ww|wz|zz)', x)]
TT = [x for x in files if re.match('.+ttbar', x)]
BFF = [x for x in files if re.match('.+BFFZp', x)]
data = [x for x in files if re.match('.+_data_', x)]
assert len(files) == len(DY+ST+TT+VB+data+BFF), "duplicate or uncaught file"

In [None]:
sample_types = [
['.+ZTo(?:Mu|EE).+', 'bck'],
['.+top.csv', 'bck'],
['.+mc_(?:ww|wz|zz)', 'bck'],
['.+ttbar', 'bck'],
['.+BFFZp', 'sig'],
['.+_data_', 'data']
]

In [None]:
def process_file(file):
    name = re.findall(regex_select, file)[0]
    #get stuff for bff samples
    dbs = re.findall('dbs(\dp\d+)', name)
    mass = re.findall('_M_(\d+)', name)
    if len(dbs) == 1 : dbs = float(dbs[0].replace('p','.'))
    else: dbs = None
    if len(mass) == 1 : mass = int(mass[0])
    else: mass = None
    type_label = 0
    for re_sample, type_sample in sample_types:
        if re.match(re_sample, file): 
            type_label = type_sample
            break
            
    #open file and filter out events with bff selection
    df = preselection()(pd.read_csv(file))
    
    for reg in df.filter(regex='(?:SR|CR)\d+_.+'):
        nJets, jv = re.findall('(?:SR|CR)(\d)\d*_(.+)', reg)[0]
        df[reg] = df[reg]*bff_no_tmb_value[nJets](df, jv)
    selected_events = df.filter(regex='(?:SR|CR)\d+_.+').sum(axis=1)>0
    df['deltaR'] = df[["minGoodJetElDR","minGoodJetMuDR"]].min(axis=1)
    deltaR = df['deltaR'] > 0.4
    print( "{} remaining".format(   (df.filter(regex='(?:SR|CR)\d+_.+').sum(axis=1)>0).mean()))
    df = df[selected_events & deltaR]
    #remove unnamed column from index, probably a better way, but ok for now
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
    #add metadata
    df['name'] = name
    df['dbs'] = dbs
    df['mass'] = mass
    df['type'] = type_label
    return df


In [None]:
combined_df = pd.DataFrame()
for file in files:
    print(file)
    df = process_file(file)
    combined_df = pd.concat([combined_df, df])

In [None]:
# convert to bool
combined_df['TriggerWeight'] = combined_df['TriggerWeight'].astype(bool)

In [None]:
import pyarrow.feather as feather
feather.write_feather(combined_df, 'data/combined_{}.feather'.format(era))

In [None]:
combined_df.to_parquet('data/combined_{}.parquet'.format(era))