# Making stacked histograms
- processes an `events[year][ch][sample]` object using `make_events_dict()`
- uses `plot_hists()` to make stacked histograms

In [1]:
import glob
import os
import json
import pickle
import yaml
import math

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pyarrow.parquet as pq
from sklearn.metrics import auc, roc_curve
from scipy.special import softmax

import hist as hist2
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)

import sys
sys.path
sys.path.append("../python/")

import utils

plt.rcParams.update({"font.size": 20})

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [3]:
# get lumi
with open("../fileset/luminosity.json") as f:
    luminosity = json.load(f)
    
luminosity

{'ele': {'Run2': 137640.0,
  '2016APV': 19492.72,
  '2016': 16809.96,
  '2017': 41476.02,
  '2018': 59816.23},
 'mu': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'lep': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96},
 'had': {'Run2': 137640.0,
  '2016APV': 19436.16,
  '2016': 16810.81,
  '2017': 41475.26,
  '2018': 59781.96}}

In [4]:
def get_lumi(years, channels):
    lum_ = 0
    for year in years:
        lum = 0
        for ch in channels:
            lum += luminosity[ch][year] / 1000.0

        lum_ += lum / len(channels)    
    return lum_

In [5]:
! ls ../eos/Lumi_July17_2016/SingleElectron_Run2016G/outfiles/

0-20.parquet    180-200.pkl     280-300.parquet 380-400.pkl     480-500.parquet
0-20.pkl        20-40.parquet   280-300.pkl     40-60.parquet   480-500.pkl
100-120.parquet 20-40.pkl       300-320.parquet 40-60.pkl       500-520.parquet
100-120.pkl     200-220.parquet 300-320.pkl     400-420.parquet 500-520.pkl
120-140.parquet 200-220.pkl     320-340.parquet 400-420.pkl     520-540.parquet
120-140.pkl     220-240.parquet 320-340.pkl     420-440.parquet 520-540.pkl
140-160.parquet 220-240.pkl     340-360.parquet 420-440.pkl     60-80.parquet
140-160.pkl     240-260.parquet 340-360.pkl     440-460.parquet 60-80.pkl
160-180.parquet 240-260.pkl     360-380.parquet 440-460.pkl     80-100.parquet
160-180.pkl     260-280.parquet 360-380.pkl     460-480.parquet 80-100.pkl
180-200.parquet 260-280.pkl     380-400.parquet 460-480.pkl


In [6]:
years = [
#     "2017", 
#     "2016", 
#     "2016APV", 
    "2018",
]

In [7]:
samples_dir = {
    "2016":    "../eos/Lumi_July17_2016",
    "2016APV": "../eos/Lumi_July17_2016APV",
    "2017":    "../eos/Lumi_July17_2017",
    "2018":    "../eos/Lumi_July17_2018",
}

In [8]:
events_dict = {}
for year in years:
    print(year)
    
    events_dict[year] = {}
    for sample in os.listdir(samples_dir[year]):
        print("   ", sample)
        parquet_files = glob.glob(f"{samples_dir[year]}/{sample}/outfiles/*.parquet")
        
        events_dict[year][sample] = pd.read_parquet(parquet_files)

2018
    EGamma_Run2018A
    SingleMuon_Run2018A
    EGamma_Run2018C
    EGamma_Run2018D
    EGamma_Run2018B
    SingleMuon_Run2018C
    SingleMuon_Run2018D
    SingleMuon_Run2018B


In [None]:
for year in years:
    for era in events_dict[year]:
#         if era != "SingleMuon_Run2017F":
#             continue
        df = events_dict[year][era]
        duplicated_events = df.duplicated()
        print(year, era, f"Duplicate events: {100*duplicated_events.sum()/len(df):.2f}%")