# Phase 2 — Build certified training table (NanoAOD → Parquet)

We will:
1) load validated run/lumi JSON (muons-only)
2) stream ROOT files with uproot.iterate (memory-safe)
3) build dimuon candidates + features
4) add trigger labels from `HLT_*`
5) write Parquet shards for ML


In [1]:
# Cell 1 — Install deps (no XRootD needed)
!pip -q install "uproot>=5" awkward vector rich tqdm pandas pyarrow fastparquet matplotlib


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.8/393.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.7/656.7 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.2/181.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Cell P2-1 — Config
from pathlib import Path
import numpy as np
import pandas as pd
import awkward as ak
import uproot
import vector

vector.register_awkward()

BASE_PATH = Path("/kaggle/input/datasets/katakuricharlotte/doublemuon2016g-rootfiles/root_converted")
ROOT_FILES = sorted(map(str, BASE_PATH.glob("*.root")))
len(ROOT_FILES), ROOT_FILES[:3]


(5,
 ['/kaggle/input/datasets/katakuricharlotte/doublemuon2016g-rootfiles/root_converted/doublemuon2016g_0.root',
  '/kaggle/input/datasets/katakuricharlotte/doublemuon2016g-rootfiles/root_converted/doublemuon2016g_1.root',
  '/kaggle/input/datasets/katakuricharlotte/doublemuon2016g-rootfiles/root_converted/doublemuon2016g_2.root'])

In [3]:
# Cell P2-2 — Download validated runs JSON (muons only) and parse
# The dataset record links validated runs JSONs; for this project we use "muons only". [page:14]
import json, urllib.request

VALID_JSON_URL = "https://opendata.cern.ch/record/14221/files/Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON_MuonPhys.txt"
json_path = Path("/kaggle/working/validated_runs_muons_2016.txt")

if not json_path.exists():
    urllib.request.urlretrieve(VALID_JSON_URL, json_path)

with open(json_path, "r") as f:
    certified = json.load(f)

# certified is dict: { "run": [ [lumiStart, lumiEnd], ... ], ... }
len(certified), list(certified.items())[:1]


(423, [('273150', [[64, 75]])])

In [4]:
# Cell P2-3 — Fast certified filter helpers
def is_certified(run, lumi, certified_map):
    # run: int, lumi: int
    rs = str(int(run))
    if rs not in certified_map:
        return False
    for lo, hi in certified_map[rs]:
        if lo <= int(lumi) <= hi:
            return True
    return False

def mask_certified(runs, lumis, certified_map):
    # vectorized-ish for numpy arrays
    out = np.zeros(len(runs), dtype=bool)
    for i, (r, l) in enumerate(zip(runs, lumis)):
        out[i] = is_certified(r, l, certified_map)
    return out


In [5]:
# Cell P2-4 — Choose triggers to emulate (auto-detect, then pick 3)
# Record 30522 lists possible HLT trigger paths for this dataset. [page:14]
test = uproot.open(ROOT_FILES[0])["Events"]
branches = test.keys()

hlt = sorted([b for b in branches if b.startswith("HLT_") and "Mu" in b])
hlt[:50], len(hlt)


(['HLT_BTagMu_AK8Jet300_Mu5',
  'HLT_BTagMu_DiJet110_Mu5',
  'HLT_BTagMu_DiJet170_Mu5',
  'HLT_BTagMu_DiJet20_Mu5',
  'HLT_BTagMu_DiJet40_Mu5',
  'HLT_BTagMu_DiJet70_Mu5',
  'HLT_BTagMu_Jet300_Mu5',
  'HLT_DiMu9_Ele9_CaloIdL_TrackIdL',
  'HLT_DiPFJet40_DEta3p5_MJJ600_PFMETNoMu140',
  'HLT_DiPFJet40_DEta3p5_MJJ600_PFMETNoMu80',
  'HLT_Dimuon0_Jpsi_Muon',
  'HLT_Dimuon0_Upsilon_Muon',
  'HLT_DoubleIsoMu17_eta2p1',
  'HLT_DoubleIsoMu17_eta2p1_noDzCut',
  'HLT_DoubleMu18NoFiltersNoVtx',
  'HLT_DoubleMu23NoFiltersNoVtxDisplaced',
  'HLT_DoubleMu28NoFiltersNoVtxDisplaced',
  'HLT_DoubleMu33NoFiltersNoVtx',
  'HLT_DoubleMu38NoFiltersNoVtx',
  'HLT_DoubleMu3_PFMET50',
  'HLT_DoubleMu3_Trk_Tau3mu',
  'HLT_DoubleMu4_3_Bs',
  'HLT_DoubleMu4_3_Jpsi_Displaced',
  'HLT_DoubleMu4_JpsiTrk_Displaced',
  'HLT_DoubleMu4_LowMassNonResonantTrk_Displaced',
  'HLT_DoubleMu4_PsiPrimeTrk_Displaced',
  'HLT_DoubleMu8_Mass8_PFHT250',
  'HLT_DoubleMu8_Mass8_PFHT300',
  'HLT_FullTracks_Multiplicity100',
  'HLT_Ful

In [6]:
# Cell P2-5 — Pick 3 practical labels (prefer common dimuon triggers if present)
preferred = [
    "HLT_Mu17_Mu8",
    "HLT_Mu17_Mu8_DZ",
    "HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ",
]
LABELS = [x for x in preferred if x in branches]
if len(LABELS) < 1:
    # fallback: take the first few muon triggers present
    LABELS = hlt[:3]

LABELS


['HLT_Mu17_Mu8', 'HLT_Mu17_Mu8_DZ', 'HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ']

In [7]:
# Cell P2-6 — Feature engineering (dimuon-level) + write Parquet shards (FIXED)

import pyarrow as pa
import pyarrow.parquet as pq

MUON_MASS = 0.105658

FEATURES = [
    "run", "luminosityBlock", "event",
    "PV_npvs", "MET_pt",
    "Muon_pt", "Muon_eta", "Muon_phi", "Muon_charge",
    "Muon_pfRelIso03_all", "Muon_tightId", "Muon_mediumId",
]

# Keep only branches that exist
BASE_READ = [b for b in FEATURES if b in branches] + LABELS

OUT_DIR = Path("/kaggle/working/parquet_dimuon")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def build_dimuon_table(a):
    # Certified mask at event level
    runs = ak.to_numpy(a["run"])
    lumis = ak.to_numpy(a["luminosityBlock"])
    good = mask_certified(runs, lumis, certified)
    a = a[good]

    if len(a["run"]) == 0:
        return pd.DataFrame()

    # Build vector muons
    mu = vector.zip({
        "pt": a["Muon_pt"],
        "eta": a["Muon_eta"],
        "phi": a["Muon_phi"],
        "mass": ak.ones_like(a["Muon_pt"]) * MUON_MASS,
        "charge": a["Muon_charge"],
    })

    # quality: prefer mediumId if present
    qual = ak.ones_like(a["Muon_pt"], dtype=bool)
    if "Muon_mediumId" in a.fields:
        qual = qual & (a["Muon_mediumId"] == 1)
    elif "Muon_tightId" in a.fields:
        qual = qual & (a["Muon_tightId"] == 1)

    mu = mu[qual]

    # Opposite-sign pairs
    pairs = ak.combinations(mu, 2, fields=["m1", "m2"])
    os_pairs = pairs[(pairs.m1.charge * pairs.m2.charge) < 0]
    dimu = os_pairs.m1 + os_pairs.m2

    # If no pairs, return empty
    if ak.sum(ak.num(dimu.mass)) == 0:
        return pd.DataFrame()

    # Build per-pair feature table (event keys repeated for each pair)
    out = {
        "run": ak.flatten(ak.broadcast_arrays(a["run"], dimu.mass)[0]),
        "lumi": ak.flatten(ak.broadcast_arrays(a["luminosityBlock"], dimu.mass)[0]),
        "event": ak.flatten(ak.broadcast_arrays(a["event"], dimu.mass)[0]),
        "m_mumu": ak.to_numpy(ak.flatten(dimu.mass)),
        "pt_mumu": ak.to_numpy(ak.flatten(dimu.pt)),
        "eta_mumu": ak.to_numpy(ak.flatten(dimu.eta)),
        "dR_mumu": ak.to_numpy(ak.flatten(os_pairs.m1.deltaR(os_pairs.m2))),
    }

    # Add event-level context (repeat per pair)
    if "PV_npvs" in a.fields:
        out["PV_npvs"] = ak.to_numpy(ak.flatten(ak.broadcast_arrays(a["PV_npvs"], dimu.mass)[0]))
    if "MET_pt" in a.fields:
        out["MET_pt"] = ak.to_numpy(ak.flatten(ak.broadcast_arrays(a["MET_pt"], dimu.mass)[0]))

    # Add trigger labels (repeat per pair)
    for lab in LABELS:
        if lab in a.fields:
            out[lab] = ak.to_numpy(ak.flatten(ak.broadcast_arrays(a[lab], dimu.mass)[0])).astype(np.int8)

    return pd.DataFrame(out)

# Iterate files in chunks (safe for memory)
MAX_EVENTS_TOTAL = 1_000_000   # scale later
events_seen = 0
shard = 0

# FIX: specify tree as part of each file path
FILES = [f"{fp}:Events" for fp in ROOT_FILES]

for arrays in uproot.iterate(
    FILES,
    expressions=BASE_READ,
    step_size="100 MB",
    library="ak",
    allow_missing=True
):
    df = build_dimuon_table(arrays)
    if len(df) == 0:
        continue

    out_path = OUT_DIR / f"dimuon_shard_{shard:03d}.parquet"
    df.to_parquet(out_path, index=False)
    shard += 1

    events_seen += len(arrays["run"])
    if events_seen >= MAX_EVENTS_TOTAL:
        break

shard, events_seen


(1, 2315223)

In [8]:
import os, glob
sorted(glob.glob("/kaggle/working/parquet_dimuon/*.parquet"))[:10], len(glob.glob("/kaggle/working/parquet_dimuon/*.parquet"))


(['/kaggle/working/parquet_dimuon/dimuon_shard_000.parquet'], 1)

In [9]:
df[LABELS].mean()

HLT_Mu17_Mu8                           0.023145
HLT_Mu17_Mu8_DZ                        0.020059
HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ    0.452260
dtype: float64

In [10]:
df.head()

Unnamed: 0,run,lumi,event,m_mumu,pt_mumu,eta_mumu,dR_mumu,PV_npvs,MET_pt,HLT_Mu17_Mu8,HLT_Mu17_Mu8_DZ,HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ
0,278822,951,1657783742,0.717688,27.446304,0.381257,0.051071,12,27.271524,0,0,0
1,278822,951,1657783742,17.536045,5.466623,1.782907,3.233651,12,27.271524,0,0,0
2,278822,951,1658163927,0.328497,31.051395,-1.63333,0.00813,10,24.15033,0,0,1
3,278822,951,1658163927,15.175849,15.982114,-1.596437,2.230811,10,24.15033,0,0,1
4,278822,951,1658526011,19.699652,39.251945,-0.040246,0.976195,19,29.609695,0,0,1
