In [19]:
!pip -q install "uproot>=5" awkward vector rich tqdm pandas pyarrow fastparquet matplotlib

In [20]:
import os, glob
from pathlib import Path

print("Top-level /kaggle/input entries:")
print(os.listdir("/kaggle/input"))

def quick_scan(p):
    print("\n---")
    print("Path:", p, "exists:", os.path.exists(p))
    if os.path.exists(p):
        # show a few files
        files = glob.glob(p + "/**/*", recursive=True)
        print("Total entries under path:", len(files))
        print("Sample:", files[:15])
        roots = glob.glob(p + "/**/*.root*", recursive=True)
        print("ROOT-like files found:", len(roots))
        print("ROOT sample:", roots[:10])
        idx = glob.glob(p + "/**/*file_index.json*", recursive=True)
        print("file_index.json* found:", len(idx))
        print("index sample:", idx[:10])

DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

for k,v in DATASET_PATHS.items():
    print(f"\nDataset key: {k}")
    quick_scan(v)


Top-level /kaggle/input entries:
['datasets']

Dataset key: SMS-TChiWZ_ZToLL

---
Path: /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll exists: True
Total entries under path: 4
Sample: ['/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_5', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_2', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgrap

In [21]:
# !pip -q install uproot awkward numpy
# NOTE: if your index contains root:// URIs, we'll install xrootd automatically (needs Internet ON).

import os, glob, json, re, subprocess, sys
import gzip
from pathlib import Path

import numpy as np
import awkward as ak
import uproot

# ----------------------------
# Inputs
# ----------------------------
DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

OUT_BASE = "/kaggle/working/derivedroot"
os.makedirs(OUT_BASE, exist_ok=True)

STEP_SIZE = 50_000     # uproot.iterate chunk size [web:14]
MAX_EVENTS = None      # set e.g. 200000 for testing
MAX_FILES  = None      # set e.g. 5 for testing

JET_PT_MIN = 30.0

FEATURES50 = [
    "nMuon","nElectron","nJet","MET_pt","MET_phi","MET_sumEt",
    "Muon_pt_0","Muon_eta_0","Muon_phi_0",
    "Muon_pt_1","Muon_eta_1","Muon_phi_1",
    "Electron_pt_0","Electron_eta_0","Electron_phi_0",
    "Electron_pt_1","Electron_eta_1","Electron_phi_1",
    "Jet_pt_0","Jet_eta_0","Jet_phi_0",
    "Jet_pt_1","Jet_eta_1","Jet_phi_1",
    "Jet_pt_2","Jet_eta_2","Jet_phi_2",
    "Jet_pt_3","Jet_eta_3","Jet_phi_3",
    "HT","ST",
    "M_ll","M_jj_01","M_jj_12",
    "delta_phi_MET_j0","delta_phi_MET_j1","min_delta_phi_MET_jets",
    "delta_R_j0_j1","delta_phi_ll","delta_R_ll",
    "Jet_btagDeepB_0","Jet_btagDeepB_1",
    "MT_lep_MET","HT_ratio","MET_pt_HT_ratio",
    "nJet_pt30","Jet_mass_0","LeadLepton_pt","sum_pt_leptons"
]

# ----------------------------
# Parse *_file_index.json_* robustly
# ----------------------------
ROOT_RE = re.compile(r'(root://[^\s"\']+?\.root(\?[^\s"\']+)?)|((http|https)://[^\s"\']+?\.root(\?[^\s"\']+)?)|([^\s"\']+?\.root(\?[^\s"\']+)?)',
                     re.IGNORECASE)

ROOT_PAT = re.compile(r'(root://[^\s"\']+?\.root(?:\?[^\s"\']+)?)|((?:https?://)[^\s"\']+?\.root(?:\?[^\s"\']+)?)|((?:/store/)[^\s"\']+?\.root)',
                      re.IGNORECASE)

DEFAULT_XROOTD_PREFIX = "root://xrootd-cms.infn.it//"  # used only when index gives "/store/..." [web:43]

def read_index_text(fp):
    """Read potentially-non-UTF8 index file robustly; also auto-decompress gzip by magic bytes."""
    with open(fp, "rb") as f:
        b = f.read()

    # gzip magic number starts with 1f 8b [web:63]
    if len(b) >= 2 and b[:2] == b"\x1f\x8b":
        b = gzip.decompress(b)

    # Try utf-8 first, then latin-1 (latin-1 never errors and preserves byte values) [web:58]
    try:
        return b.decode("utf-8")
    except UnicodeDecodeError:
        return b.decode("latin-1")
        

def extract_root_like_strings(text):
    out = []
    for m in ROOT_RE.finditer(text):
        s = next(g for g in m.groups() if g)
        out.append(s)
    return out

def collect_root_uris(dataset_dir):
    dataset_dir = str(dataset_dir)
    idx_files = sorted(glob.glob(os.path.join(dataset_dir, "**", "*file_index.json*"), recursive=True))
    if not idx_files:
        raise RuntimeError(f"No *file_index.json* files found under {dataset_dir}")

    uris = set()

    for fp in idx_files:
        txt = read_index_text(fp)

        # 1) Try JSON parse (works if the file is actually JSON)
        parsed = None
        try:
            parsed = json.loads(txt)
        except Exception:
            parsed = None

        if parsed is not None:
            blob = json.dumps(parsed)
            for m in ROOT_PAT.finditer(blob):
                s = next(g for g in m.groups() if g)
                uris.add(s)
        else:
            # 2) Otherwise regex scan raw text
            for m in ROOT_PAT.finditer(txt):
                s = next(g for g in m.groups() if g)
                uris.add(s)

    # Normalize "/store/..." into full root:// URIs (so uproot can open them via XRootD) [web:43]
    norm = []
    for u in sorted(uris):
        if u.startswith("/store/"):
            norm.append(DEFAULT_XROOTD_PREFIX + u.lstrip("/"))
        else:
            norm.append(u)

    return norm, idx_files

# ----------------------------
# Ensure XRootD if needed
# uproot needs the optional 'xrootd' dependency for root:// URIs [web:44][web:43]
# ----------------------------
def ensure_xrootd_if_needed(uris):
    need = any(u.lower().startswith("root://") for u in uris)
    if not need:
        return
    try:
        import XRootD  # noqa: F401
        return
    except Exception:
        print("[INFO] Installing xrootd (required for root:// URIs)...")
        subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "xrootd"])
        import XRootD  # noqa: F401

# ----------------------------
# Physics helpers
# ----------------------------
def delta_phi(phi1, phi2):
    d = phi1 - phi2
    return ak.abs((d + np.pi) % (2*np.pi) - np.pi)

def delta_r(eta1, phi1, eta2, phi2):
    return ak.sqrt((eta1 - eta2)**2 + delta_phi(phi1, phi2)**2)

def pad_take(jagged, i, fill=0.0):
    padded = ak.pad_none(jagged, i+1, clip=True)
    return ak.fill_none(padded[:, i], fill)

def safe_div(num, den):
    return ak.where(den != 0, num/den, 0)

def vec_mass(pt1, eta1, phi1, m1, pt2, eta2, phi2, m2):
    px1 = pt1 * np.cos(phi1); py1 = pt1 * np.sin(phi1); pz1 = pt1 * np.sinh(eta1)
    px2 = pt2 * np.cos(phi2); py2 = pt2 * np.sin(phi2); pz2 = pt2 * np.sinh(eta2)
    e1 = ak.sqrt(px1*px1 + py1*py1 + pz1*pz1 + m1*m1)
    e2 = ak.sqrt(px2*px2 + py2*py2 + pz2*pz2 + m2*m2)
    px = px1 + px2; py = py1 + py2; pz = pz1 + pz2; e = e1 + e2
    m2tot = e*e - (px*px + py*py + pz*pz)
    return ak.sqrt(ak.where(m2tot > 0, m2tot, 0))

# ----------------------------
# Tree + btag detection
# ----------------------------
TREE_PREFERRED = ["Events", "tree", "ntuple", "T"]

def pick_tree_name(fin):
    keys = [k.split(";")[0] for k in fin.keys()]
    for cand in TREE_PREFERRED:
        if cand in keys:
            return cand
    for k in keys:
        try:
            obj = fin[k]
            if isinstance(obj, uproot.behaviors.TTree.TTree):
                return k
        except Exception:
            pass
    raise RuntimeError(f"No TTree found. Keys sample: {keys[:30]}")

def choose_btag_branch(branches_set):
    for cand in ["Jet_btagDeepB", "Jet_btagDeepFlavB", "Jet_btagCSVV2"]:
        if cand in branches_set:
            return cand
    return None

# ----------------------------
# Derive your 50 features from NanoAOD-style branches
# ----------------------------
MU_MASS = 0.105658
EL_MASS = 0.000511

def compute_features(chunk, btag_branch=None):
    # required branches default to empty jagged if absent
    n_events = len(next(iter(chunk.values())))

    Mu_pt  = chunk.get("Muon_pt",  ak.Array([[]]*n_events))
    Mu_eta = chunk.get("Muon_eta", ak.Array([[]]*n_events))
    Mu_phi = chunk.get("Muon_phi", ak.Array([[]]*n_events))

    El_pt  = chunk.get("Electron_pt",  ak.Array([[]]*n_events))
    El_eta = chunk.get("Electron_eta", ak.Array([[]]*n_events))
    El_phi = chunk.get("Electron_phi", ak.Array([[]]*n_events))

    J_pt   = chunk.get("Jet_pt",   ak.Array([[]]*n_events))
    J_eta  = chunk.get("Jet_eta",  ak.Array([[]]*n_events))
    J_phi  = chunk.get("Jet_phi",  ak.Array([[]]*n_events))
    J_mass = chunk.get("Jet_mass", ak.Array([[]]*n_events))

    MET_pt   = chunk.get("MET_pt",   ak.zeros(n_events))
    MET_phi  = chunk.get("MET_phi",  ak.zeros(n_events))
    MET_sumE = chunk.get("MET_sumEt", ak.zeros(n_events))

    nMuon = ak.num(Mu_pt, axis=1)
    nElectron = ak.num(El_pt, axis=1)
    nJet = ak.num(J_pt, axis=1)

    # Leading/subleading
    out = {}
    out["nMuon"] = ak.to_numpy(nMuon).astype(np.int32)
    out["nElectron"] = ak.to_numpy(nElectron).astype(np.int32)
    out["nJet"] = ak.to_numpy(nJet).astype(np.int32)

    out["MET_pt"] = ak.to_numpy(MET_pt).astype(np.float32)
    out["MET_phi"] = ak.to_numpy(MET_phi).astype(np.float32)
    out["MET_sumEt"] = ak.to_numpy(MET_sumE).astype(np.float32)

    out["Muon_pt_0"]  = ak.to_numpy(pad_take(Mu_pt, 0, 0.0)).astype(np.float32)
    out["Muon_eta_0"] = ak.to_numpy(pad_take(Mu_eta,0, 0.0)).astype(np.float32)
    out["Muon_phi_0"] = ak.to_numpy(pad_take(Mu_phi,0, 0.0)).astype(np.float32)
    out["Muon_pt_1"]  = ak.to_numpy(pad_take(Mu_pt, 1, 0.0)).astype(np.float32)
    out["Muon_eta_1"] = ak.to_numpy(pad_take(Mu_eta,1, 0.0)).astype(np.float32)
    out["Muon_phi_1"] = ak.to_numpy(pad_take(Mu_phi,1, 0.0)).astype(np.float32)

    out["Electron_pt_0"]  = ak.to_numpy(pad_take(El_pt, 0, 0.0)).astype(np.float32)
    out["Electron_eta_0"] = ak.to_numpy(pad_take(El_eta,0, 0.0)).astype(np.float32)
    out["Electron_phi_0"] = ak.to_numpy(pad_take(El_phi,0, 0.0)).astype(np.float32)
    out["Electron_pt_1"]  = ak.to_numpy(pad_take(El_pt, 1, 0.0)).astype(np.float32)
    out["Electron_eta_1"] = ak.to_numpy(pad_take(El_eta,1, 0.0)).astype(np.float32)
    out["Electron_phi_1"] = ak.to_numpy(pad_take(El_phi,1, 0.0)).astype(np.float32)

    for i in range(4):
        out[f"Jet_pt_{i}"]  = ak.to_numpy(pad_take(J_pt, i, 0.0)).astype(np.float32)
        out[f"Jet_eta_{i}"] = ak.to_numpy(pad_take(J_eta,i, 0.0)).astype(np.float32)
        out[f"Jet_phi_{i}"] = ak.to_numpy(pad_take(J_phi,i, 0.0)).astype(np.float32)

    out["Jet_mass_0"] = ak.to_numpy(pad_take(J_mass, 0, 0.0)).astype(np.float32)

    # HT, nJet_pt30
    HT = ak.sum(J_pt[J_pt > JET_PT_MIN], axis=1)
    nJet_pt30 = ak.sum(J_pt > JET_PT_MIN, axis=1)
    out["HT"] = ak.to_numpy(HT).astype(np.float32)
    out["nJet_pt30"] = ak.to_numpy(nJet_pt30).astype(np.int32)

    # LeadLepton_pt
    lead_mu_pt = pad_take(Mu_pt, 0, 0.0)
    lead_el_pt = pad_take(El_pt, 0, 0.0)
    LeadLepton_pt = ak.where(lead_mu_pt > lead_el_pt, lead_mu_pt, lead_el_pt)
    out["LeadLepton_pt"] = ak.to_numpy(LeadLepton_pt).astype(np.float32)

    # Two leading leptons with "prefer muons" rule:
    # - if >=2 muons: mu0, mu1
    # - elif mu>=1 and e>=1: mu0, e0
    # - elif e>=2: e0, e1
    # - else: none
    mu0_pt, mu0_eta, mu0_phi = pad_take(Mu_pt,0,0.0), pad_take(Mu_eta,0,0.0), pad_take(Mu_phi,0,0.0)
    mu1_pt, mu1_eta, mu1_phi = pad_take(Mu_pt,1,0.0), pad_take(Mu_eta,1,0.0), pad_take(Mu_phi,1,0.0)
    e0_pt,  e0_eta,  e0_phi  = pad_take(El_pt,0,0.0), pad_take(El_eta,0,0.0), pad_take(El_phi,0,0.0)
    e1_pt,  e1_eta,  e1_phi  = pad_take(El_pt,1,0.0), pad_take(El_eta,1,0.0), pad_take(El_phi,1,0.0)

    cond_mumu = nMuon >= 2
    cond_mue  = (nMuon >= 1) & (nElectron >= 1) & (~cond_mumu)
    cond_ee   = (nElectron >= 2) & (~cond_mumu) & (~cond_mue)

    l0_pt  = ak.where(cond_mumu | cond_mue, mu0_pt, ak.where(cond_ee, e0_pt, 0.0))
    l0_eta = ak.where(cond_mumu | cond_mue, mu0_eta, ak.where(cond_ee, e0_eta, 0.0))
    l0_phi = ak.where(cond_mumu | cond_mue, mu0_phi, ak.where(cond_ee, e0_phi, 0.0))
    l0_m   = ak.where(cond_mumu | cond_mue, MU_MASS, ak.where(cond_ee, EL_MASS, 0.0))

    l1_pt  = ak.where(cond_mumu, mu1_pt, ak.where(cond_mue, e0_pt, ak.where(cond_ee, e1_pt, 0.0)))
    l1_eta = ak.where(cond_mumu, mu1_eta, ak.where(cond_mue, e0_eta, ak.where(cond_ee, e1_eta, 0.0)))
    l1_phi = ak.where(cond_mumu, mu1_phi, ak.where(cond_mue, e0_phi, ak.where(cond_ee, e1_phi, 0.0)))
    l1_m   = ak.where(cond_mumu, MU_MASS, ak.where(cond_mue, EL_MASS, ak.where(cond_ee, EL_MASS, 0.0)))

    has2lep = cond_mumu | cond_mue | cond_ee
    out["sum_pt_leptons"] = ak.to_numpy(ak.where(has2lep, l0_pt + l1_pt, 0.0)).astype(np.float32)

    # ST
    out["ST"] = ak.to_numpy(HT + MET_pt + ak.where(has2lep, l0_pt + l1_pt, 0.0)).astype(np.float32)

    # M_ll, delta_phi_ll, delta_R_ll
    mll = ak.where(has2lep, vec_mass(l0_pt,l0_eta,l0_phi,l0_m, l1_pt,l1_eta,l1_phi,l1_m), 0.0)
    out["M_ll"] = ak.to_numpy(mll).astype(np.float32)

    out["delta_phi_ll"] = ak.to_numpy(ak.where(has2lep, delta_phi(l0_phi, l1_phi), 0.0)).astype(np.float32)
    out["delta_R_ll"]   = ak.to_numpy(ak.where(has2lep, delta_r(l0_eta,l0_phi, l1_eta,l1_phi), 0.0)).astype(np.float32)

    # Dijet masses
    j0_pt, j0_eta, j0_phi, j0_m = pad_take(J_pt,0,0.0), pad_take(J_eta,0,0.0), pad_take(J_phi,0,0.0), pad_take(J_mass,0,0.0)
    j1_pt, j1_eta, j1_phi, j1_m = pad_take(J_pt,1,0.0), pad_take(J_eta,1,0.0), pad_take(J_phi,1,0.0), pad_take(J_mass,1,0.0)
    j2_pt, j2_eta, j2_phi, j2_m = pad_take(J_pt,2,0.0), pad_take(J_eta,2,0.0), pad_take(J_phi,2,0.0), pad_take(J_mass,2,0.0)

    has2j = nJet >= 2
    has3j = nJet >= 3
    out["M_jj_01"] = ak.to_numpy(ak.where(has2j, vec_mass(j0_pt,j0_eta,j0_phi,j0_m, j1_pt,j1_eta,j1_phi,j1_m), 0.0)).astype(np.float32)
    out["M_jj_12"] = ak.to_numpy(ak.where(has3j, vec_mass(j1_pt,j1_eta,j1_phi,j1_m, j2_pt,j2_eta,j2_phi,j2_m), 0.0)).astype(np.float32)

    # Angular: MET vs jets
    has1j = nJet >= 1
    out["delta_phi_MET_j0"] = ak.to_numpy(ak.where(has1j, delta_phi(MET_phi, pad_take(J_phi,0,0.0)), 0.0)).astype(np.float32)
    out["delta_phi_MET_j1"] = ak.to_numpy(ak.where(has2j, delta_phi(MET_phi, pad_take(J_phi,1,0.0)), 0.0)).astype(np.float32)

    jphi_pad = ak.pad_none(J_phi, 4, clip=True)
    jphi0 = ak.fill_none(jphi_pad[:,0], 0.0)
    jphi1 = ak.fill_none(jphi_pad[:,1], 0.0)
    jphi2 = ak.fill_none(jphi_pad[:,2], 0.0)
    jphi3 = ak.fill_none(jphi_pad[:,3], 0.0)
    dphis = ak.stack([delta_phi(MET_phi, jphi0),
                      delta_phi(MET_phi, jphi1),
                      delta_phi(MET_phi, jphi2),
                      delta_phi(MET_phi, jphi3)], axis=1)
    out["min_delta_phi_MET_jets"] = ak.to_numpy(ak.where(has1j, ak.min(dphis, axis=1), 0.0)).astype(np.float32)

    # Angular: jets
    out["delta_R_j0_j1"] = ak.to_numpy(ak.where(has2j, delta_r(pad_take(J_eta,0,0.0), pad_take(J_phi,0,0.0),
                                                              pad_take(J_eta,1,0.0), pad_take(J_phi,1,0.0)), 0.0)).astype(np.float32)

    # B-tag (store into Jet_btagDeepB_* outputs even if source is DeepFlav/CSVV2)
    if btag_branch is not None and btag_branch in chunk:
        b = chunk[btag_branch]
        out["Jet_btagDeepB_0"] = ak.to_numpy(pad_take(b, 0, 0.0)).astype(np.float32)
        out["Jet_btagDeepB_1"] = ak.to_numpy(pad_take(b, 1, 0.0)).astype(np.float32)
    else:
        out["Jet_btagDeepB_0"] = np.zeros(n_events, dtype=np.float32)
        out["Jet_btagDeepB_1"] = np.zeros(n_events, dtype=np.float32)

    # MT_lep_MET using leading lepton = max(leading mu, leading e)
    # choose its phi accordingly
    lead_is_mu = lead_mu_pt >= lead_el_pt
    lead_phi = ak.where(lead_is_mu, pad_take(Mu_phi,0,0.0), pad_take(El_phi,0,0.0))
    lead_pt  = ak.where(lead_is_mu, lead_mu_pt, lead_el_pt)
    has1lep = (nMuon + nElectron) >= 1
    dphi_lep_met = ak.where(has1lep, delta_phi(lead_phi, MET_phi), 0.0)
    MT = ak.where(has1lep, ak.sqrt(2*lead_pt*MET_pt*(1 - ak.cos(dphi_lep_met))), 0.0)
    out["MT_lep_MET"] = ak.to_numpy(MT).astype(np.float32)

    # Ratios
    out["HT_ratio"] = ak.to_numpy(safe_div(HT, HT + MET_pt)).astype(np.float32)
    out["MET_pt_HT_ratio"] = ak.to_numpy(safe_div(MET_pt, HT)).astype(np.float32)

    # Sanity
    for k in FEATURES50:
        if k not in out:
            raise RuntimeError(f"Missing output feature '{k}'")

    return out

# ----------------------------
# Stream remote ROOT files and write derived ROOT
# ----------------------------
def write_derived_root(uris, out_root_path, step_size=50_000, max_events=None, max_files=None):
    if max_files is not None:
        uris = uris[:max_files]

    # Detect tree/branches from first readable URI
    tree_name, branches, btag_branch = None, None, None
    for u in uris:
        try:
            with uproot.open(u) as fin:
                tree_name = pick_tree_name(fin)
                branches = set(fin[tree_name].keys())
                btag_branch = choose_btag_branch(branches)
                break
        except Exception as e:
            continue
    if tree_name is None:
        raise RuntimeError("Could not open any URI from the index (network/xrootd/internet issue likely).")

    needed = [
        "Muon_pt","Muon_eta","Muon_phi",
        "Electron_pt","Electron_eta","Electron_phi",
        "Jet_pt","Jet_eta","Jet_phi","Jet_mass",
        "MET_pt","MET_phi","MET_sumEt",
    ]
    if btag_branch is not None:
        needed.append(btag_branch)
    expressions = [b for b in needed if b in branches]

    n_written = 0
    wrote_tree = False

    with uproot.recreate(out_root_path) as fout:  # creates/overwrites output ROOT [web:11]
        for u in uris:
            try:
                with uproot.open(u) as fin:
                    if tree_name not in [k.split(";")[0] for k in fin.keys()]:
                        continue
                    tree = fin[tree_name]

                    # iterate returns chunks; library="ak" uses Awkward Arrays [web:14]
                    for chunk in tree.iterate(expressions, step_size=step_size, library="ak"):
                        feats = compute_features(chunk, btag_branch=btag_branch)

                        if not wrote_tree:
                            branch_types = {k: v.dtype for k, v in feats.items()}
                            fout.mktree("Events", branch_types)
                            wrote_tree = True

                        fout["Events"].extend(feats)
                        n_written += len(next(iter(feats.values())))

                        if max_events is not None and n_written >= max_events:
                            return n_written
            except Exception as e:
                print(f"[WARN] Failed URI: {u}\n       {e}")

    return n_written

# ----------------------------
# Run all datasets
# ----------------------------
for ds_name, ds_path in DATASET_PATHS.items():
    ds_out_dir = os.path.join(OUT_BASE, ds_name)
    os.makedirs(ds_out_dir, exist_ok=True)

    uris, idx_files = collect_root_uris(ds_path)
    print(f"\n=== {ds_name} ===")
    print("Index files:", len(idx_files))
    print("ROOT URIs found:", len(uris))
    print("URI preview:", uris[:3])

    if not uris:
        raise RuntimeError(f"No ROOT URIs extracted from index files for {ds_name}.")

    ensure_xrootd_if_needed(uris)

    out_root = os.path.join(ds_out_dir, f"derived_{ds_name}.root")

    manifest = {
        "dataset": ds_name,
        "dataset_path": ds_path,
        "out_root": out_root,
        "index_files": idx_files,
        "n_uris": len(uris),
        "uri_preview": uris[:20],
        "step_size": STEP_SIZE,
        "max_events": MAX_EVENTS,
        "max_files": MAX_FILES,
        "features_out": FEATURES50,
    }
    with open(os.path.join(ds_out_dir, "manifest.json"), "w") as f:
        json.dump(manifest, f, indent=2)

    n = write_derived_root(uris, out_root, step_size=STEP_SIZE, max_events=MAX_EVENTS, max_files=MAX_FILES)
    print(f"[DONE] events written: {n}")
    print(f"[DONE] output: {out_root}")



=== SMS-TChiWZ_ZToLL ===
Index files: 4
ROOT URIs found: 0
URI preview: []


RuntimeError: No ROOT URIs extracted from index files for SMS-TChiWZ_ZToLL.

In [23]:
import glob, os, binascii

ds = "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll"
idx = sorted(glob.glob(ds + "/**/*file_index.json*", recursive=True))
print("Index files:", len(idx))
print("First index:", idx[0])
print("Size (bytes):", os.path.getsize(idx[0]))

with open(idx[0], "rb") as f:
    head = f.read(64)
print("Head hex:", binascii.hexlify(head).decode())
print("Head bytes:", head)


Index files: 4
First index: /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0
Size (bytes): 2062410813
Head hex: 726f6f740010322100000064000000007aede43d000000007aede3f600000047000000010000003a08000000d1000000007aedd019000013dd000118ace12865
Head bytes: b'root\x00\x102!\x00\x00\x00d\x00\x00\x00\x00z\xed\xe4=\x00\x00\x00\x00z\xed\xe3\xf6\x00\x00\x00G\x00\x00\x00\x01\x00\x00\x00:\x08\x00\x00\x00\xd1\x00\x00\x00\x00z\xed\xd0\x19\x00\x00\x13\xdd\x00\x01\x18\xac\xe1(e'


In [None]:
uris, idx = collect_root_uris("/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll")
print("index files:", len(idx))
print("uris found:", len(uris))
print("preview:", uris[:10])
