In [55]:
!pip -q install "uproot>=5" awkward vector rich tqdm pandas pyarrow fastparquet matplotlib

In [56]:
import os, glob
from pathlib import Path

print("Top-level /kaggle/input entries:")
print(os.listdir("/kaggle/input"))

def quick_scan(p):
    print("\n---")
    print("Path:", p, "exists:", os.path.exists(p))
    if os.path.exists(p):
        # show a few files
        files = glob.glob(p + "/**/*", recursive=True)
        print("Total entries under path:", len(files))
        print("Sample:", files[:15])
        roots = glob.glob(p + "/**/*.root*", recursive=True)
        print("ROOT-like files found:", len(roots))
        print("ROOT sample:", roots[:10])
        idx = glob.glob(p + "/**/*file_index.json*", recursive=True)
        print("file_index.json* found:", len(idx))
        print("index sample:", idx[:10])

DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

for k,v in DATASET_PATHS.items():
    print(f"\nDataset key: {k}")
    quick_scan(v)


Top-level /kaggle/input entries:
['datasets']

Dataset key: SMS-TChiWZ_ZToLL

---
Path: /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll exists: True
Total entries under path: 4
Sample: ['/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_5', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_2', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgrap

In [57]:
import glob, os, binascii

ds = "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll"
idx = sorted(glob.glob(ds + "/**/*file_index.json*", recursive=True))
print("Index files:", len(idx))
print("First index:", idx[0])
print("Size (bytes):", os.path.getsize(idx[0]))

with open(idx[0], "rb") as f:
    head = f.read(64)
print("Head hex:", binascii.hexlify(head).decode())
print("Head bytes:", head)


Index files: 4
First index: /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0
Size (bytes): 2062410813
Head hex: 726f6f740010322100000064000000007aede43d000000007aede3f600000047000000010000003a08000000d1000000007aedd019000013dd000118ace12865
Head bytes: b'root\x00\x102!\x00\x00\x00d\x00\x00\x00\x00z\xed\xe4=\x00\x00\x00\x00z\xed\xe3\xf6\x00\x00\x00G\x00\x00\x00\x01\x00\x00\x00:\x08\x00\x00\x00\xd1\x00\x00\x00\x00z\xed\xd0\x19\x00\x00\x13\xdd\x00\x01\x18\xac\xe1(e'


In [58]:
import os, glob, re, gzip, bz2, lzma, zipfile

# Match ROOT locations in raw bytes (ASCII-ish)
ROOT_BYTES_RE = re.compile(
    rb'(root://[A-Za-z0-9_.:/-]+?\.root(?:\?[A-Za-z0-9_.=&%-]+)?)'
    rb'|((?:https?://)[A-Za-z0-9_.:/%-]+?\.root(?:\?[A-Za-z0-9_.=&%-]+)?)'
    rb'|(/store/[A-Za-z0-9_./-]+?\.root)',
    re.IGNORECASE
)

EOSPUBLIC_PREFIX = "root://eospublic.cern.ch//"  # CERN Open Data commonly uses eospublic [web:79]

def maybe_decompress(data: bytes, fp_hint: str = "") -> bytes:
    # gzip magic: 1f 8b [web:63][web:72]
    if len(data) >= 2 and data[:2] == b"\x1f\x8b":
        return gzip.decompress(data)
    # bzip2 magic: BZh
    if len(data) >= 3 and data[:3] == b"BZh":
        return bz2.decompress(data)
    # xz magic: fd 37 7a 58 5a 00
    if len(data) >= 6 and data[:6] == b"\xfd7zXZ\x00":
        return lzma.decompress(data)

    # zip magic: PK..
    if len(data) >= 2 and data[:2] == b"PK":
        # Some Kaggle datasets store a zip but keep weird names
        with zipfile.ZipFile(fp_hint, "r") as z:
            # read the first member
            names = z.namelist()
            if not names:
                return data
            return z.read(names[0])

    # zstd magic: 28 B5 2F FD (optional)
    if len(data) >= 4 and data[:4] == b"\x28\xb5\x2f\xfd":
        try:
            import zstandard as zstd
            return zstd.ZstdDecompressor().decompress(data)
        except Exception:
            # If this triggers, install zstandard:
            # !pip -q install zstandard
            return data

    return data

def collect_root_uris(dataset_dir):
    idx_files = sorted(glob.glob(os.path.join(dataset_dir, "**", "*file_index.json*"), recursive=True))
    if not idx_files:
        raise RuntimeError(f"No *file_index.json* under {dataset_dir}")

    uris = set()

    for fp in idx_files:
        with open(fp, "rb") as f:
            raw = f.read()

        raw2 = maybe_decompress(raw, fp_hint=fp)

        for m in ROOT_BYTES_RE.finditer(raw2):
            b = next(g for g in m.groups() if g)
            s = b.decode("ascii", errors="ignore")
            uris.add(s)

    # Normalize /store/... into a root:// URI if needed (common CMS style)
    norm = []
    for u in sorted(uris):
        if u.startswith("/store/"):
            norm.append(EOSPUBLIC_PREFIX + u.lstrip("/"))
        else:
            norm.append(u)

    return norm, idx_files


In [59]:
# !pip -q install uproot awkward numpy

import os, glob, json
import numpy as np
import awkward as ak
import uproot

# ----------------------------
# Dataset paths (yours)
# ----------------------------
DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

OUT_BASE = "/kaggle/working/derivedroot"
os.makedirs(OUT_BASE, exist_ok=True)

STEP_SIZE = 50_000
MAX_EVENTS = None     # set e.g. 200_000 for quick testing
JET_PT_MIN = 30.0

FEATURES50 = [
    "nMuon","nElectron","nJet","MET_pt","MET_phi","MET_sumEt",
    "Muon_pt_0","Muon_eta_0","Muon_phi_0",
    "Muon_pt_1","Muon_eta_1","Muon_phi_1",
    "Electron_pt_0","Electron_eta_0","Electron_phi_0",
    "Electron_pt_1","Electron_eta_1","Electron_phi_1",
    "Jet_pt_0","Jet_eta_0","Jet_phi_0",
    "Jet_pt_1","Jet_eta_1","Jet_phi_1",
    "Jet_pt_2","Jet_eta_2","Jet_phi_2",
    "Jet_pt_3","Jet_eta_3","Jet_phi_3",
    "HT","ST",
    "M_ll","M_jj_01","M_jj_12",
    "delta_phi_MET_j0","delta_phi_MET_j1","min_delta_phi_MET_jets",
    "delta_R_j0_j1","delta_phi_ll","delta_R_ll",
    "Jet_btagDeepB_0","Jet_btagDeepB_1",
    "MT_lep_MET","HT_ratio","MET_pt_HT_ratio",
    "nJet_pt30","Jet_mass_0","LeadLepton_pt","sum_pt_leptons"
]

# ----------------------------
# Utilities (NumPy math ufuncs everywhere)
# ----------------------------
def is_root_file(fp):
    try:
        with open(fp, "rb") as f:
            return f.read(4) == b"root"
    except Exception:
        return False

def collect_input_root_files(dataset_dir):
    roots = set(glob.glob(os.path.join(dataset_dir, "**", "*.root*"), recursive=True))
    idx_like = glob.glob(os.path.join(dataset_dir, "**", "*file_index.json_*"), recursive=True)
    roots.update([p for p in idx_like if is_root_file(p)])
    return sorted(roots)

def delta_phi(phi1, phi2):
    d = phi1 - phi2
    return np.abs((d + np.pi) % (2*np.pi) - np.pi)

def delta_r(eta1, phi1, eta2, phi2):
    return np.sqrt((eta1 - eta2)**2 + delta_phi(phi1, phi2)**2)

def pad_take(jagged, i, fill=0.0):
    padded = ak.pad_none(jagged, i+1, clip=True)
    return ak.fill_none(padded[:, i], fill)

def safe_div(num, den):
    den_safe = ak.where(den != 0, den, 1)     # replace 0 with 1 before dividing
    out = num / den_safe
    return ak.where(den != 0, out, 0)


def vec_mass(pt1, eta1, phi1, m1, pt2, eta2, phi2, m2):
    px1 = pt1 * np.cos(phi1); py1 = pt1 * np.sin(phi1); pz1 = pt1 * np.sinh(eta1)
    px2 = pt2 * np.cos(phi2); py2 = pt2 * np.sin(phi2); pz2 = pt2 * np.sinh(eta2)
    e1 = np.sqrt(px1*px1 + py1*py1 + pz1*pz1 + m1*m1)
    e2 = np.sqrt(px2*px2 + py2*py2 + pz2*pz2 + m2*m2)
    px = px1 + px2; py = py1 + py2; pz = pz1 + pz2; e = e1 + e2
    m2tot = e*e - (px*px + py*py + pz*pz)
    return np.sqrt(ak.where(m2tot > 0, m2tot, 0))

def pick_tree_name(fin):
    keys = [k.split(";")[0] for k in fin.keys()]
    for cand in ["Events", "tree", "ntuple", "T"]:
        if cand in keys:
            return cand
    for k in keys:
        try:
            if isinstance(fin[k], uproot.behaviors.TTree.TTree):
                return k
        except Exception:
            pass
    raise RuntimeError(f"No TTree found. Keys sample: {keys[:30]}")

def choose_btag_branch(branches_set):
    for cand in ["Jet_btagDeepB", "Jet_btagDeepFlavB", "Jet_btagCSVV2"]:
        if cand in branches_set:
            return cand
    return None

MU_MASS = 0.105658
EL_MASS = 0.000511

# ----------------------------
# FULL 50-feature builder
# events is ak.Array of records from uproot.iterate(..., library="ak") [web:14]
# ----------------------------
def compute_features(events, btag_branch=None):
    n_events = len(events)
    fields = set(events.fields)

    def get(name, default):
        return events[name] if name in fields else default

    empty_jagged = ak.Array([[]] * n_events)
    zeros = np.zeros(n_events, dtype=np.float32)

    Mu_pt  = get("Muon_pt",  empty_jagged)
    Mu_eta = get("Muon_eta", empty_jagged)
    Mu_phi = get("Muon_phi", empty_jagged)

    El_pt  = get("Electron_pt",  empty_jagged)
    El_eta = get("Electron_eta", empty_jagged)
    El_phi = get("Electron_phi", empty_jagged)

    J_pt   = get("Jet_pt",   empty_jagged)
    J_eta  = get("Jet_eta",  empty_jagged)
    J_phi  = get("Jet_phi",  empty_jagged)
    J_mass = get("Jet_mass", empty_jagged)

    MET_pt   = get("MET_pt",    zeros)
    MET_phi  = get("MET_phi",   zeros)
    MET_sumE = get("MET_sumEt", zeros)

    nMuon = ak.num(Mu_pt, axis=1)
    nElectron = ak.num(El_pt, axis=1)
    nJet = ak.num(J_pt, axis=1)

    out = {}

    # Group 1
    out["nMuon"] = ak.to_numpy(nMuon).astype(np.int32)
    out["nElectron"] = ak.to_numpy(nElectron).astype(np.int32)
    out["nJet"] = ak.to_numpy(nJet).astype(np.int32)
    out["MET_pt"] = ak.to_numpy(MET_pt).astype(np.float32)
    out["MET_phi"] = ak.to_numpy(MET_phi).astype(np.float32)
    out["MET_sumEt"] = ak.to_numpy(MET_sumE).astype(np.float32)

    # Group 2-5
    out["Muon_pt_0"]  = ak.to_numpy(pad_take(Mu_pt, 0, 0.0)).astype(np.float32)
    out["Muon_eta_0"] = ak.to_numpy(pad_take(Mu_eta,0, 0.0)).astype(np.float32)
    out["Muon_phi_0"] = ak.to_numpy(pad_take(Mu_phi,0, 0.0)).astype(np.float32)
    out["Muon_pt_1"]  = ak.to_numpy(pad_take(Mu_pt, 1, 0.0)).astype(np.float32)
    out["Muon_eta_1"] = ak.to_numpy(pad_take(Mu_eta,1, 0.0)).astype(np.float32)
    out["Muon_phi_1"] = ak.to_numpy(pad_take(Mu_phi,1, 0.0)).astype(np.float32)

    out["Electron_pt_0"]  = ak.to_numpy(pad_take(El_pt, 0, 0.0)).astype(np.float32)
    out["Electron_eta_0"] = ak.to_numpy(pad_take(El_eta,0, 0.0)).astype(np.float32)
    out["Electron_phi_0"] = ak.to_numpy(pad_take(El_phi,0, 0.0)).astype(np.float32)
    out["Electron_pt_1"]  = ak.to_numpy(pad_take(El_pt, 1, 0.0)).astype(np.float32)
    out["Electron_eta_1"] = ak.to_numpy(pad_take(El_eta,1, 0.0)).astype(np.float32)
    out["Electron_phi_1"] = ak.to_numpy(pad_take(El_phi,1, 0.0)).astype(np.float32)

    # Group 6
    for i in range(4):
        out[f"Jet_pt_{i}"]  = ak.to_numpy(pad_take(J_pt, i, 0.0)).astype(np.float32)
        out[f"Jet_eta_{i}"] = ak.to_numpy(pad_take(J_eta,i, 0.0)).astype(np.float32)
        out[f"Jet_phi_{i}"] = ak.to_numpy(pad_take(J_phi,i, 0.0)).astype(np.float32)

    # Group 13
    out["Jet_mass_0"] = ak.to_numpy(pad_take(J_mass, 0, 0.0)).astype(np.float32)

    # HT, nJet_pt30
    HT = ak.sum(J_pt[J_pt > JET_PT_MIN], axis=1)
    nJet_pt30 = ak.sum(J_pt > JET_PT_MIN, axis=1)
    out["HT"] = ak.to_numpy(HT).astype(np.float32)
    out["nJet_pt30"] = ak.to_numpy(nJet_pt30).astype(np.int32)

    # LeadLepton_pt
    lead_mu_pt = pad_take(Mu_pt, 0, 0.0)
    lead_el_pt = pad_take(El_pt, 0, 0.0)
    LeadLepton_pt = ak.where(lead_mu_pt > lead_el_pt, lead_mu_pt, lead_el_pt)
    out["LeadLepton_pt"] = ak.to_numpy(LeadLepton_pt).astype(np.float32)

    # Two leading leptons: prefer muons
    mu0_pt, mu0_eta, mu0_phi = pad_take(Mu_pt,0,0.0), pad_take(Mu_eta,0,0.0), pad_take(Mu_phi,0,0.0)
    mu1_pt, mu1_eta, mu1_phi = pad_take(Mu_pt,1,0.0), pad_take(Mu_eta,1,0.0), pad_take(Mu_phi,1,0.0)
    e0_pt,  e0_eta,  e0_phi  = pad_take(El_pt,0,0.0), pad_take(El_eta,0,0.0), pad_take(El_phi,0,0.0)
    e1_pt,  e1_eta,  e1_phi  = pad_take(El_pt,1,0.0), pad_take(El_eta,1,0.0), pad_take(El_phi,1,0.0)

    cond_mumu = nMuon >= 2
    cond_mue  = (nMuon >= 1) & (nElectron >= 1) & (~cond_mumu)
    cond_ee   = (nElectron >= 2) & (~cond_mumu) & (~cond_mue)
    has2lep = cond_mumu | cond_mue | cond_ee

    l0_pt  = ak.where(cond_mumu | cond_mue, mu0_pt, ak.where(cond_ee, e0_pt, 0.0))
    l0_eta = ak.where(cond_mumu | cond_mue, mu0_eta, ak.where(cond_ee, e0_eta, 0.0))
    l0_phi = ak.where(cond_mumu | cond_mue, mu0_phi, ak.where(cond_ee, e0_phi, 0.0))
    l0_m   = ak.where(cond_mumu | cond_mue, MU_MASS, ak.where(cond_ee, EL_MASS, 0.0))

    l1_pt  = ak.where(cond_mumu, mu1_pt, ak.where(cond_mue, e0_pt, ak.where(cond_ee, e1_pt, 0.0)))
    l1_eta = ak.where(cond_mumu, mu1_eta, ak.where(cond_mue, e0_eta, ak.where(cond_ee, e1_eta, 0.0)))
    l1_phi = ak.where(cond_mumu, mu1_phi, ak.where(cond_mue, e0_phi, ak.where(cond_ee, e1_phi, 0.0)))
    l1_m   = ak.where(cond_mumu, MU_MASS, ak.where(cond_mue, EL_MASS, ak.where(cond_ee, EL_MASS, 0.0)))

    sum_pt_leptons = ak.where(has2lep, l0_pt + l1_pt, 0.0)
    out["sum_pt_leptons"] = ak.to_numpy(sum_pt_leptons).astype(np.float32)

    ST = HT + MET_pt + sum_pt_leptons
    out["ST"] = ak.to_numpy(ST).astype(np.float32)

    # Masses
    out["M_ll"] = ak.to_numpy(ak.where(has2lep, vec_mass(l0_pt,l0_eta,l0_phi,l0_m, l1_pt,l1_eta,l1_phi,l1_m), 0.0)).astype(np.float32)

    j0_pt, j0_eta, j0_phi, j0_m = pad_take(J_pt,0,0.0), pad_take(J_eta,0,0.0), pad_take(J_phi,0,0.0), pad_take(J_mass,0,0.0)
    j1_pt, j1_eta, j1_phi, j1_m = pad_take(J_pt,1,0.0), pad_take(J_eta,1,0.0), pad_take(J_phi,1,0.0), pad_take(J_mass,1,0.0)
    j2_pt, j2_eta, j2_phi, j2_m = pad_take(J_pt,2,0.0), pad_take(J_eta,2,0.0), pad_take(J_phi,2,0.0), pad_take(J_mass,2,0.0)

    has2j = nJet >= 2
    has3j = nJet >= 3
    out["M_jj_01"] = ak.to_numpy(ak.where(has2j, vec_mass(j0_pt,j0_eta,j0_phi,j0_m, j1_pt,j1_eta,j1_phi,j1_m), 0.0)).astype(np.float32)
    out["M_jj_12"] = ak.to_numpy(ak.where(has3j, vec_mass(j1_pt,j1_eta,j1_phi,j1_m, j2_pt,j2_eta,j2_phi,j2_m), 0.0)).astype(np.float32)

    # Angular MET vs jets
    has1j = nJet >= 1
    out["delta_phi_MET_j0"] = ak.to_numpy(ak.where(has1j, delta_phi(MET_phi, pad_take(J_phi,0,0.0)), 0.0)).astype(np.float32)
    out["delta_phi_MET_j1"] = ak.to_numpy(ak.where(has2j, delta_phi(MET_phi, pad_take(J_phi,1,0.0)), 0.0)).astype(np.float32)

    big = 10.0
    dphi0 = ak.where(nJet > 0, delta_phi(MET_phi, pad_take(J_phi,0,0.0)), big)
    dphi1 = ak.where(nJet > 1, delta_phi(MET_phi, pad_take(J_phi,1,0.0)), big)
    dphi2 = ak.where(nJet > 2, delta_phi(MET_phi, pad_take(J_phi,2,0.0)), big)
    dphi3 = ak.where(nJet > 3, delta_phi(MET_phi, pad_take(J_phi,3,0.0)), big)
    min_dphi = ak.min(ak.Array([dphi0, dphi1, dphi2, dphi3]), axis=0)
    out["min_delta_phi_MET_jets"] = ak.to_numpy(ak.where(has1j, min_dphi, 0.0)).astype(np.float32)

    # Angular jets & leptons
    out["delta_R_j0_j1"] = ak.to_numpy(
        ak.where(has2j, delta_r(pad_take(J_eta,0,0.0), pad_take(J_phi,0,0.0),
                                pad_take(J_eta,1,0.0), pad_take(J_phi,1,0.0)), 0.0)
    ).astype(np.float32)
    out["delta_phi_ll"] = ak.to_numpy(ak.where(has2lep, delta_phi(l0_phi, l1_phi), 0.0)).astype(np.float32)
    out["delta_R_ll"]   = ak.to_numpy(ak.where(has2lep, delta_r(l0_eta,l0_phi, l1_eta,l1_phi), 0.0)).astype(np.float32)

    # B-tag
    if btag_branch is not None and btag_branch in fields:
        b = events[btag_branch]
        out["Jet_btagDeepB_0"] = ak.to_numpy(pad_take(b, 0, 0.0)).astype(np.float32)
        out["Jet_btagDeepB_1"] = ak.to_numpy(pad_take(b, 1, 0.0)).astype(np.float32)
    else:
        out["Jet_btagDeepB_0"] = np.zeros(n_events, dtype=np.float32)
        out["Jet_btagDeepB_1"] = np.zeros(n_events, dtype=np.float32)

    # MT_lep_MET (NumPy cos + NumPy sqrt)
    has1lep = (nMuon + nElectron) >= 1
    lead_is_mu = lead_mu_pt >= lead_el_pt
    lead_phi = ak.where(lead_is_mu, pad_take(Mu_phi,0,0.0), pad_take(El_phi,0,0.0))
    lead_pt  = ak.where(lead_is_mu, lead_mu_pt, lead_el_pt)
    dphi_lep_met = ak.where(has1lep, delta_phi(lead_phi, MET_phi), 0.0)
    MT = ak.where(has1lep, np.sqrt(2*lead_pt*MET_pt*(1 - np.cos(dphi_lep_met))), 0.0)
    out["MT_lep_MET"] = ak.to_numpy(MT).astype(np.float32)

    # Ratios
    out["HT_ratio"] = ak.to_numpy(safe_div(HT, HT + MET_pt)).astype(np.float32)
    out["MET_pt_HT_ratio"] = ak.to_numpy(safe_div(MET_pt, HT)).astype(np.float32)

    # Final checks
    for k in FEATURES50:
        if k not in out:
            raise RuntimeError(f"Missing output feature '{k}'")
        if len(out[k]) != n_events:
            raise RuntimeError(f"Bad length for '{k}': {len(out[k])} vs {n_events}")

    return out

# ----------------------------
# Stream files -> write derived ROOT
# ----------------------------
def write_derived_root(input_files, out_root_path, step_size=50_000, max_events=None):
    if not input_files:
        raise RuntimeError("No input ROOT files found.")

    # detect tree + branches from first readable file
    tree_name, branches, btag_branch = None, None, None
    for fp in input_files:
        try:
            with uproot.open(fp) as fin:
                tree_name = pick_tree_name(fin)
                branches = set(fin[tree_name].keys())
                btag_branch = choose_btag_branch(branches)
                break
        except Exception:
            continue
    if tree_name is None:
        raise RuntimeError("Could not open any input ROOT file to detect tree/branches.")

    needed = [
        "Muon_pt","Muon_eta","Muon_phi",
        "Electron_pt","Electron_eta","Electron_phi",
        "Jet_pt","Jet_eta","Jet_phi","Jet_mass",
        "MET_pt","MET_phi","MET_sumEt",
    ]
    if btag_branch is not None:
        needed.append(btag_branch)
    expressions = [b for b in needed if b in branches]

    file_specs = {fp: tree_name for fp in input_files}  # supported by uproot.iterate [web:14]

    n_written = 0
    wrote_tree = False

    with uproot.recreate(out_root_path) as fout:
        for events in uproot.iterate(
            file_specs,
            expressions,
            step_size=step_size,
            library="ak",
            allow_missing=True,
        ):
            feats = compute_features(events, btag_branch=btag_branch)

            if not wrote_tree:
                fout.mktree("Events", {k: v.dtype for k, v in feats.items()})
                wrote_tree = True

            fout["Events"].extend(feats)
            n_written += len(next(iter(feats.values())))

            if max_events is not None and n_written >= max_events:
                return n_written

    return n_written

# ----------------------------
# Run all datasets: separate folder per dataset
# ----------------------------
for ds_name, ds_path in DATASET_PATHS.items():
    ds_out_dir = os.path.join(OUT_BASE, ds_name)
    os.makedirs(ds_out_dir, exist_ok=True)

    input_files = collect_input_root_files(ds_path)
    print(f"\n=== {ds_name} ===")
    print("Inputs found:", len(input_files))
    print("First inputs:", input_files[:3])

    out_root = os.path.join(ds_out_dir, f"derived_{ds_name}.root")

    manifest = {
        "dataset": ds_name,
        "dataset_path": ds_path,
        "inputs_count": len(input_files),
        "inputs_preview": input_files[:30],
        "out_root": out_root,
        "features_out": FEATURES50,
        "step_size": STEP_SIZE,
        "max_events": MAX_EVENTS,
    }
    with open(os.path.join(ds_out_dir, "manifest.json"), "w") as f:
        json.dump(manifest, f, indent=2)

    n = write_derived_root(input_files, out_root, step_size=STEP_SIZE, max_events=MAX_EVENTS)
    print("[DONE] events written:", n)
    print("[DONE] output:", out_root)
    print("[DONE] folder:", ds_out_dir)



=== SMS-TChiWZ_ZToLL ===
Inputs found: 4
First inputs: ['/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_1', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_2']
[DONE] events written: 200000
[DONE] output: /kaggle/working/derivedroot/SMS-TChiWZ_ZToLL/derived_SMS-TChiWZ_ZToLL.root
[DONE] folder: /kaggle/working/derivedroot/SMS-TChiWZ_ZToLL

=== DYJetsToLL_0J_TuneCP5 ===
Inputs found: 4
First inputs: ['/kaggle/input/datasets/g0ldeneagle/dy