In [2]:
!pip -q install "uproot>=5" awkward vector rich tqdm pandas pyarrow fastparquet matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.8/393.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.7/656.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.2/181.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# If needed:
# !pip -q install uproot awkward numpy

import os, glob, json
from pathlib import Path

import numpy as np
import uproot
import awkward as ak

# ----------------------------
# Inputs (your dataset paths)
# ----------------------------
DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

OUT_BASE = "/kaggle/working/derivedroot"
os.makedirs(OUT_BASE, exist_ok=True)

STEP_SIZE = 100_000     # entries per chunk (tune if needed)
MAX_EVENTS = None       # set an int for quick testing

JET_PT_MIN = 30.0

# -----------------------------------------
# Output feature names (exactly your 50)
# -----------------------------------------
FEATURES50 = [
    "nMuon","nElectron","nJet","MET_pt","MET_phi","MET_sumEt",
    "Muon_pt_0","Muon_eta_0","Muon_phi_0",
    "Muon_pt_1","Muon_eta_1","Muon_phi_1",
    "Electron_pt_0","Electron_eta_0","Electron_phi_0",
    "Electron_pt_1","Electron_eta_1","Electron_phi_1",
    "Jet_pt_0","Jet_eta_0","Jet_phi_0",
    "Jet_pt_1","Jet_eta_1","Jet_phi_1",
    "Jet_pt_2","Jet_eta_2","Jet_phi_2",
    "Jet_pt_3","Jet_eta_3","Jet_phi_3",
    "HT","ST",
    "M_ll","M_jj_01","M_jj_12",
    "delta_phi_MET_j0","delta_phi_MET_j1","min_delta_phi_MET_jets",
    "delta_R_j0_j1","delta_phi_ll","delta_R_ll",
    "Jet_btagDeepB_0","Jet_btagDeepB_1",
    "MT_lep_MET","HT_ratio","MET_pt_HT_ratio",
    "nJet_pt30","Jet_mass_0","LeadLepton_pt","sum_pt_leptons"
]

# -----------------------------------------
# Helpers: find ROOT files via file_index.json
# -----------------------------------------
def _find_root_strings(obj):
    if isinstance(obj, str):
        if obj.lower().endswith(".root"):
            yield obj
    elif isinstance(obj, dict):
        for v in obj.values():
            yield from _find_root_strings(v)
    elif isinstance(obj, (list, tuple)):
        for it in obj:
            yield from _find_root_strings(it)

def collect_root_files(dataset_dir):
    dataset_dir = str(dataset_dir)
    json_paths = glob.glob(os.path.join(dataset_dir, "**", "*file_index.json*"), recursive=True)
    roots = set()

    for jp in json_paths:
        try:
            with open(jp, "r") as f:
                data = json.load(f)
            for s in _find_root_strings(data):
                roots.add(s)
        except Exception as e:
            print(f"[WARN] Could not parse {jp}: {e}")

    # fallback: scan for .root directly if no index was found
    if not roots:
        for rp in glob.glob(os.path.join(dataset_dir, "**", "*.root"), recursive=True):
            roots.add(rp)

    return sorted(roots), sorted(json_paths)

# -----------------------------------------
# Physics/math helpers
# -----------------------------------------
def delta_phi(phi1, phi2):
    d = phi1 - phi2
    return ak.abs((d + np.pi) % (2*np.pi) - np.pi)

def delta_r(eta1, phi1, eta2, phi2):
    dphi = delta_phi(phi1, phi2)
    return ak.sqrt((eta1 - eta2)**2 + dphi**2)

def pad_take(jagged, i, fill=0.0):
    # jagged: ak.Array of per-event variable-length arrays
    padded = ak.pad_none(jagged, i+1, clip=True)
    return ak.fill_none(padded[:, i], fill)

def vec_mass(pt1, eta1, phi1, m1, pt2, eta2, phi2, m2):
    # Build 4-vectors from (pt, eta, phi, m)
    px1 = pt1 * np.cos(phi1); py1 = pt1 * np.sin(phi1); pz1 = pt1 * np.sinh(eta1)
    px2 = pt2 * np.cos(phi2); py2 = pt2 * np.sin(phi2); pz2 = pt2 * np.sinh(eta2)
    e1 = ak.sqrt(px1*px1 + py1*py1 + pz1*pz1 + m1*m1)
    e2 = ak.sqrt(px2*px2 + py2*py2 + pz2*pz2 + m2*m2)
    px = px1 + px2; py = py1 + py2; pz = pz1 + pz2; e = e1 + e2
    m2tot = e*e - (px*px + py*py + pz*pz)
    return ak.sqrt(ak.where(m2tot > 0, m2tot, 0))

def safe_div(num, den):
    return ak.where(den != 0, num/den, 0)

# -----------------------------------------
# Determine tree name + pick btag branch
# -----------------------------------------
TREE_PREFERRED = ["Events", "tree", "ntuple", "T"]

def pick_tree_name(fin):
    keys = [k.split(";")[0] for k in fin.keys()]
    for cand in TREE_PREFERRED:
        if cand in keys:
            return cand
    # fallback: first TTree
    for k in keys:
        try:
            obj = fin[k]
            if isinstance(obj, uproot.behaviors.TTree.TTree):
                return k
        except Exception:
            pass
    raise RuntimeError(f"No TTree found. Keys sample: {keys[:30]}")

def choose_btag_branch(branches_set):
    # Prefer DeepCSV-style name if present; fallback options
    for cand in ["Jet_btagDeepB", "Jet_btagDeepFlavB", "Jet_btagCSVV2"]:
        if cand in branches_set:
            return cand
    return None

# -----------------------------------------
# Compute 50 derived features for one chunk
# chunk is dict-like mapping branch -> ak.Array
# -----------------------------------------
def compute_features(chunk, btag_branch=None):
    # Raw branches (NanoAOD-style)
    Mu_pt  = chunk.get("Muon_pt",  ak.Array([[]]*len(next(iter(chunk.values())))))
    Mu_eta = chunk.get("Muon_eta", ak.Array([[]]*len(next(iter(chunk.values())))))
    Mu_phi = chunk.get("Muon_phi", ak.Array([[]]*len(next(iter(chunk.values())))))

    El_pt  = chunk.get("Electron_pt",  ak.Array([[]]*len(next(iter(chunk.values())))))
    El_eta = chunk.get("Electron_eta", ak.Array([[]]*len(next(iter(chunk.values())))))
    El_phi = chunk.get("Electron_phi", ak.Array([[]]*len(next(iter(chunk.values())))))

    J_pt   = chunk.get("Jet_pt",   ak.Array([[]]*len(next(iter(chunk.values())))))
    J_eta  = chunk.get("Jet_eta",  ak.Array([[]]*len(next(iter(chunk.values())))))
    J_phi  = chunk.get("Jet_phi",  ak.Array([[]]*len(next(iter(chunk.values())))))
    J_mass = chunk.get("Jet_mass", ak.zeros_like(pad_take(J_pt, 0, 0.0)))  # scalar default; overwritten below if present
    if "Jet_mass" in chunk:
        J_mass = chunk["Jet_mass"]

    MET_pt   = chunk.get("MET_pt",   ak.zeros_like(pad_take(J_pt, 0, 0.0)))
    MET_phi  = chunk.get("MET_phi",  ak.zeros_like(MET_pt))
    MET_sumE = chunk.get("MET_sumEt", ak.zeros_like(MET_pt))

    # Counts
    nMuon = ak.num(Mu_pt, axis=1)
    nElectron = ak.num(El_pt, axis=1)
    nJet = ak.num(J_pt, axis=1)

    # Leading/subleading objects
    out = {}
    out["nMuon"] = ak.to_numpy(nMuon, allow_missing=False).astype(np.int32)
    out["nElectron"] = ak.to_numpy(nElectron, allow_missing=False).astype(np.int32)
    out["nJet"] = ak.to_numpy(nJet, allow_missing=False).astype(np.int32)

    out["MET_pt"] = ak.to_numpy(MET_pt).astype(np.float32)
    out["MET_phi"] = ak.to_numpy(MET_phi).astype(np.float32)
    out["MET_sumEt"] = ak.to_numpy(MET_sumE).astype(np.float32)

    # Muons 0/1
    out["Muon_pt_0"]  = ak.to_numpy(pad_take(Mu_pt, 0, 0.0)).astype(np.float32)
    out["Muon_eta_0"] = ak.to_numpy(pad_take(Mu_eta,0, 0.0)).astype(np.float32)
    out["Muon_phi_0"] = ak.to_numpy(pad_take(Mu_phi,0, 0.0)).astype(np.float32)

    out["Muon_pt_1"]  = ak.to_numpy(pad_take(Mu_pt, 1, 0.0)).astype(np.float32)
    out["Muon_eta_1"] = ak.to_numpy(pad_take(Mu_eta,1, 0.0)).astype(np.float32)
    out["Muon_phi_1"] = ak.to_numpy(pad_take(Mu_phi,1, 0.0)).astype(np.float32)

    # Electrons 0/1
    out["Electron_pt_0"]  = ak.to_numpy(pad_take(El_pt, 0, 0.0)).astype(np.float32)
    out["Electron_eta_0"] = ak.to_numpy(pad_take(El_eta,0, 0.0)).astype(np.float32)
    out["Electron_phi_0"] = ak.to_numpy(pad_take(El_phi,0, 0.0)).astype(np.float32)

    out["Electron_pt_1"]  = ak.to_numpy(pad_take(El_pt, 1, 0.0)).astype(np.float32)
    out["Electron_eta_1"] = ak.to_numpy(pad_take(El_eta,1, 0.0)).astype(np.float32)
    out["Electron_phi_1"] = ak.to_numpy(pad_take(El_phi,1, 0.0)).astype(np.float32)

    # Jets 0-3
    for i in range(4):
        out[f"Jet_pt_{i}"]  = ak.to_numpy(pad_take(J_pt, i, 0.0)).astype(np.float32)
        out[f"Jet_eta_{i}"] = ak.to_numpy(pad_take(J_eta,i, 0.0)).astype(np.float32)
        out[f"Jet_phi_{i}"] = ak.to_numpy(pad_take(J_phi,i, 0.0)).astype(np.float32)

    out["Jet_mass_0"] = ak.to_numpy(pad_take(J_mass, 0, 0.0)).astype(np.float32)

    # HT, nJet_pt30
    jet_pt30 = J_pt[J_pt > JET_PT_MIN]
    HT = ak.sum(jet_pt30, axis=1)
    nJet_pt30 = ak.sum(J_pt > JET_PT_MIN, axis=1)
    out["HT"] = ak.to_numpy(HT).astype(np.float32)
    out["nJet_pt30"] = ak.to_numpy(nJet_pt30).astype(np.int32)

    # Build combined leptons for M_ll, delta_phi_ll, delta_R_ll, sum_pt_leptons, MT_lep_MET
    MU_MASS = 0.105658
    EL_MASS = 0.000511

    mu = ak.zip({"pt": Mu_pt, "eta": Mu_eta, "phi": Mu_phi, "m": ak.zeros_like(Mu_pt) + MU_MASS})
    el = ak.zip({"pt": El_pt, "eta": El_eta, "phi": El_phi, "m": ak.zeros_like(El_pt) + EL_MASS})
    lep = ak.concatenate([mu, el], axis=1)
    lep_sorted = lep[ak.argsort(lep.pt, axis=1, ascending=False)]

    lep0 = ak.pad_none(lep_sorted, 1, clip=True)[:, 0]
    lep1 = ak.pad_none(lep_sorted, 2, clip=True)[:, 1]

    lep0_pt  = ak.fill_none(lep0.pt, 0.0)
    lep0_eta = ak.fill_none(lep0.eta, 0.0)
    lep0_phi = ak.fill_none(lep0.phi, 0.0)

    lep1_pt  = ak.fill_none(lep1.pt, 0.0)
    lep1_eta = ak.fill_none(lep1.eta, 0.0)
    lep1_phi = ak.fill_none(lep1.phi, 0.0)

    # LeadLepton_pt = max(leading mu pT, leading e pT) with 0 if none
    lead_mu_pt = pad_take(Mu_pt, 0, 0.0)
    lead_el_pt = pad_take(El_pt, 0, 0.0)
    LeadLepton_pt = ak.where(lead_mu_pt > lead_el_pt, lead_mu_pt, lead_el_pt)
    out["LeadLepton_pt"] = ak.to_numpy(LeadLepton_pt).astype(np.float32)

    sum_pt_leptons = lep0_pt + lep1_pt
    out["sum_pt_leptons"] = ak.to_numpy(sum_pt_leptons).astype(np.float32)

    # ST = HT + MET + sum(leading lepton pT)  (here: first two combined leptons)
    ST = HT + MET_pt + sum_pt_leptons
    out["ST"] = ak.to_numpy(ST).astype(np.float32)

    # M_ll (0 if <2 leptons)
    has2lep = ak.num(lep_sorted, axis=1) >= 2
    mll = ak.where(
        has2lep,
        vec_mass(lep0_pt, lep0_eta, lep0_phi, ak.zeros_like(lep0_pt),
                 lep1_pt, lep1_eta, lep1_phi, ak.zeros_like(lep1_pt)),
        0.0
    )
    out["M_ll"] = ak.to_numpy(mll).astype(np.float32)

    # delta_phi_ll, delta_R_ll
    dphi_ll = ak.where(has2lep, delta_phi(lep0_phi, lep1_phi), 0.0)
    dR_ll = ak.where(has2lep, delta_r(lep0_eta, lep0_phi, lep1_eta, lep1_phi), 0.0)
    out["delta_phi_ll"] = ak.to_numpy(dphi_ll).astype(np.float32)
    out["delta_R_ll"] = ak.to_numpy(dR_ll).astype(np.float32)

    # Dijet masses
    j0_pt, j0_eta, j0_phi, j0_m = pad_take(J_pt,0,0.0), pad_take(J_eta,0,0.0), pad_take(J_phi,0,0.0), pad_take(J_mass,0,0.0)
    j1_pt, j1_eta, j1_phi, j1_m = pad_take(J_pt,1,0.0), pad_take(J_eta,1,0.0), pad_take(J_phi,1,0.0), pad_take(J_mass,1,0.0)
    j2_pt, j2_eta, j2_phi, j2_m = pad_take(J_pt,2,0.0), pad_take(J_eta,2,0.0), pad_take(J_phi,2,0.0), pad_take(J_mass,2,0.0)

    has2j = nJet >= 2
    has3j = nJet >= 3

    mjj01 = ak.where(has2j, vec_mass(j0_pt,j0_eta,j0_phi,j0_m, j1_pt,j1_eta,j1_phi,j1_m), 0.0)
    mjj12 = ak.where(has3j, vec_mass(j1_pt,j1_eta,j1_phi,j1_m, j2_pt,j2_eta,j2_phi,j2_m), 0.0)
    out["M_jj_01"] = ak.to_numpy(mjj01).astype(np.float32)
    out["M_jj_12"] = ak.to_numpy(mjj12).astype(np.float32)

    # Angular: MET vs jets
    dphi_met_j0 = ak.where(has2j | (nJet>=1), delta_phi(MET_phi, pad_take(J_phi,0,0.0)), 0.0)
    dphi_met_j1 = ak.where(has2j, delta_phi(MET_phi, pad_take(J_phi,1,0.0)), 0.0)

    jets_phi_0to3 = ak.concatenate([ak.unzip(ak.zip({"phi": J_phi})).phi], axis=0)  # no-op; keep simple
    jphi_pad = ak.pad_none(J_phi, 4, clip=True)
    jphi0 = ak.fill_none(jphi_pad[:,0], 0.0)
    jphi1 = ak.fill_none(jphi_pad[:,1], 0.0)
    jphi2 = ak.fill_none(jphi_pad[:,2], 0.0)
    jphi3 = ak.fill_none(jphi_pad[:,3], 0.0)
    dphis = ak.stack([delta_phi(MET_phi, jphi0),
                      delta_phi(MET_phi, jphi1),
                      delta_phi(MET_phi, jphi2),
                      delta_phi(MET_phi, jphi3)], axis=1)
    # If fewer than 1 jet, define as 0
    min_dphi = ak.where(nJet > 0, ak.min(dphis, axis=1), 0.0)

    out["delta_phi_MET_j0"] = ak.to_numpy(dphi_met_j0).astype(np.float32)
    out["delta_phi_MET_j1"] = ak.to_numpy(dphi_met_j1).astype(np.float32)
    out["min_delta_phi_MET_jets"] = ak.to_numpy(min_dphi).astype(np.float32)

    # Angular: jets
    dR_j0_j1 = ak.where(has2j, delta_r(pad_take(J_eta,0,0.0), pad_take(J_phi,0,0.0),
                                      pad_take(J_eta,1,0.0), pad_take(J_phi,1,0.0)), 0.0)
    out["delta_R_j0_j1"] = ak.to_numpy(dR_j0_j1).astype(np.float32)

    # B-tag leading/subleading
    if btag_branch is not None and btag_branch in chunk:
        b = chunk[btag_branch]
        out["Jet_btagDeepB_0"] = ak.to_numpy(pad_take(b, 0, 0.0)).astype(np.float32)
        out["Jet_btagDeepB_1"] = ak.to_numpy(pad_take(b, 1, 0.0)).astype(np.float32)
    else:
        out["Jet_btagDeepB_0"] = ak.to_numpy(ak.zeros_like(MET_pt)).astype(np.float32)
        out["Jet_btagDeepB_1"] = ak.to_numpy(ak.zeros_like(MET_pt)).astype(np.float32)

    # MT_lep_MET using leading combined lepton (lep0)
    has1lep = ak.num(lep_sorted, axis=1) >= 1
    dphi_lep_met = ak.where(has1lep, delta_phi(lep0_phi, MET_phi), 0.0)
    MT = ak.where(has1lep, ak.sqrt(2*lep0_pt*MET_pt*(1 - ak.cos(dphi_lep_met))), 0.0)
    out["MT_lep_MET"] = ak.to_numpy(MT).astype(np.float32)

    # Ratios
    out["HT_ratio"] = ak.to_numpy(safe_div(HT, HT + MET_pt)).astype(np.float32)
    out["MET_pt_HT_ratio"] = ak.to_numpy(safe_div(MET_pt, HT)).astype(np.float32)

    # Sanity: enforce all 50 keys exist
    for k in FEATURES50:
        if k not in out:
            raise RuntimeError(f"Missing output feature '{k}' (bug in compute_features).")

    return out

# -----------------------------------------
# Main loop per dataset: stream -> write ROOT
# -----------------------------------------
def write_derived_root(root_files, out_root_path, step_size=100_000, max_events=None):
    if not root_files:
        raise RuntimeError("No ROOT files found (from file_index.json or .root scan).")

    n_written = 0
    wrote_tree = False

    # Find a readable file to decide tree name + available branches
    tree_name = None
    branches = None
    btag_branch = None

    for rf in root_files:
        try:
            with uproot.open(rf) as fin:
                tree_name = pick_tree_name(fin)
                branches = set(fin[tree_name].keys())
                btag_branch = choose_btag_branch(branches)
                break
        except Exception:
            continue

    if tree_name is None:
        raise RuntimeError("Could not open any ROOT file to detect TTree/branches.")

    # Branches we want to read (intersection only; missing are handled as defaults)
    needed = [
        "Muon_pt","Muon_eta","Muon_phi",
        "Electron_pt","Electron_eta","Electron_phi",
        "Jet_pt","Jet_eta","Jet_phi","Jet_mass",
        "MET_pt","MET_phi","MET_sumEt",
    ]
    if btag_branch is not None:
        needed.append(btag_branch)

    expressions = [b for b in needed if b in branches]

    with uproot.recreate(out_root_path) as fout:
        for rf in root_files:
            try:
                with uproot.open(rf) as fin:
                    if tree_name not in [k.split(";")[0] for k in fin.keys()]:
                        continue
                    tree = fin[tree_name]

                    for chunk in tree.iterate(expressions, step_size=step_size, library="ak"):
                        feats = compute_features(chunk, btag_branch=btag_branch)

                        if not wrote_tree:
                            # Define output types from first chunk
                            branch_types = {k: v.dtype for k, v in feats.items()}
                            fout.mktree("Events", branch_types)
                            wrote_tree = True

                        fout["Events"].extend(feats)
                        n_written += len(next(iter(feats.values())))

                        if max_events is not None and n_written >= max_events:
                            return n_written
            except Exception as e:
                print(f"[WARN] Failed file {rf}: {e}")

    return n_written

# Run all datasets
for ds_name, ds_path in DATASET_PATHS.items():
    ds_out_dir = os.path.join(OUT_BASE, ds_name)
    os.makedirs(ds_out_dir, exist_ok=True)

    root_files, json_paths = collect_root_files(ds_path)
    out_root = os.path.join(ds_out_dir, f"derived_{ds_name}.root")

    manifest = {
        "dataset": ds_name,
        "dataset_path": ds_path,
        "out_root": out_root,
        "root_files_found_count": len(root_files),
        "root_files_found_preview": root_files[:30],
        "file_index_json_found": json_paths,
        "features_out": FEATURES50,
        "step_size": STEP_SIZE,
        "max_events": MAX_EVENTS,
    }
    with open(os.path.join(ds_out_dir, "manifest.json"), "w") as f:
        json.dump(manifest, f, indent=2)

    print(f"\n=== {ds_name} ===")
    print(f"[INFO] ROOT files found: {len(root_files)}")
    print(f"[INFO] Writing: {out_root}")

    n = write_derived_root(root_files, out_root, step_size=STEP_SIZE, max_events=MAX_EVENTS)
    print(f"[DONE] events written: {n}")
    print(f"[DONE] folder: {ds_out_dir}")


[WARN] Could not parse /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_5: 'utf-8' codec can't decode byte 0x85 in position 16: invalid start byte
[WARN] Could not parse /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_2: 'utf-8' codec can't decode byte 0xef in position 6: invalid continuation byte
[WARN] Could not parse /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0: 'utf-8' codec can't decode byte 0xed in position 17: invalid continuation byte
[WARN] Could not parse /kaggle/input/datasets

RuntimeError: No ROOT files found (from file_index.json or .root scan).

In [4]:
import os, glob
from pathlib import Path

print("Top-level /kaggle/input entries:")
print(os.listdir("/kaggle/input"))

def quick_scan(p):
    print("\n---")
    print("Path:", p, "exists:", os.path.exists(p))
    if os.path.exists(p):
        # show a few files
        files = glob.glob(p + "/**/*", recursive=True)
        print("Total entries under path:", len(files))
        print("Sample:", files[:15])
        roots = glob.glob(p + "/**/*.root*", recursive=True)
        print("ROOT-like files found:", len(roots))
        print("ROOT sample:", roots[:10])
        idx = glob.glob(p + "/**/*file_index.json*", recursive=True)
        print("file_index.json* found:", len(idx))
        print("index sample:", idx[:10])

DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

for k,v in DATASET_PATHS.items():
    print(f"\nDataset key: {k}")
    quick_scan(v)


Top-level /kaggle/input entries:
['datasets']

Dataset key: SMS-TChiWZ_ZToLL

---
Path: /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll exists: True
Total entries under path: 4
Sample: ['/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_5', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_2', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0', '/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgrap

In [7]:
import os, glob, json, re
from pathlib import Path

ROOT_RE = re.compile(r'([^\s"\']+?\.root)(\?[^\s"\']+)?', re.IGNORECASE)

def resolve_dataset_dir(p):
    """If user gave a non-existent path, try to find the dataset under /kaggle/input/<slug>."""
    if os.path.exists(p):
        return p
    slug = Path(p).name  # last token, often the dataset slug
    cand = f"/kaggle/input/{slug}"
    if os.path.exists(cand):
        return cand
    matches = glob.glob(f"/kaggle/input/**/{slug}", recursive=True)
    if matches:
        return matches[0]
    return p  # fallback (will likely fail, but gives a clear debug)

def find_root_strings_anywhere(obj):
    """Find substrings containing '.root' even if they have querystrings or extra text."""
    if isinstance(obj, str):
        m = ROOT_RE.search(obj)
        if m:
            yield m.group(1) + (m.group(2) or "")
    elif isinstance(obj, dict):
        for v in obj.values():
            yield from find_root_strings_anywhere(v)
    elif isinstance(obj, (list, tuple)):
        for it in obj:
            yield from find_root_strings_anywhere(it)

def collect_root_files(dataset_dir):
    dataset_dir = resolve_dataset_dir(str(dataset_dir))

    json_paths = glob.glob(os.path.join(dataset_dir, "**", "*file_index.json*"), recursive=True)
    roots = set()

    # 1) parse file_index.json* (json or json-lines)
    for jp in json_paths:
        try:
            with open(jp, "r") as f:
                txt = f.read()

            # try normal JSON
            try:
                data = json.loads(txt)
                for s in find_root_strings_anywhere(data):
                    roots.add(s)
            except Exception:
                # try json-lines
                for line in txt.splitlines():
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        data = json.loads(line)
                        for s in find_root_strings_anywhere(data):
                            roots.add(s)
                    except Exception:
                        # last resort: regex scan line
                        m = ROOT_RE.search(line)
                        if m:
                            roots.add(m.group(1) + (m.group(2) or ""))

        except Exception as e:
            print(f"[WARN] Could not parse {jp}: {e}")

    # 2) fallback: direct local scan for .root files
    for rp in glob.glob(os.path.join(dataset_dir, "**", "*.root*"), recursive=True):
        roots.add(rp)

    roots = sorted(roots)
    return roots, sorted(json_paths), dataset_dir


In [8]:
for ds_name, ds_path in DATASET_PATHS.items():
    root_files, json_paths, resolved = collect_root_files(ds_path)
    print(f"\n=== {ds_name} ===")
    print("given:", ds_path)
    print("resolved:", resolved)
    print("file_index.json*:", len(json_paths))
    print("root files:", len(root_files))
    print("root preview:", root_files[:5])

    if not root_files:
        raise RuntimeError(
            f"No ROOT files found for {ds_name}. "
            f"Check the resolved path above and whether the dataset actually contains ROOT/tar/zip files."
        )

    # ... now call write_derived_root(root_files, ...)


[WARN] Could not parse /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_5: 'utf-8' codec can't decode byte 0x85 in position 16: invalid start byte
[WARN] Could not parse /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_2: 'utf-8' codec can't decode byte 0xef in position 6: invalid continuation byte
[WARN] Could not parse /kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll/CMS_mc_RunIISummer20UL16NanoAODv9_SMS-TChiWZ_ZToLL_mZMin-0p1_TuneCP5_13TeV-madgraphMLM-pythia8_NANOAODSIM_106X_mcRun2_asymptotic_v17-v1_80000_file_index.json_0: 'utf-8' codec can't decode byte 0xed in position 17: invalid continuation byte
[WARN] Could not parse /kaggle/input/datasets

RuntimeError: No ROOT files found for SMS-TChiWZ_ZToLL. Check the resolved path above and whether the dataset actually contains ROOT/tar/zip files.