In [None]:
# Kaggle-ready: ROOT slimming (keep only 50 features) + per-dataset output folders

# If uproot/awkward are missing in your image, uncomment:
# !pip -q install uproot awkward

import os, glob, json
from pathlib import Path

import numpy as np
import uproot
import awkward as ak

# 1) Set your 50 features here (branch names exactly as in the TTree)
FEATURES50 = [
    # مثال (replace with your real 50 branch names):
    # "MET_pt", "MET_phi", "Electron_pt", ...
]

# 2) Your dataset paths (as you provided)
DATASET_PATHS = {
    "SMS-TChiWZ_ZToLL": "/kaggle/input/datasets/katakuricharlotte/sms-tchiwz-ztoll",
    "DYJetsToLL_0J_TuneCP5": "/kaggle/input/datasets/g0ldeneagle/dyjetstoll-0j-tunecp5",
    "WJetsToLNu_TuneCP5": "/kaggle/input/datasets/prajwalaaryan/wjetstolnu-tunecp5",
    "TTJets_TuneCP5": "/kaggle/input/datasets/darkangel411/ttjets-tunecp5",
}

OUT_BASE = "/kaggle/working/derivedroot"
MAX_EVENTS = None          # set int (e.g., 2_000_000) to cap for testing
STEP_SIZE = 100_000        # chunk size for streaming
TREE_PREFERRED = ["Events", "tree", "ntuple", "T"]  # try in this order

os.makedirs(OUT_BASE, exist_ok=True)

def _find_root_strings(obj):
    """Recursively yield any strings ending with .root from an arbitrary JSON structure."""
    if isinstance(obj, str):
        if obj.lower().endswith(".root"):
            yield obj
    elif isinstance(obj, dict):
        for v in obj.values():
            yield from _find_root_strings(v)
    elif isinstance(obj, (list, tuple)):
        for it in obj:
            yield from _find_root_strings(it)

def collect_root_files(dataset_dir):
    dataset_dir = str(dataset_dir)
    # Prefer json indexes (your Inputs show file_index.json_* entries) [page:1]
    json_paths = glob.glob(os.path.join(dataset_dir, "**", "*file_index.json*"), recursive=True)
    roots = set()

    for jp in json_paths:
        try:
            with open(jp, "r") as f:
                data = json.load(f)
            for s in _find_root_strings(data):
                roots.add(s)
        except Exception as e:
            print(f"[WARN] Could not parse {jp}: {e}")

    # Fallback: direct .root files inside the dataset directory (if any)
    if not roots:
        for rp in glob.glob(os.path.join(dataset_dir, "**", "*.root"), recursive=True):
            roots.add(rp)

    return sorted(roots), sorted(json_paths)

def pick_tree_name(uproot_file):
    keys = [k.split(";")[0] for k in uproot_file.keys()]
    for cand in TREE_PREFERRED:
        if cand in keys:
            return cand
    # fallback: first TTree
    for k in keys:
        try:
            obj = uproot_file[k]
            if isinstance(obj, uproot.behaviors.TTree.TTree):
                return k
        except Exception:
            pass
    raise RuntimeError(f"No TTree found in file. Keys: {keys[:50]} ...")

def to_numpy_dict(ak_arrays, features):
    out = {}
    for name in features:
        if name not in ak_arrays.fields:
            continue
        # Expecting flat scalar branches for "50 features" -> convert to numpy 1D
        out[name] = ak.to_numpy(ak_arrays[name])
    return out

def write_slim_root(root_files, out_root_path, features, max_events=None, step_size=100_000):
    if not root_files:
        raise RuntimeError("No ROOT files found (from file_index.json or direct .root scan).")

    n_written = 0
    wrote_tree = False

    # uproot.recreate overwrites/creates a ROOT file for output [web:11]
    with uproot.recreate(out_root_path) as fout:
        for i, rf in enumerate(root_files):
            try:
                with uproot.open(rf) as fin:
                    tree_name = pick_tree_name(fin)
                    tree = fin[tree_name]

                    available = set(tree.keys())
                    keep = [b for b in features if b in available]
                    missing = [b for b in features if b not in available]

                    if i == 0:
                        print(f"[INFO] Using tree '{tree_name}' from first readable file.")
                        if missing:
                            print(f"[WARN] Missing {len(missing)} branches (will be skipped): {missing[:10]} ...")

                    if not keep:
                        print(f"[WARN] No requested branches found in {rf}; skipping.")
                        continue

                    for chunk in tree.iterate(keep, step_size=step_size, library="ak"):
                        chunk_np = to_numpy_dict(chunk, keep)
                        if not chunk_np:
                            continue

                        # Create output tree on first chunk (with numpy dtypes)
                        if not wrote_tree:
                            branch_types = {k: v.dtype for k, v in chunk_np.items()}
                            fout.mktree("Events", branch_types)  # output tree name
                            wrote_tree = True

                        fout["Events"].extend(chunk_np)

                        n = len(next(iter(chunk_np.values())))
                        n_written += n

                        if (max_events is not None) and (n_written >= max_events):
                            print(f"[INFO] Reached MAX_EVENTS={max_events}; stopping.")
                            return n_written

            except Exception as e:
                print(f"[WARN] Failed to process {rf}: {e}")

    return n_written

# --- Run for each dataset ---
if not FEATURES50 or len(FEATURES50) != 50:
    print(f"[ACTION REQUIRED] Please set FEATURES50 with exactly 50 branch names. Current len={len(FEATURES50)}")
else:
    for ds_name, ds_path in DATASET_PATHS.items():
        ds_out_dir = os.path.join(OUT_BASE, ds_name)
        os.makedirs(ds_out_dir, exist_ok=True)

        root_files, json_paths = collect_root_files(ds_path)

        out_root = os.path.join(ds_out_dir, f"derived_{ds_name}.root")
        manifest = {
            "dataset": ds_name,
            "dataset_path": ds_path,
            "out_root": out_root,
            "n_features_requested": len(FEATURES50),
            "features_requested": FEATURES50,
            "file_index_json_found": json_paths,
            "root_files_found_count": len(root_files),
            "root_files_found_preview": root_files[:20],
            "max_events": MAX_EVENTS,
            "step_size": STEP_SIZE,
        }

        with open(os.path.join(ds_out_dir, "manifest.json"), "w") as f:
            json.dump(manifest, f, indent=2)

        print(f"\n=== {ds_name} ===")
        print(f"[INFO] root files found: {len(root_files)}")
        print(f"[INFO] writing -> {out_root}")

        n_written = write_slim_root(
            root_files=root_files,
            out_root_path=out_root,
            features=FEATURES50,
            max_events=MAX_EVENTS,
            step_size=STEP_SIZE,
        )

        print(f"[DONE] Wrote {n_written} events to {out_root}")
        print(f"[DONE] Folder: {ds_out_dir}")
