In [5]:
!pip -q install -U uproot awkward

import os, re, glob, shutil
import uproot


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.8/393.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.7/656.7 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import os, re, glob, shutil, pathlib

DATA_DIR = "/kaggle/input/datasets/katakuricharlotte/doublemuon2016g"
OUT_DIR  = "/kaggle/working/root_converted"

os.makedirs(OUT_DIR, exist_ok=True)

all_files = sorted([p for p in glob.glob(DATA_DIR + "/**", recursive=True) if os.path.isfile(p)])
print("Total files:", len(all_files))
print("First 20:")
for p in all_files[:20]:
    print(p)


Total files: 5
First 20:
/kaggle/input/datasets/katakuricharlotte/doublemuon2016g/CMS_Run2016G_DoubleMuon_NANOAOD_UL2016_MiniAODv2_NanoAODv9-v2_2430000_file_index.json_0
/kaggle/input/datasets/katakuricharlotte/doublemuon2016g/CMS_Run2016G_DoubleMuon_NANOAOD_UL2016_MiniAODv2_NanoAODv9-v2_2430000_file_index.json_1
/kaggle/input/datasets/katakuricharlotte/doublemuon2016g/CMS_Run2016G_DoubleMuon_NANOAOD_UL2016_MiniAODv2_NanoAODv9-v2_2430000_file_index.json_10
/kaggle/input/datasets/katakuricharlotte/doublemuon2016g/CMS_Run2016G_DoubleMuon_NANOAOD_UL2016_MiniAODv2_NanoAODv9-v2_2430000_file_index.json_11
/kaggle/input/datasets/katakuricharlotte/doublemuon2016g/CMS_Run2016G_DoubleMuon_NANOAOD_UL2016_MiniAODv2_NanoAODv9-v2_2430000_file_index.json_12


In [7]:
def is_root_file(path):
    with open(path, "rb") as f:
        return f.read(4) == b"root"   # ROOT magic bytes [web:45]

part_re = re.compile(r"^(.*)_(\d+)$")  # matches "..._0", "..._1", ...

def group_by_base(files):
    groups = {}
    singles = []
    for p in files:
        m = part_re.match(p)
        if m:
            base, idx = m.group(1), int(m.group(2))
            groups.setdefault(base, []).append((idx, p))
        else:
            singles.append(p)
    # sort parts by numeric suffix
    for base in list(groups.keys()):
        groups[base] = [p for _, p in sorted(groups[base], key=lambda x: x[0])]
    return singles, groups

singles, groups = group_by_base(all_files)
print("Singles:", len(singles), "Grouped (have _N suffix):", len(groups))


Singles: 0 Grouped (have _N suffix): 1


In [None]:
converted = []
skipped = []
failed = []

# 1) Handle single files
for src in singles:
    try:
        if is_root_file(src):
            name = os.path.basename(src)
            # ensure .root extension
            if not name.lower().endswith(".root"):
                name = name + ".root"
            dst = os.path.join(OUT_DIR, name)
            shutil.copyfile(src, dst)
            converted.append(dst)
        else:
            skipped.append(src)
    except Exception as e:
        failed.append((src, str(e)))

# 2) Handle grouped split parts
for base, parts in groups.items():
    # only attempt join if first part exists and looks like ROOT (common if it was split)
    try:
        if is_root_file(parts[0]):
            out_name = os.path.basename(base)
            if not out_name.lower().endswith(".root"):
                out_name = out_name + ".root"
            dst = os.path.join(OUT_DIR, out_name)

            with open(dst, "wb") as w:
                for part in parts:
                    with open(part, "rb") as r:
                        shutil.copyfileobj(r, w)

            # quick validation: should still start with ROOT magic bytes
            if not is_root_file(dst):
                raise RuntimeError("Joined file does not start with ROOT header")

            converted.append(dst)
        else:
            skipped.append(base + " (group not ROOT)")
    except Exception as e:
        failed.append((base, str(e)))

print("Converted ROOT files:", len(converted))
print("Skipped (non-ROOT):", len(skipped))
print("Failed:", len(failed))

print("\nExamples converted:")
for p in converted[:10]:
    print(p)

if failed[:5]:
    print("\nFirst failures:")
    for x in failed[:5]:
        print(x)


In [4]:
import uproot

for p in converted[:5]:
    with uproot.open(p) as f:
        print("\nFILE:", os.path.basename(p))
        print("keys:", f.keys(recursive=False, cycle=False)[:20])
        if "Events" in f:
            print("Events entries:", f["Events"].num_entries)


ModuleNotFoundError: No module named 'uproot'