In [9]:
!pip install uproot



In [10]:
!pip install uproot awkward vector numpy scipy



In [12]:
import uproot
import awkward as ak
import numpy as np

# ============================================================
# CONFIG
# ============================================================
INPUT_ROOT  = "/kaggle/input/cms2011b-requiredbranches/merged.root"
OUTPUT_ROOT = "stageB_jet_composition_id.root"

# ============================================================
# OPEN FILE
# ============================================================
tree = uproot.open(INPUT_ROOT)["Events"]
branches = tree.keys()

# ============================================================
# HELPER
# ============================================================
def find(token):
    matches = [b for b in branches if token in b]
    if not matches:
        raise RuntimeError(f"Missing branch containing: {token}")
    if len(matches) > 1:
        print(f"⚠ Multiple matches for {token}, using {matches[0]}")
    return matches[0]

# ============================================================
# BRANCHES (CONFIRMED FROM YOUR LIST)
# ============================================================
b_pt       = find("recoPFJets_ak5PFJets__RECO.obj.pt_")
b_area     = find("recoPFJets_ak5PFJets__RECO.obj.mJetArea")

b_neuHadE  = find("mNeutralHadronEnergy")
b_neuEmE   = find("mNeutralEmEnergy")
b_chHadE   = find("mChargedHadronEnergy")
b_chEmE    = find("mChargedEmEnergy")

b_chMult   = find("mChargedMultiplicity")
b_neuMult  = find("mNeutralMultiplicity")

print("✔ All required branches resolved")

# ============================================================
# LOAD ARRAYS (ONE JET PER EVENT)
# ============================================================
arr = tree.arrays(
    [
        b_pt, b_area,
        b_neuHadE, b_neuEmE, b_chHadE, b_chEmE,
        b_chMult, b_neuMult
    ],
    library="ak"
)

def safe(x):
    return ak.fill_none(x, 0)

pt      = safe(arr[b_pt])
area    = safe(arr[b_area])

neuHadE = safe(arr[b_neuHadE])
neuEmE  = safe(arr[b_neuEmE])
chHadE  = safe(arr[b_chHadE])
chEmE   = safe(arr[b_chEmE])

chMult  = safe(arr[b_chMult])
neuMult = safe(arr[b_neuMult])

# ============================================================
# ENERGY FRACTIONS
# ============================================================
E_tot = neuHadE + neuEmE + chHadE + chEmE
E_tot = ak.where(E_tot > 0, E_tot, 1.0)

j1_neutralHadronEF = neuHadE / E_tot
j1_neutralEmEF     = neuEmE  / E_tot
j1_chargedHadronEF = chHadE  / E_tot
j1_chargedEmEF     = chEmE   / E_tot

# ============================================================
# MULTIPLICITIES
# ============================================================
j1_chargedMultiplicity  = chMult
j1_neutralMultiplicity  = neuMult
j1_numberOfConstituents = chMult + neuMult

# ============================================================
# CMS 2011 PF JET ID (CORRECT)
# ============================================================
j1_jetID_loose = (
    (j1_neutralHadronEF < 0.99) &
    (j1_neutralEmEF     < 0.99) &
    (j1_numberOfConstituents > 1) &
    (j1_chargedHadronEF > 0) &
    (j1_chargedMultiplicity > 0) &
    (j1_chargedEmEF < 0.99)
)

j1_jetID_tight = (
    (j1_neutralHadronEF < 0.90) &
    (j1_neutralEmEF     < 0.90) &
    (j1_numberOfConstituents > 1) &
    (j1_chargedHadronEF > 0) &
    (j1_chargedMultiplicity > 0) &
    (j1_chargedEmEF < 0.90)
)

# ============================================================
# ROOT-SAFE OUTPUT
# ============================================================
def f32(x): return ak.to_numpy(x).astype(np.float32)
def i32(x): return ak.to_numpy(x).astype(np.int32)

out = {
    "j1_neutralHadronEF":      f32(j1_neutralHadronEF),
    "j1_neutralEmEF":          f32(j1_neutralEmEF),
    "j1_chargedHadronEF":      f32(j1_chargedHadronEF),
    "j1_chargedEmEF":          f32(j1_chargedEmEF),
    "j1_numberOfConstituents": i32(j1_numberOfConstituents),
    "j1_chargedMultiplicity": i32(j1_chargedMultiplicity),
    "j1_neutralMultiplicity": i32(j1_neutralMultiplicity),
    "j1_jetArea":              f32(area),
    "j1_jetID_loose":          i32(j1_jetID_loose),
    "j1_jetID_tight":          i32(j1_jetID_tight),
}

with uproot.recreate(OUTPUT_ROOT) as fout:
    fout["Events"] = out

print(f"\n✅ Stage-B Jet Composition & ID written → {OUTPUT_ROOT}")


⚠ Multiple matches for recoPFJets_ak5PFJets__RECO.obj.pt_, using nrecoPFJets_ak5PFJets__RECO./recoPFJets_ak5PFJets__RECO.obj/recoPFJets_ak5PFJets__RECO.obj.pt_
⚠ Multiple matches for recoPFJets_ak5PFJets__RECO.obj.mJetArea, using nrecoPFJets_ak5PFJets__RECO./recoPFJets_ak5PFJets__RECO.obj/recoPFJets_ak5PFJets__RECO.obj.mJetArea
⚠ Multiple matches for mNeutralHadronEnergy, using nrecoPFJets_ak5PFJets__RECO./recoPFJets_ak5PFJets__RECO.obj/recoPFJets_ak5PFJets__RECO.obj.m_specific.mNeutralHadronEnergy
⚠ Multiple matches for mNeutralEmEnergy, using nrecoPFJets_ak5PFJets__RECO./recoPFJets_ak5PFJets__RECO.obj/recoPFJets_ak5PFJets__RECO.obj.m_specific.mNeutralEmEnergy
⚠ Multiple matches for mChargedHadronEnergy, using nrecoPFJets_ak5PFJets__RECO./recoPFJets_ak5PFJets__RECO.obj/recoPFJets_ak5PFJets__RECO.obj.m_specific.mChargedHadronEnergy
⚠ Multiple matches for mChargedEmEnergy, using nrecoPFJets_ak5PFJets__RECO./recoPFJets_ak5PFJets__RECO.obj/recoPFJets_ak5PFJets__RECO.obj.m_specific.mCharge

In [13]:
import uproot
import numpy as np
import pandas as pd

# ===============================
# CONFIG
# ===============================
INPUT_ROOT = "/kaggle/working/stageB_jet_composition_id.root"

# ===============================
# LOAD DATA
# ===============================
with uproot.open(INPUT_ROOT) as f:
    tree = f["Events"]
    arr = tree.arrays(library="np")

print(f"Total events: {len(arr['j1_jetArea'])}")
print(f"Available branches: {list(arr.keys())}")

# ===============================
# BASIC STATS
# ===============================
features_float = [
    "j1_neutralHadronEF",
    "j1_neutralEmEF",
    "j1_chargedHadronEF",
    "j1_chargedEmEF",
    "j1_jetArea",
]

features_int = [
    "j1_numberOfConstituents",
    "j1_chargedMultiplicity",
    "j1_neutralMultiplicity",
    "j1_jetID_loose",
    "j1_jetID_tight",
]

print("\nBasic stats:")
for k in features_float + features_int:
    v = arr[k]
    print(f"{k:30s} min/max: {np.nanmin(v)} {np.nanmax(v)}")

# ===============================
# PHYSICS SANITY CHECKS
# ===============================
print("\nPhysics sanity checks:")

# Energy fractions
ef_sum = (
    arr["j1_neutralHadronEF"]
    + arr["j1_neutralEmEF"]
    + arr["j1_chargedHadronEF"]
    + arr["j1_chargedEmEF"]
)

print("EF outside [0,1]:",
      np.sum(
          (arr["j1_neutralHadronEF"] < 0) |
          (arr["j1_neutralHadronEF"] > 1) |
          (arr["j1_neutralEmEF"] < 0) |
          (arr["j1_neutralEmEF"] > 1) |
          (arr["j1_chargedHadronEF"] < 0) |
          (arr["j1_chargedHadronEF"] > 1) |
          (arr["j1_chargedEmEF"] < 0) |
          (arr["j1_chargedEmEF"] > 1)
      )
)

print("EF sum < 0.9:", np.sum(ef_sum < 0.9))
print("EF sum > 1.1:", np.sum(ef_sum > 1.1))

# Multiplicity consistency
mult_mismatch = (
    arr["j1_numberOfConstituents"]
    != arr["j1_chargedMultiplicity"] + arr["j1_neutralMultiplicity"]
)
print("Multiplicity mismatch events:", np.sum(mult_mismatch))

# Jet area
print("Unique jet area values (first 5):",
      np.unique(arr["j1_jetArea"])[:5])

# JetID logic
tight_not_loose = np.sum(
    (arr["j1_jetID_tight"] == 1) & (arr["j1_jetID_loose"] == 0)
)
print("Tight && !Loose violations:", tight_not_loose)

# ===============================
# EVENT COUNTS
# ===============================
print("\nEvent counts:")
n_loose = np.sum(arr["j1_jetID_loose"] == 1)
n_tight = np.sum(arr["j1_jetID_tight"] == 1)

print("Loose JetID:", n_loose)
print("Tight JetID:", n_tight)
print("Loose fraction:", n_loose / len(arr["j1_jetArea"]))
print("Tight fraction:", n_tight / len(arr["j1_jetArea"]))

# ===============================
# CONTINGENCY TABLE
# ===============================
table = np.zeros((2, 2), dtype=int)
for l, t in zip(arr["j1_jetID_loose"], arr["j1_jetID_tight"]):
    table[int(l), int(t)] += 1

print("\nLoose vs Tight contingency table:")
print("rows = loose (0,1), cols = tight (0,1)")
print(table)


Total events: 452055
Available branches: ['j1_neutralHadronEF', 'j1_neutralEmEF', 'j1_chargedHadronEF', 'j1_chargedEmEF', 'j1_numberOfConstituents', 'j1_chargedMultiplicity', 'j1_neutralMultiplicity', 'j1_jetArea', 'j1_jetID_loose', 'j1_jetID_tight']

Basic stats:
j1_neutralHadronEF             min/max: 0.0 0.25
j1_neutralEmEF                 min/max: 0.0 0.25
j1_chargedHadronEF             min/max: 0.0 0.25
j1_chargedEmEF                 min/max: 0.0 0.25
j1_jetArea                     min/max: 0.0 118.0
j1_numberOfConstituents        min/max: 0 236
j1_chargedMultiplicity         min/max: 0 118
j1_neutralMultiplicity         min/max: 0 118
j1_jetID_loose                 min/max: 0 1
j1_jetID_tight                 min/max: 0 1

Physics sanity checks:
EF outside [0,1]: 0
EF sum < 0.9: 257
EF sum > 1.1: 0
Multiplicity mismatch events: 0
Unique jet area values (first 5): [0. 1. 2. 3. 4.]
Tight && !Loose violations: 0

Event counts:
Loose JetID: 451798
Tight JetID: 451798
Loose fraction: 0