In [None]:
# Cell 1 — Install deps (no XRootD needed)
!pip -q install "uproot>=5" awkward vector rich tqdm pandas pyarrow fastparquet matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.8/393.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.7/656.7 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.2/181.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import uproot
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt


In [None]:
cmsderived = "/kaggle/input/datasets/katakuricharlotte/cms-derivedroot/derivedroot"
cms2016G   = "/kaggle/input/datasets/hiteshrs/cms2016g29-5785/processed_events"


In [None]:
def load_root_folder(folder, label, source, max_files=None):
    files = sorted(glob.glob(folder + "/*.root"))
    if max_files:
        files = files[:max_files]

    dfs = []
    for f in files:
        with uproot.open(f) as file:
            tree = file[file.keys()[0]]   # safer if tree name differs
            df = tree.arrays(library="pd")
            df["label"] = label
            df["source"] = source
            dfs.append(df)

    return pd.concat(dfs, ignore_index=True)


In [None]:
dy   = load_root_folder(f"{cmsderived}/DYJetsToLL_0J_TuneCP5", 0, "DY")
wj   = load_root_folder(f"{cmsderived}/WJetsToLNu_TuneCP5", 0, "WJets")
tt   = load_root_folder(f"{cmsderived}/TTJets_TuneCP5", 0, "TT")
susy = load_root_folder(f"{cmsderived}/SMS-TChiWZ_ZToLL", 1, "SUSY")


In [None]:
real = load_root_folder(cms2016G, -1, "REAL")


In [None]:
df = pd.concat([dy, wj, tt, susy, real], ignore_index=True)

print(df.shape)
df.head()


In [None]:
df.info()
df.describe().T.head(20)


In [None]:
plt.figure(figsize=(7,5))
for s in ["DY","WJets","TT","SUSY"]:
    subset = df[df.source==s]
    plt.hist(subset["MET_pt"], bins=100, histtype="step", density=True, label=s)

plt.yscale("log")
plt.xlabel("MET_pt")
plt.legend()
plt.title("MET comparison")
plt.show()


In [None]:
plt.figure(figsize=(7,5))
for s in ["DY","WJets","TT","SUSY"]:
    subset = df[df.source==s]
    plt.hist(subset["nJet"], bins=10, histtype="step", density=True, label=s)

plt.xlabel("nJet")
plt.legend()
plt.title("Jet multiplicity")
plt.show()


In [None]:
plt.figure(figsize=(7,5))
for s in ["DY","WJets","TT","SUSY"]:
    subset = df[df.source==s]
    plt.hist(subset["HT"], bins=100, histtype="step", density=True, label=s)

plt.yscale("log")
plt.xlabel("HT")
plt.legend()
plt.title("HT comparison")
plt.show()


In [None]:
plt.figure(figsize=(7,5))
for s in ["DY","TT","SUSY"]:
    subset = df[df.source==s]
    plt.hist(subset["M_ll"], bins=120, histtype="step", density=True, label=s)

plt.xlabel("M_ll")
plt.legend()
plt.title("Dilepton mass")
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(12,10))
corr = df[df.label!=-1].corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature correlations (MC only)")
plt.show()


In [None]:
susy_df = df[df.label==1]
sm_df   = df[df.label==0]

plt.figure(figsize=(7,5))
plt.hist(sm_df["MET_pt"], bins=100, alpha=0.5, density=True, label="SM")
plt.hist(susy_df["MET_pt"], bins=100, alpha=0.5, density=True, label="SUSY")
plt.legend()
plt.title("SM vs SUSY MET")
plt.show()
