# PHYS805 Final Project Notebook

- Overview:
    - Supervised
        - Use transformer over jets
        - Attach classifier head
        - Train en-to-end to predict signal vs background using CE loss
    - Self-supervised + supervised probe
        - Pretrain: 
            - First train using InfoNCE. 
            - No labels, no pretext objective. 
            - Just finding good representation. 
            - Would NOT use labels here.
        - Freeze transformer encoder. Then train small classification head to predict signal vs background. Would use labels here.

<!-- ```
ml4phys$ eosls /store/group/lpcemj/EMJAnalysis2025
QCD_PT-1000to1400_TuneCP5_13p6TeV_pythia8
QCD_PT-100to1400_TuneCP5_13p6TeV_pythia8
QCD_PT-120to170_TuneCP5_13p6TeV_pythia8
QCD_PT-1400to1800_TuneCP5_13p6TeV_pythia8
QCD_PT-15to30_TuneCP5_13p6TeV_pythia8
QCD_PT-170to300_TuneCP5_13p6TeV_pythia8
QCD_PT-1800to2400_TuneCP5_13p6TeV_pythia8
QCD_PT-2400to3200_TuneCP5_13p6TeV_pythia8
QCD_PT-300to470_TuneCP5_13p6TeV_pythia8
QCD_PT-30to50_TuneCP5_13p6TeV_pythia8
QCD_PT-3200_TuneCP5_13p6TeV_pythia8
QCD_PT-470to600_TuneCP5_13p6TeV_pythia8
QCD_PT-50to80_TuneCP5_13p6TeV_pythia8
QCD_PT-600to800_TuneCP5_13p6TeV_pythia8
QCD_PT-800to1000_TuneCP5_13p6TeV_pythia8
QCD_PT-80to120_TuneCP5_13p6TeV_pythia8
``` -->

In [None]:
import uproot
import awkward as ak
import numpy as np
import matplotlib.pyplot as plt
import yaml
import torch
from sklearn.model_selection import train_test_split

# auto reload of imported modules
%load_ext autoreload
%autoreload 2

import importlib
from utils import utils

def reload_utils():
    importlib.reload(utils)
    from utils import utils

In [None]:
# Data config stuff
with open("datasets.yaml", "r") as f:
    ds_cfg = yaml.safe_load(f)

features = [
    "Jet_pt",
    "Jet_eta",
    "Jet_phi",
    "Jet_mass",
]
other_branches = [
    "nJet",
    "Pileup_nPU",
]
branches = features + other_branches

test_split = 0.2
val_split = 0.5

In [None]:
sig = utils.load_data(ds_cfg, 'EMJ', filter_name=branches, entry_stop=20_000)
bkg = utils.load_data(ds_cfg, 'QCD', filter_name=branches, entry_stop=20_000)

In [None]:
sig_ftrs = sig[sig['nJet'] >= 2][features][:, :2]
bkg_ftrs = bkg[bkg['nJet'] >= 2][features][:, :2]
sig_tensor = utils.ak_to_torch(sig_ftrs, label=1)
bkg_tensor = utils.ak_to_torch(bkg_ftrs, label=0)
data_tensor = torch.cat([sig_tensor, bkg_tensor], dim=0)
data_tensor = data_tensor[torch.randperm(data_tensor.size(0))]

sig_tensor.shape, bkg_tensor.shape

In [None]:
# Split
X_train, X_temp, y_train, y_temp = train_test_split(
    data_tensor[:, :-1],
    data_tensor[:, -1],
    test_size=test_split,
    shuffle=True,
)

# test and val split
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size= val_split
)

y_train.shape, y_val.shape, y_test.shape

In [None]:
train_ds = utils.JetDataset(X_train, y_train)
val_ds = utils.JetDataset(X_val, y_val)
test_ds = utils.JetDataset(X_test, y_test)

In [None]:
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)