In [1]:
import awkward as ak
import numpy as np
# import json
# Hists
# import hist
# from hist import Hist
# NanoEvents
from coffea.nanoevents import NanoEventsFactory, BaseSchema, NanoAODSchema
# Processors
# from coffea import processor
from coffea.analysis_tools import PackedSelection

# import utils

import torch
import torch.nn as nn
import torch.nn.functional as F
# import torch_geometric.data as geom_data
# import torch_geometric.nn as geom_nn
from torch_geometric.nn import GATConv
# from torch_geometric.data import Data, DataLoader

: 

In [3]:
### GLOBAL CONFIGURATION
# input files per process, set to e.g. 10 (smaller number = faster)
N_FILES_MAX_PER_SAMPLE = 1

# enable Dask
USE_DASK = False

# enable ServiceX
USE_SERVICEX = False

### ML-INFERENCE SETTINGS

# enable ML inference
USE_INFERENCE = False

# enable inference using NVIDIA Triton server
USE_TRITON = False

# From nanoaod_inputs.json

## ttbar
- nominal: {'nevts_total': 276079127, 'files':['path':, 'nevts']}
- scaledown: {'nevts_total': 39329663, 'files':['path':, 'nevts']}
- scaleup: {'nevts_total': 38424467, 'files':['path':, 'nevts']}
- ME_var: {'nevts_total': 19098219, 'files':['path':, 'nevts']}
- PS_varr: {'nevts_total': 19337064, 'files':['path':, 'nevts']}

## single_top_s_chan
- "nominal": {"nevts_total": 2867199, "files": ['path':, 'nevts']}

## single_top_t_chan
- "nominal": {"nevts_total": 109305936, "files": ['path':, 'nevts']}

## single_top_tW
- "nominal": {"nevts_total": 1999400, "files":['path':, 'nevts']}

## wjets
- "nominal": {"nevts_total": 433719099, "files": ['path':, 'nevts']}

open('').keys()

['tag;6',
 'tag;5',
 'tag;4',
 'tag;3',
 'tag;2',
 'tag;1',
 'Events;1',
 'LuminosityBlocks;1',
 'Runs;1',
 'MetaData;1',
 'ParameterSets;1']


In [3]:
# checking 
# utils.file_input.construct_fileset(N_FILES_MAX_PER_SAMPLE=1, use_xcache=False, af_name="coffea_casa", input_from_eos=False, xcache_atlas_prefix=None)

with open('nanoaod_inputs.json') as f:
    file_info = json.load(f)

file_info.keys()

dict_keys(['ttbar', 'single_top_s_chan', 'single_top_t_chan', 'single_top_tW', 'wjets'])

In [4]:
xsec_info = {
    "ttbar": 396.87 + 332.97, # nonallhad + allhad, keep same x-sec for all
    "single_top_s_chan": 2.0268 + 1.2676,
    "single_top_t_chan": (36.993 + 22.175)/0.252,  # scale from lepton filter to inclusive
    "single_top_tW": 37.936 + 37.906,
    "wjets": 61457 * 0.252,  # e/mu+nu final states
    "data": None
}

fileset = {}
for process in file_info.keys(): #['ttbar', 'single_top_s_chan', 'single_top_t_chan', 'single_top_tW', 'wjets']
    # print('process: ', process)
    for variation in file_info[process].keys(): # each channel has variation(s). each variatioin has 'nevts' and 'files'
        # print(variation)
        file_list = file_info[process][variation]['files'] # each file has 'path' and 'nevts'
        file_path = [f['path'] for f in file_list]
        nevts_total = sum( f['nevts'] for f in file_list)
        # print(nevts_total==file_info[process][variation]['nevts_total']) -> true
        metadata = {"process": process, "variation": variation, "nevts": nevts_total, "xsec": xsec_info[process]}
        fileset.update({f'{process}__{variation}': {'files': file_path, 'metadata': metadata}})
        
    

In [5]:
fileset.keys()

dict_keys(['ttbar__nominal', 'ttbar__scaledown', 'ttbar__scaleup', 'ttbar__ME_var', 'ttbar__PS_var', 'single_top_s_chan__nominal', 'single_top_t_chan__nominal', 'single_top_tW__nominal', 'wjets__nominal'])

In [6]:
fileset['ttbar__scaledown']['metadata']

{'process': 'ttbar',
 'variation': 'scaledown',
 'nevts': 39329663,
 'xsec': 729.84}

## File Construction

In [7]:
fileset = utils.file_input.construct_fileset(
    N_FILES_MAX_PER_SAMPLE,
    use_xcache=False,
    af_name=utils.config["benchmarking"]["AF_NAME"], # "coffea_casa" # local files on /data for af_name="ssl-dev"
    input_from_eos=utils.config["benchmarking"]["INPUT_FROM_EOS"], # False
    xcache_atlas_prefix=utils.config["benchmarking"]["XCACHE_ATLAS_PREFIX"], #None
)

print(f"processes in fileset: {list(fileset.keys())}")
print(f"\nexample of information in fileset:\n{{\n  'files': [{fileset['ttbar__nominal']['files'][0]}, ...],")
print(f"  'metadata': {fileset['ttbar__nominal']['metadata']}\n}}")

processes in fileset: ['ttbar__nominal', 'ttbar__scaledown', 'ttbar__scaleup', 'ttbar__ME_var', 'ttbar__PS_var', 'single_top_s_chan__nominal', 'single_top_t_chan__nominal', 'single_top_tW__nominal', 'wjets__nominal']

example of information in fileset:
{
  'files': [https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root, ...],
  'metadata': {'process': 'ttbar', 'variation': 'nominal', 'nevts': 1334428, 'xsec': 729.84}
}


# coffea

In [4]:
file_path = 'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root'
tree_name = 'Events'
# events = NanoEventsFactory.from_root({file_path: tree_name}, schemaclass=NanoAODSchema).events()
events = NanoEventsFactory.from_root(file_path, treepath=tree_name, schemaclass=NanoAODSchema).events()



In [6]:
Muon = events.Muon
Electron = events.Electron
Jet = events.Jet

In [9]:
muon_mask = ((Muon.pt>30) & (np.abs(Muon.eta)<2.1) & (Muon.tightId) & (Muon.sip3d<4) & (Muon.pfRelIso04_all<0.15))
electron_mask = ((Electron.pt>30) & (np.abs(Electron.eta)<2.1) & (Electron.cutBased==4) & (Electron.sip3d<4))
jet_mask = ((Jet.pt>30) & (np.abs(Jet.eta)<2.4) & (Jet.isTightLeptonVeto))

muons = Muon[muon_mask]
electrons = Electron[electron_mask]
jets = Jet[jet_mask]

In [10]:
B_TAG_THRESHOLD = 0.5

selections = PackedSelection(dtype='uint64')
# exactly_1l
selections.add('exactly_1l', (ak.num(electrons)+ak.num(muons))==1) #111711
# atleast_4j
selections.add('atleast_4j', ak.num(jets)>=4)
# max_6j
selections.add('max_6j', ak.num(jets)<=6)
# exactly_1b
selections.add('exactly_1b', ak.sum(jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1)==1)
# atleaast_2b
selections.add('atleast_2b', ak.sum(jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1)>=2)
#combined for each regions
selections.add('4j1b', selections.all('atleast_4j','exactly_1l','exactly_1b'))
selections.add('4j2b', selections.all('atleast_4j','exactly_1l','atleast_2b'))

TimeoutError: The read operation timed out

In [5]:
print(selections.names)
selections.all('4j2b')

['exactly_1l', 'atleast_4j', 'max_6j', 'exactly_1b', 'atleast_2b', '4j1b', '4j2b']


array([False, False, False, ..., False, False, False])

### 4j2b region

In [None]:
region_selection = selections.all('4j2b','max_6j')
jets = jets[region_selection]
electrons = electrons[region_selection]
muons = muons[region_selection]
genpar = GenPar[region_selection]
even = (events.event%2==0)
even = even[region_selection]
#region_weights = np.ones(len(region_jets)) * xsec_weight

## preprocess

In [7]:
#constructing input matrix for GNN
'''
nodes construction:
    row would be node graph (muons, electrons, jet)? or should it be the number of permutation
    columns would be properties: pt, mass, btag, qgl (maybe eta and phi as well)

edge construction:
    features: deltaR, combined mass, combine mass

Probably have to create torch.Tensor that match the size of input for node features
'''

'\nnodes construction:\n    row would be node graph (muons, electrons, jet)? or should it be the number of permutation\n    columns would be properties: pt, mass, btag, qgl (maybe eta and phi as well)\n\nedge construction:\n    features: deltaR, combined mass, combine mass\n\nProbably have to create torch.Tensor that match the size of input for node features\n'

In [None]:
el_p4 = ak.zip({'pt': electrons.pt, 'eta': electrons.eta, 'phi': electrons.phi, 'mass': electrons.mass}, with_name= 'Momentum4D')
mu_p4 = ak.zip({'pt': muons.pt, 'eta': muons.eta, 'phi': muons.phi, 'mass': muons.mass}, with_name= 'Momentum4D')
lep_p4 = ak.concatenate((el_p4, mu_p4), axis=1)
jet_p4 = ak.zip({'pt': jets.pt, 'eta': jets.eta, 'phi': jets.phi, 'mass': jets.mass}, with_name= 'Momentum4D')

In [None]:
def array_to_tensor(array, chunk_size=1000, dtype=torch.float32):
    # converting numpy.array into torch.tensor
    # chunk iteration because of too many event
    chunks_tensor = []
    num_rows, num_col = array.shape
    for row_start in range(0, num_rows, chunk_size):
        row_end = min(row_start+chunk_size, num_rows)
        small_array = array[row_start:row_end]
        small_tensor = torch.tensor(small_array, dtype=torch.float32)
        chunks_tensor.append(small_tensor)
    return torch.cat(chunks_tensor, dim=0)

In [10]:
# lep_phi = ak.flatten(lep_p4.phi, axis=1).to_numpy()
# lep_eta = ak.flatten(lep_p4.eta, axis=1).to_numpy()
# lep_pt = ak.flatten(lep_p4.pt, axis=1).to_numpy()
# lep_mass = ak.flatten(lep_p4.mass, axis=1).to_numpy()
# lep_features = np.vstack((lep_pt, lep_mass, lep_phi, lep_eta)).T

In [None]:
jet_pt = ak.flatten(jets.pt, axis=1).to_numpy()
jet_mass = ak.flatten(jets.mass, axis=1).to_numpy()
jet_phi = ak.flatten(jets.phi, axis=1).to_numpy()
jet_eta = ak.flatten(jets.eta, axis=1).to_numpy()
jet_btag = ak.flatten(jets.btagCSVV2, axis=1).to_numpy()
jet_qgl = ak.flatten(jets.qgl, axis=1).to_numpy()

jet_features = np.vstack((jet_pt, jet_mass, jet_phi, jet_eta, jet_btag, jet_qgl)).T
# column pt: 0
# column mass: 1
# column phi: 2
# column eta: 3
# column btag: 4
# column qcl: 5


In [None]:
node_features = array_to_tensor(jet_features)

## Define GAT model

In [None]:
def create_adjacency_matrix(eta, phi, threshold=0.4):
    num_particles = len(eta)
    adjacency_matrix = np.zeros((num_particles, num_particles))
    for i in range(num_particles):
        for j in range(i + 1, num_particles):
            delta_eta = eta[i] - eta[j]
            delta_phi = np.abs(phi[i] - phi[j])
            if delta_phi > np.pi:
                delta_phi = 2 * np.pi - delta_phi
            distance = np.sqrt(delta_eta**2 + delta_phi**2)
            if distance < threshold:
                adjacency_matrix[i, j] = 1
                adjacency_matrix[j, i] = 1
    return adjacency_matrix

In [4]:
adj_30sams = create_adjacency_matrix(eta=node_features[:30,3], phi=node_features[:30,2])

NameError: name 'create_adjacency_matrix' is not defined

In [None]:
# Create a graph for each event
graphs = []
for event_idx in range(len(jet_features)):
    features = torch.tensor(jet_features[event_idx], dtype=torch.float)
    eta = jet_eta[event_idx]
    phi = jet_phi[event_idx]
    adjacency_matrix = create_adjacency_matrix(eta, phi)
    edge_index = torch.tensor(np.array(np.nonzero(adjacency_matrix)), dtype=torch.long)
    graph = Data(x=features, edge_index=edge_index)
    graphs.append(graph)

In [None]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, out_channels): #output_channels would be the number of class labels
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, 8, heads=8, dropout=0.6)
        self.conv2 = GATConv(8 * 8, out_channels, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1) #output would be the vector of classes (highest score is the prediction)

In [None]:
# Assume we have labels for each node (0: background, 1: signal)
# Here we create random labels as a placeholder
# In practice, you should replace this with your actual labels
labels = [torch.randint(0, 2, (g.num_nodes(),), dtype=torch.long) for g in graphs]

# Create DataLoader
loader = DataLoader([Data(x=g.x, edge_index=g.edge_index, y=labels[i]) for i, g in enumerate(graphs)], batch_size=32, shuffle=True)

# Initialize the model, optimizer, and loss function
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = GAT(in_channels=6, out_channels=2).to(device)
model = GAT(in_channels=6, out_channels=4) #output dim would be 4?
lss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4) #defining optimizer

def train():
    model.train()
    lss = []
    val_lss = []
    for data in loader:
        # data = data.to(device)
        optimizer.zero_grad() #clear gradients for each iteration (don't want acculmulated )
        out = model(data.x)
        loss = lss_fn(out, data.y) # lss_fn(output[data.train_mask], data.y[data.train_mask])
        val_loss = lss_fn(out[data.val_mask], data.y[data.val_mask])
        loss.backward() #derive gradients
        optimizer.step() #update parameters based on gradients
        lss.append(loss)
        val_lss.append(val_loss)
    return lss, val_lss

def test():
    model.eval()
    acc = []
    for data in loader:
        out = model(data) #data.x
        pred = out.argmax(dim=1) #choising highest class probability
        test_correct = pred[data.test_mask]==data.y[data.test_mask] #need syntax modification
        test_acc = int(test_correct.sum()) / int(data.test_mask.sum()) #need syntax modification
        acc.append(test_acc)
    return acc

# Training loop
for epoch in range(100):  # Number of epochs
    # train()
    lss, val_lss = train()

# Save the model
torch.save(model.state_dict(), 'gat_model.pth')


## Dynamic Edge

In [None]:
from torch_geometric.utils import knn_graph

class DynamicEdgeGAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, out_dim, heads=4):
        super(DynamicEdgeGAT, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self,output_dim = output_dim
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * heads, out_dim, heads=1, concat=False, dropout=0.6)

    def forward(self, x):
        edge_index = knn_graph(x, k=5)  # Create k-NN graph with k=5
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Example usage:
# Assuming 16 hidden units, 5 classes, and 4 attention heads
# model = DynamicEdgeGAT(in_channels=x.size(1), hidden_channels=16, out_channels=5, heads=4)


In [68]:
jets.fields

['area',
 'btagCSVV2',
 'btagDeepB',
 'btagDeepCvB',
 'btagDeepCvL',
 'btagDeepFlavB',
 'btagDeepFlavCvB',
 'btagDeepFlavCvL',
 'btagDeepFlavQG',
 'chEmEF',
 'chFPV0EF',
 'chHEF',
 'eta',
 'mass',
 'muEF',
 'muonSubtrFactor',
 'neEmEF',
 'neHEF',
 'phi',
 'pt',
 'puIdDisc',
 'qgl',
 'rawFactor',
 'bRegCorr',
 'bRegRes',
 'cRegCorr',
 'cRegRes',
 'electronIdx1',
 'electronIdx2',
 'jetId',
 'muonIdx1',
 'muonIdx2',
 'nElectrons',
 'nMuons',
 'puId',
 'nConstituents',
 'genJetIdx',
 'hadronFlavour',
 'partonFlavour',
 'cleanmask',
 'electronIdx1G',
 'electronIdx2G',
 'genJetIdxG',
 'muonIdx1G',
 'muonIdx2G',
 'muonIdxG',
 'electronIdxG']

In [None]:
labels = torch.tensor(ak.flatten(jets.jetId, axis=1).to_numpy().T, dtype=torch.float32)


In [65]:
# Extract labels
labels = torch.tensor(ak.flatten(jets.jetId, axis=1).to_numpy().T, dtype=torch.float32)

labels = np.array(events['LabelBranch'].array())
y = torch.tensor(labels, dtype=torch.long)

# Create masks for splitting data
num_nodes = x.size(0)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# Define the split (e.g., 60% training, 20% validation, 20% test)
train_mask[:int(0.6 * num_nodes)] = True
val_mask[int(0.6 * num_nodes):int(0.8 * num_nodes)] = True
test_mask[int(0.8 * num_nodes):] = True

# Create the data object
from torch_geometric.data import Data
data = Data(x=x, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
