In [42]:
import awkward as ak
import numpy as np
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
# from coffea.analysis_tools import PackedSelection
import torch
# from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch_geometric
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader 
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
import utils
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
file_path = 'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root'
tree_name = 'Events'
# events = NanoEventsFactory.from_root({file_path: tree_name}, schemaclass=NanoAODSchema).events()
events = NanoEventsFactory.from_root(file_path, treepath=tree_name, schemaclass=NanoAODSchema).events()



In [3]:
#code from jetassignment_training
# events filtering
selected_electrons = events.Electron[(events.Electron.pt > 30) & (np.abs(events.Electron.eta)<2.1) & 
                                        (events.Electron.cutBased==4) & (events.Electron.sip3d < 4)]
selected_muons = events.Muon[(events.Muon.pt > 30) & (np.abs(events.Muon.eta)<2.1) & (events.Muon.tightId) & 
                                (events.Muon.sip3d < 4) & (events.Muon.pfRelIso04_all < 0.15)]
jet_filter = (events.Jet.pt > 30) & (np.abs(events.Jet.eta) < 2.4) & (events.Jet.isTightLeptonVeto)
selected_jets = events.Jet[jet_filter]
selected_genpart = events.GenPart
even = (events.event%2==0)
    
# single lepton requirement
event_filters = ((ak.count(selected_electrons.pt, axis=1) + ak.count(selected_muons.pt, axis=1)) == 1)
# require at least 4 jets
event_filters = event_filters & (ak.count(selected_jets.pt, axis=1) >= 4)
# require at least one jet above B_TAG_THRESHOLD
B_TAG_THRESHOLD = 0.5
event_filters = event_filters & (ak.sum(selected_jets.btagCSVV2 >= B_TAG_THRESHOLD, axis=1) >= 1)
    
# apply event filters
selected_electrons = selected_electrons[event_filters]
selected_muons = selected_muons[event_filters]
selected_jets = selected_jets[event_filters]
selected_genpart = selected_genpart[event_filters]
even = even[event_filters]
    
### only consider 4j2b (signal) region
region_filter = ak.sum(selected_jets.btagCSVV2 > B_TAG_THRESHOLD, axis=1) >= 2 # at least two b-tagged jets
selected_jets_region = selected_jets[region_filter][:,:4] # only keep top 4 jets
selected_electrons_region = selected_electrons[region_filter]
selected_muons_region = selected_muons[region_filter]
selected_genpart_region = selected_genpart[region_filter]
even = even[region_filter]

In [4]:
#don't need even
jets, electrons, muons, labels, even = utils.ml.training_filter(selected_jets_region, 
                                                        selected_electrons_region, 
                                                        selected_muons_region, 
                                                        selected_genpart_region,
                                                        even)

In [70]:
def jet_lepton_dR(jet_p4, lep_p4): #each event has 4 jets
    dR_array = np.array([])
    for event in range(len(jet_p4)):
        dR = np.array([np.sqrt((lep_p4[event].eta - jet_p4[event,i].eta)**2 + (lep_p4[event].phi - jet_p4[event,i].phi)**2) for i in range(4)])
        dR_array = np.append(dR_array, dR)
    return dR_array

def combinedMass(jet_p4, lep_p4):
    mass_array = np.array([])
    for event in range(len(jet_p4)):
        mass = [lep_p4[event].mass + jet_p4[event].mass]
        mass_array = np.append(mass_array, mass)
    return mass_array

def array_to_tensor(array, chunk_size=1000, dtype=torch.float32):
    # converting numpy.array into torch.tensor
    # chunk iteration because of too many event
    chunks_tensor = []
    num_rows, num_col = array.shape
    for row_start in range(0, num_rows, chunk_size):
        row_end = min(row_start+chunk_size, num_rows)
        small_array = array[row_start:row_end]
        small_tensor = torch.tensor(small_array, dtype=torch.float32)
        chunks_tensor.append(small_tensor)
    return torch.cat(chunks_tensor, dim=0)

# creating edge_indices for each event
def sub_event_edge_indices(num_jets_in_event):
    row = np.tile(np.arange(num_jets_in_event), num_jets_in_event)
    col = np.repeat(np.arange(num_jets_in_event), num_jets_in_event)
    #excluding edge with itself
    mask = row != col
    row = row[mask]
    col = col[mask]
    edge_idx = np.stack([row, col], axis=0)
    return edge_idx

# creating edge_indices for all event
def full_edge_indices(num_jets_in_event, num_events):
    edge_indices = []
    offset = 0
    for _ in range(num_events):
        edge_index = sub_event_edge_indices(num_jets_in_event)
        edge_index += offset
        edge_indices.append(edge_index)
        offset += num_jets_in_event
    concat_edge_indices = np.concatenate(edge_indices, axis=1)
    return (array_to_tensor(concat_edge_indices.T).T).to(dtype=int)

In [72]:
el_p4 = ak.zip({'pt': electrons.pt, 'eta': electrons.eta, 'phi': electrons.phi, 'mass': electrons.mass}, with_name= 'Momentum4D')
mu_p4 = ak.zip({'pt': muons.pt, 'eta': muons.eta, 'phi': muons.phi, 'mass': muons.mass}, with_name= 'Momentum4D')
lep_p4 = ak.concatenate((el_p4, mu_p4), axis=1)
jet_p4 = ak.zip({'pt': jets.pt, 'eta': jets.eta, 'phi': jets.phi, 'mass': jets.mass}, with_name= 'Momentum4D')

#node features from jets
jet_pt = ak.flatten(jets.pt, axis=1).to_numpy()
jet_mass = ak.flatten(jets.mass, axis=1).to_numpy()
jet_phi = ak.flatten(jets.phi, axis=1).to_numpy()
jet_eta = ak.flatten(jets.eta, axis=1).to_numpy()
jet_btag = ak.flatten(jets.btagCSVV2, axis=1).to_numpy()
jet_qgl = ak.flatten(jets.qgl, axis=1).to_numpy()
# node features from jets+leptons
jet_lep_dR = jet_lepton_dR(jet_p4, lep_p4)
jet_lep_mass = combinedMass(jet_p4, lep_p4)

# creating 2d array. Row is nodes (the total number of jets) with columns are features 
node_features = np.vstack((jet_pt, 
                           jet_mass,
                           jet_phi, 
                           jet_eta, 
                           jet_btag, 
                           jet_qgl, 
                           jet_lep_dR, #dR between each jet and lepton (in the same event, 4 jets and 1 lepton)
                           jet_lep_mass)).T #combined mass of each jet and lepton (in the same event, 4 jets and 1 lepton)

# convert node_features_array to ninde_features_tensor
node_features = array_to_tensor(node_features)
# labels_tensor for jet assignment 
labels = torch.tensor(labels.flatten(), dtype=torch.float32)
#edge_index_tensor
num_events = ak.num(jets, axis=0)
num_jets_in_event = 4
edge_index = full_edge_indices(num_jets_in_event, num_events)

  labels = torch.tensor(labels.flatten(), dtype=torch.float32)


In [73]:
node_features.shape, edge_index.shape, labels.shape

(torch.Size([25648, 8]), torch.Size([2, 76944]), torch.Size([25648]))

In [75]:
class JetGraphDataset(torch.utils.data.Dataset):
    def __init__(self, node_features, edge_index, labels):
        self.node_features = node_features
        self.edge_index = edge_index
        self.labels = labels
        self.num_examples = len(labels) // 4  # Assuming labels determine the number of examples

    def __len__(self):
        return self.num_examples

    def __getitem__(self, idx):
        node_features = self.node_features[idx * 4:(idx + 1) * 4]  # Adjust indexing as per your data structure
        label = self.labels[idx * 4:(idx + 1) * 4]  # Adjust indexing as per your data structure
        data = Data(x=node_features, edge_index=self.edge_index, y=label)
        return data


class GATModel(nn.Module):
    def __init__(self, num_features, hidden_dim, num_labels, num_heads, dropout):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=num_heads, dropout=dropout)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads, dropout=dropout)
        self.conv3 = GATConv(hidden_dim * num_heads, num_labels, heads=1, concat=True, dropout=dropout)
        self.dropout = torch.nn.Dropout(p=0.6)
        
    def forward(self, x, edge_index):
        # x, edge_index = data.x, data.edge_index
        # First GAT layer
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        # Second GAT layer (optional)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        # last layer
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        return F.softmax(x, dim=1) #F.log_softmax(x, dim=1)


def train(model, optimizer, loss_fn, train_loader, epochs=100):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for data in train_loader:
            optimizer.zero_grad()
            output = model(data.x, data.edge_index)
            loss = loss_fn(output, data.y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Train Epoch: [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}')

def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            output = model(data.x, data.edge_index)
            _, predicted = torch.max(output.data, 1)
            total += data.y.size(0)
            correct += (predicted == data.y).sum().item()
    accuracy = 100 * correct / total
    return accuracy

In [76]:
#Using cross-validation
num_features = node_features.shape[1]
hidden_dim = 32
num_labels = 4
num_heads = 4
dropout = 0.6
lr = 0.005
weight_decay = 5e-4
epochs = 50

# Convert labels tensor to numpy for StratifiedKFold
labels_np = labels.numpy()
# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = list(skf.split(np.arange(len(labels_np) // 4), labels_np[::4]))

In [77]:
for fold, (train_idx, test_idx) in enumerate(splits):
    print(f'Fold {fold + 1}/{len(splits)}')
    
    train_mask = np.repeat(train_idx, 4)
    test_mask = np.repeat(test_idx, 4)
    
    train_dataset = JetGraphDataset(node_features[train_mask], edge_index, labels[train_mask])
    test_dataset = JetGraphDataset(node_features[test_mask], edge_index, labels[test_mask])

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = GATModel(num_features=num_features, hidden_dim=hidden_dim, num_labels=num_labels, num_heads=num_heads, dropout=dropout)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.CrossEntropyLoss()

    train(model, optimizer, loss_fn, train_loader, epochs=epochs)
    test_accuracy = test(model, test_loader)
    
    # train(model, optimizer, loss_fn, train_loader, epochs=epochs)
    # test_accuracy = test(model, test_loader)
    
    print(f'Fold {fold + 1} - Test Accuracy: {test_accuracy:.2f}%')

Fold 1/5


IndexError: Encountered an index error. Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 127] (got interval [0, 25771])