In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch 
import sys

## N-hop Eff&Pur

In [None]:
import scipy as sp
import numpy as np
from scipy import sparse
def efficiency_performance_wrt_distance(gnn_graph, pred_graph, truth_graph, n_hop):
    
    array_size = max(gnn_graph.max().item(), pred_graph.max().item(), truth_graph.max().item()) + 1

    if torch.is_tensor(pred_graph):
        l1 = pred_graph.cpu().numpy()
    else:
        l1 = pred_graph
    if torch.is_tensor(truth_graph):
        l2 = truth_graph.cpu().numpy()
    else:
        l2 = truth_graph
    if torch.is_tensor(gnn_graph):
        l3 = gnn_graph.cpu().numpy()
    else:
        l3 = gnn_graph
        
    e_pred = sp.sparse.coo_matrix(
        (np.ones(l1.shape[1]), l1), shape=(array_size, array_size)
    ).tocsr()
    e_truth = sp.sparse.coo_matrix(
        (np.ones(l2.shape[1]), l2), shape=(array_size, array_size)
    ).tocsr()
    e_gnn = sp.sparse.coo_matrix(
        (np.ones(l3.shape[1]), l3), shape=(array_size, array_size)
    ).tocsr()
    
    # symmetrization:
    e_pred = ((e_pred + e_pred.T) > 0).astype(np.float32)
    e_truth = ((e_truth + e_truth.T) > 0).astype(np.float32)
    e_gnn = ((e_gnn + e_gnn.T) > 0).astype(np.float32)
    
    # find n hop neighbors
    
    n_hop_neighbors = []
    
    for i in range(n_hop):
        power = e_gnn
        for j in range(i):
            power = power @ e_gnn
        power = power > 0
        n_hop_neighbors.append(power.astype(np.float32))
        del power
    
    for i in reversed(range(n_hop)):
        for j in reversed(range(i)):
            n_hop_neighbors[i] = n_hop_neighbors[i] - n_hop_neighbors[j]
        n_hop_neighbors[i] = (n_hop_neighbors[i] > 0).astype(np.float32)
    
    n_hop_eff = []
    
    for i in range(n_hop):
        signal_num = e_truth.multiply(n_hop_neighbors[i]).sum()
        found_num = e_truth.multiply(e_pred.multiply(n_hop_neighbors[i])).sum()
        n_hop_eff.append((found_num/(signal_num + 1e-12), int(found_num/2), int(signal_num/2)))
        
    return n_hop_eff, n_hop_neighbors

In [None]:
path = "/global/cfs/cdirs/m3443/data/ITk-upgrade/processed/gnn_processed/0GeV_barrel_v3/"
event = torch.load(path + "test/0008", map_location = "cpu")

In [None]:
torch.unique(event.pid, return_inverse = True)[1].max()

In [None]:
event

In [None]:
pred_graph = event.edge_index[:,event.scores > 0.0]
gnn_graph = event.edge_index[:,(event.scores > 0.15) | (torch.rand(event.scores.shape) < 0.1)]
signal_graph = event.signal_true_edges
# n_hop_eff, n_hop_neighbors = efficiency_performance_wrt_distance(gnn_graph, pred_graph, signal_graph, 10)
# print(n_hop_eff)
# print(*[n_hop_neighbors[i].sum() for i in range(len(n_hop_neighbors))])

In [None]:
uniques, inverse = event.pid[(event.pt > 1000.) & (event.primary == 1) & (event.nhits >= 3)].unique(return_inverse = True)
print(inverse.max())

In [None]:
mask = torch.tensor(len(event.pid) * [False])
conncted_nodes = torch.unique(event.edge_index[:,event.scores > 0.15])
mask[conncted_nodes] = True

inverse_mask = torch.zeros(len(event.pid)).long()
inverse_mask[mask] = torch.arange(mask.sum())

In [None]:
pred_graph = pred_graph[:, mask[pred_graph].all(0)]
pred_graph = inverse_mask[pred_graph]

In [None]:
from scipy.sparse.csgraph import connected_components

graph = sp.sparse.coo_matrix((np.ones(pred_graph.shape[1]), pred_graph), shape=(pred_graph.max()+1, pred_graph.max()+1)).tocsr()
n_components, labels = connected_components(graph, directed=False, return_labels=True)
print(n_components, labels)
# print(pred_graph.unique(return_inverse = True)[1])

In [None]:
score_cut = 0.5
eff = event.y[event.scores > score_cut].sum()/event.signal_true_edges.shape[1]
pur = event.y[event.scores > score_cut].sum()/(event.scores > score_cut).sum()
print("eff:{:.3f}, pur:{:.3f}".format(eff.item(), pur.item()))

In [None]:
from sklearn.metrics import RocCurveDisplay
from sklearn import svm
RocCurveDisplay.from_predictions(event.y.bool().numpy(), event.scores.numpy())

## Toy Model

In [None]:
import torch 
import sys
from scipy.sparse.csgraph import connected_components
import scipy as sp
import numpy as np
from scipy import sparse

sys.path.append('../..')

from LightningModules.GNNEmbeddings.utils import generate_toys
    
def graph_intersection(
    pred_graph, truth_graph, using_weights=False, weights_bidir=None
):

    array_size = max(pred_graph.max().item(), truth_graph.max().item()) + 1

    if torch.is_tensor(pred_graph):
        l1 = pred_graph.cpu().numpy()
    else:
        l1 = pred_graph
    if torch.is_tensor(truth_graph):
        l2 = truth_graph.cpu().numpy()
    else:
        l2 = truth_graph
    e_1 = sp.sparse.coo_matrix(
        (np.ones(l1.shape[1]), l1), shape=(array_size, array_size)
    ).tocsr()
    e_2 = sp.sparse.coo_matrix(
        (np.ones(l2.shape[1]), l2), shape=(array_size, array_size)
    ).tocsr()
    del l1

    e_intersection = e_1.multiply(e_2) - ((e_1 - e_2) > 0)
    del e_1
    del e_2

    if using_weights:
        weights_list = weights_bidir.cpu().numpy()
        weights_sparse = sp.sparse.coo_matrix(
            (weights_list, l2), shape=(array_size, array_size)
        ).tocsr()
        del weights_list
        del l2
        new_weights = weights_sparse[e_intersection.astype("bool")]
        del weights_sparse
        new_weights = torch.from_numpy(np.array(new_weights)[0])

    e_intersection = e_intersection.tocoo()
    new_pred_graph = torch.from_numpy(
        np.vstack([e_intersection.row, e_intersection.col])
    ).long()  # .to(device)
    y = torch.from_numpy(e_intersection.data > 0)  # .to(device)
    del e_intersection

    if using_weights:
        return new_pred_graph, y, new_weights
    else:
        return new_pred_graph, y

In [None]:
event = generate_toys(100, 10, 1, 5, 2, 1000., 0.9, 0.5)

In [None]:
del_y = (event.x[event.signal_true_edges[0], 1] - event.x[event.signal_true_edges[1], 1])
x = (event.x[event.signal_true_edges[0], 0] + event.x[event.signal_true_edges[1], 0])/2
print((del_y).abs().mean())
print((del_y).square().mean().sqrt())
print(del_y.abs().max())

In [None]:
import networkx as nx
from matplotlib import cm

G = nx.Graph()
G.add_nodes_from(range(len(event.x)))

for i in range(len(event.graph.T)):
    color = "red" if event.y[i] else "gray" 
    G.add_edge(*event.graph.T.tolist()[i], color = color)

pos = {}

for i in range(len(event.x)):
    pos[i] = event.x[i].numpy()
    
node_color = cm.jet(event.pid/event.pid.max())

import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
nx.draw(G, pos=pos, node_size = 100, node_color = node_color, edge_color = nx.get_edge_attributes(G,'color').values())
plt.draw()

In [None]:
batch = event

unnormalized_assignment = torch.rand((len(batch.pid), 10))
assignments = torch.nn.Softmax(dim = -1)(unnormalized_assignment)

In [None]:
from torch_scatter import scatter_mean, scatter_add, scatter_min
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from torch import nn
_, pid = torch.unique(batch.pid.long(), return_inverse = True)
pt = scatter_min(batch.pt, pid, dim=0, dim_size = pid.max()+1)[0]
matching = torch.zeros(pid.max()+1).long()

assignments = scatter_add(assignments, pid, dim = 0, dim_size = pid.max()+1)
probability = assignments/assignments.sum(1).unsqueeze(1)
signal_sample = (pt > 1000.)
inverse_mask = torch.arange(len(signal_sample))[signal_sample]

bipartite_graph = csr_matrix(probability[signal_sample].cpu().numpy())
row_match, col_match = min_weight_full_bipartite_matching(bipartite_graph, maximize=True)
matching[row_match] = torch.tensor(col_match).long()

if (~signal_sample).any():
    assignments[:,col_match] = -1
    matching[~signal_sample] = assignments[~signal_sample].argmax(1)

labels = matching[pid]
loss = nn.functional.cross_entropy(unnormalized_assignment, labels, reduction = "none")
print(batch.pid, signal_sample, pt, probability, labels, row_match, col_match, sep = "\n")
print(torch.stack([batch.pid, labels]).T)

weights = torch.rand(len(batch.pid))
weights = weights/weights.sum()

loss = torch.dot(loss, weights)

## HDBSCAN

In [None]:
import torch
import cudf
from cuml.neighbors import NearestNeighbors
from cuml.datasets import make_blobs
import cupy as cp
from cuml.cluster import HDBSCAN
import matplotlib.pyplot as plt
model = HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.0, metric='euclidean', gen_min_span_tree=True, cluster_selection_method = "leaf")


In [None]:
%%time
import time
for i in range(1000):
    s = time.time()
    X, _ = make_blobs(n_samples=100000, centers=10000,
                        n_features=15)
    X = torch.as_tensor(X).cuda()

    clusters = model.fit_predict(X)
    print(torch.as_tensor(clusters).max())
    print(time.time()-s)

In [None]:
plt.figure(figsize=(8, 6), dpi= 500, facecolor='w', edgecolor='k')
model.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6), dpi= 500, facecolor='w', edgecolor='k')
model.minimum_spanning_tree_.plot(edge_cmap='viridis',
                                      edge_alpha=0.6,
                                      node_size=5,
                                      edge_linewidth=0.5)

In [None]:
import seaborn as sns
plt.figure(figsize=(8, 6), dpi= 500, facecolor='w', edgecolor='k')
model.condensed_tree_.plot()

## GNN Embedding

In [None]:
import sys
import torch
import cudf
from cuml.neighbors import NearestNeighbors
from cuml.datasets import make_blobs
import cupy as cp
from cuml.cluster import HDBSCAN, KMeans
import cuml
import matplotlib.pyplot as plt
sys.path.append('../..')
from LightningModules.GNNNodeEmbedding.Models.gnn_embedding import InteractionGNN
from LightningModules.GNNEmbeddings.utils import generate_toys
model = InteractionGNN.load_from_checkpoint("/global/cfs/cdirs/m3443/usr/ryanliu/ITk_gnn_embedding/ITk_barrel_gnn_embedding/nay5rxu1/checkpoints/last.ckpt").to("cuda")

In [None]:
%%time
import sys
import cuml
sys.path.append('../..')
from LightningModules.GNNEmbeddings.utils import generate_toys
event = generate_toys(100, 10, 10, 1, 5, 1, 1000., 0.9, 0.5).cuda()

In [None]:
with torch.no_grad():
    embeddings,  old_embeddings = model(event.x, event.graph)

In [None]:
HDBSCANmodel = HDBSCAN(min_cluster_size = 3, min_samples = 3, max_cluster_size=15, metric='euclidean', cluster_selection_method = "leaf", verbose = cuml.common.logger.level_critical)
clusters = cp.asnumpy(HDBSCANmodel.fit_predict(old_embeddings))

In [None]:
import networkx as nx
from matplotlib import cm

event = event.cpu().detach()
G = nx.Graph()
G.add_nodes_from(range(len(event.x)))

for i in range(len(event.graph.T)):
    color = [1, 0, 0, 0.5] if event.y[i] else [0, 0, 0, 0.5] 
    G.add_edge(*event.graph.T.tolist()[i], color = color)

pos = {}

for i in range(len(event.x)):
    pos[i] = event.x[i].numpy()
    
node_color = cm.jet(event.pid/event.pid.max())
node_color[:, 3] = 0.5

import matplotlib.pyplot as plt
plt.figure(figsize=(16,12), dpi = 300)
nx.draw(G, pos=pos, node_size = 50, node_color = node_color, edge_color = nx.get_edge_attributes(G,'color').values())
plt.grid()
plt.show()
plt.savefig('misc/toy_example.png')

In [None]:
import networkx as nx
from matplotlib import cm
event = event.cpu().detach()
G = nx.Graph()
G.add_nodes_from(range(len(event.x)))

for i in range(len(event.graph.T)):
    color = [1, 0, 0, 0.5] if event.y[i] else [0, 0, 0, 0.3] 
    G.add_edge(*event.graph.T.tolist()[i], color = color)

pos = {}

for i in range(len(event.x)):
    pos[i] = event.x[i].numpy()
    
node_color = cm.gist_rainbow((clusters + 1)/(clusters + 1).max())
node_color[:, -1]=0.1
node_color[clusters == -1] = [0,0,0,1]

import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
nx.draw(G, pos=pos, node_size = 50, node_color = node_color, edge_color = nx.get_edge_attributes(G,'color').values())
plt.draw()

In [None]:
event.y.sum()/len(event.y)

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from torch import nn
from torch_scatter import scatter_mean, scatter_add, scatter_min

labels = torch.tensor(clusters+1).long()
labels = nn.functional.one_hot(labels, num_classes=labels.max()+1).float()
_, pid = event.pid.unique(return_inverse = True)
pid_cluster_counts = scatter_add(labels, pid, dim = 0, dim_size = pid.max()+1)
original_assignments = labels + 0.1*torch.rand(labels.shape)
_, pid = event.pid.unique(return_inverse = True)
bipartite_matrix = csr_matrix(scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy())
row_match, col_match = min_weight_full_bipartite_matching(bipartite_matrix, maximize=True)
pt = scatter_min(event.pt, pid, dim=0, dim_size = pid.max()+1)[0]

majority_mask = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0) > 0.5)
pt_mask = (pt[row_match] > 1000.)

eff = (majority_mask & pt_mask).sum()/(pt > 1000.).sum()
pur = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0))[majority_mask].mean()

In [None]:
print(eff, pur)

In [None]:
plt.figure(figsize=(8, 6), dpi= 200, facecolor='w', edgecolor='k')
HDBSCANmodel.minimum_spanning_tree_.plot(edge_cmap='viridis',
                                      edge_alpha=0.6,
                                      node_size=5,
                                      edge_linewidth=0.5)

In [None]:
plt.figure(figsize=(8, 6), dpi= 200, facecolor='w', edgecolor='k')
HDBSCANmodel.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

In [None]:
import seaborn as sns
plt.figure(figsize=(8, 6), dpi= 500, facecolor='w', edgecolor='k')
plt.yscale('log')
HDBSCANmodel.condensed_tree_.plot()

In [None]:
from cuml.cluster import KMeans

In [None]:
%%time
kmean = KMeans(handle=None, n_clusters=1000, max_iter=300, tol=0.0001, verbose=False, random_state=1, init='scalable-k-means++', n_init=1, oversampling_factor=2.0, max_samples_per_batch=32768, output_type=None)

## Torch Sparse

In [None]:
import torch

In [None]:
a = torch.arange(10).cuda()
b = torch.randperm(10).cuda()
inds = torch.stack([a, b], dim = 0)
values = torch.rand(10).cuda()
sparse_tensor = torch.sparse_coo_tensor(inds, values, (10, 10)).coalesce().requires_grad_(True)
sparse_tensor2 = torch.sparse_coo_tensor(inds, values, (10, 10)).coalesce().requires_grad_(True)

In [None]:
# sparse_tensor.requires_grad_(True)

In [None]:
a = torch.rand((10,10)).requires_grad_(True)
b = torch.randint(10, (10, 5))
torch.einsum("ij, ikj -> ik", a, a[b]).sum().backward()

In [None]:
tot.backward()

In [None]:
print(sparse_tensor.grad)
print(dense.grad)

In [None]:
import torch

In [None]:
a = torch.rand(10, 10)

In [None]:
a[torch.arange(10), torch.randperm(10)]

In [None]:
import torch
embeddings = torch.load("/global/homes/r/ryanliu/Tracking-ML-Exa.TrkX/Pipelines/Common_Tracking_Example/LightningModules/GNNEmbeddings/test")

In [None]:
print(embeddings)

In [None]:
import torch
import cudf
from cuml.neighbors import NearestNeighbors
from cuml.datasets import make_blobs
import cupy as cp
from cuml.cluster import HDBSCAN
import matplotlib.pyplot as plt
import cudf
model = HDBSCAN(min_cluster_size=3, cluster_selection_epsilon=0.0, metric='euclidean', cluster_selection_method = "eom")

In [None]:
model.fit_predict(embeddings.cuda())

In [None]:
dist = torch.einsum("ijk -> ij", (embeddings.unsqueeze(0).expand(len(embeddings), len(embeddings), 15) - embeddings.unsqueeze(1).expand(len(embeddings), len(embeddings), 15)).square())

In [None]:
print(dist.max())
print(dist.argmin())
print((dist == 0).sum())
len(dist)

## HDBSCAN

In [None]:
import cuml
import torch
event = torch.load("/global/cfs/cdirs/m3443/data/ITk-upgrade/processed/gnn_processed/0GeV_barrel_v3/test/0008", map_location = "cpu")
event.pid[(event.nhits <= 3)] = 0

In [None]:
mask = torch.tensor(len(event.pid) * [False])
conncted_nodes = torch.unique(event.edge_index[:, event.scores > 0.15])
mask[conncted_nodes] = True
inverse_mask = torch.zeros(len(event.pid)).long()
inverse_mask[mask] = torch.arange(mask.sum())
event.scores = event.scores[mask[event.edge_index].all(0)]
event.edge_index = event.edge_index[:, mask[event.edge_index].all(0)]

In [None]:
scores_matrix = 1e4*torch.ones((mask.sum(), mask.sum()))

In [None]:
scores_matrix[inverse_mask[event.edge_index[0]], inverse_mask[event.edge_index[1]]] = 1 - event.scores
scores_matrix[inverse_mask[event.edge_index[1]], inverse_mask[event.edge_index[0]]] = 1 - event.scores

In [None]:
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, metric = "precomputed")

In [None]:
import numpy as np
clusterer.fit(scores_matrix.numpy().astype(np.float64))

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from torch import nn
from torch_scatter import scatter_mean, scatter_add, scatter_min

labels = torch.zeros(len(event.pid)).long()
labels[mask] = torch.tensor(clusterer.labels_+1)
labels = nn.functional.one_hot(labels, num_classes=labels.max()+1).float()
labels = labels[:, 1:]
_, pid = event.pid.unique(return_inverse = True)
pid_cluster_counts = scatter_add(labels, pid, dim = 0, dim_size = pid.max()+1)
original_assignments = labels + 0.001*torch.rand(labels.shape)
_, pid = event.pid.unique(return_inverse = True)
bipartite_matrix = csr_matrix(scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy())
row_match, col_match = min_weight_full_bipartite_matching(bipartite_matrix, maximize=True)
pt = scatter_min(event.pt, pid, dim=0, dim_size = pid.max()+1)[0]

majority_mask = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0) > 0.5)
pt_mask = (pt[row_match] > 1000.)

track_eff = (majority_mask & pt_mask).sum()/(pt > 1000).sum()
track_pur = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0))[majority_mask].mean()

fake_rate = 1 - (majority_mask & pt_mask).sum()/(labels.shape[1] - (majority_mask & (pt[row_match] > 0) & (~pt_mask)).sum())
particle_eff = pid_cluster_counts[row_match, col_match][majority_mask & pt_mask].sum()/((event.pt > 1000)&(event.pid != 0)).sum()

In [None]:
print(track_eff, track_pur, fake_rate, particle_eff)

## HGNN

In [None]:
import sys
import torch
import cudf
import cupy as cp
from cuml.cluster import HDBSCAN
import cuml
import matplotlib.pyplot as plt
sys.path.append('../..')
from LightningModules.GNNEmbeddings.Models.models import SparseHierarchicalGNN
from LightningModules.GNNEmbeddings.utils import generate_toys
model = SparseHierarchicalGNN.load_from_checkpoint("/global/cfs/cdirs/m3443/usr/ryanliu/ITk_barrel_embedding/ITk_barrel_embedding/yg20n9v3/checkpoints/last.ckpt").to("cuda")

In [None]:
event = generate_toys(400, 50, 10, 1, 5, 2, 1000., 0.9, 0.5).cuda()

In [None]:
with torch.no_grad():
    embeddings, old_embeddings = model(event.x, event.graph)

In [None]:
HDBSCANmodel = HDBSCAN(min_cluster_size = 3, min_samples = 3, metric='euclidean', cluster_selection_method = "eom", verbose = cuml.common.logger.level_critical)
clusters = cp.asnumpy(HDBSCANmodel.fit_predict(old_embeddings))

In [None]:
import networkx as nx
from matplotlib import cm
from sklearn.manifold import TSNE
from torch_scatter import scatter_add, scatter_mean, scatter_max, scatter_min
import numpy as np
event = event.cpu().detach()
G = nx.Graph()
G.add_nodes_from(range(len(event.x)))

for i in range(len(event.graph.T)):
    color = [1, 0, 0, 0.3] if event.y[i] else [0, 0, 0, 0.3] 
    G.add_edge(*event.graph.T.tolist()[i], color = color)

pos = {}

for i in range(len(event.x)):
    pos[i] = event.x[i].numpy()

# clusters = torch.tensor(clusters)
# centroids = scatter_mean(old_embeddings[clusters>=0], clusters[clusters>=0].cuda().long(), dim=0, dim_size=clusters.max()+1)
# centroids = centroids/torch.sqrt(centroids.square().sum(-1)).unsqueeze(1)
# centroids = torch.tensor(TSNE(n_components = 1, init = "pca").fit_transform(centroids.cpu().numpy()))
# centroids = (centroids-centroids.min())/(centroids.max()-centroids.min())
# clusters = clusters.numpy()
    
# # node_color = cm.gist_rainbow((clusters + 1)/(clusters + 1).max())
# # node_color[:, -1]=0.1
# node_color = np.ones((len(clusters), 4))
# node_color[clusters >= 0] = cm.gist_rainbow(centroids[clusters[clusters >= 0]].squeeze())
# node_color[clusters == -1] = [0,0,0,1]

embeddings = torch.tensor(TSNE(n_components = 1, init = "pca").fit_transform(old_embeddings.cpu().numpy()))
embeddings = (embeddings-embeddings.min())/(embeddings.max()-embeddings.min())
node_color = cm.gist_rainbow(embeddings)

import matplotlib.pyplot as plt
plt.figure(figsize=(16,8), dpi = 300)
nx.draw(G, pos=pos, node_size = 50, node_color = node_color, edge_color = nx.get_edge_attributes(G,'color').values())
plt.draw()
plt.savefig('misc/clustering.png')

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from torch import nn
from torch_scatter import scatter_mean, scatter_add, scatter_min
mask = torch.tensor([True]*len(event.pid)).bool()
labels = torch.zeros(len(event.pid)).long()
labels[mask] = torch.tensor(HDBSCANmodel.labels_+1).long()
labels = nn.functional.one_hot(labels, num_classes=labels.max()+1).float()
labels = labels[:, 1:]
_, pid = event.pid.unique(return_inverse = True)
pid_cluster_counts = scatter_add(labels, pid, dim = 0, dim_size = pid.max()+1)
original_assignments = labels + 0.001*torch.rand(labels.shape)
_, pid = event.pid.unique(return_inverse = True)
bipartite_matrix = csr_matrix(scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy())
row_match, col_match = min_weight_full_bipartite_matching(bipartite_matrix, maximize=True)
pt = scatter_min(event.pt, pid, dim=0, dim_size = pid.max()+1)[0]

majority_mask = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0) > 0.5)
pt_mask = (pt[row_match] > 1000.)

track_eff = (majority_mask & pt_mask).sum()/(pt > 1000).sum()
track_pur = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0))[majority_mask].mean()

fake_rate = 1 - (majority_mask & pt_mask).sum()/(labels.shape[1] - (majority_mask & (pt[row_match] > 0) & (~pt_mask)).sum())
particle_eff = pid_cluster_counts[row_match, col_match][majority_mask & pt_mask].sum()/((event.pt > 1000)&(event.pid != 0)).sum()

In [None]:
print(track_eff, track_pur, fake_rate, particle_eff)

## HGNN model

In [None]:
import sys
import torch
import cudf
import cupy as cp
from cuml.cluster import HDBSCAN, DBSCAN
import cuml
import matplotlib.pyplot as plt
sys.path.append('../..')
from LightningModules.GNNEmbeddings.Models.models import SparseHierarchicalGNN
from LightningModules.GNNEmbeddings.utils import generate_toys
model = SparseHierarchicalGNN.load_from_checkpoint("/global/cfs/cdirs/m3443/usr/ryanliu/ITk_barrel_embedding/ITk_barrel_embedding/1fp3lv99/checkpoints/last.ckpt").to("cuda")

In [None]:
model.setup("test")
data_loader = model.val_dataloader()

In [None]:
event = next(iter(data_loader)).cuda()

In [None]:
with torch.no_grad():
    input_data = model.get_input_data(event)
    embeddings = model(input_data, event.graph)

In [None]:
for min_cluster_size in range(3, 10):
    for min_sample in range(3, 10):
        HDBSCANmodel = HDBSCAN(min_cluster_size =min_cluster_size, min_samples = min_sample, metric='euclidean', cluster_selection_method = "eom", verbose = cuml.common.logger.level_critical)
        clusters = cp.asnumpy(HDBSCANmodel.fit_predict(embeddings))
        event = event.cpu()
        mask = torch.tensor([True]*len(event.pid)).bool()
        labels = torch.zeros(len(event.pid)).long()
        labels[mask] = torch.tensor(clusters+1).long()
        labels = nn.functional.one_hot(labels, num_classes=labels.max()+1).float()
        labels = labels[:, 1:]
        _, pid = event.pid.unique(return_inverse = True)
        pid_cluster_counts = scatter_add(labels, pid, dim = 0, dim_size = pid.max()+1)
        original_assignments = labels + 1e-12*torch.rand(labels.shape)
        _, pid = event.pid.unique(return_inverse = True)
        bipartite_matrix = csr_matrix(scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy())
        row_match, col_match = min_weight_full_bipartite_matching(bipartite_matrix, maximize=True)
        mask = scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy()[row_match, col_match] >= 1
        row_match = row_match[mask]
        col_match = col_match[mask]
        pt = scatter_min(event.pt, pid, dim=0, dim_size = pid.max()+1)[0]

        majority_mask = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0) > 0.5)
        pt_mask = (pt[row_match] > 1000.)

        track_eff = (majority_mask & pt_mask).sum()/(pt > 1000).sum()
        track_pur = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0))[majority_mask].mean()

        fake_rate = 1 - (majority_mask & pt_mask).sum()/(labels.shape[1] - (majority_mask & (pt[row_match] > 0) & (~pt_mask)).sum())
        particle_eff = pid_cluster_counts[row_match, col_match][majority_mask & pt_mask].sum()/((event.pt > 1000)&(event.pid != 0)).sum()
        print("min_cluster_size: {}, min_sample: {}, tracking efficiency: {:.2f}, tracking purity: {:.2f}, fake_rate: {:.2f}, particle efficiency: {:.2f}".format(min_cluster_size, min_sample, track_eff.item(), track_pur.item(), fake_rate.item(), particle_eff.item()))

In [None]:
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from torch import nn
from torch_scatter import scatter_mean, scatter_add, scatter_min
import numpy as np
for eps in np.linspace(0.1, 0.2, 10):
    for min_sample in range(3, 7):
        DBSCANmodel = DBSCAN(eps=eps, min_samples=min_sample)
        clusters = cp.asnumpy(DBSCANmodel.fit_predict(embeddings))
        event = event.cpu()
        mask = torch.tensor([True]*len(event.pid)).bool()
        labels = torch.zeros(len(event.pid)).long()
        labels[mask] = torch.tensor(clusters+1).long()
        labels = nn.functional.one_hot(labels, num_classes=labels.max()+1).float()
        labels = labels[:, 1:]
        _, pid = event.pid.unique(return_inverse = True)
        pid_cluster_counts = scatter_add(labels, pid, dim = 0, dim_size = pid.max()+1)
        original_assignments = labels + 1e-12*torch.rand(labels.shape)
        _, pid = event.pid.unique(return_inverse = True)
        bipartite_matrix = csr_matrix(scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy())
        row_match, col_match = min_weight_full_bipartite_matching(bipartite_matrix, maximize=True)
        mask = scatter_add(original_assignments, pid, dim = 0, dim_size = pid.max()+1).cpu().numpy()[row_match, col_match] >= 1
        row_match = row_match[mask]
        col_match = col_match[mask]
        pt = scatter_min(event.pt, pid, dim=0, dim_size = pid.max()+1)[0]

        majority_mask = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0) > 0.5)
        pt_mask = (pt[row_match] > 1000.)

        track_eff = (majority_mask & pt_mask).sum()/(pt > 1000).sum()
        track_pur = (pid_cluster_counts[row_match, col_match]/pid_cluster_counts[:, col_match].sum(0))[majority_mask].mean()

        fake_rate = 1 - (majority_mask & pt_mask).sum()/(labels.shape[1] - (majority_mask & (pt[row_match] > 0) & (~pt_mask)).sum())
        particle_eff = pid_cluster_counts[row_match, col_match][majority_mask & pt_mask].sum()/((event.pt > 1000)&(event.pid != 0)).sum()
        print("eps: {:.2f}, min_sample: {}, tracking efficiency: {:.2f}, tracking purity: {:.2f}, fake_rate: {:.2f}, particle efficiency: {:.2f}".format(eps, min_sample, track_eff.item(), track_pur.item(), fake_rate.item(), particle_eff.item()))

In [None]:
print(len(row_match), pt_mask.sum().item(), majority_mask.sum().item(), labels.shape[1] - (majority_mask & (pt[row_match] > 0) & (~pt_mask)).sum())

In [7]:
import cuml
import torch
event = torch.load("/global/cfs/cdirs/m3443/data/ITk-upgrade/processed/gnn_processed/0GeV_barrel_v3/test/0008")

In [10]:
event.primary.unique()

tensor([0., 1., nan,  ..., nan, nan, nan], device='cuda:0',
       dtype=torch.float64)