In [1]:
CUDA_INDEX = 0
NAME = 'Amazon'
CLASSES = 2

In [2]:
import sys
sys.path.insert(0, '../..')
sys.path.insert(0, '../../pyged/lib')

In [3]:
import os
import pickle
import random
import time

import IPython as ipy
import matplotlib.pyplot as plt
import numpy as np
import torch
torch.cuda.set_device(CUDA_INDEX)
torch.backends.cudnn.benchmark = True
import torch.optim
import torch_geometric as tg
import torch_geometric.data
from tqdm.auto import tqdm

from neuro import config, datasets, index, metrics, models, train, utils, viz
import pyged

from importlib import reload
reload(config)
reload(datasets)
reload(index)
reload(metrics)
reload(models)
reload(pyged)
reload(train)
reload(utils)
reload(viz)

<module 'neuro.viz' from '../../neuro/viz.py'>

In [4]:
# graphs = utils.remove_extra_attrs(utils.label_graphs(tg.datasets.CitationFull(root=f'../data/{NAME}/tg', name=f'{NAME}')))

In [5]:
# tic = time.time()
# nbrs = datasets.decompose(graphs, n_hops=2)
# toc = time.time()
# torch.save(nbrs, f'../data/{NAME}/nbrs.pt')
# tqdm.write(f'neighborhood decomposition time: {toc-tic:.3} s')
nbrs = torch.load(f'../data/{NAME}/nbrs.pt')

In [6]:
queries = datasets.make_queries(nbrs, n_queries=10, n_hops=4, trav_prob=0.3, node_lim=25)

sample queries from targets


sampled queries:   0%|          | 0/10 [00:00<?, ?it/s]

## Embedding

In [7]:
model = models.NormSEDModel(8, CLASSES, 64, 64)
model.load_state_dict(torch.load(f'../runlogs/{NAME}/1628189994.2692797/best_model.pt', map_location='cpu'))
model = model.eval()
model = model.to(config.device)

In [8]:
tic = time.time()
target_embs_list = []
batch_size = 4096
with torch.no_grad():
    for i in tqdm(range(0,len(nbrs),batch_size), 'target batches'):
        target_embs_list.append(model.embed_model(tg.data.Batch.from_data_list(nbrs[i:i+batch_size]).to(config.device)))
target_embs = torch.cat(target_embs_list)
toc = time.time()
tqdm.write(f'total target embedding time: {toc-tic:.3} s')
tqdm.write(f'per target embedding time: {(toc-tic)/len(nbrs):.3} s')

target batches:   0%|          | 0/82 [00:00<?, ?it/s]

total target embedding time: 17.9 s
per target embedding time: 5.34e-05 s


In [9]:
tic = time.time()
with torch.no_grad():
    query_embs = model.embed_model(tg.data.Batch.from_data_list(queries).to(config.device))
toc = time.time()
tqdm.write(f'total query embedding time: {toc-tic:.3} s')
tqdm.write(f'per query embedding time: {(toc-tic)/len(queries):.3} s')

total query embedding time: 0.00386 s
per query embedding time: 0.000386 s


In [10]:
temb = target_embs
qemb = query_embs

## Unvectorised Linear Scan on CPU (threads=1)

In [11]:
tic = time.time()
config.device = torch.device('cpu')
torch.set_num_threads(1)
index_str = index.LinearScan(temb, dist_fn=utils.norm_sed_func)
toc = time.time()
tqdm.write(f'preprocessing time: {toc-tic:.3} s')

slow (pythonic) linear scan index
config.device: cpu
preprocessing time: 0.0767 s


In [17]:
k = 10
tic = time.time()
for q in tqdm(qemb, f'k = {k} | queries'):
    index_str.knn_query(q, k, verbose=False)
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

k = 10 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 10.3


In [12]:
r = 2
tic = time.time()
for q in tqdm(qemb, f'r = {r} | queries'):
    index_str.range_query(q, r, verbose=False)
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

r = 2 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 6.47


## Unvectorised Quasi-Metric Tree on CPU (threads=1)

In [13]:
tic = time.time()
config.device = torch.device('cpu')
torch.set_num_threads(1)
index_str = index.QuasiMetricTree(temb, dist_fn=utils.norm_sed_func, max_leaf_size=128)
toc = time.time()
tqdm.write(f'preprocessing time: {toc-tic:.3} s')

construct quasi metric tree for asymmetric/symmetric distance function
config.device: cpu


  0%|          | 0/334863 [00:00<?, ?it/s]

distance computations: 4950266
internal nodes: 2088 / 8353
leaf nodes: 6265 / 8353
big leaf nodes: 16 / 6265
max leaf size: 2448
preprocessing time: 2.81 s


In [52]:
k = 10
tic = time.time()
for q in tqdm(qemb, f'k = {k} | queries'):
    index_str.knn_query(q, k, slow=True, verbose=False)
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

k = 10 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 3.96


In [14]:
r = 2
tic = time.time()
for q in tqdm(qemb, f'r = {r} | queries'):
    index_str.range_query(q, r, slow=True, verbose=False)
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

r = 2 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 4.67


## Vectorised Linear Scan on GPU

In [11]:
tic = time.time()
config.device = torch.device('cuda')
index_str = index.FastLinearScan(temb, dist_fn=utils.norm_sed_func)
toc = time.time()
tqdm.write(f'preprocessing time: {toc-tic:.3} s')

fast (vectorised) linear scan index
config.device: cuda
preprocessing time: 0.000402 s


In [17]:
k = 10
tic = time.time()
for q in tqdm(qemb, f'k = {k} | queries'):
    index_str.knn_query(q, k, verbose=False)
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

k = 10 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 10.3


In [16]:
r = 2
tic = time.time()
for q in tqdm(qemb, f'r = {r} | queries'):
    index_str.range_query(q, r, verbose=False)
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

r = 2 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 0.00557


## Alignment

In [12]:
tic = time.time()
k = 10
topk = []
for q in tqdm(qemb, f'k = {k} | queries'):
    topk.append(index_str.knn_query(q, k, verbose=False))
toc = time.time()
tqdm.write('time in s: ' f'{(toc-tic)/qemb.shape[0]:.3}')

k = 10 | queries:   0%|          | 0/10 [00:00<?, ?it/s]

time in s: 0.00197


In [13]:
tic = time.time()
for q, ts in zip(tqdm(queries, 'queries'), topk):
    for ti in tqdm(ts, 'targets'):
        node_map, (lb, ub) = pyged.sed_align(utils.to_pyged(q), utils.to_pyged(nbrs[ti]), 'f2', '')
        print(lb, ub, end = ' | ')
    print()
toc = time.time()
print(toc-tic)

queries:   0%|          | 0/10 [00:00<?, ?it/s]

targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 1.0 1.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 1.0 1.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 
684.7798104286194


In [14]:
tic = time.time()
for q, ts in zip(tqdm(queries, 'queries'), topk):
    for ti in tqdm(ts, 'targets'):
        node_map, (lb, ub) = pyged.sed_align(utils.to_pyged(q), utils.to_pyged(nbrs[ti]), 'f2', '--threads 64')
        print(lb, ub, end = ' | ')
    print()
toc = time.time()
print(toc-tic)

queries:   0%|          | 0/10 [00:00<?, ?it/s]

targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 1.0 1.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 1.0 1.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 


targets:   0%|          | 0/10 [00:00<?, ?it/s]

0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 
229.58446645736694
