In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import logging

import requests
from tqdm import tqdm_notebook
tqdm = tqdm_notebook

import numpy as np
import pandas as pd

from dvidutils import LabelMapper
from libdvid import DVIDNodeService

from neuclease.dvid import *

In [3]:
handler = logging.StreamHandler(sys.stdout)
root_logger = logging.getLogger()
root_logger.handlers = []
root_logger.addHandler(handler)
root_logger.setLevel(logging.INFO)
logging.getLogger('kafka').setLevel(logging.WARNING)

In [4]:
pwd

'/nrs/flyem/bergs/complete-ffn-agglo'

In [7]:
from neuclease.util import read_csv_header

CSV_DTYPES = { 'id_a': np.uint64, 'id_b': np.uint64, # Use'id_a', and 'id_b' for consistency with our other code.
               'xa': np.int32, 'ya': np.int32, 'za': np.int32,
               'xb': np.int32, 'yb': np.int32, 'zb': np.int32,
               'caa': np.float32, 'cab': np.float32, 'cba': np.float32, 'cbb': np.float32,
               'iou': np.float32,
               'da': np.float32, 'db': np.float32 }

def load_focused_table(path):
    """
    Load an edge table from the given path.
    Must have at least the required columns for an edge table (id_a, id_b, and coordinates),
    but may include extra.  All columns are loaded and included in the result.
    """
    REQUIRED_COLUMNS = ['id_a', 'id_b', 'xa', 'ya', 'za', 'xb', 'yb', 'zb']
    ext = os.path.splitext(path)[1]
    assert ext in ('.csv', '.npy')
    if ext == '.csv':
        header = read_csv_header(path)
        if header is None:
            raise RuntimeError(f"CSV has no header: {path}")
        df = pd.read_csv(path, header=0, dtype=CSV_DTYPES)

    if ext == '.npy':
        df = pd.DataFrame(np.load(path))

    if not set(REQUIRED_COLUMNS).issubset(df.columns):
        raise RuntimeError(f"file ({path}) does not contain the required columns: {REQUIRED_COLUMNS}")

    return df

In [8]:
table = load_focused_table('hemibrain_662edc_equivs_from_v2_32nm_5563382-FILTERED-UNREVIEWED.csv')

In [12]:
table.duplicated(['body_a', 'body_b']).sum()

0

In [24]:
_tmp = table[['body_a', 'body_b']]
_tmp = _tmp.sort_values(0, axis=1)
_tmp.duplicated().sum()

0

In [36]:
master_node = ('emdata3:8900', '7254')
master_mapping = fetch_complete_mappings(*master_node, 'segmentation')

Reading kafka messages from kafka.int.janelia.org:9092 for emdata3:8900 / 7254 / segmentation
Reading 517976 kafka messages took 11.947327375411987 seconds
Fetching http://emdata3:8900/api/node/7254/segmentation/mappings...
Fetching http://emdata3:8900/api/node/7254/segmentation/mappings took 0:00:33.518943
Parsing mapping...
Parsing mapping took 0:00:06.575215
Constructing missing identity-mappings...
Constructing missing identity-mappings took 0:00:19.660588


In [26]:
master_mapper = LabelMapper(master_mapping.index.values, master_mapping.values)

In [37]:
table['body_a'] = master_mapper.apply(table['id_a'].values, True)
table['body_b'] = master_mapper.apply(table['id_b'].values, True)

In [38]:
table[['body_a', 'body_b']].sort_values(0, axis=1).duplicated().sum()

1797

In [39]:
len(table)

121433

In [40]:
1797 / 121433

0.01479828382729571

In [41]:
table.query('body_a == body_b').shape

(5058, 10)

In [43]:
table.shape

(121433, 10)

In [44]:
5058 / 121433

0.041652598552288096

In [None]:
sorted_sv_counts = master_mapping.value_counts().sort_values(ascending=False)

In [46]:
sorted_sv_counts.loc[106979579]

2

In [60]:
%time tar_bytes = fetch_tarfile('emdata3:8900', '7254', 'segmentation_sv_meshes', 106979579)

CPU times: user 3.23 ms, sys: 1.73 ms, total: 4.96 ms
Wall time: 50.3 s


In [59]:
fetch_exists('emdata3:8900', '7254', 'segmentation_sv_meshes', [106979579, 2386468249])

sv
106979579     False
2386468249    False
Name: exists, dtype: bool

In [62]:
print(tar_bytes.decode())

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [74]:
%time master_mapping[master_mapping == 106979579]

CPU times: user 46.3 ms, sys: 0 ns, total: 46.3 ms
Wall time: 46.2 ms


sv
106979579     106979579
2386468249    106979579
Name: body, dtype: uint64

In [69]:
%time fetch_sizes('emdata3:8900', '7254', 'segmentation', [106979579, 2386468249])

CPU times: user 4 ms, sys: 274 µs, total: 4.28 ms
Wall time: 47.1 s


[2683339214778, 0]

In [65]:
master_mapping.iloc[:10]

sv
106979579    106979579
108002724    108002724
108343758    108002724
108343759    108002724
108343809    108002724
108348135    108348135
108684808    108684808
108684825    108684808
108684845    108684808
108684852    108684852
Name: body, dtype: uint64

In [70]:
2683339214778 / 1e12

2.683339214778

In [71]:
megablocks = fetch_sparsevol_coarse('emdata3:8900', '7254', 'segmentation', 106979579, supervoxels=True)

In [72]:
len(megablocks)

14597201

In [78]:
len(master_mapping) * 2 * 8 / 1e9 * 30

19.72450704

In [81]:
40000**2 * 8 / 1e9

12.8

In [83]:
%time fetch_mapping('emdata3:8900', '7254', 'segmentation', [106979579])

CPU times: user 4.45 ms, sys: 217 µs, total: 4.67 ms
Wall time: 47.3 s


[106979579]

In [85]:
index = pd.MultiIndex.from_arrays([master_mapping.index.values, master_mapping.values], names=['sv', 'body'])