In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import os
import sys
import logging
from itertools import chain

import requests
from tqdm import tqdm, tqdm_notebook, tnrange
#tqdm = tqdm_notebook

import vigra
import numpy as np
import pandas as pd

from dvidutils import LabelMapper
from libdvid import DVIDNodeService

from neuclease.dvid import *
from neuclease.util import Timer
from neuclease.misc import find_best_plane

In [3]:
from DVIDSparkServices.spark_launch_scripts.janelia_lsf.lsf_utils import get_hostgraph_url

In [4]:
handler = logging.StreamHandler(sys.stdout)
root_logger = logging.getLogger()
root_logger.handlers = []
root_logger.addHandler(handler)
root_logger.setLevel(logging.INFO)
logging.getLogger('kafka').setLevel(logging.WARNING)

In [5]:
cd /nrs/flyem/bergs/complete-ffn-agglo/

/nrs/flyem/bergs/complete-ffn-agglo


In [6]:
!uname -n

h11u11.int.janelia.org


In [7]:
pwd

'/nrs/flyem/bergs/complete-ffn-agglo'

In [8]:
sc

### Hostgraph URLS

In [12]:
print("This notebook:")
print(get_hostgraph_url(os.environ["LSB_JOBID"]))
print("Cluster:")
print(get_hostgraph_url(os.environ["MASTER_BJOB_ID"]))

This notebook:
http://lsf-rtm/cacti/plugins/grid/grid_bjobs.php?action=viewjob&tab=hostgraph&clusterid=1&indexid=0&jobid=44073279&submit_time=1532443822
Cluster:
http://lsf-rtm/cacti/plugins/grid/grid_bjobs.php?action=viewjob&tab=hostgraph&clusterid=1&indexid=0&jobid=44073287&submit_time=1532443858


### UUIDs

In [13]:
# The starting agglo
initial_agglo = DvidInstanceInfo('emdata3:8900', 'ac901', 'segmentation')

# The uuid used when loading the neo4j instance (for 'important bodies')
neo4j_reference = DvidInstanceInfo('emdata3:8900', '52f9', 'segmentation')

# The last supervoxel splits: One past the neo4j node
analysis_node = DvidInstanceInfo('emdata3:8900', '662e', 'segmentation')

# We won't be using this...
current_master = DvidInstanceInfo('emdata3:8900', 'f545', 'segmentation')

### Load neo4j-defined important bodies; append final splits

In [15]:
# This list was generated from node 52f9
important_bodies_path = '/nrs/flyem/bergs/complete-ffn-agglo/bodies-0.5-from-neuprint-52f9.csv'
important_bodies = pd.read_csv(important_bodies_path, header=0, usecols=['bodyId'], dtype=np.uint64)['bodyId']
important_bodies = set(important_bodies)

# Read last set of new bodies (from analysis node, after neo4j was loaded).
msgs = read_kafka_messages(analysis_node, 'split', 'leaf-only')
final_new_bodies = set(chain(*((msg['Target'], msg['NewLabel']) for msg in msgs)))

# Append final set
important_bodies |= final_new_bodies

Reading kafka messages from kafka.int.janelia.org:9092 for emdata3:8900 / 662e / segmentation
Reading 166499 kafka messages took 6.998676776885986 seconds


### Load pre-filtered table

In [None]:
# id_a, id_b -- the two supervoxel IDs
# xa, ya, za -- point from which segmentation of 'a' was started, 8 nm coordinates
# xb, yb, zb -- point from which segmentation of 'b' was started, 8 nm coordinates
# caa, cab, cba, cbb -- cXY means: fraction of voxels from the original segment Y recovered when seeding from X
# iou -- Jaccard index of the two local segmentations
# da, db -- dX means: fraction of voxels that changed value from >0.8 to <0.5 when segmenting & seeding from X;
#                     the higher this value is, the more "internally inconsistent" the segmentation resolution
#                     potentially is; higher thresholds for iou, cXY might be warranted

%time df = pd.DataFrame(np.load('combined-filtered-table.npy'))

In [33]:
# df = pd.DataFrame(np.load('32nm/data-00000-of-00100.npy'))
# print("Selecting retired supervoxels")
# rows_to_fix = df.eval('(id_a in @retired_svs) or (id_b in @retired_svs)')
# print(f"Found {rows_to_fix.sum()} rows")
# df_to_fix = df[rows_to_fix]
# df_to_fix.iloc[37:38]

In [28]:
#repair_coords_on_splits('32nm/data-00000-of-00100.npy')

In [36]:
sc.parallelize(orig_npy_paths).foreach(repair_coords_on_splits)
print("Done.")

Done.


In [20]:
fixed_npy_paths = (  sorted(glob.glob('split-coords-fixed/32nm/*.npy'))
                   + sorted(glob.glob('split-coords-fixed/16nm/*.npy'))
                   + sorted(glob.glob('split-coords-fixed/8nm/*.npy')))
fixed_npy_paths = list(map(os.path.abspath, fixed_npy_paths))

In [21]:
%%time
def count_unfixable(npy_path):
    return (np.load(npy_path)['za'] < 0).sum()
unfixable_count = sc.parallelize(fixed_npy_paths).map(count_unfixable).sum()

CPU times: user 26.1 ms, sys: 10.6 ms, total: 36.7 ms
Wall time: 11 s


### Relabel table SVs from init agglo to current master
(and drop bad edges)

In [126]:
os.makedirs('updated-tables/32nm', exist_ok=True)
os.makedirs('updated-tables/16nm', exist_ok=True)
os.makedirs('updated-tables/8nm', exist_ok=True)

# Replace old SV ids with updated IDs by sampling from those coordinates.
def remap_split_svs(npy_path):
    df = pd.DataFrame(np.load(npy_path))
    assert df['id_a'].dtype == np.uint64
    assert df['id_b'].dtype == np.uint64

    retired_svs # Reference this variable to ensure that it gets captured when pickling this function.
    rows_to_fix = df.eval('(id_a in @retired_svs) or (id_b in @retired_svs)')

    fixed_ids = []
    df_to_fix = df[rows_to_fix]
    for row in tqdm(df_to_fix.itertuples(), total=len(df_to_fix)):
        id_a, id_b = row.id_a, row.id_b
        if id_a in retired_svs:
            id_a = fetch_label_for_coordinate(analysis_node, (row.za, row.ya, row.xa), supervoxels=True)
        if id_b in retired_svs:
            id_b = fetch_label_for_coordinate(analysis_node, (row.zb, row.yb, row.xb), supervoxels=True)
        fixed_ids.append( (id_a, id_b) )

    df.loc[rows_to_fix, ['id_a', 'id_b']] = np.array(fixed_ids, np.uint64)
    assert df['id_a'].dtype == np.uint64
    assert df['id_b'].dtype == np.uint64

    parts = npy_path.split('/')
    assert parts[-3] == 'split-coords-fixed'
    parts[-3] = 'updated-tables'
    new_npy_path = '/'.join(parts)
    np.save(new_npy_path, df.to_records(index=False))

    return rows_to_fix.sum()

In [28]:
#remap_split_svs(fixed_npy_paths[0])

In [26]:
%time updated_row_count = sc.parallelize(fixed_npy_paths).map(remap_split_svs).sum()

CPU times: user 204 ms, sys: 46.3 ms, total: 250 ms
Wall time: 13min 29s


In [27]:
updated_row_count

2081409

### Body mapping

In [70]:
mapping = fetch_mappings(analysis_node)

Fetching http://emdata3:8900/api/node/662e/segmentation/mappings...
Fetching http://emdata3:8900/api/node/662e/segmentation/mappings took 0:00:31.333353
Parsing mapping...
Parsing mapping took 0:00:07.734782


In [73]:
%time mapper = LabelMapper(mapping.index.values, mapping.values)

CPU times: user 24 s, sys: 471 ms, total: 24.4 s
Wall time: 24.3 s


### Filter

In [133]:
os.makedirs('filtered-tables/32nm', exist_ok=True)
os.makedirs('filtered-tables/16nm', exist_ok=True)
os.makedirs('filtered-tables/8nm', exist_ok=True)

# Replace old SV ids with updated IDs by sampling from those coordinates.
def apply_mapping_and_filter_to_partition(paths):
    # Must create mapper here since it cannot be pickled.
    mapper = LabelMapper(mapping.index.values, mapping.values)

    def apply_mapping_and_filter(npy_path):
        df = pd.DataFrame(np.load(npy_path))

        # A bug above caused the type to be int64. Fix that now.
        df['id_a'] = df['id_a'].astype(np.uint64)
        df['id_b'] = df['id_b'].astype(np.uint64)
        
        df['body_a'] = mapper.apply(df['id_a'].values, allow_unmapped=True)
        df['body_b'] = mapper.apply(df['id_b'].values, allow_unmapped=True)

        important_bodies # Referenced to ensure capture in this closure

        # Drop internal edges,
        # Filter for important bodies (on at least one end -- capture 1-hop and 2-hop)
        q = '(body_a != body_b) and ((body_a in @important_bodies) or (body_b in @important_bodies))'
        df.query(q, inplace=True)

        parts = npy_path.split('/')
        assert parts[-3] == 'updated-tables'
        parts[-3] = 'filtered-tables'
        new_npy_path = '/'.join(parts)
        np.save(new_npy_path, df.to_records(index=False))

        return len(df)
    
    return list(map(apply_mapping_and_filter, paths))

In [134]:
updated_npy_paths = (  sorted(glob.glob('updated-tables/32nm/*.npy'))
                     + sorted(glob.glob('updated-tables/16nm/*.npy'))
                     + sorted(glob.glob('updated-tables/8nm/*.npy')))
updated_npy_paths = list(map(os.path.abspath, updated_npy_paths))

In [135]:
%%time 
filtered_row_count = (sc.parallelize(updated_npy_paths)
                        .mapPartitions(apply_mapping_and_filter_to_partition)
                        .sum())

CPU times: user 23.6 s, sys: 4.18 s, total: 27.7 s
Wall time: 2min 45s


In [136]:
print(filtered_row_count)

755536110


In [137]:
filtered_npy_paths = (  sorted(glob.glob('filtered-tables/32nm/*.npy'))
                      + sorted(glob.glob('filtered-tables/16nm/*.npy'))
                      + sorted(glob.glob('filtered-tables/8nm/*.npy')))
filtered_npy_paths = list(map(os.path.abspath, filtered_npy_paths))

In [138]:
combined_table = np.concatenate(list(map(np.load, tqdm(filtered_npy_paths))))

100%|██████████| 300/300 [01:14<00:00,  4.02it/s]


In [147]:
print(combined_table.shape[0] / 1e6, "M")
print(combined_table.nbytes / 1e9, "GB")

755.53611 M
64.22056935 GB


In [None]:
combined_df = pd.DataFrame(combined_table)

In [154]:
%time np.save('combined-filtered-table.npy', combined_table)

CPU times: user 3.28 s, sys: 1min 7s, total: 1min 10s
Wall time: 2min 50s


In [151]:
ls

[0m[38;5;27m16nm[0m/                               [38;5;27mnotebook-cluster--20180723.094528[0m/
[38;5;27m32nm[0m/                               [38;5;27mnotebook-cluster--20180723.094651[0m/
[38;5;27m8nm[0m/                                [38;5;27mnotebook-cluster--20180723.180735[0m/
bodies-0.5-from-neuprint-52f9.csv   spark-focused.ipynb
[38;5;27mfiltered-tables[0m/                    [38;5;27msplit-coords-fixed[0m/
[38;5;27mnotebook-cluster--20180722.201030[0m/  [38;5;27mupdated-tables[0m/
