## LSH Prototype 2

In [3]:
h = None           # Number of individual signed projections to compute
g = None           # Number of composite hashes (groupings of signs)
sample_rate = None # Variant sampling rate used in extraction
ds_name = None     # Dataset name 
n_workers = 8      # Number of dask workers
mem_fraction = .9  # Maximum fraction of system memory to use

# Example Settings
# h = 24
# g = 100        
# sample_rate = .05
# ds_name = 'hapmap'
# n_workers = 8
# mem_fraction = .9

In [4]:
import os
import pandas as pd
import numpy as np
from dask.distributed import Client
import dask.array as da
import dask.dataframe as dd
import xarray as xr
%run {os.environ['NB_DIR']}/nb.py
%run $BENCHMARK_METHOD_DIR/common.py
assert h is not None
assert g is not None
assert sample_rate is not None
assert ds_name is not None
ds_config = DATASET_CONFIG[ds_name]
n_projections = h * g
ds_path = dataset_path(ds_name, sr=sample_rate)

### Initialization

In [3]:
client = get_dask_client(n_workers=n_workers, max_mem_fraction=mem_fraction)
client

0,1
Client  Scheduler: tcp://127.0.0.1:40373  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 120.00 GB


In [4]:
# Load the coded call data
X = da.from_zarr(ds_path + '.zarr')

# Rechunk to ensure that more than one worker is used downstream
# (which is the case with a relatively small number of variants < 1M)
def blocks(n, n_workers):
    if n <= n_workers:
        return n
    return n // n_workers
X = X.rechunk(chunks=(blocks(X.shape[0], n_workers), -1))
X

Unnamed: 0,Array,Chunk
Bytes,12.00 MB,1.50 MB
Shape,"(72732, 165)","(9091, 165)"
Count,20 Tasks,9 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 12.00 MB 1.50 MB Shape (72732, 165) (9091, 165) Count 20 Tasks 9 Chunks Type uint8 numpy.ndarray",165  72732,

Unnamed: 0,Array,Chunk
Bytes,12.00 MB,1.50 MB
Shape,"(72732, 165)","(9091, 165)"
Count,20 Tasks,9 Chunks
Type,uint8,numpy.ndarray


In [5]:
# Convert to xarray and center variant vectors (to make dot product equivalent to un-normalized cosine)
X = xr.DataArray(
    data=X, 
    dims=['variant', 'sample'],
    coords=dict(
        variant=da.arange(X.shape[0], dtype=np.int32),
        sample=da.arange(X.shape[1], dtype=np.int32)
    ),
    name='calls'
)
X -= X.mean(dim='sample')
X

In [6]:
# Generate random vectors for projection
da.random.seed(seed=1)
R = xr.DataArray(
    data=da.random.normal(
        size=(len(X['sample']), n_projections), 
        # Split the random array into chunks here as well
        chunks=(-1, blocks(n_projections, n_workers))
    ), 
    dims=('sample', 'projection'),
    coords=dict(
        sample=da.arange(len(X['sample']), dtype=np.int32),
        projection=da.arange(n_projections, dtype=np.int32)
    ),
    name='random'
)
R

### Compute Hash Bits

In [7]:
# Apply projection and attach hash signature grouping
P = (
    (X @ R)
    .rename('projections')
    .pipe(lambda x: x.assign_coords(hash_group=('projection', x['projection']//h)))
)
P

In [9]:
%%time
# Group by hash signature group (i.e. columns) and compute row-wise hashes
# within those columns, which will each be boolean vectors indicating sign
# TODO: This should operate on rows across hash group boundaries since the number
# of groups can be high and making many small chunks is inefficient
def hash_bits(x):
    return np.expand_dims(np.apply_along_axis(
        lambda r: hash(np.asarray(r).tobytes()), 
        axis=1, arr=x
    ), 1)
H = xr.DataArray(
    (P > 0).data.rechunk(chunks=(P.data.chunksize[0], h))
    .map_blocks(hash_bits, chunks=(P.data.chunksize[0], 1))
    .compute(),
    dims=('variant', 'hash_group'),
    coords=dict(
        variant=P['variant'],
        hash_group=np.arange(g)
    ),
    name='hash_value'
)
H

CPU times: user 4.08 s, sys: 200 ms, total: 4.28 s
Wall time: 10.2 s


Note that the above is unfortunately much faster than using the xarray API directly:

In [10]:
# %%time
# def hash_bits(x, axis=None):
#     # NOTE: It makes little difference if the np conversion is done
#     # per row or initially for this group (it must all be loaded into memory already)
#     return xr.DataArray(da.apply_along_axis(
#         lambda r: hash(np.asarray(r).tobytes()), 
#         axis=axis, arr=x
#     ))

# H = (
#     (P > 0)
#     .groupby('hash_group')
#     .reduce(hash_bits, dim='projection')
#     .rename('hash_value')
# )
# H

# For h = 24, g = 100:
# CPU times: user 24 s, sys: 1.64 s, total: 25.6 s
# Wall time: 2min 10s

### Compute Hash Bucket

In [11]:
L = H.stack(i=('variant', 'hash_group')).reset_index('i')
L

In [12]:
%%time
def hash_bucket(x, axis=None):
    return xr.DataArray(da.apply_along_axis(
        lambda r: hash(np.asarray(r).tobytes()), 
        axis=axis, arr=x
    ))
L = L.assign_coords(hash_bucket=
    xr.concat([L, L['hash_group']], dim='component', coords='minimal').T
    .pipe(lambda x: x.reduce(hash_bucket, dim='component'))
    .rename('hash_bucket')
)
L

CPU times: user 2.49 s, sys: 431 ms, total: 2.92 s
Wall time: 28.2 s


### Export

In [13]:
df = L.to_dataset().to_dask_dataframe().drop('i', axis='columns')
df.head(8)

Unnamed: 0,variant,hash_group,hash_bucket,hash_value
0,0,0,4518865617056631551,-4202982439074213779
1,0,1,-2374908829845851768,-622209708047959191
2,0,2,-6764392033969002082,-8577516402077061696
3,0,3,43936362975166207,-4381640997820518551
4,0,4,-6561558303003957345,-5538741014653971404
5,0,5,-773945013534815288,3493395928185676612
6,0,6,8866745322780316067,3710387948626310814
7,0,7,-6049193602095995057,-637540166530277304


In [14]:
%%time
path = dataset_path(ds_name, sr=sample_rate, h=h, g=g) + '.parquet'
df.to_parquet(path)
path

CPU times: user 314 ms, sys: 209 ms, total: 523 ms
Wall time: 2.66 s


'/tmp/benchmark_datasets/hapmap-sr=0.05-hash=lsh.parquet'

In [16]:
client.shutdown()