# Locality Sensitive Hashing: Exploration

This is a redo of Alex Klibisz's [sample notebook](https://github.com/alexklibisz/elastik-nearest-neighbors/blob/master/scratch/lsh-experiments/lsh-explore.ipynb) using [`plotly express`](https://medium.com/@plotlygraphs/introducing-plotly-express-808df010143d) instead of `matplotlib`.

In [None]:
import plotly_express as px
import plotly.graph_objs as go
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors

In [None]:
# This was a first pass at LSH, but it doesn't handle the case where points are not centered at (0, 0)..
# The implementation further down handles this case nicely.
rng = np.random.RandomState(22)

# Generate random points in 2D space.
X = rng.normal(0, 1, size=(100, 2))

# Sample random coefficients. In practice you probably don't know 
# the true distribution, so sample from the dataset. You might
# also benefit from adding some gaussian noise.
bits = 20
# C = rng.choice(X.ravel(), size=(bits, X.shape[-1]))

# Sample a random choice of rows from X? Is this what Klibisz intended?
C = X[rng.choice(X.shape[0], bits, replace=False)]

# Compute bias.
B = rng.normal(0, 2, size=bits)
# B = rng.choice(X.ravel(), size=(bits,))

# Hash the points.
H = ((X.dot(C.T) + B) >= 0).astype(np.uint8)

# Value closer to 0.5 are good here..
print('Proportion positive at each hash: ', H.mean())

# Compute the recall @ 10...
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='euclidean')
nbrs_true = knn.fit(X).kneighbors(X, return_distance=False)
nbrs_hash = knn.fit(H).kneighbors(H, return_distance=False)

recalls = np.array([len(np.intersect1d(a, b)) for a, b in zip(nbrs_true, nbrs_hash)])
print('Recall @10 min, mean, median, max = %.2lf, %.2lf, %.2lf, %.2lf' % (
    recalls.min(), recalls.mean(), np.median(recalls), recalls.max()))

df = pd.DataFrame(X, columns=['x', 'y'])
fig = px.scatter(df, x='x', y='y', labels={'x':'', 'y':''}, width=600, height=600)
# fig.data[0].update(mode='markers+text', text=["$p=(1,2)$", "$q=(2,1)$"], cliponaxis=False, textposition='top right')
fig.layout.xaxis.range = fig.layout.yaxis.range = [-2, 2]
fig.layout.xaxis.rangemode = fig.layout.yaxis.rangemode = 'tozero'

lineX = np.linspace(X.min() - 1, X.max() + 1, 3)
for i in range(bits):
    c0, c1 = C[i]
    b = B[i]
    y = ((c0 * lineX) + b) / (c1 + 1e-7)
    df_line = pd.DataFrame({'x':lineX, 'y':y})
    fig.add_scatter(x=df_line.x, y=df_line.y, showlegend=False)
    fig.data[i+1].update(mode='lines', line={'dash':'dash', 'color':'red'})
    
fig

In [None]:
for c in X[np.random.choice(X.shape[0], 20, replace=False)]:
    if c not in X:
        print(c)

In [None]:
for c in rng.choice(X.ravel(), size=(bits, X.shape[-1])):
    if c not in X:
        print(c)

In [None]:
B