# Locality Sensitive Hashing: Exploration

This is a redo of Alex Klibisz's [sample notebook](https://github.com/alexklibisz/elastik-nearest-neighbors/blob/master/scratch/lsh-experiments/lsh-explore.ipynb) using [`plotly express`](https://medium.com/@plotlygraphs/introducing-plotly-express-808df010143d) instead of `matplotlib`.

In [None]:
import plotly_express as px
import plotly.graph_objs as go
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors

In [None]:
# Generate random points in 2D space.
rng = np.random.RandomState(33)
X = rng.normal(5, 1, size=(200, 2))

df = pd.DataFrame(X, columns=['x', 'y'])
fig = go.FigureWidget(data=[dict(type='scatter', x = df.x, y = df.y,
                      mode = 'markers', showlegend=False)])
# fig.layout.xaxis.rangemode = fig.layout.yaxis.rangemode = 'tozero'
fig.layout.width = fig.layout.height = 600
fig.layout.yaxis = {'scaleanchor':'x', 'scaleratio':1}
fig.layout.hovermode = 'closest'

In [None]:
# Value closer to 0.5 are good here..
# print('Proportion positive at each hash: ', H.mean())

# Compute the recall @ 10...
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='euclidean')
nbrs_true = knn.fit(X).kneighbors(X, return_distance=False)
# nbrs_hash = knn.fit(H).kneighbors(H, return_distance=False)

# recalls = np.array([len(np.intersect1d(a, b)) for a, b in zip(nbrs_true, nbrs_hash)])
# print('Recall @10 min, mean, median, max = %.2lf, %.2lf, %.2lf, %.2lf' % (
#     recalls.min(), recalls.mean(), np.median(recalls), recalls.max()))

In [None]:
def hover_fn(trace, points, state):
    point_of_interest = points.point_inds[0]
    neighbor_colors = ['blue']*len(X)
#     neighbors = [n for n,h in enumerate(H) if (h == H[point_of_interest]).all()]
    nbrs_true = knn.fit(X).kneighbors(X, return_distance=False)
    for n in nbrs_true[point_of_interest]:
        neighbor_colors[n] = 'orange'
    neighbor_colors[point_of_interest] = 'green'
    fig.data[0].marker.color = neighbor_colors
    fig.data[0].marker.opacity = 1
    
    C_index = [n for n,x in enumerate(X) if (x in C)]
#     c_colors = ['blue']*len(X)
    for n in C_index:
        neighbor_colors[n] = 'red'
    fig.data[0].marker.color = neighbor_colors

In [None]:
bits = 2
# Randomly sample (indices of) dataset X.
vector_sample = rng.choice(np.arange(len(X)), size=(bits, 2), replace=False)
for i0, i1 in vector_sample:
    p, q = X[i0], X[i1]

    # This is effectively the only information that needs to be stored.
    m = (p + q) / 2 # Midpoint.
    n = m - q       # Normal vector.
    
#     fig.add_scatter(x=[p[0], q[0]], y=[p[1], q[1]], mode='markers')
    
    # Some arithmetic to plot the lines.
    Z = np.vstack([np.linspace(0, 10, 2), np.zeros(2)]).T
    Z[:,1] = (n[0] * Z[:,0] - n.dot(m)) / (-1 * n[1])
    fig.add_scatter(x=Z[:,0], y=Z[:,1], mode='lines', line={'dash':'dash', 'color':'red'}, showlegend=False)
    
fig.data[0].on_hover(hover_fn)
fig.data[0].hoverinfo = 'none'
fig

In [None]:
# # Value closer to 0.5 are good here..
# print('Proportion positive at each hash: ', H.mean())

# # Compute the recall @ 10...
# knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='euclidean')
# nbrs_true = knn.fit(X).kneighbors(X, return_distance=False)
# nbrs_hash = knn.fit(H).kneighbors(H, return_distance=False)

# recalls = np.array([len(np.intersect1d(a, b)) for a, b in zip(nbrs_true, nbrs_hash)])
# print('Recall @10 min, mean, median, max = %.2lf, %.2lf, %.2lf, %.2lf' % (
#     recalls.min(), recalls.mean(), np.median(recalls), recalls.max()))

# lineX = np.linspace(X.min() - 1, X.max() + 1, 3)
# for i in range(bits):
#     c0, c1 = C[i]
#     b = B[i]
#     y = ((c0 * lineX) + b) / (c1 + 1e-7)
#     df_line = pd.DataFrame({'x':lineX, 'y':y})
#     fig.add_scatter(x=df_line.x, y=df_line.y, showlegend=False)
#     fig.data[i+1].update(mode='lines', line={'dash':'dash', 'color':'red'})
    

# point_of_interest = 55
# neighbor_colors = ['blue']*len(X)
# neighbors = [n for n,h in enumerate(H) if (h == H[point_of_interest]).all()]
# for n in nbrs_true[point_of_interest]:
#     neighbor_colors[n] = 'orange'
# neighbor_colors[point_of_interest] = 'green'
# fig.data[0].marker.color = neighbor_colors
# fig.data[0].marker.opacity = 1
# fig.data[0].on_hover(hover_fn)
# fig.data[0].hoverinfo = 'none'

# C_index = [n for n,x in enumerate(X) if (x in C)]
# c_colors = ['blue']*len(X)
# for n in C_index:
#     c_colors[n] = 'red'
# fig.data[0].marker.color = c_colors

# fig.layout.hovermode = 'closest'
# fig

In [None]:
knn.fit(X).kneighbors(X, return_distance=False)