In [1]:
import numpy as np
from sklearn.random_projection import SparseRandomProjection
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Set parameters
n_samples = 10
n_features = 100
n_components = 3  # lower dimension for LSH projection

# Create random high-dimensional data
np.random.seed(42)
data = np.random.randn(n_samples, n_features)

# Implement LSH using random projection
lsh = SparseRandomProjection(n_components=n_components, random_state=42)
lsh.fit(data)


# Hash function: simply binarize the projected data
def hash_vector(v):
    projection = lsh.transform(v.reshape(1, -1))
    binary_hash = (projection > 0).astype(int).flatten()
    return "".join(binary_hash.astype(str))


# Hash each vector
hashes = [hash_vector(d) for d in data]

# Show the original vectors and their hashes
for i, (vec, hsh) in enumerate(zip(data, hashes)):
    print(f"Vector {i}: {vec[:5]}... Hash: {hsh}")

# Demonstrating similarity in hashes for similar vectors
print("\nExample of similarity in hashes:")
print(f"Vector 0 hash: {hashes[0]}")
print(f"Vector 1 hash: {hashes[1]}")  # Assuming Vector 1 is similar to Vector 0

Vector 0: [ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337]... Hash: 100
Vector 1: [-1.41537074 -0.42064532 -0.34271452 -0.80227727 -0.16128571]... Hash: 001
Vector 2: [ 0.35778736  0.56078453  1.08305124  1.05380205 -1.37766937]... Hash: 110
Vector 3: [-0.82899501 -0.56018104  0.74729361  0.61037027 -0.02090159]... Hash: 111
Vector 4: [-1.59442766 -0.59937502  0.0052437   0.04698059 -0.45006547]... Hash: 100
Vector 5: [ 0.92617755  1.90941664 -1.39856757  0.56296924 -0.65064257]... Hash: 110
Vector 6: [ 0.75698862 -0.92216532  0.86960592  1.35563786  0.4134349 ]... Hash: 001
Vector 7: [-0.52272302  1.04900923 -0.70434369 -1.4084613  -1.55662917]... Hash: 111
Vector 8: [ 0.93828381 -0.51604473  0.09612078 -0.46227529 -0.43449623]... Hash: 001
Vector 9: [ 0.36867331 -0.39333881  0.02874482  1.27845186  0.19109907]... Hash: 001

Example of similarity in hashes:
Vector 0 hash: 100
Vector 1 hash: 001


In [4]:
# Function to calculate cosine similarity for all pairs
def compute_all_cosine_similarities(data):
    n = len(data)
    count = 0
    for i in range(n):
        for j in range(i + 1, n):
            cosine_similarity(data[i].reshape(1, -1), data[j].reshape(1, -1))
            count += 1
    return count


# Function to calculate cosine similarities using LSH
def compute_lsh_cosine_similarities(data, hashes):
    unique_hashes = set(hashes)
    count = 0
    for h in unique_hashes:
        indices = [
            i for i, hash in enumerate(hashes) if hash == h
        ]  # find subset with identical hash
        for i in range(len(indices)):
            for j in range(i + 1, len(indices)):
                cosine_similarity(
                    data[indices[i]].reshape(1, -1), data[indices[j]].reshape(1, -1)
                )
                count += 1
    return count


# Calculating cosine similarities for all pairs
all_cosine_calculations = compute_all_cosine_similarities(data)

# Calculating cosine similarities using LSH
lsh_cosine_calculations = compute_lsh_cosine_similarities(data, hashes)

print("Number of calculations for naive cosine similarity:", all_cosine_calculations)
print("Number of calculations for LSH cosine similarity:", lsh_cosine_calculations)

Number of calculations for naive cosine similarity: 45
Number of calculations for LSH cosine similarity: 9
