In [2]:
import numpy as np
import pickle

from scipy.sparse import csr_matrix
from scripts.caracteristica import caracteristica
from datasketch import MinHash, MinHashLSH

from tqdm import tqdm

In [3]:
n_perm = 128

def minhash_vector(sparse_vector:csr_matrix, num_perm:int=n_perm):
    minhash = MinHash(num_perm=num_perm)
    for index in sparse_vector.indices:
        minhash.update(str(index).encode('utf8'))
    return minhash

In [4]:
k = 2

# sample_size = 20_000
# matrix, tweets = caracteristica(k, sample_size)

matrix, tweets = caracteristica(k)

In [5]:
threshold = 0.2
lsh = MinHashLSH(threshold=threshold, num_perm=n_perm)
hashes = []

for i in tqdm(range(matrix.shape[0])):
    hashes.append((i, minhash_vector(matrix[i])))

with lsh.insertion_session() as session:
    for i, minhash in tqdm(hashes):
        session.insert(i, minhash)

100%|██████████| 1371764/1371764 [17:31<00:00, 1304.02it/s]
100%|██████████| 1371764/1371764 [01:16<00:00, 18027.95it/s]


In [6]:
pickle.dump(lsh, open(f'../Data/lsh/k{k}_th{threshold}.pkl', 'wb'))

In [12]:
m = minhash_vector(matrix[1500])
print(lsh.query(m))
print(len(lsh.query(m)))

[565250, 1122311, 36873, 917520, 1099796, 256023, 612375, 653337, 1357863, 518189, 991278, 1026095, 782384, 217134, 1361970, 202800, 1108017, 714797, 194614, 260151, 686134, 755769, 280636, 200774, 528455, 88141, 839757, 1359949, 833621, 630871, 1065048, 1189977, 839767, 634969, 827484, 1304673, 931938, 1060963, 1153124, 129128, 579689, 1263721, 1181800, 1116268, 798825, 10346, 1265776, 964721, 573555, 1200246, 618618, 751739, 983164, 206975, 1165440, 688258, 10372, 49285, 440454, 1259655, 1011852, 921742, 280720, 1095825, 311442, 620690, 735386, 92315, 641181, 176289, 491681, 280741, 190630, 1093798, 1106088, 1200299, 821423, 1052850, 329912, 364728, 1192126, 1134783, 977088, 6337, 1343685, 620741, 573639, 817352, 735433, 221387, 248012, 1360076, 1319119, 1192144, 794834, 544979, 897235, 41175, 92381, 280797, 1046751, 266464, 192737, 1046752, 809187, 233701, 336102, 1059053, 176368, 28913, 1235186, 264432, 1007860, 82163, 401656, 1265915, 350461, 1235198, 1353983, 1310977, 358660, 739