In [10]:
from datasketch import MinHash, MinHashLSH

import glob, json

def make_windows(doc, n):
    '''
    - chunk into n token blocks (at least for now)
    - no rolling windows. too big! something to improve later
    '''
    # https://gist.github.com/moshekaplan/4678925
    for i in xrange(0, len(doc), n):
        yield doc[i:i+n]

def get_tokens(fn):
    '''get tokens in file'''
    all_tokens = []
    with open(fn, "r") as inf:
        js = json.load(inf)
        for sentence in js["sentences"]:
            for token in sentence["tokens"]:
                if token.lower() not in STOPS:
                    all_tokens.append(token)
    return all_tokens

def get_stops():
    import string
    from stop_words import get_stop_words
    return set([a for a in string.punctuation] + get_stop_words('en'))
    
STOPS = get_stops()

data = {}
for dt in glob.glob("*.anno"):
    toks = get_tokens(dt)
    windows = make_windows(toks, 50)
    for wno, window in enumerate(windows):
        data["{}-{}".format(dt, wno)] = window


# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)

for k,v in data.items():
    for d in v:
        m1.update(d.encode('utf8'))
        m2.update(d.encode('utf8'))
        m3.update(d.encode('utf8'))

# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Insert m2 and m3 into the index
lsh.insert("m2", m2)
lsh.insert("m3", m3)


# Using m1 as the query, retrieve the keys of the qualifying datasets
result = lsh.query(m1)
print("Candidates with Jaccard similarity > 0.5", result)


('Candidates with Jaccard similarity > 0.5', ['m3', 'm2'])


0.5