In [199]:
from __future__ import division
import numpy as np
import ujson as json
import hashlib
import ipdb
import cPickle as pickle
import itertools
import glob
import datetime
import pandas as pd
import numpy as np
assert hashlib.md5("a").hexdigest() == hashlib.md5("a").hexdigest()

from pi import make_pi

import logging
logging.basicConfig(filename='windows.log', level=logging.DEBUG)

import os

def reset_file():
    with open("shingles.congress", "w") as outf:
        outf.write('fn,digit,window_no,window_size,iter' + "\n")

reset_file()
# http://nlp.stanford.edu/IR-book/html/htmledition/near-duplicates-and-shingling-1.html

In [201]:
HASH_LEN = 32
SHINGLESIZE = 2
iters = 100

def make_windows(doc, n):
    '''find all ngrams'''
    return zip(*[doc[i:] for i in range(n)])

def shingle(doc, n):
    shingles = zip(*[doc[i:] for i in range(n)])
    return [" ".join(a) for a in shingles]

def get_pi():
    with open("pi.p", "r") as outf:
        return pickle.load(outf)

def permute(hashed, pi_):
    '''permute h and return a hex string'''
    assert len(hashed) == len(pi_)
    out2 = [None] * HASH_LEN # this could be a global var that keeps getting refilled
    
    for pno, p in enumerate(pi_):
        out2[p] = hashed[pno]
    
    return "0x" + "".join(out2)

def shingle_hash_permute_min(tokens):
    '''shingle doc j, hash shingles, permute the hashes'''
    pi = get_pi()
    shingles = shingle(tokens, SHINGLESIZE) # shingle
    hdj = [hashlib.md5(s).hexdigest() for s in shingles] # hash
    pi_d_j = [permute(h, pi) for h in hdj]
    return int(min(pi_d_j), 16)

def jaccard(a, b):
    '''a and b are indexes on documents'''
    return len(set(a).intersection(set(b)))/len(set(a).union(set(b)))

def sanity_check(d1, d2):
    '''as iters grows, this should be more and more like jaccard'''
    equal = 0
    for i in range(iters):
        make_pi(HASH_LEN)
        pi_d = [shingle_hash_permute_min(j) for j in range(len(docs))]
        if pi_d[d1] == pi_d[d2]:
            equal += 1
    # These should be more or less the same, if this is working properly
    # print "fancy jaccard={}\npoor man's jaccard {}".format(equal/iters, jaccard(set(shingle(docs[d1], ngram)), set(shingle(docs[d2], ngram))))
    return equal/iters, jaccard(set(shingle(docs[d1], SHINGLESIZE)), set(shingle(docs[d2], SHINGLESIZE)))
    
def sketch_docs():
    '''get N (iters) sketches of docs'''
    out = []
    for i in range(iters):
        make_pi(HASH_LEN)
        # pi = np.random.permutation(HASH_LEN) # assume for now docs are same size
        out.append([(shingle_hash_permute_min(j), j) for j in range(len(docs))])
    return out

def get_tokens(fn):
    '''get tokens in file'''
    all_tokens = []
    with open(fn, "r") as inf:
        js = json.load(inf)
        for sentence in js["sentences"]:
            for token in sentence["tokens"]:
                all_tokens.append(token)
    return all_tokens

In [None]:
def do_doc(fn, window_size, iter_no):
    '''
    Bunch o stuff
       - make windows
       - shingle_hash_permute_min them
       - report results to file
    '''
    windows = make_windows(get_tokens(fn), window_size)
    for window_no, window in enumerate(windows):
        min_digit = shingle_hash_permute_min(window)
        with open("shingles.congress", "a") as outf:
            out_str = ",".join([fn, str(min_digit), str(window_no), str(window_size), str(iter_no)])
            outf.write(out_str + "\n")
    logging.debug("{}\t{}\tran permute".format(datetime.datetime.utcnow(), fn))

reset_file()
iters = 10000
WINDOWSIZE = 3

for iter_ in range(iters):    
    make_pi(HASH_LEN)
    for doc in glob.glob("demos/*anno"):
        do_doc(doc, WINDOWSIZE, iter_)
        do_doc(doc, WINDOWSIZE, iter_)

def fancy_jaccard():
    df = pd.read_csv("shingles.congress")

    aa = df[(df.fn == "demos/1.anno") & (df.window_no == 0)]
    bb = df[(df.fn == "demos/2.anno") & (df.window_no == 0)]

    print "fancy Jaccard", pd.merge(aa, bb, on=['digit', 'iter'], how='inner')["iter"].count()/iters


def get_window(fn, size, window_no):
    ngrams = find_windows(get_tokens(fn), size)
    return ngrams[window_no]

def poormans_jaccard():
    window1 = get_window("demos/1.anno", WINDOWSIZE, 0)
    window2 = get_window("demos/2.anno", WINDOWSIZE, 0)

    shingles1 = shingle(set1, SHINGLESIZE)
    shingles2 = shingle(set2, SHINGLESIZE)
    print "Poor man's Jaccard", jaccard(set(shingles1), set(shingles2))

fancy_jaccard()
poormans_jaccard()