In [None]:
# EE226 - Coding 2
## Streaming algorithm & Locality Sensitive Hashing

### Streaming: DGIM

DGIM is an efficient algorithm in processing large streams. When it's infeasible to store the flowing binary stream, DGIM can estimate the number of 1-bits in the window. In this coding, you're given the *stream_data.txt* (binary stream), and you need to implement the DGIM algorithm to count the number of 1-bits. Write code and ask the problems below.

### Your task

1. Set the window size to 1000, and count the number of 1-bits in the current window.

In [None]:
import time
import sys as sys

buckets = []
window_size = 1000
current_time = 4000

def Merge_buckets():
    for i in range(len(buckets) - 1, 2, -1):
        if buckets[i]['bitsum'] == buckets[i-3]['bitsum']:
            buckets[i-3]['bitsum'] += buckets[i-2]['bitsum']
            buckets[i-3]['timestamp'] = buckets[i-2]['timestamp']
            del buckets[i-2]
        
def DGIM():
    bitsum = 0
    start_time = time.time()
    with open('../input/coding2/stream_data.txt','r') as f:
        bits = f.readline().split()
        start_stamp = 0 if current_time <= window_size else current_time - window_size
        for i in range(min(window_size, current_time)):
            bit = bits[start_stamp + i]
            if len(buckets) > 0:
                if buckets[0]["timestamp"] <= start_stamp:
                    del bucket[0]
            if int(bit) == 1:
                bucket = {"timestamp":start_stamp + i,"bitsum":1}
                buckets.append(bucket)
                Merge_buckets()
    for i in range(len(buckets)):
        bitsum += buckets[i]["bitsum"]
    bitsum -= buckets[0]['bitsum'] / 2
    return bitsum if len(buckets) > 0 else 0, time.time() - start_time

bit_sum, DGIM_time = DGIM()
print("The number of 1-bits in the current window:",bit_sum," Using time: ",DGIM_time)

2. Write a function that accurately counts the number of 1-bits in the current window, and compare the difference between its running time and space and the DGIM algorithm.

In [None]:
import csv
import numpy as np

def Acc_count_bits():
    bitsum = 0
    start_time = time.time()
    with open('../input/coding2/stream_data.txt','r') as f:
        bits = f.readline().split()
        start_stamp = 0 if current_time <= window_size else current_time - window_size
        for i in range(min(window_size, current_time)):
            bit = bits[start_stamp + i]
            if int(bit) == 1:
                bitsum += 1
    return bitsum, time.time()-start_time

acc_bit_sum, acc_time = Acc_count_bits()
print("The number of 1-bits counted by DGIM in the current window:",bit_sum," Using time: ",DGIM_time)
print("The accurate number of 1-bits in the current window:",acc_bit_sum," Using time: ", acc_time)

From the result we can see that using brute force costs less running time than DGIM, but using DGIM will cost much less space than brute force.

### Locality Sensitive Hashing

The locality sensitive hashing (LSH) algorithm is efficient in near-duplicate document detection. In this coding, you're given the *docs_for_lsh.csv*, where the documents are processed into set of k-shingles (k = 8, 9, 10). *docs_for_lsh.csv* contains 201 columns, where column 'doc_id' represents the unique id of each document, and from column '0' to column '199', each column represents a unique shingle. If a document contains a shingle ordered with **i**, then the corresponding row will have value 1 in column **'i'**, otherwise it's 0. You need to implement the LSH algorithm and ask the problems below.

### Your task

Use minhash algoirthm to create signature of each document, and find 'the most similar' documents under Jaccard similarity. 
Parameters you need to determine:
1) Length of signature (number of distinct minhash functions) *n*. Recommanded value: n > 20.

2) Number of bands that divide the signature matrix *b*. Recommanded value: b > n // 10.

In [None]:
import csv
import numpy as np

# Load the file
with open('../input/coding2/docs_for_lsh.csv','r') as f:
    data = csv.reader(f)
    data_matrix = []
    for row in data:
        data_matrix.append(list(map(int,row[1:])))
    del(data_matrix[0])
    matrix = np.array(data_matrix)
    matrix = matrix.T
    print("Loading the csv files...")
    print(matrix.shape)
    print(matrix)

In [None]:
import random

# Minhashing
def MinHash(data, b, r):
    n = b*r
    signature = []    
    for i in range(n):                          
        permutation = []                        # permutation pi for minhash
        signal_signature = []                  
        for num in range(1,data.shape[0]+1):
            permutation.append(num)        
        random.shuffle(permutation)             # initialize/randomize the permutation vector   
        for j in range(data.shape[1]):
            for k in range(data.shape[0]):
                index = permutation.index(k+1)  # find the earliest '1', if find, then break  
                if data[index][j] == 1:
                    signal_signature.append(k+1)
                    break
                else:
                    pass          
        signature.append(signal_signature)
    return np.array(signature)

b = 10
r = 5
res_signature = MinHash(matrix,b,r)
print('MinHashing...')


Problem: For document 0 (the one with id '0'), list the 30 most similar document ids (except document 0 itself). You can valid your results with the sklearn.metrics.jaccard_score() function.

Tips: You can adjust your parameters to hash the documents with similarity s > 0.8 into the same bucket.

In [None]:
import hashlib
from sklearn.metrics import jaccard_score

# LSH functions
def LSH(signature, b, r):  
    length, docnum = signature.shape    
    buckets = {}                      # the hash buckets for LSH result, a dictionary type    
    start = 0                         # the beginning location   
    for i in range(b):
        for j in range(docnum):
            md5 = hashlib.md5()       # create hash object, here I use md5 in hashlib
            signal_band = str(signature[start:start+r,j])
            hashed_band = md5.update(signal_band.encode())    # need to encode
            hashed_band = md5.hexdigest() 
            if hashed_band not in buckets:                 # put the hashed bands in the buckets,
                buckets[hashed_band] = [j]                 # key for hashed id,
            elif j not in buckets[hashed_band]:            # value for document id.
                buckets[hashed_band].append(j)
        start += r 
    return buckets

print('LSH...')
LSH_table = LSH(res_signature,b,r)

In [None]:
# Nearest Neighbor Searching
def NNS(LSH_table, num):
    res = {}
    for key in LSH_table:
        if num in LSH_table[key] and len(LSH_table) != 1:    # avoid the bucket only contains the searched num itselt
            for docnum in LSH_table[key]:
                if docnum == num:
                    pass
                else:
                    if docnum in res:
                        res[docnum] += 1
                    else:
                        res[docnum] = 1
    return res

result = NNS(LSH_table,0)
result = sorted(result.items(),key=lambda item:item[1])     # sort the result dictionary in an ascending order by the value of each key 

nearest_neighbor_num = 30
nearest_neighbor = []
for i in range(len(result)-1,len(result)-nearest_neighbor_num-1,-1):   # find the nearest documents
    nearest_neighbor.append(result[i])
print('The nearest documents with times in the buckets are {}. '.format(nearest_neighbor))

check_data = matrix.T

LSH_neighbor = []
# Check the accuracy by calculating the jaccard score between the found documents and the document 0
for i in range(len(nearest_neighbor)):
    check_doc = nearest_neighbor[i][0]
    score = jaccard_score(check_data[check_doc],check_data[0], pos_label=1, average = 'binary')
    print("doc {}'s score with doc 0 is : {}".format(check_doc,score))
    LSH_neighbor.append((check_doc,score))
print('The LSH results of nearest documents are {}'.format(LSH_neighbor))