In [None]:
# EE226 - Coding 2
## Streaming algorithm & Locality Sensitive Hashing

### Streaming: DGIM

DGIM is an efficient algorithm in processing large streams. When it's infeasible to store the flowing binary stream, DGIM can estimate the number of 1-bits in the window. In this coding, you're given the *stream_data.txt* (binary stream), and you need to implement the DGIM algorithm to count the number of 1-bits. Write code and ask the problems below.

### Your task

1. Set the window size to 1000, and count the number of 1-bits in the current window.

In [None]:
import time
from math import *
      
window_size = 1000     #窗口大小
time_location = 1000  #当前时刻
n_max_bucket = 2      #最大相同桶数量
container = {} 
list_1 = []          

for i in range(int(log(window_size,2))+1):
    key = int(pow(2,i))
    list_1.append(key)
    container[key] = []          #建桶

def DGIM(data,container,keylist,window_size,n_max_bucket,time_location):
    start_time = time.time()
    bit_num = 0
    timestamp = 0   #时间戳标志该位进入流的时间
    
    for i in range(time_location):
        timestamp = (timestamp + 1) % window_size      
                                                      
        for key in container:
            for eachstamp in container[key]:
                if eachstamp == timestamp:            
                    container[key].remove(eachstamp)   #弃桶
                                                     
                    
        if data[i] == '1':
            container[1].append(timestamp)             #合并桶
            for key in list_1:                             
                if len(container[key]) > n_max_bucket:   
                    container[key].pop(0)
                    tmpstamp = container[key].pop(0)
                    if key != list_1[-1]:
                        container[key*2].append(tmpstamp)
                    else:
                        container[key].pop(0)
                else:
                    break
                    
    firststamp = 0                                
    for key in list_1:
        if len(container[key]) > 0:
            firststamp = container[key][0]    
        for tmpstamp in container[key]:
            print("Bucket's size: {}.Timestamp: {}".format(key,tmpstamp))
    for key in list_1:
        for tmpstamp in container[key]:
            if tmpstamp != firststamp:           
                bit_num += key
            else:
                bit_num += 0.5*key                   
            
    end_time = time.time()
    return bit_num,end_time-start_time
    
with open('../input/coding2/stream_data.txt','r') as f:
    data = f.read().split('\t')
    res, cost_time = DGIM(data,container,list_1,window_size,n_max_bucket,time_location)
    print("1s in the last 1000 window_size of {} bits: {}".format(time_location,res))
    print("Running time with DGIM algorithm:{}".format(cost_time))

2. Write a function that accurately counts the number of 1-bits in the current window, and compare the difference between its running time and space and the DGIM algorithm.

In [None]:
# Your code here, you can add cells if necessary
def BF(data,window_size,time_location):   #暴力算法
    start_time = time.time()
    bit_num = 0
    
    for i in range(time_location-window_size,time_location):
        if data[i] == '1': bit_num += 1
    end_time = time.time()
    return  bit_num, end_time-start_time 
        
with open('../input/coding2/stream_data.txt','r') as f:
    data = f.read().split('\t')
    res, cost_time = BF(data,window_size,time_location)
    print("1s in the last 1000 window_size of  {} bits: {}".format(time_location,res))
    print("Running time with BF algorithm:{}".format(cost_time))

**Answer**:In ***running time*** aspect: BF algorithm is faster than DGIM algorithm.      
           In ***space*** aspcet: BF algorithm cost more memory than DGIM algorithm.


### Locality Sensitive Hashing

The locality sensitive hashing (LSH) algorithm is efficient in near-duplicate document detection. In this coding, you're given the *docs_for_lsh.csv*, where the documents are processed into set of k-shingles (k = 8, 9, 10). *docs_for_lsh.csv* contains 201 columns, where column 'doc_id' represents the unique id of each document, and from column '0' to column '199', each column represents a unique shingle. If a document contains a shingle ordered with **i**, then the corresponding row will have value 1 in column **'i'**, otherwise it's 0. You need to implement the LSH algorithm and ask the problems below.

### Your task

Use minhash algoirthm to create signature of each document, and find 'the most similar' documents under Jaccard similarity. 
Parameters you need to determine:
1) Length of signature (number of distinct minhash functions) *n*. Recommanded value: n > 20.

2) Number of bands that divide the signature matrix *b*. Recommanded value: b > n // 10.

In [None]:
# Your code here, you can add cells if necessary

import numpy as np
import csv
import random

#数据预处理，将csv处理成需要的list
time = 0
data = []
with open('../input/coding2/docs_for_lsh.csv') as f:
    csvmap = csv.reader(f)
    for row in csvmap:
        time += 1
        if time == 1:              
            pass
        else:
            data.append(row[1:])    

data = np.array(data)                       
print(data.T)
print('-------------------------------------------------------')

In [None]:
def MinHash(data, b, r):
    n = b*r
    signature = []
    
    for i in range(n):                          
        permutation = []                        
        signal_signature = []                  
        for num in range(1,data.shape[0]+1):
            permutation.append(num)
        
        random.shuffle(permutation)            #初始化向量 
    
        for j in range(data.shape[1]):
            for k in range(data.shape[0]):
                index = permutation.index(k+1)           
                if data[index][j] == '1':
                    signal_signature.append(k+1)
                    break
                else:
                    pass
                
        signature.append(signal_signature)
    return np.array(signature)

b = 10
r = 5
res_signature = MinHash(data,b,r)
print(res_signature)
print('----------------------------------------')

In [None]:
import hashlib
from sklearn.metrics import jaccard_score
def LSH(signature, b, r):
    #计算两个signature向量MinHash值相等的比例，即可以估计原向量A，B的Jaccard相似度
    length, docnum = signature.shape
     
    buckets = {}                     
    
    start = 0       #初始位置                 
    
    for i in range(b):
        for j in range(docnum):
            md5 = hashlib.md5()       
            signal_band = str(signature[start:start+r,j])
            hashed_band = md5.update(signal_band.encode())    
            hashed_band = md5.hexdigest()
            
            if hashed_band not in buckets:                
                buckets[hashed_band] = [j]                
            elif j not in buckets[hashed_band]:           
                buckets[hashed_band].append(j)
        start += r
    
    return buckets

LSH_table = LSH(res_signature,b,r)
print('-------------------------------------------------------')

def NNS(LSH_table, num):
    res = {}
    for key in LSH_table:
        if num in LSH_table[key] and len(LSH_table) != 1:   
            for docnum in LSH_table[key]:
                if docnum == num:
                    pass
                else:
                    if docnum in res:
                        res[docnum] += 1
                    else:
                        res[docnum] = 1
    return res

result = NNS(LSH_table,0)
result = sorted(result.items(),key=lambda item:item[1])    

nearest_neighbor_num = 30
nearest_neighbor = []
for i in range(len(result)-1,len(result)-nearest_neighbor_num-1,-1):   #找到距离最近的文件
    nearest_neighbor.append(result[i])
print('time: {}. '.format(nearest_neighbor))

check_data = data.T

LSH_neighbor = []
for i in range(len(nearest_neighbor)):
    check_doc = nearest_neighbor[i][0]
    score = jaccard_score(check_data[check_doc],check_data[0], pos_label= '1', average = 'binary') #计算准确度
    print("Number:{}        Score(with doc 0): {}".format(check_doc,score))
    LSH_neighbor.append((check_doc,score))
print('result: {}'.format(LSH_neighbor))

Problem: For document 0 (the one with id '0'), list the **30** most similar document ids (except document 0 itself). You can valid your results with the [sklearn.metrics.jaccard_score()](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html) function.

Tips: You can adjust your parameters to hash the documents with similarity *s > 0.8* into the same bucket.

In [None]:
# Your code here, you can add cells if necessary
true_res = {}
for i in range(1,data.shape[1]):
    score = jaccard_score(check_data[0],check_data[i],pos_label= '1', average = 'binary')
    true_res[i] = score
true_res = sorted(true_res.items(),key=lambda item:item[1])

true_result = []
for i in range(len(true_res)-1,len(true_res)-nearest_neighbor_num-1,-1):
    true_result.append(true_res[i])
print('results{}'.format(true_result))
samecnt = 0
same_neighbor = []
for i in range(len(LSH_neighbor)):
    if LSH_neighbor[i] in true_result:
        samecnt += 1
        same_neighbor.append(LSH_neighbor[i])

print('The documents with score in both LSH result and brute force result are: {}'.format(same_neighbor))
print('The same documents number is {}, the accuracy is {}.'.format(samecnt,float(samecnt/nearest_neighbor_num)))