* > # EE226 - Coding 2
## Streaming algorithm & Locality Sensitive Hashing

### Streaming: DGIM

DGIM is an efficient algorithm in processing large streams. When it's infeasible to store the flowing binary stream, DGIM can estimate the number of 1-bits in the window. In this coding, you're given the *stream_data.txt* (binary stream), and you need to implement the DGIM algorithm to count the number of 1-bits. Write code and ask the problems below.

### Your task

1. Set the window size to 1000, and count the number of 1-bits in the current window.

## Import libraries

In [None]:
import numpy as np
from collections import deque
import time
from copy import deepcopy

## Look up data

In [None]:
# Read data and transform to integer
file = "../input/coding2/stream_data.txt"
f = open(file, "r")
org_data = f.read()
print(f"type of data: {type(org_data)}")
org_data = org_data.split('\t')
org_data = [int(d) for d in org_data if d != '']
f.close()

In [None]:
len(org_data)

## Define DGIM

In [None]:
class DGIM:
    def __init__(self, window_length=1000, bucket_size=2):
        self.window_length = window_length
        self.bucket_size = bucket_size    # allow either s-1 or s buckets
        # self.key_number = np.ceil(np.log2(self.window_length))
        self.key_number = np.floor( np.log2(self.window_length / self.bucket_size) ) + 1
        self.buckets_keys = np.arange(start=0, stop=self.key_number, step=1).tolist()
        self.buckets_keys_reversed = np.arange(start=self.key_number-1, stop=-1, step=-1).tolist()
        
        self.reset()
        
    
    def reset(self):
        # initialize the buckets
        self.buckets = {}                 # buckets to store indexes
        for key in self.buckets_keys:
            self.buckets[key] = deque( maxlen = self.bucket_size + 1 )  # +1 to tolerate s+1 buckets
        self.buckets[self.key_number-1] = deque( maxlen = self.bucket_size )   # last bucket only tolerate s buckets
    
        self.current_stamp = -1    # record time stamp
    
    
    def stream(self, bit):
        self.current_stamp = (self.current_stamp + 1) % self.window_length
        
        # remove bits out of windows
        for key in self.buckets_keys_reversed:
            if len(self.buckets[key]) > 0:
                # first_stamp = self.buckets[key][-1]
                # normally, last_bucket_end_stamp small than last_bucket_start_stamp
                # last_bucket_end_stamp = self.buckets[key][-1]
                """
                last_bucket_start_stamp = (self.buckets[key][-2] + 1) % self.window_length
                if last_bucket_start_stamp == self.current_stamp:
                    self.buckets[key].pop()
                """
                last_bucket_end_stamp = self.buckets[key][-1]
                if last_bucket_end_stamp == self.current_stamp:
                    self.buckets[key].pop()
                    # print(f"removed: {last_bucket_end_stamp}")
                
                break
        
        if bit == 1:
            self.append(self.current_stamp)
        
    
    def append(self, stamp):
        self.buckets[0].appendleft(stamp)
        for key in self.buckets_keys:
            if (len(self.buckets[key]) == self.bucket_size + 1) and (key + 1 < self.key_number):
                self.buckets[key+1].appendleft(self.buckets[key].pop())
                self.buckets[key].pop()    # throw away
            else:
                break
                
    def count(self, window_length=None):
        if window_length is None:
            window_length = self.window_length
        """
        if len(self.buckets[0]) == 0:
            print("The buckets is empty now!")
            return 0
        """
        
        cnt = 0
        for key in self.buckets_keys:
            for stamp in self.buckets[key]:
                index = (self.current_stamp - stamp + self.window_length) % self.window_length   # in case that current stamp start from beginning again
                if index < window_length:
                    cnt += 2**key
                else:
                    cnt += 2**key * 0.5    # number estimated for cnt
                    cnt = min(cnt, window_length)
                    return cnt
        cnt = min(cnt, window_length)
        return cnt

## Instantiate DGIM

In [None]:
window_length = 1000
bucket_size = 2
DGIM_handler = DGIM(window_length=window_length, bucket_size=bucket_size)

## Test DGIM

In [None]:
n = 560
for _ in range(n):
    DGIM_handler.stream(1)
DGIM_cnt = DGIM_handler.count()
print(f"number of bit 1 actually: {n}, number of bit 1 DGIM counts: {DGIM_cnt}")

In [None]:
DGIM_handler.reset()

In [None]:
# prepare data
data = deepcopy(org_data)

start_time = time.time()

while data != []:
    bit = data.pop(0)
    DGIM_handler.stream(bit)
    if DGIM_handler.current_stamp == window_length - 1:
        DGIM_cnt = DGIM_handler.count()
        print(f"number of bit 1 DGIM counts: {DGIM_cnt}")
        
end_time = time.time()
total_time = end_time - start_time
print (f"total time normal counter spends: {total_time}")

2. Write a function that accurately counts the number of 1-bits in the current window, and compare the difference between its running time and space and the DGIM algorithm.

In [None]:
window_size = 1000
windows = deque(maxlen=window_size)
current_stamp = -1

In [None]:
# prepare data
data = deepcopy(org_data)

start_time = time.time()

while data != []:
    bit = data.pop(0)
    windows.appendleft(bit)
    current_stamp = (current_stamp + 1) % window_size
    if current_stamp == window_size - 1:
        NORMAL_cnt = np.sum(windows)
        print(f"number of bit 1 normal counter counts: {NORMAL_cnt}")

end_time = time.time()
total_time = end_time - start_time
print (f"total time normal counter spends: {total_time}")

## Increase bucket size

In [None]:
window_length = 1000
bucket_size = 10
DGIM_handler = DGIM(window_length=window_length, bucket_size=bucket_size)

In [None]:
# prepare data
data = deepcopy(org_data)

start_time = time.time()

while data != []:
    bit = data.pop(0)
    DGIM_handler.stream(bit)
    if DGIM_handler.current_stamp == window_length - 1:
        DGIM_cnt = DGIM_handler.count()
        print(f"number of bit 1 DGIM counts: {DGIM_cnt}")
        
end_time = time.time()
total_time = end_time - start_time
print (f"total time normal counter spends: {total_time}")

可以看到，增加 bucket_size 后，误差确实变小了很多！

### Locality Sensitive Hashing

The locality sensitive hashing (LSH) algorithm is efficient in near-duplicate document detection. In this coding, you're given the *docs_for_lsh.csv*, where the documents are processed into set of k-shingles (k = 8, 9, 10). *docs_for_lsh.csv* contains 201 columns, where column 'doc_id' represents the unique id of each document, and from column '0' to column '199', each column represents a unique shingle. If a document contains a shingle ordered with **i**, then the corresponding row will have value 1 in column **'i'**, otherwise it's 0. You need to implement the LSH algorithm and ask the problems below.

### Your task

Use minhash algoirthm to create signature of each document, and find 'the most similar' documents under Jaccard similarity. 
Parameters you need to determine:
1) Length of signature (number of distinct minhash functions) *n*. Recommanded value: n > 20.

2) Number of bands that divide the signature matrix *b*. Recommanded value: b > n // 10.

## Input libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
file = "../input/coding2/docs_for_lsh.csv"
data = pd.read_csv(file)
data.head()

In [None]:
# print(data.columns)
print(f"Before delete: {data.shape}")
data = data.drop('doc_id', axis=1)      # NOTE: 不是 in-place 操作；axis=1 表示列，axis=0 表示行
print(f"After delete: {data.shape}")

In [None]:
# The min_hashing function
def minHashing(shingles,  signature_number):
    shingle_number, file_number = shingles.shape
    # shingles， signatures
    signatures = np.ones(shape=[signature_number, file_number]) * shingle_number
    
    for signature in range(signature_number):
        hash_a = np.random.randint(1000)
        hash_b = np.random.randint(1000)
        hash_p = 10000019    # a prime number
        print(f"Process signature: {signature}\t", end="\t")
        if signature % 5 == 4:
            print(f"\n", end='')
        for file in range(file_number):
            # for shingle in range(shingle_number):
            """
            for shingle in np.where(shingles[str(file)])[0]:
                index = ((hash_a * shingle + hash_b) % hash_p) % shingle_number
                if signatures[signature, file] < index:
                    signatures[signature, file] = shingle
            """
            #  本质上就是只计算为 1 的位置，不是 1 的位置不计算
            indexes = np.where(shingles[str(file)])[0]
            hash_indexes = ((hash_a * indexes + hash_b) % hash_p) % shingle_number
            signatures[signature, file] = hash_indexes.min()
            
    return signatures

In [None]:
# The LSH hashing for bands
def LSHHash(signatures, row_number):
    signature_number, file_number = signatures.shape
    band_number = signature_number // row_number
    LSH_hash = np.zeros(shape=[band_number, file_number])
    hash_add = np.random.randint(low=1, high=3)
    
    for band in range(band_number):
        for file in range(file_number):
            band_signature = signatures[band*row_number : band*row_number+row_number, file]
            # product_item = round(np.product(band_signature + hash_add) ** (1/row_number))    # hashing
            # product_item = int(np.cumproduct(band_signature**2 + hash_add).mean() ** (1/row_number))
            sum_item = int(np.cumsum(band_signature**2).mean())
            # LSH_hash[band, file] = product_item + sum_item
            LSH_hash[band, file] = sum_item
    return LSH_hash

In [None]:
def get_Jaccard_similarity(fileA, fileB):
    # AandB = dA and dB
    AandB = np.bitwise_and(fileA, fileB).sum()
    # AorB = dA or dB
    AorB = np.bitwise_or(fileA, fileB).sum()
    Jaccard_similarity_A_B = AandB / AorB
    # print(f"Jaccard similarity for file A and file B: {Jaccard_similarity_A_B}")
    # print(AandB, AorB)
    return Jaccard_similarity_A_B

In [None]:
signature_number = 200
start_time = time.time()
signatures = minHashing(data,  signature_number)
end_time = time.time()
print(f"total time: {end_time - start_time}")

In [None]:
signatures

In [None]:
row_number = 5    # band_number = signature_number // row_number
start_time = time.time()
LSH_hash =  LSHHash(signatures, row_number)
end_time = time.time()
print(f"total time: {end_time - start_time}")

In [None]:
LSH_hash

Problem: For document 0 (the one with id '0'), list the **30** most similar document ids (except document 0 itself). You can valid your results with the [sklearn.metrics.jaccard_score()](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html) function.

Tips: You can adjust your parameters to hash the documents with similarity *s > 0.8* into the same bucket.

In [None]:
similar_bools = np.any(np.repeat(a=LSH_hash[:,0].reshape(-1, 1), repeats=LSH_hash.shape[1]-1, axis=1) == LSH_hash[:,1:], axis=0)
similar_numbers = similar_bools.sum()
print(f"similar numbers: {similar_numbers}")
similar_file_indexes = np.where(similar_bools)[0] + 1
print(f"similar files: {similar_file_indexes}")

In [None]:
from sklearn.metrics import jaccard_score

In [None]:
fileA_index = 0
fileA = data[str(fileA_index)]
# for fileB_index in data.columns:
for fileB_index in similar_file_indexes:
    fileB = data[str(fileB_index)]
    Jaccard_similarity_A_B = get_Jaccard_similarity(fileA, fileB)
    # Jaccard_similarity_A_B = jaccard_score(fileA, fileB)    # 速度很慢，不如自己算
    print(f"Jaccard similarity for file {fileA_index} and file {fileB_index}: {round(Jaccard_similarity_A_B, 3)}")

In [None]:
fileA_index = 0
fileA = data[str(fileA_index)]
for fileB_index in data.columns:
# for fileB_index in similar_file_indexes:
    fileB = data[str(fileB_index)]
    Jaccard_similarity_A_B = get_Jaccard_similarity(fileA, fileB)
    # Jaccard_similarity_A_B = jaccard_score(fileA, fileB)    # 速度很慢，不如自己算
    print(f"Jaccard similarity for file {fileA_index} and file {fileB_index}: {round(Jaccard_similarity_A_B, 3)}")

In [None]:
fileA_index = 0
fileA = signatures[:, fileA_index]
# for fileB_index in data.columns:
for fileB_index in similar_file_indexes:
    fileB = signatures[:, fileB_index]
    Jaccard_similarity_A_B = (fileA == fileB).sum() / len(fileA)
    print(f"Jaccard similarity for file {fileA_index} and file {fileB_index}: {round(Jaccard_similarity_A_B, 3)}")

In [None]:
fileA_index = 0
fileA = signatures[:, fileA_index]
for fileB_index in range(1, signatures.shape[1]):
# for fileB_index in similar_file_indexes:
    fileB = signatures[:, fileB_index]
    Jaccard_similarity_A_B = (fileA == fileB).sum() / len(fileA)
    print(f"Jaccard similarity for file {fileA_index} and file {fileB_index}: {round(Jaccard_similarity_A_B, 3)}")