In [None]:
import math
import random
import collections
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
class TabulationHash:
    """Hash function for hashing by tabulation.

    The 32-bit key is split to four 8-bit parts. Each part indexes
    a separate table of 256 randomly generated values. Obtained values
    are XORed together.
    """

    def __init__(self, num_buckets):
        self.tables = [None] * 4
        for i in range(4):
            self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)]
        self.num_buckets = num_buckets

    def hash(self, key):
        h0 = key & 0xff;
        h1 = (key >> 8) & 0xff;
        h2 = (key >> 16) & 0xff;
        h3 = (key >> 24) & 0xff;
        t = self.tables
        return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
    def binary_conv(self,num):
        mys = ''
        while(num>=1):
            mys+=str(num%2)
            num=int(num/2)
        rest_l = 32-len(mys)
        for i in range(rest_l):
            mys+='0'
        return mys[::-1]
            
    def converter(self,key,maximum_coordinate):
        '''This function converts a key to a 32 bit integer, so we are trying to make each co-ordinate 
        a 32 bit integer and the only way to do that is to map all tuples to unique integers and then map them using binary transformation.
        We are using integer coordinates only.'''
        int_num = key[0]*maximum_coordinate+key[1]
        #convert it to a binary integer and pad it with zeros for filling the previuos empty posts.
        bit_int = self.binary_conv(int_num)
        return self.hash(int(bit_int))



            
    

In [None]:
table = TabulationHash(36)
table.converter((14,15),16)

1

In [None]:
class Differential_Privacy_CMS():
    def __init__(self, width, depth,epsilon, max_coor):
        ''' Method to initialize the data structure
        @param width int: Width of the table
        @param depth int: Depth of the table (num of hash func)
        @param epsilon: privacy parameter
        @param max_coor: this is added for hash function
        In this case, when we declare differential privacy class instance, we give the parameters m and k, or width and depth
        '''
        self.width = width #hash values range between 0 to width-1
        self.depth = depth #number of hash functions
        self.epsilon = epsilon 
        self.tables = [None]*depth #going to be a 3D table where self.tables[i] is the ith hash function 
        self.max_coor = max_coor
        for i in range(depth):
            self.tables[i]= TabulationHash(width)
        
            
#Implementing the client side algorithm which works on each particular data element. First it selects an integer randomly from 0 to depth-1, 
#we store that value as j, next, we initialize a vector of -1, of length width, and set that jth index as 1. Then we create another vector of -1s
#and +1s, with fixed probabilities determined by the parameter epsilon. Then we do element wise multioplication and return the 
#result vector. 

    def CLient_Side(self,d):
        '''parameters are a data element, epsilon value and a hash family, but we are using mmh3 here so didn't add that as a parameter'''
        j = np.random.randint(self.depth)
        z_vect = np.zeros((1,self.width))
        v = z_vect-1 
        index = self.tables[j].converter(d,self.max_coor)
        v[0,index]=1
        val = np.exp(self.epsilon/2)
        probability_of_1 = val/(val+1) 
        probability_of_neg_1 = 1/(val+1)
        b = np.random.choice([1,-1],self.width,p=[probability_of_1,probability_of_neg_1]) 
        final_vector = v*b
        return (final_vector,j) 

    def Compute_Sketch_Matrix(self,D):
        '''So each element of D is a tuple where the first element is v_i and second element is j_i(which is created by the above funstion after getting 
        passed by Client_side each time), we have privacy parameter epsilon and dimensions. v_i is the a vector an it has the sma eshape as the 
        vectors returned by the Client_side algorithm'''
        val = np.exp(self.epsilon/2)
        n = len(D)
        c_epsilon = (val+1)/(val-1) 
        vec_one = np.ones((1,self.width))
        manipulated_data_matrix = np.zeros((n,self.width)) #Creatimg a matrix for x_is
        for elt in enumerate(D):
            new_vect = elt[1][0].reshape((1,self.width))
            man_vect = (c_epsilon/2)*new_vect #scalar and vector multiplication 
            half_vec_one = 0.5*vec_one 
            sum_vect = man_vect + half_vec_one
            manipulated_data_matrix[[elt[0]],:]= self.depth*sum_vect  
        M = np.zeros((self.depth,self.width))
        for elt in enumerate(D):
            for l in range(self.width): 
                M[elt[1][1],l]=M[elt[1][1],l]+manipulated_data_matrix[elt[0],l]
        return M 

    def Server_Side(self,Sketch_Matrix,d,length):
        '''It returns the estimated frequency of a data element given to it. So it has two parameters, data element and the length of the 
        data stream we are considering'''
        n = length 
        frac1 = self.width/(self.width-1)
        frac2 = n/self.width
        row_sum = 0
        for i in range(self.depth):
            index = self.tables[i].converter(d,self.max_coor)
            row_sum = row_sum + Sketch_Matrix[i,index]
        avg_row_sum = row_sum/self.depth 
        subtraction = avg_row_sum - frac2 
        assumed_freq = frac1*subtraction 
        return assumed_freq
        
    def Count_Mean_Sketch(self,D_s,D): 
        '''D_s is the stream of data and this is a subset of universe of data'''
        Modified_datalist = []
        Sketch_Matrix = []
        freq_vect = {}
        length = len(D_s) 
        for elt in enumerate(D_s):
            Modified_datalist.append(self.CLient_Side(elt[1])) 
        Sketch_Matrix = self.Compute_Sketch_Matrix(Modified_datalist)
        for d in D:
            freq_vect[d] = self.Server_Side(Sketch_Matrix,d,length) 
        #for d in freq_vect:
          
            #freq_vect[d]=round(freq_vect[d])
        return freq_vect

In [None]:
path_list = [(15, 9), (16, 9), (17, 9), (18, 9), (19, 9), (19, 10), (19, 9), (18, 9), (18, 8), (18, 7), (18, 8), (18, 9), (19, 9), (19, 8), (19, 9), (19, 8), (18, 8), (18, 7), (18, 6), (18, 5), (17, 5), (18, 5), (17, 5), (16, 5), (17, 5), (16, 5), (16, 4), (16, 3), (15, 3), (15, 2), (14, 2), (14, 1), (13, 1), (12, 1), (13, 1), (13, 0), (12, 0), (11, 0), (12, 0), (11, 0), (10, 0), (9, 0), (8, 0), (7, 0), (7, 1), (7, 0), (6, 0), (6, 1), (5, 1), (5, 0), (5, 1), (6, 1), (6, 0), (5, 0), (4, 0), (3, 0), (2, 0), (2, 1), (1, 1), (1, 0), (2, 0), (2, 1), (2, 2), (2, 1), (3, 1), (4, 1), (4, 2), (4, 3), (3, 3), (4, 3), (5, 3), (6, 3), (6, 2), (6, 1), (7, 1), (8, 1), (7, 1), (8, 1), (8, 2), (8, 1), (7, 1), (7, 2), (6, 2), (6, 1), (5, 1), (4, 1), (3, 1), (4, 1), (3, 1), (4, 1), (4, 2), (3, 2), (3, 3), (2, 3), (2, 2), (1, 2), (1, 1), (0, 1), (1, 1), (1, 2), (2, 2), (2, 1), (2, 0), (3, 0), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (3, 5), (3, 6), (3, 7), (3, 6), (2, 6), (2, 5), (1, 5), (0, 5), (1, 5), (1, 4), (1, 3), (2, 3), (2, 2), (1, 2), (2, 2), (2, 1), (2, 2), (1, 2), (2, 2), (2, 3), (1, 3), (1, 4), (1, 5), (0, 5), (0, 6), (0, 5), (0, 6), (0, 5), (1, 5), (1, 6), (0, 6), (1, 6), (0, 6), (1, 6), (1, 7), (2, 7), (1, 7), (0, 7), (1, 7), (2, 7), (1, 7), (1, 6), (0, 6), (0, 5), (0, 6), (0, 7), (0, 6), (1, 6), (1, 5), (1, 6), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (0, 10), (1, 10), (1, 9), (0, 9), (1, 9), (1, 10), (2, 10), (1, 10), (1, 9), (1, 8), (1, 7), (1, 6), (1, 5), (2, 5), (1, 5), (0, 5), (0, 4), (0, 3), (1, 3), (0, 3), (0, 4), (1, 4), (1, 3), (0, 3), (1, 3), (1, 4), (1, 3), (1, 2), (2, 2), (3, 2), (4, 2), (4, 1), (5, 1), (5, 2)]

In [None]:
def frequency_counter_on_average(path_list,epsilon,run_number):
    '''This function runs the entire process multiple and take the running average of frequencies for reducing error'''
    u_list = [(x,y) for x in range(20) for y in range(20)]
    freq_counter_cum = {d:0 for d in u_list}
    for i in range(run_number):
        class_instance = Differential_Privacy_CMS(100,100,epsilon,19)
        freq_counter_new = class_instance.Count_Mean_Sketch(path_list, u_list)
        for d in freq_counter_new:
            freq_counter_cum[d]=(i*freq_counter_cum[d]+freq_counter_new[d])/(i+1)
    an_list =[]
    for d in freq_counter_cum: 
        if freq_counter_cum[d]>=1 and d in path_list:
            an_list.append((d, freq_counter_cum[d], path_list.count(d)))
    return freq_counter_cum, an_list
frequency_counter_on_average(path_list,5,400)

({(0, 0): 0.0974001399971508,
  (0, 1): 1.320904565864386,
  (0, 2): 0.11526151847696493,
  (0, 3): 2.779583808382505,
  (0, 4): 1.6334786892611286,
  (0, 5): 6.345905711518641,
  (0, 6): 7.1288294682171385,
  (0, 7): 2.0502441871234445,
  (0, 8): 0.16586875750310348,
  (0, 9): 0.5171425342727733,
  (0, 10): 0.5796573589521214,
  (0, 11): -0.042513991428056536,
  (0, 12): 0.22838358218245147,
  (0, 13): 0.040839108144407374,
  (0, 14): 0.17479944674300996,
  (0, 15): -0.5128636247298164,
  (0, 16): -0.1526591587202407,
  (0, 17): -0.16158984796014775,
  (0, 18): 0.18373013598291732,
  (0, 19): 0.9249773428951855,
  (1, 0): 0.9249773428951855,
  (1, 1): 2.833167943821946,
  (1, 2): 4.887226469000525,
  (1, 3): 6.092869516387946,
  (1, 4): 3.7649365211855668,
  (1, 5): 8.021898392207822,
  (1, 6): 8.069528734820654,
  (1, 7): 6.092869516387943,
  (1, 8): 2.2943496930142366,
  (1, 9): 3.964388580876817,
  (1, 10): 4.110256505128627,
  (1, 11): 0.4665352952466337,
  (1, 12): -0.12884398741

In [None]:
freq_counter_cum

NameError: ignored