# Chapter 10 - Private Entity Resolution

In [35]:
import requests
import private_set_intersection.python as psi
from pandas import read_csv

In [36]:
url="http://localhost:5000/"

## Client Setup

In [None]:
df_m = read_csv('mari_clean.csv')
client_items = (df_m['CompanyName']+' '+df_m['Postcode']).to_list()

c = psi.client.CreateWithNewKey(True)
psirequest = c.CreateRequest(client_items).SerializeToString()
c.CreateRequest(client_items)

### Get Server-encrypted Client Values

In [None]:
response = requests.post(url+'match', headers={'Content-Type': 'application/protobuf'}, data=psirequest)
psiresponse = psi.Response()
psiresponse.ParseFromString(response.content)
psiresponse

### Get Server Setup - Raw

In [39]:
setupresponse = requests.get(url+'rawsetup')
rawsetup = psi.ServerSetup()
rawsetup.ParseFromString(setupresponse.content)
rawsetup

raw {
  encrypted_elements: "\003\001v\262\314I~\252\2021.53}\236\232\351+\267\323\220\006\362\005~,~\266u\303\333\315$"
  encrypted_elements: "\003\266\000\301ov\007\345Ao?\0331x\022$\2115e\322.F\237N\007\353\200\037\016\302\235vg"
  encrypted_elements: "\003\331\316\347\240\314r\250\210\270B7\244D\023k*\025O\274p\374\325\2428\3555-\212\306\244D\373"
}

## Bloom Decode

In [40]:
setupresponse = requests.get(url+'bloomsetup')
bloomsetup = psi.ServerSetup()
bloomsetup.ParseFromString(setupresponse.content)
bloomsetup

bloom_filter {
  num_hash_functions: 14
  bits: "\020\000\000\000\000\000\000\000\000\010\000\000\000\000\000\000\000\000\000\000\000\000\000\010\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\200\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\020\000\000\000\000\000@ \000\000\001\000\000\001\000\000\000\000\200\004\000\000\000\000\000\020\000@\000\000\000B\000\000\000\000 \000\001\000\000\000\000\000\004\020\000\000\000\000\020\000\000\000(\000@\000\000\000\000@\000\005\000\000\000\000\000\004\000\000\002\000\000\020\000\000\000\000\000A\000\000\000\004\000\000\001\200\000\000\000\000\000\000\000\000@\000\000\000\000\000\000\000\000 @\000\000\000\000\000\000\200"
}

### Server Calculation

In [41]:
from math import ceil, log, log2

fpr = 0.01
num_client_inputs = 100
correctedfpr = fpr/num_client_inputs
len_server_items = 2
max_elements = max(num_client_inputs, len_server_items)
num_bits = (ceil(-max_elements * log2(correctedfpr) / log(2) /8 )) * 8
num_bits

1920

In [42]:
from hashlib import sha256

#num_bits = len(bloomsetup.bloom_filter.bits)*8
filterlist = ['0'] * num_bits

for element in rawsetup.raw.encrypted_elements:
    element1 = str.encode('1') + element
    k = sha256(element1).hexdigest()
    h1 = int(k,16) % num_bits

    element2 = str.encode('2') + element
    k = sha256(element2).hexdigest()
    h2 = int(k,16) % num_bits
    
    for i in range(bloomsetup.bloom_filter.num_hash_functions):
        pos = ((h1 + i * h2) % num_bits)
        filterlist[num_bits-1-pos]='1'
        
filterstring = ''.join(filterlist)

In [43]:
bloombits = ''.join(format(byte, '08b') for byte in reversed(bloomsetup.bloom_filter.bits))
bloombits == filterstring

True

In [44]:
num_hash_functions = ceil(-log2(correctedfpr))
num_hash_functions

14

## GCS Decode

In [45]:
setupresponse = requests.get(url+'gcssetup')
gcssetup = psi.ServerSetup()
gcssetup.ParseFromString(setupresponse.content)
gcssetup

gcs {
  div: 18
  hash_range: 1000000
  bits: "C\022sW0\216;\000"
}

In [46]:
from math import ceil, log, log2

fpr = 0.01
num_client_inputs = 100
correctedfpr = fpr/num_client_inputs

hash_range = max_elements/correctedfpr
hash_range

1000000.0

In [47]:
from hashlib import sha256

# For all server encrypted elements, calculate hash and then bucket value
ulist = []
for element in rawsetup.raw.encrypted_elements:
    k = sha256(element).hexdigest()
    ks = int(k,16) % gcssetup.gcs.hash_range
    ulist.append(ks)

# Sort the hash bucket values
ulist.sort()
# Calculate deltas between sorted hash bucket values 
udiff = [ulist[0]] + [ulist[n]-ulist[n-1] for n in range(1,len(ulist))]

In [48]:
avg = (ulist[-1]+1)/len(ulist)
prob = 1/avg
gcsdiv = max(0,round(-log2(-log2(1.0-prob))))
gcsdiv

18

In [49]:
# For all delta hash bucket values encode as unary portion for quotient followed by binary for remainder.
# Pad with leading zeros so binary portion is of consistent length.
# Concatenate with previous values

encoded = ''
for diff in udiff:
    if diff != 0:
        quot = int(diff / pow(2,gcssetup.gcs.div)) 
        rem = diff % pow(2,gcssetup.gcs.div)
        next = '{0:b}'.format(rem) + '1' + ('0' * quot)
        pad = next.zfill(quot+gcssetup.gcs.div+1)
        encoded = pad + encoded

In [50]:
# Pad final encoded string with leading 0s to length as a multiple of 8 

from math import ceil

padlength = ceil(len(encoded)/8)*8
padded = encoded.zfill(padlength)

In [51]:
# Build gcs as concatenated sequence of bits from reversed gcs.bits value returned from setup
# Check server gcs bits match our gcs bits 

gcsbits = ''.join(format(byte, '08b') for byte in reversed(gcssetup.gcs.bits))
gcsbits == padded

True

### Calculate Set Intersection

In [52]:
intersection = c.GetIntersection(gcssetup, psiresponse)
#intersection = c.GetIntersection(bloomsetup, psiresponse)
#intersection = c.GetIntersection(rawsetup, psiresponse)

iset = set(intersection)
sorted(intersection)

[1, 2]

In [54]:
for index in sorted(intersection):
    print(client_items[index])

ABLY RESOURCES G2 1PB
ADVANCE GLOBAL RECRUITMENT EH7 4HG
