<a href="https://colab.research.google.com/github/sierrarowley/research_fall2020/blob/master/data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [132]:
import random
import string
import numpy as np

xbound = 10
ybound = 10
point_bound = 1000

def generatedata():
  db = np.empty((xbound, ybound), dtype=object)
  # create random number of points
  for n in range(0, random.randint(1, point_bound)):
    x = random.randint(0, xbound-1)
    y = random.randint(0, ybound-1)

    # encrypt value of point
    randstring = ''.join(random.choices(string.ascii_letters + string.digits, k=100))
    enc = hash(randstring + str(x) + str(y))

    # store encrypted point in the database
    if db[x, y] == None:
      db[x, y] = [enc]
    # at most 10 values per point
    elif len(db[x, y]) < 10:
      db[x, y].append(enc)
  
  return db

The database is classified as either sparse (0) or dense (1)

In [133]:
def produce_leakage(db):
  leakage = []
  dense_flag = 1
  query_len = 0
  # all possible size rectangle queries
  for l in range(1, xbound+1):
    for w in range(1, ybound+1):
      # all possible starting points of the rectangle query
      for x in range(0, xbound):
        # dont perform query if it is out of bounds
        if x+l > xbound:
          break
        for y in range(0, ybound):
          if y+w > ybound:
            break

          # if any point in database is None, then it is not dense
          if db[x, y] == None:
            dense_flag = 0
          
          curr_query = []
          # loop through current query in the database
          for i in range(x, x+l):
            for j in range(y, y+w):
              if db[i, j] != None:
                curr_query.append(db[i, j])

          # add query leakage to entire list
          if len(curr_query) != 0:
            leakage.append(curr_query)
          # keep track of longest query length for creating input array
          if len(curr_query) > query_len:
            query_len = len(curr_query)
  
  # shuffle to make location of queries less obvious
  random.shuffle(leakage)
  return (leakage, dense_flag, query_len)

create_input() calls generatedata() and produce_leakage(), which creates the database and the search query leakages from that database. It then takes the returned list of leakages and transforms it into a 2D numpy array with dimensions (number of queries, length of longest query). All queries with a length shorter than the longest are padded with 0's at the end. Each index of the array represents one point in a query and is stored as a list of the value(s) at that point. create_input() returns the numpy array, which can now be fed into a tensorflow neural network, and the classification of the database.

In [137]:
def create_input():
  leakage, c, querylength = produce_leakage(generatedata())
  numqueries = len(leakage)
  
  input = np.empty((numqueries, querylength), dtype=object)
  for x in range(0, numqueries):
    q = np.array(leakage[x]).reshape(-1,)
    input[x] = np.pad(q, (0, querylength-len(q)), 'constant', constant_values=(0, 0))

  return (input, c)

create_input()

(array([[list([-3256506557335454334]), list([9068507468738152949]),
         list([5618558401670450457, -1075273649716704027, -9034128101618050107]),
         ..., 0, 0, 0],
        [list([-1827411471930281147, 2534460368314086198, 2935261470265920565]),
         list([-6246637671289476312]), list([-3856353503613942023]), ...,
         0, 0, 0],
        [list([-334823265489042667, -255600828581533805]),
         list([3339592978246353040]), list([4883026770447550994]), ..., 0,
         0, 0],
        ...,
        [list([7432543942090274769]), list([-3318036558576878488]),
         list([-5290195023202205286, -4712227084718497383, 2708348145642834868]),
         ..., 0, 0, 0],
        [list([-692801589995772069]), list([-3495830199094040265]),
         list([7550648942710716011]), ..., 0, 0, 0],
        [list([7432543942090274769]), list([-3318036558576878488]),
         list([-666635612110227659, 2884447474467833026]), ..., 0, 0, 0]],
       dtype=object), 0)