In [1]:
import sys
# append the path of the parent directory
sys.path.append("..")

In [2]:
import math
import os
import time


import numpy as np
np.set_printoptions(legacy='1.25')

import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import matplotlib.patches as patches

import seaborn as sns
import time
import json
import pandas as pd
from ctypes import c_int32
from itertools import product
import copy


from tqdm import tqdm

from scipy.stats import pearsonr
from importlib import reload

import orjson
import gzip

from scipy.stats import entropy



from lib import sketches, visualization_utils, encoders, ploting, pacha_sketch
reload(ploting)
reload(sketches)
reload(visualization_utils)
reload(encoders)

reload(pacha_sketch)

from lib.sketches import BloomFilter, CountMinSketch, H3HashFunctions, HashFunctionFamily,\
      CountMinSketchHadamard, CountMinSketchLocalHashing, deterministic_hash, simple_deterministic_hash
from lib.visualization_utils import visualize_badic_cover, plot_b_adic_cubes
from lib.encoders import minimal_b_adic_cover, minimal_spatial_b_adic_cover, BAdicCube, BAdicRange, \
      minimal_b_adic_cover_array, downgrade_b_adic_range_indices
from lib.pacha_sketch import PachaSketch, ADTree, BFParameters, CMParameters, cartesian_product

from lib.ploting import set_style, plot_ylabel, plot_legend

from lib import baselines
reload(baselines)

from lib.baselines import CentralDPServer, LDPServer, LDPEncoderGRR, filter_df, query_df, \
      infer_domains_and_ranges, translate_query_region, evaluate_queries, check_accruracy, \
      evaluate_queries_baselines, evaluate_equivalent_pacha_sketches

# set_style()

  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)


# Different Datasets

In [3]:
delta = 0.01
abs_error_eps = 1.0
bloom_p = 0.01

## Retail

In [9]:
cat_updates = 4
num_updates = 6

In [4]:
retail_df = pd.read_parquet("../data/clean/online_retail_no_outliers.parquet")

In [7]:
query_path = "../queries/online_retail_2_cols.json"
with open(query_path, 'rb') as f:
    retail_queries_2 = orjson.loads(f.read())

query_path = "../queries/online_retail_4_cols.json"
with open(query_path, 'rb') as f:
    retail_queries_4 = orjson.loads(f.read())

retail_ad_tree = ADTree.from_json("../sketches/ad_trees/online_retail.json")


In [14]:
retail_p_sketch = PachaSketch.build_with_uniform_size(
    levels=6,
    num_dimensions=6,
    cat_col_map=[0,1,2],
    num_col_map=[3,4,5],
    bases=[2,2,2],
    ad_tree=retail_ad_tree,
    cm_params=CMParameters(delta=delta, error_eps=abs_error_eps / len(retail_df)),
    cat_index_parameters=BFParameters(n_values=len(retail_df)*cat_updates, p=bloom_p),
    num_index_parameters=BFParameters(n_values=len(retail_df)*num_updates, p=bloom_p),
    region_index_parameters= BFParameters(n_values=len(retail_df)*(cat_updates+num_updates), p=bloom_p))
retail_p_sketch.get_size()

179.22578525543213

In [15]:
retail_p_sketch.update_data_frame(retail_df)

  cat_values = tuple(element[i] for i in self.cat_col_map)
  num_values = tuple(element[i] for i in self.num_col_map)
Updating: 100%|██████████| 536494/536494 [03:02<00:00, 2942.80it/s]


<lib.pacha_sketch.PachaSketch at 0x706ae028ae70>

In [19]:
retail_results_2 = evaluate_queries(retail_df, retail_queries_2["queries"], retail_p_sketch, path_to_file="../results/accuracy/retail_2_p_sketch.csv")

Computing true counts...


True Count: 100%|██████████| 200/200 [00:05<00:00, 39.52it/s]


Computing estimates...


Estimates: 100%|██████████| 200/200 [08:19<00:00,  2.50s/it]


In [20]:
retail_results_4 = evaluate_queries(retail_df, retail_queries_4["queries"], retail_p_sketch, path_to_file="../results/accuracy/retail_4_p_sketch.csv")

Computing true counts...


True Count: 100%|██████████| 200/200 [00:06<00:00, 29.57it/s]


Computing estimates...


Estimates: 100%|██████████| 200/200 [00:26<00:00,  7.47it/s]


## Bank Marketing

In [5]:
cat_updates = 7
num_updates = 6

In [6]:
bank_df = pd.read_parquet("../data/clean/bank_marketing.parquet")

query_path = "../queries/bank_marketing_2_cols.json"
with open(query_path, 'rb') as f:
    bank_queries_2 = orjson.loads(f.read())

query_path = "../queries/bank_marketing_4_cols.json"
with open(query_path, 'rb') as f:
    bank_queries_4 = orjson.loads(f.read())
bank_ad_tree = ADTree.from_json("../sketches/ad_trees/bank_marketing.json")

In [39]:
bank_p_sketch = PachaSketch.build_with_uniform_size(
    levels=6,
    num_dimensions=10,
    cat_col_map=[0,1,2,3,4,5],
    num_col_map=[6,7,8,9],
    bases=[4, 5, 2, 2],
    # bases=[5, 5, 5, 5],
    ad_tree=bank_ad_tree,
    cm_params=CMParameters(delta=delta, error_eps=abs_error_eps / len(bank_df)),
    cat_index_parameters=BFParameters(n_values=len(bank_df)*cat_updates, p=bloom_p),
    num_index_parameters=BFParameters(n_values=len(bank_df)*num_updates, p=bloom_p),
    region_index_parameters= BFParameters(n_values=len(bank_df)*(cat_updates+num_updates), p=bloom_p))
bank_p_sketch.get_size()

15.502957344055176

In [40]:
bank_p_sketch.update_data_frame(bank_df)

Updating: 100%|██████████| 45211/45211 [00:21<00:00, 2131.08it/s]


<lib.pacha_sketch.PachaSketch at 0x734a593065d0>

In [15]:
cover = bank_p_sketch.minimal_spatial_b_adic_cover([(0.0, 8191.0), [125, 803], (0.0, 127.0), (0.0, 383.0)])

In [13]:
cover.shape

(154789372, 5)

In [16]:
num_predicates = [(0.0, 8191.0), [125, 803], (0.0, 127.0), (0.0, 383.0)]
minimal_b_adic_covers = []
for i in range(len(num_predicates)):
    cover_ranges = minimal_b_adic_cover_array(bank_p_sketch.bases[i], num_predicates[i][0], num_predicates[i][1])
    unpruned_ranges = bank_p_sketch.numerical_bitmaps[i].prune_b_adic_array(cover_ranges)
    minimal_b_adic_covers.append(unpruned_ranges)

In [33]:
covers_down = []
for i, cover in enumerate(minimal_b_adic_covers):
    new_cover = []
    for (level, idx) in cover:
        if level > bank_p_sketch.levels - 1:
            indices = downgrade_b_adic_range_indices(base=bank_p_sketch.bases[i], level=level, idx=idx, new_level=bank_p_sketch.levels-1)
            new_cover.append(np.vstack([np.full(len(indices), bank_p_sketch.levels-1), indices]).T)
        else:
            new_cover.append(np.array([[level, idx]]))
    covers_down.append(np.concatenate(new_cover))

In [34]:
covers_down

[array([[5, 0],
        [5, 1],
        [5, 2],
        [5, 3],
        [5, 4],
        [5, 5],
        [5, 6],
        [5, 7]]),
 array([[  3,   1],
        [  3,   2],
        [  3,   3],
        [  3,   4],
        [  3,   5],
        [  2,  30],
        [  2,  31],
        [  0, 800],
        [  0, 801],
        [  0, 802],
        [  0, 803]]),
 array([[5, 0],
        [5, 1],
        [5, 2],
        [5, 3]]),
 array([[ 5,  0],
        [ 5,  1],
        [ 5,  2],
        [ 5,  3],
        [ 5,  4],
        [ 5,  5],
        [ 5,  6],
        [ 5,  7],
        [ 5,  8],
        [ 5,  9],
        [ 5, 10],
        [ 5, 11]])]

In [36]:
[len(cover) for cover in covers_down]

[8, 11, 4, 12]

In [44]:
def compute_number_of_updates(levels, n_cat, n_num):
    curr_cat_index = n_cat + 1
    curr_num_index = levels
    curr_region_index = curr_cat_index * curr_num_index
    curr_base_sketches = curr_region_index
    curr_total = curr_cat_index + curr_num_index + curr_region_index + curr_base_sketches

    print("Nr. of updates in current solution:")
    print(f"cat_index: {curr_cat_index}")
    print(f"num_index: {curr_num_index}")
    print(f"region_index: {curr_region_index}")
    print(f"base_sketches: {curr_base_sketches}")
    print(f"Total: {curr_total}\n")

    new1_cat_index = n_cat + 1
    new1_num_index = levels**n_num
    new1_region_index = new1_cat_index * new1_num_index
    new1_base_sketches = new1_region_index
    new1_total = new1_cat_index + new1_num_index + new1_region_index + new1_base_sketches

    print("Nr. of updates in new solution 1:")
    print(f"cat_index: {new1_cat_index}")
    print(f"num_index: {new1_num_index}")
    print(f"region_index: {new1_region_index}")
    print(f"base_sketches: {new1_base_sketches}")
    print(f"Total: {new1_total}\n")

    new2_cat_index = n_cat + 1
    new2_num_index = 2**n_num * levels
    new2_region_index = new2_cat_index * new2_num_index
    new2_base_sketches = new2_region_index
    new2_total = new2_cat_index + new2_num_index + new2_region_index + new2_base_sketches

    print("Nr. of updates in new solution 2:")
    print(f"cat_index: {new2_cat_index}")
    print(f"num_index: {new2_num_index}")
    print(f"region_index: {new2_region_index}")
    print(f"base_sketches: {new2_base_sketches}")
    print(f"Total: {new2_total}\n")

In [46]:
levels = 6
# Retail
n_cat = 3
n_num = 3

print("--------------------------------------------------------")
print("Retail Dataset")
compute_number_of_updates(levels, n_cat, n_num)

# Bank
n_cat = 6
n_num = 4

print("--------------------------------------------------------")
print("Bank Dataset")
compute_number_of_updates(levels, n_cat, n_num)

# Folktables
n_cat = 7
n_num = 3

print("--------------------------------------------------------")
print("Folktables Dataset")
compute_number_of_updates(levels, n_cat, n_num)



--------------------------------------------------------
Retail Dataset
Nr. of updates in current solution:
cat_index: 4
num_index: 6
region_index: 24
base_sketches: 24
Total: 58

Nr. of updates in new solution 1:
cat_index: 4
num_index: 216
region_index: 864
base_sketches: 864
Total: 1948

Nr. of updates in new solution 2:
cat_index: 4
num_index: 48
region_index: 192
base_sketches: 192
Total: 436

--------------------------------------------------------
Bank Dataset
Nr. of updates in current solution:
cat_index: 7
num_index: 6
region_index: 42
base_sketches: 42
Total: 97

Nr. of updates in new solution 1:
cat_index: 7
num_index: 1296
region_index: 9072
base_sketches: 9072
Total: 19447

Nr. of updates in new solution 2:
cat_index: 7
num_index: 96
region_index: 672
base_sketches: 672
Total: 1447

--------------------------------------------------------
Folktables Dataset
Nr. of updates in current solution:
cat_index: 8
num_index: 6
region_index: 48
base_sketches: 48
Total: 110

Nr. of u

In [None]:
delta = 0.01
abs_error_eps = 1.0
bloom_p = 0.01

## Retail

In [None]:
cat_updates = 4
num_updates = 6

In [None]:
retail_df = pd.read_parquet("../data/clean/online_retail_no_outliers.parquet")

In [None]:
query_path = "../queries/online_retail_2_cols.json"
with open(query_path, 'rb') as f:
    retail_queries_2 = orjson.loads(f.read())

query_path = "../queries/online_retail_4_cols.json"
with open(query_path, 'rb') as f:
    retail_queries_4 = orjson.loads(f.read())

retail_ad_tree = ADTree.from_json("../sketches/ad_trees/online_retail.json")


In [None]:
retail_p_sketch = PachaSketch.build_with_uniform_size(
    levels=6,
    num_dimensions=6,
    cat_col_map=[0,1,2],
    num_col_map=[3,4,5],
    bases=[2,2,2],
    ad_tree=retail_ad_tree,
    cm_params=CMParameters(delta=delta, error_eps=abs_error_eps / len(retail_df)),
    cat_index_parameters=BFParameters(n_values=len(retail_df)*cat_updates, p=bloom_p),
    num_index_parameters=BFParameters(n_values=len(retail_df)*num_updates, p=bloom_p),
    region_index_parameters= BFParameters(n_values=len(retail_df)*(cat_updates+num_updates), p=bloom_p))
retail_p_sketch.get_size()

179.22578525543213

In [None]:
retail_p_sketch.update_data_frame(retail_df)

  cat_values = tuple(element[i] for i in self.cat_col_map)
  num_values = tuple(element[i] for i in self.num_col_map)
Updating: 100%|██████████| 536494/536494 [03:02<00:00, 2942.80it/s]


<lib.pacha_sketch.PachaSketch at 0x706ae028ae70>

In [None]:
retail_results_2 = evaluate_queries(retail_df, retail_queries_2["queries"], retail_p_sketch, path_to_file="../results/accuracy/retail_2_p_sketch.csv")

Computing true counts...


True Count: 100%|██████████| 200/200 [00:05<00:00, 39.52it/s]


Computing estimates...


Estimates: 100%|██████████| 200/200 [08:19<00:00,  2.50s/it]


In [None]:
retail_results_4 = evaluate_queries(retail_df, retail_queries_4["queries"], retail_p_sketch, path_to_file="../results/accuracy/retail_4_p_sketch.csv")

Computing true counts...


True Count: 100%|██████████| 200/200 [00:06<00:00, 29.57it/s]


Computing estimates...


Estimates: 100%|██████████| 200/200 [00:26<00:00,  7.47it/s]


In [35]:
np.prod([len(cover) for cover in covers_down])

4224

In [11]:
bank_queries_2["queries"][0]

[['unknown'], '*', '*', '*', '*', '*', '*', [125, 803], '*', '*']

In [12]:
bank_p_sketch.get_subqueries(bank_queries_2["queries"][0])

KeyboardInterrupt: 

In [28]:
bank_results_2 = evaluate_queries(bank_df, bank_queries_2["queries"], bank_p_sketch, path_to_file="../results/accuracy/bank_2_p_sketch.csv")

Computing true counts...


True Count: 100%|██████████| 200/200 [00:00<00:00, 456.91it/s]


Computing estimates...


Estimates:   0%|          | 0/200 [00:57<?, ?it/s]


KeyboardInterrupt: 

In [None]:
bank_results_4 = evaluate_queries(bank_df, bank_queries_4["queries"], bank_p_sketch, path_to_file="../results/accuracy/bank_4_p_sketch.csv")