In [1]:
import sys
# append the path of the parent directory
sys.path.append("..")

In [80]:
import math
import os
import time


import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import matplotlib.patches as patches

import seaborn as sns
import time
import json
import pandas as pd
from ctypes import c_int32
from itertools import product

from scipy.stats import pearsonr
from importlib import reload

import orjson
import gzip



from lib import sketches, visualization_utils, encoders, ploting, pacha_sketch_new
reload(ploting)
reload(sketches)
reload(visualization_utils)
reload(encoders)

reload(pacha_sketch_new)

from lib.sketches import BloomFilter, CountMinSketch, H3HashFunctions, HashFunctionFamily, CountMinSketchHadamard, CountMinSketchLocalHashing
from lib.visualization_utils import visualize_badic_cover, plot_b_adic_cubes
from lib.encoders import minimal_b_adic_cover, minimal_spatial_b_adic_cover, BAdicCube, BAdicRange
from lib.pacha_sketch_new import PachaSketch, ADTree, BFParameters, CMParameters, build_pacha_sketch_uniform_size, \
    build_pacha_sketch_from_json_file

from lib.ploting import set_style, plot_ylabel, plot_legend

from lib import baselines
reload(baselines)

from lib.baselines import CentralDPServer, LDPServer, LDPEncoderGRR, query_df, infer_domains_and_ranges

# set_style()

In [28]:
delta = 0.05
abs_error_eps = 1.0

# Online Retail

In [None]:
retail_df = pd.read_parquet("../data/clean/online_retail.parquet")

In [51]:
len(retail_df)

541909

In [10]:
query_path = "../queries/online_retail_2_cols.json"
with open(query_path, 'rb') as f:
    queries = orjson.loads(f.read())

ad_tree = ADTree.from_json("../sketches/ad_trees/online_retail.json")


In [21]:
ad_tree.names

['category', 'region', 'gender']

In [22]:
retail_df.columns

Index(['category', 'region', 'gender', 'date', 'age', 'total'], dtype='object')

In [84]:
p_sketch = build_pacha_sketch_uniform_size(
    levels=5,
    num_dimensions=6,
    cat_col_map=[0,1,2],
    num_col_map=[3,4,5],
    bases=[2,2,2],
    ad_tree=ad_tree,
    cm_params=CMParameters(delta=delta, error_eps=abs_error_eps / len(retail_df)),
    cat_index_parameters=BFParameters(n_values=len(retail_df)*4, p=0.001),
    num_index_parameters=BFParameters(n_values=len(retail_df)*5, p=0.001))

In [85]:
p_sketch.get_size()

235.4842987060547

In [86]:
p_sketch.update_data_frame(retail_df, workers=os.cpu_count())

Updating: 100%|██████████| 541909/541909 [03:42<00:00, 2436.16it/s]


<lib.pacha_sketch_new.PachaSketch at 0x75db4c81df10>

In [33]:
p_sketch.save_to_file("../sketches/online_retail.json.gz")

In [49]:
p_sketch2 = build_pacha_sketch_from_json_file("../sketches/online_retail.json.gz")

In [45]:
p_sketch == p_sketch2

True

In [None]:
est, details = p_sketch2.query(queries["queries"][0], debug=True, detailed=True)

Categorical regions: 125
Indexed categorical regions: 125
Numerical regions: 1420
Indexed numerical regions: 967
Query regions: 120875
Level 0 queries: 18000
Level 1 queries: 58875
Level 2 queries: 19000
Level 3 queries: 25000


In [87]:
est, details = p_sketch.query(queries["queries"][0], debug=True, detailed=True)

Categorical regions: 125
Indexed categorical regions: 125
Numerical regions: 1420
Indexed numerical regions: 949
Query regions: 118625
Level 0 queries: 16000
Level 1 queries: 58625
Level 2 queries: 19000
Level 3 queries: 25000


In [74]:
for i in range(len(details['queries_per_level'])):
    print(f"Level {i}: {details['queries_per_level'][i]/details['cat_regions']}")

Level 0: 144.0
Level 1: 471.0
Level 2: 152.0
Level 3: 200.0
Level 4: 0.0


In [76]:
print(p_sketch2.min_values)
print(p_sketch2.max_values)

[0, 18, 0]
[304, 71, 168469]


In [54]:
query_df(retail_df, queries["queries"][0])

49540

In [75]:
print(np.log2(15-8+1)) 

print(np.log2(15))

3.0
3.9068905956085187


In [42]:
queries["queries"][0]

['*', ['United Kingdom'], '*', '*', '*', [8, 15]]

In [62]:
domain_sizes = np.asarray([len(ad_tree.possible_values[dim]) for dim in range(ad_tree.num_dimensions)])

In [63]:
domain_sizes

array([125,  38,   3])

In [64]:
with_wildcard = domain_sizes +1 

In [60]:
np.prod(domain_sizes)

np.int64(14250)

In [65]:
np.prod(with_wildcard)

np.int64(19656)

In [66]:
np.prod(with_wildcard[1:])

np.int64(156)