In [1]:
import sys
# append the path of the parent directory
sys.path.append("..")

In [2]:
import math
import os

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import matplotlib.patches as patches

import seaborn as sns
import time
import json
import pandas as pd
from ctypes import c_int32
from itertools import product


from importlib import reload

from lib import sketches
from lib import visualization_utils
reload(sketches)

from lib.sketches import CountMinSketch, minimal_b_adic_cover, sort_b_adic_ranges, BAdicRange, BAdicCube, minimal_spatial_b_adic_cover
from lib.visualization_utils import visualize_badic_cover, plot_b_adic_cubes

  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)


# Data Generation

In [4]:
def generate_normal_dataset(size, num_columns, mean, std_dev, output_path=None):
    """
    Generates a dataset with the specified parameters where each column
    contains integer values following a normal distribution.

    Parameters:
    - size (int): Number of rows in the dataset.
    - num_columns (int): Number of columns in the dataset.
    - mean (float): Mean of the normal distribution.
    - std_dev (float): Standard deviation of the normal distribution.
    - output_dir (str, optional): Directory to save the dataset in Parquet format. Defaults to None.

    Returns:
    - pd.DataFrame: Generated dataset as a pandas DataFrame.
    """
    # Generate random data for each column
    data = {
        f"d_{i}": np.random.normal(loc=mean, scale=std_dev, size=size).astype(int)
        for i in range(num_columns)
    }

    # Create DataFrame
    df = pd.DataFrame(data)

    # Save to output directory if specified
    if output_path:
        df.to_parquet(output_path, index=False)
        print(f"Dataset saved to {output_path}")

    return df


In [3]:
df = generate_normal_dataset(size=100000, num_columns=3, mean=5000, std_dev=500, output_path="data/normal_3d_100k.parquet")

NameError: name 'generate_normal_dataset' is not defined

In [4]:
df = pd.read_parquet("data/normal_3d_100k.parquet")

In [4]:
df.head()

Unnamed: 0,d_0,d_1,d_2
0,5140,4001,4377
1,5130,4674,5048
2,4887,4876,4978
3,3998,3944,4732
4,5088,4845,4431


In [5]:
df.describe()

Unnamed: 0,d_0,d_1,d_2
count,100000.0,100000.0,100000.0
mean,5001.03781,5000.45389,4999.3106
std,498.926127,499.127459,500.522806
min,2990.0,2535.0,2898.0
25%,4663.0,4662.0,4661.0
50%,4999.0,5001.0,4998.0
75%,5337.0,5336.0,5335.0
max,7264.0,7239.0,7098.0


# Build Count-Min Sketches

In [None]:
base = 2
levels = 10
dimensions = 3

In [5]:
def encode(tuple_value):
    return c_int32(hash(tuple_value)).value

In [6]:
def build_sketches(df, base, levels, dimensions):
    cm_sketches = []
    for i in range(levels):
        cm_sketches.append(CountMinSketch(1000, 5))

    for index, row in df.iterrows():
        # Encode the tuple as a 32-bit integer
        row = tuple(row)
        for l in range(levels):
            cube = []
            for d in range(dimensions):
                cube.append(row[d] // base**l)
            cube = tuple(cube)
            encoded_value = encode(cube)
            cm_sketches[l].update(encoded_value)

    return cm_sketches

In [7]:
cm_sketches_2 = build_sketches(df, base=2, levels=10, dimensions=3)

KeyboardInterrupt: 

In [8]:
cm_sketches_5 = build_sketches(df, base=5, levels=5, dimensions=3)

In [14]:
5**3

125

# Evaluation

## Base = 2  :  Levels = 10 

In [15]:
levels = 5
base = 5
dimensions = 3
cm_sketches = cm_sketches_5

lower_bound = 4000
upper_bound = 6000 - base ** (levels-1)
n_samples = 10

true_results = []
estimates = []
n_cubes = []
absolute_errors = []
relative_errors = []

for l in range(levels):
    l_true_results = []
    l_estimates = []
    l_n_cubes = []
    l_absolute_errors = []
    l_relative_errors = []

    for i in range(n_samples):
        start = np.random.randint(lower_bound, upper_bound)
        end = start + base**l - 1
        query = [(int(start), int(end))] * dimensions

        true_result = df.query(" and ".join([f"d_{i} >= {query[i][0]} and d_{i} <= {query[i][1]}" for i in range(dimensions)])).shape[0]
        bases = [base] * dimensions
        cubes = minimal_spatial_b_adic_cover(query, bases)
        estimate = 0
        for cube in cubes:
            tuple_value = tuple([r.index for r in cube.b_adic_ranges])
            estimate += cm_sketches[cube.level].query(encode(tuple_value))

        l_true_results.append(true_result)
        l_estimates.append(estimate)
        l_n_cubes.append(len(cubes))
        l_absolute_errors.append(abs(true_result - estimate))
        if true_result != 0:
            l_relative_errors.append(abs(true_result - estimate) / true_result)
    
    print(f"level compleated: {l}")
    true_results.append(l_true_results)
    estimates.append(l_estimates)
    n_cubes.append(l_n_cubes)
    absolute_errors.append(l_absolute_errors)
    relative_errors.append(l_relative_errors)

level compleated: 0
level compleated: 1
level compleated: 2
level compleated: 3


KeyboardInterrupt: 

In [29]:
query = [(base**2*201, base**2*202-1)] * dimensions
true_result = df.query(" and ".join([f"d_{i} >= {query[i][0]} and d_{i} <= {query[i][1]}" for i in range(dimensions)])).shape[0]
bases = [base] * dimensions
cubes = minimal_spatial_b_adic_cover(query, bases)
estimate = 0
for cube in cubes:
    tuple_value = tuple([r.index for r in cube.b_adic_ranges])
    estimate += cm_sketches[cube.level].query(encode(tuple_value))

In [21]:
base**2*200

5000

In [26]:
query = [(base**2*200, base**2*201-1)] * dimensions

cubes = minimal_spatial_b_adic_cover(query, bases)

len(cubes)

1

In [30]:
print(true_result)
print(estimate)

0
75


In [78]:
true_results

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
 [10, 4, 0, 12, 6, 6, 8, 7, 1, 2, 0, 8, 4, 0, 1, 11, 2, 11, 3, 6],
 [101, 90, 2, 2, 9, 2, 1, 99, 97, 94, 74, 88, 36, 10, 97, 2, 5, 22, 100, 1]]

In [79]:
2**7

128

In [74]:
start = np.random.randint(lower_bound, upper_bound)
end = start + base**l - 1
query = [(int(start), int(end))] * dimensions
print(query)

[(3399, 3526), (3399, 3526), (3399, 3526)]


In [35]:
levels = 10
base = 2
dimensions = 3

lower_bound = 3000
upper_bound = 7300 - base ** levels
n_samples = 100

starts = np.random.randint(lower_bound, upper_bound, dimensions)
ends = starts + base**l - 1
query = [(int(x), int(y)) for x, y in zip(starts, ends)]

print(starts)
print(ends)
print(query)

[5534 3450 4220]
[6045 3961 4731]
[(5534, 6045), (3450, 3961), (4220, 4731)]


In [59]:
range(3)

range(0, 3)

In [54]:
true_result = df.query(" and ".join([f"d_{i} >= {query[i][0]} and d_{i} <= {query[i][1]}" for i in np.arange(dimensions)])).shape[0]
true_result

44

In [None]:
query_conditions = [ for i in range(3)]
query_conditions

TypeError: 'numpy.ndarray' object is not callable

In [51]:
test = []
for i in np.arange(3):
    test.append(f"d_{i} >= {query[i][0]} and d_{i} <= {query[i][1]}")

In [52]:
" and ".join([f"d_{i} >= {query[i][0]} and d_{i} <= {query[i][1]}" for i in np.arange(dimensions)])

'd_0 >= 5534 and d_0 <= 6045 and d_1 >= 3450 and d_1 <= 3961 and d_2 >= 4220 and d_2 <= 4731'