In [1]:
import sys
# append the path of the parent directory
sys.path.append("..")

In [2]:
import math
import os

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd
import datetime

import random
random.seed(7)

import orjson
import gzip

from importlib import reload
from lib import sketches, visualization_utils, encoders, ploting, pacha_sketch_new

reload(pacha_sketch_new)
from lib.pacha_sketch_new import ADTree

  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)


# Bank Marketing

https://archive.ics.uci.edu/dataset/222/bank+marketing

## Data Cleaning

In [None]:
df = pd.read_parquet("data/raw/bank_marketing/bank_full.parquet")

In [None]:
# Convert month and day to day of the year indexed from 0
df['date'] = df.apply(lambda row: (datetime.datetime.strptime(f"{row['month']} {row['day']}", "%b %d").timetuple().tm_yday - 1), axis=1)

In [None]:
clean_df = df[['poutcome', 'job', 'education', 'housing', 'loan', 'marital', 'duration', 'balance','campaign', 'age', 'date']]

In [None]:
clean_df.to_parquet("data/clean/bank_marketing.parquet", index=False)

## Queries and AD-Tree Generation

In [3]:
bank_df = pd.read_parquet("data/clean/bank_marketing.parquet")
cat_cols = ['poutcome', 'job', 'education', 'housing', 'loan', 'marital']
num_cols = ['duration', 'balance','campaign', 'age', 'date']

In [4]:
bank_df.nunique()

poutcome        4
job            12
education       4
housing         2
loan            2
marital         3
duration     1573
balance      7168
campaign       48
age            77
date          318
dtype: int64

In [None]:
ad_tree = ADTree()

for col in cat_cols:
    ad_tree.add_dimension(set(bank_df[col].unique().tolist()), name=col)

ad_tree.save_to_file("sketches/ad_trees/bank_marketing.json")
print(ad_tree.names)

In [None]:
df =bank_df
col = 'balance'
range_size = 0.1
max_val = int(df[col].max())
min_val = int(df[col].min())
range_size = int((max_val - min_val) * range_size)



In [None]:
start = random.randint(min_val, max_val-range_size)
end = start + range_size

In [None]:
start

In [5]:
def generate_queries(df: pd.DataFrame, num_cols: list, cat_cols: list, num_queries: int = 200, n_cat: int = 1, n_num: int = 1,
                      decay_rate: float = 0.5, range_portion:float = 0.1, dataset_name: str = None, file_path: str = None) -> list:
    # Exponentially decaying probability for each index - smaller decay_rate means faster decay
    probs_cat = np.array([decay_rate**i for i in range(len(cat_cols))])
    probs_cat = probs_cat / probs_cat.sum()

    probs_num = np.array([decay_rate**i for i in range(len(num_cols))])
    probs_num = probs_num / probs_num.sum()

    queries = []
    for i in range(num_queries):
        filter_predicates = {}

        picked_cat_cols = np.random.choice(cat_cols, size=n_cat, replace=False, p=probs_cat)
        for col in picked_cat_cols:
            val_counts = df[col].value_counts(normalize=True)
            predicate = set(np.random.choice(val_counts.index, size=1, p=val_counts.values))
            filter_predicates[col] = predicate

        picked_num_cols = np.random.choice(num_cols, size=n_num, replace=False, p=probs_num)
        for col in picked_num_cols:
            max_val = int(df[col].max())
            min_val = int(df[col].min())
            range_size = int((max_val - min_val) * range_portion)
            start = random.randint(min_val, max_val-range_size)
            end = start + range_size
            filter_predicates[col] = (start, end)

        all_predicates = []
        for col in df.columns:
            if col in filter_predicates:
                all_predicates.append(filter_predicates[col])
            else:
                all_predicates.append("*")
        queries.append(all_predicates)
    
    query_set = {
        "dataset_name": dataset_name,
        "n_cat": n_cat,
        "n_num": n_num,
        "num_queries": num_queries,
        "range_size": range_size,
        "queries": queries
    }

    if file_path is not None:
        if file_path.endswith('.gz'):
            with gzip.open(file_path, "wb") as f:
                f.write(orjson.dumps(query_set))
        else:
            with open(file_path, 'wb') as f:
                f.write(orjson.dumps(query_set))


In [8]:
generate_queries(df=bank_df, num_cols=num_cols, cat_cols=cat_cols, num_queries=3, n_cat=1, n_num=1, dataset_name="bank_marketing_2_cols", file_path="queries/bank_marketing_2_cols.json")

TypeError: Type is not JSON serializable: set

In [None]:
generate_queries(df=bank_df, num_cols=num_cols, cat_cols=cat_cols, num_queries=200, n_cat=2, n_num=2, dataset_name="bank_marketing_4_cols", file_path="queries/bank_marketing_4_cols.json")

In [None]:
# Queries with two predicates
n_cat = 1
n_num = 1

range_size = 0.1

predicates = {}

cat_cols = np.random.choice(cat_cols, size=n_cat, replace=False, p=probs_cat)
cat_predicates = {}
for col in cat_cols:
    val_counts = bank_df[col].value_counts(normalize=True)
    predicate = set(np.random.choice(val_counts.index, size=1, p=val_counts.values))
    predicates[col] = predicate

# print(cat_predicates) 

num_cols = np.random.choice(num_cols, size=n_num, replace=False, p=probs_num)
num_predicates = {}
for col in num_cols:
    max_val = bank_df[col].max()
    min_val = bank_df[col].min()
    range_size = int((max_val - min_val) * range_size)

    start = random.randint(min_val, max_val-range_size)
    end = start + range_size
    predicates[col] = (start, end)

print(predicates)

final_predicates = []
for col in bank_df.columns:
    if col in predicates:
        final_predicates.append(predicates[col])
    else:
        final_predicates.append("*")
        
print(final_predicates)


In [None]:
'poutcome' in predicates

# Online Retail

https://archive.ics.uci.edu/dataset/352/online+retail

In [None]:
df = pd.read_parquet("data/raw/online_retail/online_retail.parquet")

In [None]:
df['category'] = df['StockCode'].str.slice(0, 3)
df['date'] = pd.to_datetime(df['InvoiceDate'], format="%d.%m.%Y %H:%M").dt.strftime('%d.%m.%Y').rank(method='dense').astype(int) - 1

In [None]:
df_customer = pd.DataFrame(df['CustomerID'].unique(), columns=['CustomerID'])
df_customer['age'] = generate_bounded_normal(loc=35, scale=10, size=len(df_customer), low=18, high=76)
df_customer['gender'] = np.random.choice(['m', 'f', 'd'], size=len(df_customer), replace=True, p=[0.48, 0.49, 03])
df_merged = df.merge(df_customer, on='CustomerID', how='inner')
df_merged["region"] = df_merged['Country']
df_merged['total'] = df_merged['Quantity'] * df_merged['UnitPrice']
df_merged['total'] = df_merged['total'].abs().fillna(0).astype(int)

In [None]:
clean_df = df_merged[['region', 'gender', 'category', 'date', 'total', 'age']].copy()

In [None]:
clean_df.to_parquet("data/clean/online_retail.parquet", index=False)

# Folktables

https://github.com/socialfoundations/folktables

| Column  | Description                                                                       |
| ------- | --------------------------------------------------------------------------------- |
| `SEX`   | **Sex** — 1 for male, 2 for female.                                               |
| `RAC1P` | **Race** — Detailed race code (White, Black, Asian, etc.).                        |
| `SCHL`  | **Educational attainment** — Highest degree or level of school completed.         |
| `MAR`   | **Marital status** — E.g., married, divorced, widowed, never married.             |
| `POBP`  | **Place of birth** — Numeric code for U.S. state or foreign country of birth.     |
| `COW`   | **Class of worker** — Employment type (e.g., private, government, self-employed). |
| `OCCP`  | **Occupation code** — Detailed job classification (4-digit code).                 |
| `AGEP`  | **Age of person** — Age in years (0–99, top-coded at 99).                         |
| `PWGTP` | **Person’s weight** — Statistical weight used to produce population estimates.    |
| `PINCP` | **Total person income** — Total pre-tax income in the past 12 months.             |



In [None]:
from folktables import ACSDataSource, ACSIncome, generate_categories, adult_filter

In [None]:
data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=False)


In [None]:
cat_cols = ["SEX", "RAC1P", "SCHL", "MAR", "POBP", "COW", "OCCP"]
num_cols = ["AGEP", "PWGTP", "PINCP"]

In [None]:
folk_df = ca_data[cat_cols + num_cols]
folk_df.fillna(0, inplace=True)
folk_df = folk_df.astype(int)

In [None]:
folk_df.nunique()

In [None]:
folk_df.to_parquet("data/clean/acs_folktables.parquet", index=False)

# OLD

### Self-Generated Datasets

In [None]:
def generate_normal_dataset(size, num_columns, mean, std_dev, output_path=None):
    """
    Generates a dataset with the specified parameters where each column
    contains integer values following a normal distribution.

    Parameters:
    - size (int): Number of rows in the dataset.
    - num_columns (int): Number of columns in the dataset.
    - mean (float): Mean of the normal distribution.
    - std_dev (float): Standard deviation of the normal distribution.
    - output_dir (str, optional): Directory to save the dataset in Parquet format. Defaults to None.

    Returns:
    - pd.DataFrame: Generated dataset as a pandas DataFrame.
    """
    # Generate random data for each column
    data = {
        f"d_{i}": np.random.normal(loc=mean, scale=std_dev, size=size).astype(int)
        for i in range(num_columns)
    }

    # Create DataFrame
    df = pd.DataFrame(data)

    # Save to output directory if specified
    if output_path:
        df.to_parquet(output_path, index=False)
        print(f"Dataset saved to {output_path}")

    return df


In [None]:
df = generate_normal_dataset(size=100000, num_columns=3, mean=5000, std_dev=50, output_path="data/normal_3d_100k.parquet")

In [None]:
df = pd.read_parquet("data/normal_3d_100k.parquet")

In [None]:
df.describe()

## 6 Dimensions Dataset

In [None]:
def generate_bounded_normal(loc, scale, size, low, high):
    result = []
    while len(result) < size:
        samples = np.random.normal(loc=loc, scale=scale, size=size)
        valid_samples = samples[(samples >= low) & (samples <= high)]
        result.extend(valid_samples.astype(int))
    return np.array(result[:size])

In [None]:
def generate_fake_dataset(size, output_path=None):

    regions = [
        "Baden-Württemberg", "Bavaria", "Berlin", "Brandenburg", "Bremen", 
        "Hamburg", "Hesse", "Lower Saxony", "Mecklenburg-Vorpommern", 
        "North Rhine-Westphalia", "Rhineland-Palatinate", "Saarland", 
        "Saxony", "Saxony-Anhalt", "Schleswig-Holstein", "Thuringia"
    ]
    gender = ["m","f","d"]
    product_category = [chr(i) for i in range(97, 123)]

    p_product_category = np.random.zipf(1.5, len(product_category))
    p_product_category = p_product_category / np.sum(p_product_category)

    reference_dist = pd.read_parquet("data/reference_dist.parquet")

    ages_array = generate_bounded_normal(loc=35, scale=10, size=size, low=18, high=76)

    data = {
        "region": np.random.choice(regions, size=size, replace=True),
        "gender": np.random.choice(gender, size=size, replace=True, p=[0.48, 0.49, 03]),
        "category": np.random.choice(product_category, size=size, replace=True, p=p_product_category),
        "date": np.random.choice(reference_dist['date'], size=size, replace=True),
        "total": np.random.choice(reference_dist['total'], size=size, replace=True),
        "age": ages_array      
    }

    # Create DataFrame
    df = pd.DataFrame(data)

    # Save to output directory if specified
    if output_path:
        df.to_parquet(output_path, index=False)
        print(f"Dataset saved to {output_path}")

    return df


In [None]:
df = generate_fake_dataset(size=200000, output_path="data/paper_example_200k.parquet")

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=30, kde=True, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Generate random ages with a normal distribution
ages_array = np.random.normal(loc=35, scale=10, size=1000).astype(int)

# Clip the ages to ensure they fall within the range of 18 to 76
ages_array = np.clip(ages_array, 18, 76)

## Kaggle

## E-Commerce Sales Dataset

In [None]:
df = pd.read_csv("data/raw/amazon_sale_report.csv", low_memory=False)
df.to_parquet("data/raw/amazon_sale_report.parquet", index=False)

In [None]:
df = pd.read_parquet("data/raw/amazon_sale_report.parquet")

In [None]:
df.columns

In [None]:
df_column_subset = df[['Date', 'Status',
       'Style', 'Category', 'Size',
       'Qty', 'Amount',
       'ship-state']].copy()

In [None]:
df_column_subset['total'] = df_column_subset['Amount'].fillna(0).astype(int)

In [None]:
df_column_subset['date'] = pd.to_datetime(df_column_subset['Date'], format='%m-%d-%y').rank(method='dense').astype(int) - 1

In [None]:
df_column_subset[['date', 'total']].to_parquet("data/raw/reference_dist.parquet", index=False)

In [None]:
df_column_subset.nunique()