In [None]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# CONFIG
SAMPLE_FRACTION = 0.001  # 0.1% of users
TOTAL_BUCKETS = 10000
SELECTION_THRESHOLD = int(TOTAL_BUCKETS * SAMPLE_FRACTION)
START_DATE = '2025-06-15'
END_DATE = '2025-06-22'  # 7 days

DATA_DIR = Path('./data')
DATA_DIR.mkdir(exist_ok=True)

load_dotenv()
print(f"Config: {SAMPLE_FRACTION*100}% sample, dates {START_DATE} to {END_DATE}")
print(f"Selection threshold: {SELECTION_THRESHOLD} / {TOTAL_BUCKETS} buckets")

In [None]:
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
    database=os.getenv('SNOWFLAKE_DATABASE'),
    schema=os.getenv('SNOWFLAKE_SCHEMA')
)
print(f"Connected to Snowflake")

In [None]:
CTE_SQL = f"""
WITH SAMPLED_USERS AS (
    SELECT OPAQUE_USER_ID FROM (
        SELECT OPAQUE_USER_ID, MOD(ABS(HASH(OPAQUE_USER_ID)), {TOTAL_BUCKETS}) AS bucket
        FROM (SELECT DISTINCT OPAQUE_USER_ID FROM AUCTIONS_USERS
              WHERE CREATED_AT BETWEEN '{START_DATE}' AND '{END_DATE}')
    ) WHERE bucket < {SELECTION_THRESHOLD}
)
"""
print(f"CTE defined: {SAMPLE_FRACTION*100}% sample, dates {START_DATE} to {END_DATE}")

In [None]:
# AUCTIONS_USERS
print("Pulling AUCTIONS_USERS...")
auctions_users = pd.read_sql(CTE_SQL + f"""
SELECT
    LOWER(HEX_ENCODE(AUCTION_ID)) as auction_id,
    TRIM(OPAQUE_USER_ID) as user_id,
    CREATED_AT as auction_time,
    PLACEMENT as placement
FROM AUCTIONS_USERS au
JOIN SAMPLED_USERS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
WHERE au.CREATED_AT BETWEEN '{START_DATE}' AND '{END_DATE}'
""", conn)
print(f"  {len(auctions_users):,} rows")

# AUCTIONS_RESULTS
print("Pulling AUCTIONS_RESULTS...")
auctions_results = pd.read_sql(CTE_SQL + f"""
SELECT
    LOWER(HEX_ENCODE(ar.AUCTION_ID)) as auction_id,
    LOWER(HEX_ENCODE(ar.VENDOR_ID)) as vendor_id,
    LOWER(HEX_ENCODE(ar.CAMPAIGN_ID)) as campaign_id,
    TRIM(ar.PRODUCT_ID) as product_id,
    ar.RANKING as ranking,
    ar.IS_WINNER as is_winner,
    ar.QUALITY as quality,
    ar.FINAL_BID as final_bid,
    ar.PRICE as price
FROM AUCTIONS_RESULTS ar
JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
JOIN SAMPLED_USERS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
WHERE au.CREATED_AT BETWEEN '{START_DATE}' AND '{END_DATE}'
""", conn)
print(f"  {len(auctions_results):,} rows")

# IMPRESSIONS
print("Pulling IMPRESSIONS...")
impressions = pd.read_sql(CTE_SQL + f"""
SELECT
    TRIM(i.INTERACTION_ID) as interaction_id,
    TRIM(i.AUCTION_ID) as auction_id,
    TRIM(i.PRODUCT_ID) as product_id,
    TRIM(i.USER_ID) as user_id,
    LOWER(HEX_ENCODE(i.VENDOR_ID)) as vendor_id,
    i.OCCURRED_AT as impression_time
FROM IMPRESSIONS i
JOIN SAMPLED_USERS s ON i.USER_ID = s.OPAQUE_USER_ID
WHERE i.OCCURRED_AT BETWEEN '{START_DATE}' AND '{END_DATE}'
""", conn)
print(f"  {len(impressions):,} rows")

# CLICKS
print("Pulling CLICKS...")
clicks = pd.read_sql(CTE_SQL + f"""
SELECT
    TRIM(c.INTERACTION_ID) as interaction_id,
    TRIM(c.AUCTION_ID) as auction_id,
    TRIM(c.PRODUCT_ID) as product_id,
    TRIM(c.USER_ID) as user_id,
    LOWER(HEX_ENCODE(c.VENDOR_ID)) as vendor_id,
    c.OCCURRED_AT as click_time
FROM CLICKS c
JOIN SAMPLED_USERS s ON c.USER_ID = s.OPAQUE_USER_ID
WHERE c.OCCURRED_AT BETWEEN '{START_DATE}' AND '{END_DATE}'
""", conn)
print(f"  {len(clicks):,} rows")

# PURCHASES
print("Pulling PURCHASES...")
purchases = pd.read_sql(CTE_SQL + f"""
SELECT
    TRIM(p.PURCHASE_ID) as purchase_id,
    p.PURCHASED_AT as purchase_time,
    TRIM(p.PRODUCT_ID) as product_id,
    p.QUANTITY as quantity,
    p.UNIT_PRICE as unit_price,
    TRIM(p.USER_ID) as user_id
FROM PURCHASES p
JOIN SAMPLED_USERS s ON p.USER_ID = s.OPAQUE_USER_ID
WHERE p.PURCHASED_AT BETWEEN '{START_DATE}' AND '{END_DATE}'
""", conn)
print(f"  {len(purchases):,} rows")

In [None]:
# Collect product IDs from event tables only
all_products = set()
all_products.update(impressions['product_id'].dropna().unique())
all_products.update(clicks['product_id'].dropna().unique())
all_products.update(purchases['product_id'].dropna().unique())
print(f"Unique products to fetch: {len(all_products):,}")

# Pull catalog in batches
product_list = list(all_products)
batch_size = 10000
catalog_dfs = []

for i in tqdm(range(0, len(product_list), batch_size), desc="Fetching catalog"):
    batch = product_list[i:i+batch_size]
    batch_str = "','".join(batch)
    query = f"""
    SELECT
        TRIM(PRODUCT_ID) as product_id,
        NAME as name,
        CATEGORIES as categories,
        PRICE as price,
        VENDORS as vendors
    FROM CATALOG
    WHERE PRODUCT_ID IN ('{batch_str}')
    """
    catalog_dfs.append(pd.read_sql(query, conn))

catalog = pd.concat(catalog_dfs, ignore_index=True) if catalog_dfs else pd.DataFrame()
print(f"Catalog: {len(catalog):,} rows")

In [None]:
auctions_users.to_parquet(DATA_DIR / 'auctions_users.parquet', index=False)
auctions_results.to_parquet(DATA_DIR / 'auctions_results.parquet', index=False)
impressions.to_parquet(DATA_DIR / 'impressions.parquet', index=False)
clicks.to_parquet(DATA_DIR / 'clicks.parquet', index=False)
purchases.to_parquet(DATA_DIR / 'purchases.parquet', index=False)
catalog.to_parquet(DATA_DIR / 'catalog.parquet', index=False)

print("\nSaved files:")
for f in DATA_DIR.glob('*.parquet'):
    print(f"  {f}: {pd.read_parquet(f).shape}")

conn.close()
print("\nData pull complete.")