In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected")

Connected


In [3]:
query = """
SELECT
    PRODUCT_ID,
    COUNT(*) AS purchases,
    SUM(QUANTITY) AS units,
    SUM(QUANTITY * UNIT_PRICE) AS revenue_cents
FROM PURCHASES
WHERE
    PURCHASED_AT >= '2025-07-01'::TIMESTAMP_NTZ
    AND PURCHASED_AT < '2025-07-02'::TIMESTAMP_NTZ
GROUP BY PRODUCT_ID
"""

cursor = conn.cursor()
cursor.execute(query)
results = cursor.fetchall()

df = pd.DataFrame(results, columns=['product_id', 'purchases', 'units', 'revenue_cents'])
print(f"Got {len(df)} rows")
df.head()

Got 125952 rows


Unnamed: 0,product_id,purchases,units,revenue_cents
0,684bebdd842dd45b19d96fcd,1,1,500
1,6861810b2061e405e1aaa0eb,1,1,2000
2,638030667dfcc2b9ef83ff17,1,1,1500
3,68275b7449e17b77e67a3126,1,1,2000
4,678c01f6b9db3a5f6626dc5f,1,1,1500


In [None]:
import os
from datetime import datetime, timedelta
from pathlib import Path

# Date range
START_DATE = '2025-03-14'
END_DATE = '2025-09-07'

# Create output directory
output_dir = Path('product_daily_purchases')
output_dir.mkdir(exist_ok=True)

# Generate all dates
current_date = datetime.strptime(START_DATE, '%Y-%m-%d')
end_date = datetime.strptime(END_DATE, '%Y-%m-%d')

all_results = []

while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    next_date_str = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')

    # Check if already processed
    checkpoint_file = output_dir / f"checkpoint_{date_str}.txt"
    if checkpoint_file.exists():
        print(f"Skipping {date_str} - already processed")
        current_date += timedelta(days=1)
        continue

    print(f"Processing {date_str}...", end='')

    query = f"""
    SELECT
        PRODUCT_ID,
        '{date_str}'::DATE as date,
        SUM(QUANTITY) AS units_sold,
        SUM(QUANTITY * UNIT_PRICE) / 100.0 AS revenue_dollars,
        AVG(UNIT_PRICE) / 100.0 AS avg_unit_price
    FROM PURCHASES
    WHERE
        PURCHASED_AT >= '{date_str}'::TIMESTAMP_NTZ
        AND PURCHASED_AT < '{next_date_str}'::TIMESTAMP_NTZ
    GROUP BY PRODUCT_ID
    """

    try:
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()

        columns = ['product_id', 'date', 'units_sold', 'revenue_dollars', 'avg_unit_price']

        df_day = pd.DataFrame(results, columns=columns)
        all_results.append(df_day)

        # Save checkpoint
        checkpoint_file.touch()

        print(f" {len(df_day)} products, ${df_day['revenue_dollars'].sum():,.2f} revenue")

    except Exception as e:
        print(f" FAILED: {e}")

    current_date += timedelta(days=1)

# Combine all results and save
if all_results:
    df_final = pd.concat(all_results, ignore_index=True)
    output_file = 'product_daily_purchases.parquet'
    df_final.to_parquet(output_file, index=False)
    print(f"\n=== FINAL SUMMARY ===")
    print(f"Saved {len(df_final):,} total rows to {output_file}")
    print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")
    print(f"Unique products: {df_final['product_id'].nunique():,}")
    print(f"Total revenue: ${df_final['revenue_dollars'].sum():,.2f}")
else:
    print("No data collected")

Processing 2025-03-14... 134987 products, $5,479,403.45 revenue
Processing 2025-03-15... 139931 products, $5,549,483.55 revenue
Processing 2025-03-16... 154200 products, $6,136,198.04 revenue
Processing 2025-03-17... 132478 products, $5,338,554.86 revenue
Processing 2025-03-18... 122066 products, $4,920,453.93 revenue
Processing 2025-03-19... 124741 products, $5,075,497.60 revenue
Processing 2025-03-20... 127585 products, $5,105,777.90 revenue
Processing 2025-03-21... 135142 products, $5,466,935.00 revenue
Processing 2025-03-22... 137343 products, $5,513,254.68 revenue
Processing 2025-03-23... 152562 products, $5,944,814.20 revenue
Processing 2025-03-24... 135434 products, $5,343,544.89 revenue
Processing 2025-03-25... 126705 products, $5,088,534.30 revenue
Processing 2025-03-26... 127048 products, $5,189,178.19 revenue
Processing 2025-03-27... 129513 products, $5,233,173.44 revenue
Processing 2025-03-28... 134964 products, $5,484,752.15 revenue
Processing 2025-03-29... 137429 products

: 