In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected")

Connected


In [2]:
query = """
SELECT
    PRODUCT_ID,
    COUNT(*) AS purchases,
    SUM(QUANTITY) AS units,
    SUM(QUANTITY * UNIT_PRICE) AS revenue_cents
FROM PURCHASES
WHERE
    PURCHASED_AT >= '2025-07-01'::TIMESTAMP_NTZ
    AND PURCHASED_AT < '2025-07-02'::TIMESTAMP_NTZ
GROUP BY PRODUCT_ID
"""

cursor = conn.cursor()
cursor.execute(query)
results = cursor.fetchall()

df = pd.DataFrame(results, columns=['product_id', 'purchases', 'units', 'revenue_cents'])
print(f"Got {len(df)} rows")
df.head()

Got 125952 rows


Unnamed: 0,product_id,purchases,units,revenue_cents
0,6861fe83850012da8905c1d8,1,1,1200
1,64b44c338634cb9c8b5c26aa,1,1,19900
2,660f7b17b635f8f5feb9d525,1,1,2000
3,66eef251ce706502c3f06e31,1,1,1000
4,630123ec253a8cba6646c89a,1,1,3500


In [None]:
import pandas as pd
from pathlib import Path
from datetime import date, timedelta
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import snowflake.connector

# CELL 1: SETUP AND CONFIGURATION
load_dotenv()
# conn = snowflake.connector.connect(...) 

ANALYSIS_START = date(2025, 3, 14)
ANALYSIS_END = date(2025, 9, 7)

BASE_PATH = Path("/Users/pranjal/Code/marketplace-incrementality/daily_summaries/data")
OUTPUT_DIR = BASE_PATH / "product_daily_purchases_dataset"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"--- Configuration ---")
print(f"Analysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"Output Directory: {OUTPUT_DIR}")

# CELL 2: DAILY PRODUCT-LEVEL PURCHASES EXTRACTION

def get_purchases_query(date_str: str, next_date_str: str) -> str:
    return f"""
    SELECT
        PRODUCT_ID,
        '{date_str}'::DATE AS date,
        COUNT(DISTINCT PURCHASE_ID) AS purchases,
        COUNT(*) AS lines_sold,
        COALESCE(SUM(QUANTITY), 0) AS units_sold,
        COALESCE(SUM(QUANTITY * UNIT_PRICE), 0) AS revenue_cents,
        AVG(UNIT_PRICE) AS avg_unit_price_cents,
        MIN(UNIT_PRICE) AS min_unit_price_cents,
        MAX(UNIT_PRICE) AS max_unit_price_cents,
        STDDEV(UNIT_PRICE) AS stddev_unit_price_cents,
        APPROX_COUNT_DISTINCT(USER_ID) AS distinct_users_purchased
    FROM
        PURCHASES
    WHERE
        PURCHASED_AT >= '{date_str}'::TIMESTAMP_NTZ AND PURCHASED_AT < '{next_date_str}'::TIMESTAMP_NTZ
        AND PRODUCT_ID IS NOT NULL
    GROUP BY
        PRODUCT_ID;
    """

print("\n--- Starting Daily Product-Level Purchases Extraction ---")
date_list = pd.date_range(start=ANALYSIS_START, end=ANALYSIS_END, freq='D')

for current_date in tqdm(date_list, desc="Extracting Daily Purchases"):
    date_str = current_date.strftime('%Y-%m-%d')
    next_date_str = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')
    
    output_file = OUTPUT_DIR / f"data_{date_str}.parquet"

    if output_file.exists():
        continue

    try:
        query = get_purchases_query(date_str, next_date_str)
        df_day = pd.read_sql(query, conn)

        if not df_day.empty:
            df_day.to_parquet(output_file, index=False, engine='pyarrow', compression='snappy')

    except Exception as e:
        print(f"\nProcessing for {date_str} FAILED: {e}")
        
print("\n--- Daily Purchase Extraction Complete ---")


# CELL 3: VERIFICATION

print("\n--- Verifying the Complete Purchases Dataset ---")
try:
    df_full_purchases = pd.read_parquet(OUTPUT_DIR)
    
    print(f"✅ Successfully loaded complete dataset with {len(df_full_purchases):,} rows.")
    
    print(f"\n=== PURCHASES EXTRACTION FINAL SUMMARY ===")
    print(f"Date range: {df_full_purchases['DATE'].min()} to {df_full_purchases['DATE'].max()}")
    print(f"Unique products with purchases: {df_full_purchases['PRODUCT_ID'].nunique():,}")
    
    total_revenue_dollars = df_full_purchases['REVENUE_CENTS'].sum() / 100.0
    print(f"Total revenue: ${total_revenue_dollars:,.2f}")
    
    print("\nSample of 5 rows:")
    print(df_full_purchases.head().to_markdown(index=False))
    
    print("\nSchema and Memory Usage:")
    df_full_purchases.info(memory_usage='deep')

except Exception as e:
    print(f"\n❌ Could not load or verify the full dataset. Error: {e}")
    print("Please check the contents of the output directory.")

--- Configuration ---
Analysis Period: 2025-03-14 to 2025-09-07
Output Directory: /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/product_daily_purchases_dataset

--- Starting Daily Product-Level Purchases Extraction ---


Extracting Daily Purchases:   0%|          | 0/178 [00:00<?, ?it/s]

  df_day = pd.read_sql(query, conn)



--- Daily Purchase Extraction Complete ---

--- Verifying the Complete Purchases Dataset ---
✅ Successfully loaded complete dataset with 23,765,830 rows.

=== PURCHASES EXTRACTION FINAL SUMMARY ===

❌ Could not load or verify the full dataset. Error: 'datetime.date' object has no attribute 'date'
Please check the contents of the output directory.
