In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected")

Connected


In [7]:

import os
from datetime import datetime, timedelta
from pathlib import Path

# Date range
START_DATE = '2025-03-14'
END_DATE = '2025-09-07'

# Create output directory
output_dir = Path('daily_granular_clicks')
output_dir.mkdir(exist_ok=True)

# Generate all dates
current_date = datetime.strptime(START_DATE, '%Y-%m-%d')
end_date = datetime.strptime(END_DATE, '%Y-%m-%d')

all_results = []

while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')

    # Check if already processed
    checkpoint_file = output_dir / f"checkpoint_{date_str}.txt"
    if checkpoint_file.exists():
        print(f"Skipping {date_str} - already processed")
        current_date += timedelta(days=1)
        continue

    print(f"Processing {date_str}...", end='')

    # Query for one day only
    query = f"""
    SELECT
        DATE(OCCURRED_AT) AS date,
        PRODUCT_ID,
        VENDOR_ID, 
        CAMPAIGN_ID,
        COUNT(*) AS clicks
    FROM CLICKS
    WHERE OCCURRED_AT >= '{date_str}' AND OCCURRED_AT < '{(current_date + timedelta(days=1)).strftime('%Y-%m-%d')}'
    GROUP BY DATE(OCCURRED_AT), PRODUCT_ID, VENDOR_ID, CAMPAIGN_ID
    """

    try:
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()

        df_day = pd.DataFrame(results, columns=['date', 'product_id', 'vendor_id', 'campaign_id', 'clicks'])
        all_results.append(df_day)

        # Save checkpoint
        checkpoint_file.touch()

        print(f" {len(df_day):,} rows, {df_day['clicks'].sum():,} clicks")

    except Exception as e:
        print(f" FAILED: {e}")
        break

    current_date += timedelta(days=1)

# Combine all results
if all_results:
    df_final = pd.concat(all_results, ignore_index=True)
    output_file = 'daily_product_vendor_campaign_clicks.parquet'
    df_final.to_parquet(output_file, index=False)

    print(f"\n=== FINAL SUMMARY ===")
    print(f"Saved {len(df_final):,} total rows to {output_file}")
    print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")
    print(f"Unique products: {df_final['product_id'].nunique():,}")
    print(f"Unique vendors: {df_final['vendor_id'].nunique():,}")
    print(f"Unique campaigns: {df_final['campaign_id'].nunique():,}")
    print(f"Total clicks: {df_final['clicks'].sum():,}")
else:
    print("No data collected")

Processing 2025-03-14... 289,743 rows, 476,306 clicks
Processing 2025-03-15... 315,281 rows, 527,057 clicks
Processing 2025-03-16... 321,127 rows, 534,071 clicks
Processing 2025-03-17... 283,847 rows, 463,985 clicks
Processing 2025-03-18... 293,106 rows, 482,605 clicks
Processing 2025-03-19... 285,193 rows, 461,796 clicks
Processing 2025-03-20... 287,043 rows, 461,217 clicks
Processing 2025-03-21... 294,073 rows, 468,827 clicks
Processing 2025-03-22... 321,064 rows, 518,526 clicks
Processing 2025-03-23... 328,001 rows, 537,301 clicks
Processing 2025-03-24... 299,747 rows, 488,395 clicks
Processing 2025-03-25... 307,374 rows, 511,954 clicks
Processing 2025-03-26... 309,743 rows, 517,883 clicks
Processing 2025-03-27... 310,835 rows, 521,574 clicks
Processing 2025-03-28... 312,048 rows, 521,893 clicks
Processing 2025-03-29... 334,594 rows, 562,562 clicks
Processing 2025-03-30... 338,423 rows, 569,625 clicks
Processing 2025-03-31... 306,162 rows, 502,463 clicks
Processing 2025-04-01... 314

: 

In [None]:
import os
from datetime import datetime, timedelta
from pathlib import Path

# Date range
START_DATE = '2025-03-14'
END_DATE = '2025-09-07'

# Create output directory
output_dir = Path('granular_daily_clicks')
output_dir.mkdir(exist_ok=True)

# Generate all dates
current_date = datetime.strptime(START_DATE, '%Y-%m-%d')
end_date = datetime.strptime(END_DATE, '%Y-%m-%d')

all_results = []

while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    next_date_str = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')
    
    # Check if already processed
    checkpoint_file = output_dir / f"checkpoint_{date_str}.txt"
    if checkpoint_file.exists():
        print(f"Skipping {date_str} - already processed")
        current_date += timedelta(days=1)
        continue
    
    print(f"Processing {date_str}...", end='')
    
    query = f"""
    SELECT
        PRODUCT_ID,
        CAMPAIGN_ID,
        VENDOR_ID,
        '{date_str}'::DATE as date,
        COUNT(*) AS clicks
    FROM CLICKS
    WHERE
        OCCURRED_AT >= '{date_str}'::TIMESTAMP_NTZ
        AND OCCURRED_AT < '{next_date_str}'::TIMESTAMP_NTZ
    GROUP BY 1, 2, 3
    """
    
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        
        columns = ['product_id', 'campaign_id', 'vendor_id', 'date', 'clicks']
        
        df_day = pd.DataFrame(results, columns=columns)
        all_results.append(df_day)
        
        # Save checkpoint
        checkpoint_file.touch()
        
        print(f" {len(df_day)} rows, {df_day['clicks'].sum():,} total clicks")
        
    except Exception as e:
        print(f" FAILED: {e}")
    
    current_date += timedelta(days=1)

# Combine all results and save
if all_results:
    df_final = pd.concat(all_results, ignore_index=True)
    output_file = 'granular_daily_clicks.parquet'
    df_final.to_parquet(output_file, index=False)
    print(f"\n=== FINAL SUMMARY ===")
    print(f"Saved {len(df_final):,} total rows to {output_file}")
    print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")
    print(f"Unique products: {df_final['product_id'].nunique():,}")
    print(f"Unique campaigns: {df_final['campaign_id'].nunique():,}")
    print(f"Unique vendors: {df_final['vendor_id'].nunique():,}")
    print(f"Total clicks: {df_final['clicks'].sum():,}")
else:
    print("No data collected")