# Extract Daily Impressions Data
This notebook extracts daily product-vendor-campaign impressions data from Snowflake

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from datetime import datetime, timedelta
from pathlib import Path

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected to Snowflake")

In [None]:
# Test query - check if IMPRESSIONS table is accessible
test_query = """
SELECT * FROM IMPRESSIONS LIMIT 5;
"""

print("Testing IMPRESSIONS table access...")
cursor = conn.cursor()
cursor.execute(test_query)
results = cursor.fetchall()

df_test = pd.DataFrame(results)
print(f"✅ Got {len(df_test)} test records")
print(df_test)

In [None]:
# Main extraction loop - one day at a time
START_DATE = '2025-03-14'
END_DATE = '2025-09-07'

# Create local checkpoint directory
checkpoint_dir = Path('daily_granular_impressions')
checkpoint_dir.mkdir(exist_ok=True)

# External drive path for final output
EXTERNAL_DRIVE_PATH = '/Volumes/rawat/data/topsort/daily_summaries'

# Generate all dates
current_date = datetime.strptime(START_DATE, '%Y-%m-%d')
end_date = datetime.strptime(END_DATE, '%Y-%m-%d')

all_results = []

while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    
    # Check if already processed
    checkpoint_file = checkpoint_dir / f"checkpoint_{date_str}.txt"
    if checkpoint_file.exists():
        print(f"Skipping {date_str} - already processed")
        current_date += timedelta(days=1)
        continue
    
    print(f"Processing {date_str}...", end='')
    
    # Query for one day only
    query = f"""
    SELECT
        DATE(OCCURRED_AT) AS date,
        PRODUCT_ID,
        VENDOR_ID,
        CAMPAIGN_ID,
        COUNT(*) AS impressions
    FROM IMPRESSIONS
    WHERE OCCURRED_AT >= '{date_str}' AND OCCURRED_AT < '{(current_date + timedelta(days=1)).strftime('%Y-%m-%d')}'
    GROUP BY DATE(OCCURRED_AT), PRODUCT_ID, VENDOR_ID, CAMPAIGN_ID
    """
    
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        
        df_day = pd.DataFrame(results, columns=['date', 'product_id', 'vendor_id', 'campaign_id', 'impressions'])
        all_results.append(df_day)
        
        # Save checkpoint
        checkpoint_file.touch()
        
        print(f" {len(df_day):,} rows, {df_day['impressions'].sum():,} impressions")
        
    except Exception as e:
        print(f" FAILED: {e}")
        break
    
    current_date += timedelta(days=1)

# Combine all results and save to external drive
if all_results:
    df_final = pd.concat(all_results, ignore_index=True)
    output_file = f'{EXTERNAL_DRIVE_PATH}/daily_product_vendor_campaign_impressions.parquet'
    df_final.to_parquet(output_file, index=False)
    
    print(f"\n=== FINAL SUMMARY ===")
    print(f"Saved {len(df_final):,} total rows to {output_file}")
    print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")
    print(f"Unique products: {df_final['product_id'].nunique():,}")
    print(f"Unique vendors: {df_final['vendor_id'].nunique():,}")
    print(f"Unique campaigns: {df_final['campaign_id'].nunique():,}")
    print(f"Total impressions: {df_final['impressions'].sum():,}")
else:
    print("No data collected")

In [None]:
# Close connection
conn.close()
print("Connection closed")