In [4]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected")

Connected


In [5]:

import pandas as pd
from pathlib import Path
from datetime import date, timedelta
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import snowflake.connector

# ==============================================================================
# CELL: SETUP AND CONFIGURATION
# ==============================================================================
load_dotenv()
# conn = snowflake.connector.connect(...) 

ANALYSIS_START = date(2025, 3, 14)
ANALYSIS_END = date(2025, 9, 7)

BASE_PATH = Path("/Users/pranjal/Code/marketplace-incrementality/daily_summaries/data")
OUTPUT_DIR = BASE_PATH / "product_daily_impressions_dataset"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"--- Configuration ---")
print(f"Analysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"Output Directory: {OUTPUT_DIR}")


# ==============================================================================
# CELL: DAILY PRODUCT-LEVEL IMPRESSIONS EXTRACTION
# ==============================================================================

def get_impressions_query(date_str: str, next_date_str: str) -> str:
    """Returns a parameterized SQL query to get all product-level impression metrics for a single day."""
    return f"""
    SELECT
        PRODUCT_ID,
        '{date_str}'::DATE AS date,
        ANY_VALUE(VENDOR_ID) AS vendor_id,
        ANY_VALUE(CAMPAIGN_ID) AS campaign_id,
        COUNT(*) AS total_impressions,
        COUNT(DISTINCT INTERACTION_ID) AS impressions,
        APPROX_COUNT_DISTINCT(USER_ID) AS distinct_users_impressed
    FROM
        IMPRESSIONS
    WHERE
        OCCURRED_AT >= '{date_str}'::TIMESTAMP_NTZ AND OCCURRED_AT < '{next_date_str}'::TIMESTAMP_NTZ
        AND PRODUCT_ID IS NOT NULL
    GROUP BY
        PRODUCT_ID;
    """

# --- Main Extraction Loop ---
print("\n--- Starting Daily Product-Level Impressions Extraction ---")
date_list = pd.date_range(start=ANALYSIS_START, end=ANALYSIS_END, freq='D')

for current_date in tqdm(date_list, desc="Extracting Daily Impressions"):
    date_str = current_date.strftime('%Y-%m-%d')
    next_date_str = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')
    
    output_file = OUTPUT_DIR / f"data_{date_str}.parquet"

    if output_file.exists():
        continue

    try:
        query = get_impressions_query(date_str, next_date_str)
        df_day = pd.read_sql(query, conn)

        if not df_day.empty:
            df_day.to_parquet(output_file, index=False, engine='pyarrow', compression='snappy')

    except Exception as e:
        print(f"\nProcessing for {date_str} FAILED: {e}")
        
print("\n--- Daily Impression Extraction Complete ---")

# ==============================================================================
# CELL: VERIFICATION
# ==============================================================================

print("\n--- Verifying the Complete Impressions Dataset ---")
try:
    df_full_impressions = pd.read_parquet(OUTPUT_DIR)
    
    print(f"✅ Successfully loaded complete dataset with {len(df_full_impressions):,} rows.")
    
    print(f"\n=== IMPRESSIONS EXTRACTION FINAL SUMMARY ===")
    print(f"Date range: {df_full_impressions['DATE'].min().date()} to {df_full_impressions['DATE'].max().date()}")
    print(f"Unique products with impressions: {df_full_impressions['PRODUCT_ID'].nunique():,}")
    
    total_impressions = df_full_impressions['IMPRESSIONS'].sum()
    print(f"Total unique impressions extracted: {total_impressions:,}")
    
    print("\nSample of 5 rows:")
    print(df_full_impressions.head().to_markdown(index=False))
    
    print("\nSchema and Memory Usage:")
    df_full_impressions.info(memory_usage='deep')

except Exception as e:
    print(f"\n❌ Could not load or verify the full dataset. Error: {e}")

--- Configuration ---
Analysis Period: 2025-03-14 to 2025-09-07
Output Directory: /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/product_daily_impressions_dataset

--- Starting Daily Product-Level Impressions Extraction ---


Extracting Daily Impressions:   0%|          | 0/178 [00:00<?, ?it/s]

  df_day = pd.read_sql(query, conn)



--- Daily Impression Extraction Complete ---

--- Verifying the Complete Impressions Dataset ---


: 