In [1]:
import os
import pandas as pd
from tabulate import tabulate
from dotenv import load_dotenv
import snowflake.connector
import sys

load_dotenv()

# --- Your provided Snowflake connection details ---
# This connection is for *reading* from INCREMENTALITY.
# We no longer need to switch contexts, as we are not creating objects.
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY', # Source database for Clicks and Purchases
    schema='INCREMENTALITY_RESEARCH' # Schema for Clicks and Purchases
)
cursor = conn.cursor()

def run_query(query):
    try:
        cursor.execute(query)
        # For SELECT queries, fetch results
        if cursor.description:
            results = cursor.fetchall()
            columns = [desc[0] for desc in cursor.description]
            return pd.DataFrame(results, columns=columns)
        # For DDL/DML, no results to fetch (though we won't be doing DDL now)
        return pd.DataFrame()
    except snowflake.connector.ProgrammingError as e:
        print(f"\nERROR executing query:\n{query}\nDetails: {e}")
        raise # Re-raise the exception to stop execution on error

def show_table(df, title=""):
    if title:
        print(f"\n{title}")
        print("="*len(title))
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

print("✅ Connected to Snowflake")

# Define the pilot week for consistency
PILOT_WEEK_START = '2025-07-01 00:00:00'
PILOT_WEEK_END = '2025-07-08 00:00:00' # End is exclusive

print("\n--- Generating User-Vendor-Week Panel for Pilot Week ---")
print(f"   Using data from {PILOT_WEEK_START} to {PILOT_WEEK_END} (exclusive)")

✅ Connected to Snowflake

--- Generating User-Vendor-Week Panel for Pilot Week ---
   Using data from 2025-07-01 00:00:00 to 2025-07-08 00:00:00 (exclusive)


In [None]:
# --- Configuration for Full-History Panel ---
print(f"--- Generating Full-History Clicks-Only Vendor Panel ---")
print("   Based on the date check, this will process data from roughly 2025-03-14 to 2025-09-07.")

# --- Define the Full-History Vendor-Week Panel Aggregation Query ---
# This query processes all available data by removing the WHERE date clauses.
vendor_panel_query_full_history = """
WITH
-- Step 1: Aggregate all historical clicks per vendor *per week*.
CLICKS_WEEKLY AS (
    SELECT
        VENDOR_ID,
        DATE_TRUNC('WEEK', OCCURRED_AT) AS week,
        COUNT(DISTINCT INTERACTION_ID) AS click_count
    FROM CLICKS
    GROUP BY VENDOR_ID, week
),

-- Step 2: Create a CLICK-BASED map from all historical data.
PRODUCT_VENDOR_MAP_CLICKS AS (
    SELECT DISTINCT PRODUCT_ID, VENDOR_ID
    FROM CLICKS
    WHERE VENDOR_ID IS NOT NULL AND PRODUCT_ID IS NOT NULL
),

-- Step 3: Aggregate all historical attributed purchases and revenue per vendor *per week*.
PURCHASES_WEEKLY AS (
    SELECT
        pvm.VENDOR_ID,
        DATE_TRUNC('WEEK', p.PURCHASED_AT) AS week,
        COUNT(DISTINCT p.PURCHASE_ID) AS purchase_count,
        COALESCE(SUM(p.QUANTITY * p.UNIT_PRICE), 0) AS total_revenue_cents
    FROM PURCHASES AS p
    JOIN PRODUCT_VENDOR_MAP_CLICKS AS pvm ON p.PRODUCT_ID = pvm.PRODUCT_ID
    GROUP BY pvm.VENDOR_ID, week
)

-- Final Step: Join all weekly aggregates into the final panel.
SELECT
    c.week,
    c.vendor_id,
    c.click_count AS clicks,
    COALESCE(p.purchase_count, 0) AS purchases,
    (COALESCE(p.total_revenue_cents, 0) / 100)::DECIMAL(18, 2) AS revenue_dollars
    
FROM CLICKS_WEEKLY AS c
LEFT JOIN PURCHASES_WEEKLY AS p ON c.vendor_id = p.vendor_id AND c.week = p.week
ORDER BY c.vendor_id, c.week;
"""

try:
    # Execute the query to build the full panel
    vendor_panel_full_df = run_query(vendor_panel_query_full_history)
    print(f"\n✅ Successfully generated full-history panel with {len(vendor_panel_full_df):,} rows.")

    # Process and save the DataFrame
    vendor_panel_full_df.columns = [col.lower() for col in vendor_panel_full_df.columns]
    
    # Save to a new Parquet file
    output_filename = "vendor_panel_full_history_clicks_only.parquet"
    vendor_panel_full_df.to_parquet(output_filename, index=False, engine='pyarrow')
    
    print(f"\n✅ Data successfully processed and saved to '{output_filename}'")
    show_table(vendor_panel_full_df.head(10), f"Data Sample from '{output_filename}'")

except Exception as e:
    print(f"\nAn error occurred during panel generation: {e}")