In [1]:
import os
import pandas as pd
from tabulate import tabulate
from dotenv import load_dotenv
import snowflake.connector
import sys

load_dotenv()

# --- Your provided Snowflake connection details ---
# This connection is for *reading* from INCREMENTALITY.
# We no longer need to switch contexts, as we are not creating objects.
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY', # Source database for Clicks and Purchases
    schema='INCREMENTALITY_RESEARCH' # Schema for Clicks and Purchases
)
cursor = conn.cursor()

def run_query(query):
    try:
        cursor.execute(query)
        # For SELECT queries, fetch results
        if cursor.description:
            results = cursor.fetchall()
            columns = [desc[0] for desc in cursor.description]
            return pd.DataFrame(results, columns=columns)
        # For DDL/DML, no results to fetch (though we won't be doing DDL now)
        return pd.DataFrame()
    except snowflake.connector.ProgrammingError as e:
        print(f"\nERROR executing query:\n{query}\nDetails: {e}")
        raise # Re-raise the exception to stop execution on error

def show_table(df, title=""):
    if title:
        print(f"\n{title}")
        print("="*len(title))
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

print("✅ Connected to Snowflake")

# Define the pilot week for consistency
PILOT_WEEK_START = '2025-07-01 00:00:00'
PILOT_WEEK_END = '2025-07-08 00:00:00' # End is exclusive

print("\n--- Generating User-Vendor-Week Panel for Pilot Week ---")
print(f"   Using data from {PILOT_WEEK_START} to {PILOT_WEEK_END} (exclusive)")

✅ Connected to Snowflake

--- Generating User-Vendor-Week Panel for Pilot Week ---
   Using data from 2025-07-01 00:00:00 to 2025-07-08 00:00:00 (exclusive)


In [26]:
import pandas as pd
from datetime import datetime, timedelta

# --- Configuration ---
TARGET_DATETIME_START = '2025-07-06 16:00:00'
TARGET_DATETIME_END = (datetime.strptime(TARGET_DATETIME_START, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=2)).strftime('%Y-%m-%d %H:%M:%S')
IMPRESSION_WINDOW_END = (datetime.strptime(TARGET_DATETIME_END, '%Y-%m-%d %H:%M:%S') + timedelta(seconds=30)).strftime('%Y-%m-%d %H:%M:%S')
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Define Helper for Clean Reports ---
def format_df_to_text(df, title):
    """Formats a pandas DataFrame into a reStructuredText simple table string."""
    df_str = df.astype(str)
    header = df_str.columns.tolist()
    data = df_str.values.tolist()
    col_widths = [max(len(x) for x in col) for col in zip(*([header] + data))]
    
    separator = '+' + '+'.join('-' * (w + 2) for w in col_widths) + '+'
    header_line = '|' + '|'.join(f' {h:<{w}} ' for h, w in zip(header, col_widths)) + '|'
    
    data_lines = []
    for row in data:
        data_lines.append('|' + '|'.join(f' {item:<{w}} ' for item, w in zip(row, col_widths)) + '|')
    
    report_lines = [
        "=" * (len(title) + 4),
        f"  {title}  ",
        "=" * (len(title) + 4),
        separator,
        header_line,
        separator.replace('-', '='),
    ]
    report_lines.extend(data_lines)
    report_lines.append(separator)
    
    return "\n".join(report_lines)

# --- Spike Test Logic ---
try:
    print(f"--- Running 2-Minute Spike Test ---")
    print(f"Window: {TARGET_DATETIME_START} to {TARGET_DATETIME_END}")

    # Query 1: Count Rank 1 Winners in the window
    query1 = f"""
    SELECT COUNT(*) AS count_rank_1_wins
    FROM {DB_SCHEMA}.AUCTIONS_RESULTS
    WHERE CREATED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{TARGET_DATETIME_END}'
      AND IS_WINNER = TRUE
      AND RANKING = 1;
    """
    
    # Query 2: Count Impressions in the window
    query2 = f"""
    SELECT COUNT(*) AS count_impressions
    FROM {DB_SCHEMA}.IMPRESSIONS
    WHERE OCCURRED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{IMPRESSION_WINDOW_END}';
    """
    
    # Query 3: The Definitive Join Test with CORRECTED column name
    query3 = f"""
    WITH WindowWinningBids AS (
        SELECT
            ar.AUCTION_ID,
            -- CORRECTED: Use OPAQUE_USER_ID from AUCTIONS_USERS and alias it
            au.OPAQUE_USER_ID AS USER_ID,
            ar.VENDOR_ID,
            ar.PRODUCT_ID
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS ar
        JOIN {DB_SCHEMA}.AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        WHERE ar.CREATED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{TARGET_DATETIME_END}'
          AND ar.IS_WINNER = TRUE
          AND ar.RANKING = 1
    ),
    WindowImpressions AS (
        SELECT
            TRY_HEX_DECODE_BINARY(AUCTION_ID) AS AUCTION_ID_BINARY,
            USER_ID,
            TRY_HEX_DECODE_BINARY(VENDOR_ID) AS VENDOR_ID_BINARY,
            PRODUCT_ID
        FROM {DB_SCHEMA}.IMPRESSIONS
        WHERE OCCURRED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{IMPRESSION_WINDOW_END}'
    )
    SELECT
        COUNT(*) AS successful_joins
    FROM WindowWinningBids b
    INNER JOIN WindowImpressions i
        ON b.AUCTION_ID = i.AUCTION_ID_BINARY
        AND b.USER_ID = i.USER_ID
        AND b.VENDOR_ID = i.VENDOR_ID_BINARY
        AND b.PRODUCT_ID = i.PRODUCT_ID;
    """

    # --- Execute Queries ---
    print("Executing queries...")
    rank_1_wins = run_query(query1).iloc[0, 0]
    impressions_count = run_query(query2).iloc[0, 0]
    join_count = run_query(query3).iloc[0, 0]
    
    # --- Generate Final Report ---
    report_filename = f"spike_test_report_{TARGET_DATE.replace('-', '')}.txt"
    with open(report_filename, "w") as f:
        f.write(f"Final Diagnostic Spike Test Report\n")
        f.write("="*40 + "\n")
        f.write(f"Time Window: {TARGET_DATETIME_START} to {TARGET_DATETIME_END}\n")
        f.write("-" * 40 + "\n\n")
        
        f.write(f"1. Total Rank 1 Winning Bids in Window: {rank_1_wins:,}\n")
        f.write(f"2. Total Impressions Logged in Window:   {impressions_count:,}\n")
        f.write(f"3. Verifiable Matches (Joins) Found:    {join_count:,}\n\n")
        
        f.write("CONCLUSION:\n")
        
        if join_count > 0:
            success_rate = (join_count / rank_1_wins) if rank_1_wins > 0 else 0
            f.write(f"SUCCESS. A data link exists. The match rate is {success_rate:.2%}.\n")
            f.write("The issue is not a total system failure but likely a data quality problem or a partial logging issue that causes the broader queries to fail.\n")
            f.write("Next step is to debug the keys on the non-matching records.\n")
        else:
            if rank_1_wins > 0 and impressions_count > 0:
                f.write("TOTAL FAILURE CONFIRMED.\n")
                f.write("Both winning bids and impressions were logged in this window, but ZERO records can be joined.\n")
                f.write("The data link is fundamentally broken or the join keys are incorrect. Escalate to Engineering.\n")
            elif rank_1_wins == 0:
                f.write("INCONCLUSIVE (No Wins).\n")
                f.write("There were no winning bids in this time window to test. Please select a different time.\n")
            else: # impressions_count == 0
                f.write("FAILURE (No Impressions).\n")
                f.write("There were winning bids, but zero impressions were logged in this window. The impression logging system appears to be down. Escalate to Engineering.\n")

    print(f"\nDiagnostic complete. Report saved to '{report_filename}'.")
    with open(report_filename, 'r') as f:
        print(f.read())

except NameError:
    print("\nERROR: The 'run_query' function is not defined.")
except Exception as e:
    print(f"\nAn error occurred during the spike test: {e}")

--- Running 2-Minute Spike Test ---
Window: 2025-07-06 16:00:00 to 2025-07-06 16:02:00
Executing queries...

Diagnostic complete. Report saved to 'spike_test_report_20250706.txt'.
Final Diagnostic Spike Test Report
Time Window: 2025-07-06 16:00:00 to 2025-07-06 16:02:00
----------------------------------------

1. Total Rank 1 Winning Bids in Window: 13,653
2. Total Impressions Logged in Window:   65,306
3. Verifiable Matches (Joins) Found:    0

CONCLUSION:
TOTAL FAILURE CONFIRMED.
Both winning bids and impressions were logged in this window, but ZERO records can be joined.
The data link is fundamentally broken or the join keys are incorrect. Escalate to Engineering.



In [27]:
import pandas as pd
from datetime import datetime, timedelta

# --- Configuration ---
TARGET_DATETIME_START = '2025-07-06 16:00:00'
TARGET_DATETIME_END = (datetime.strptime(TARGET_DATETIME_START, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=2)).strftime('%Y-%m-%d %H:%M:%S')
# Define a generous 30-minute look-ahead window for impressions
LOOKAHEAD_WINDOW_END = (datetime.strptime(TARGET_DATETIME_END, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S')
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Final Diagnostic Logic ---
try:
    print(f"--- Running Final Look-Ahead Diagnostic (30-Minute Window) ---")
    print(f"Auction Window: {TARGET_DATETIME_START} to {TARGET_DATETIME_END}")

    # This query tests for a time-based link, ignoring the AUCTION_ID in the join.
    lookahead_query = f"""
    WITH WindowWinningBids AS (
        -- Step 1: Get all winning bids in the window, with their exact win time and user.
        SELECT
            au.OPAQUE_USER_ID AS USER_ID,
            ar.VENDOR_ID,
            ar.PRODUCT_ID,
            ar.CREATED_AT AS WIN_TIME
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS ar
        JOIN {DB_SCHEMA}.AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        WHERE ar.CREATED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{TARGET_DATETIME_END}'
          AND ar.IS_WINNER = TRUE
    ),
    ImpressionsInLookahead AS (
        -- Step 2: Get all impressions in the wider look-ahead window.
        SELECT
            USER_ID,
            TRY_HEX_DECODE_BINARY(VENDOR_ID) AS VENDOR_ID_BINARY,
            PRODUCT_ID,
            OCCURRED_AT AS IMPRESSION_TIME
        FROM {DB_SCHEMA}.IMPRESSIONS
        -- Filter impressions to the relevant time window for performance.
        WHERE OCCURRED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{LOOKAHEAD_WINDOW_END}'
    )
    -- Step 3: Count total wins and find matches where an impression occurred AFTER the win for the same tuple.
    SELECT
        COUNT(b.WIN_TIME) AS total_wins_in_window,
        COUNT(i.IMPRESSION_TIME) AS wins_with_lookahead_impression
    FROM WindowWinningBids b
    LEFT JOIN ImpressionsInLookahead i
        -- The join key is now the stable (USER, VENDOR, PRODUCT) tuple.
        ON b.USER_ID = i.USER_ID
        AND b.VENDOR_ID = i.VENDOR_ID_BINARY
        AND b.PRODUCT_ID = i.PRODUCT_ID
        -- The critical look-ahead condition:
        AND i.IMPRESSION_TIME >= b.WIN_TIME
        AND DATEDIFF('minute', b.WIN_TIME, i.IMPRESSION_TIME) <= 30;
    """

    # --- Execute the query ---
    print("Executing look-ahead query...")
    result_df = run_query(lookahead_query)
    result_df.columns = result_df.columns.str.lower()
    r = result_df.iloc[0].to_dict()

    total_wins = r.get('total_wins_in_window', 0) or 0
    matched_wins = r.get('wins_with_lookahead_impression', 0) or 0
    match_rate = (matched_wins / total_wins) if total_wins > 0 else 0

    # --- Generate Final Report ---
    report_filename = f"lookahead_diagnostic_report_{TARGET_DATE.replace('-', '')}.txt"
    with open(report_filename, "w") as f:
        f.write(f"Final Look-Ahead Diagnostic Report: {TARGET_DATE}\n")
        f.write("="*60 + "\n\n")
        f.write("This report tests if a winning bid is followed by an impression for the same (USER, VENDOR, PRODUCT) within 30 minutes, ignoring AUCTION_ID.\n\n")
        
        f.write(f"Total Winning Bids in 2-Min Window: {total_wins:,}\n")
        f.write(f"Wins with a Subsequent Impression (within 30 mins): {matched_wins:,}\n")
        f.write(f"Successful Look-Ahead Match Rate: {match_rate:.2%}\n\n")
        
        f.write("-" * 60 + "\n")
        f.write("CONCLUSION:\n")
        
        if match_rate > 0.01: # Use a small threshold like 1% to declare success
            f.write("POTENTIAL SUCCESS. A time-based data link exists.\n")
            f.write("The AUCTION_ID key is confirmed to be unreliable for joining wins to impressions.\n")
            f.write("The correct method is to attribute impressions to wins based on the (USER, VENDOR, PRODUCT) tuple within a time window.\n")
            f.write("Next Step: Re-architect the main EDA using this look-ahead logic to find the true visibility cliff.\n")
        else:
            f.write("ULTIMATE FAILURE CONFIRMED.\n")
            f.write("Even with a generous 30-minute look-ahead window, no meaningful link can be established between a winning bid and a subsequent impression.\n")
            f.write("We have now exhausted all plausible data linkage hypotheses. The data instrumentation is fundamentally and completely broken.\n")
            f.write("Recommendation: Escalate to Engineering. All incrementality analysis is BLOCKED.\n")

    print(f"Diagnostic complete. Report saved to '{report_filename}'.")
    with open(report_filename, 'r') as f:
        print(f.read())

except NameError:
    print("\nERROR: The 'run_query' function is not defined.")
except Exception as e:
    print(f"\nAn error occurred during the look-ahead diagnostic: {e}")

--- Running Final Look-Ahead Diagnostic (30-Minute Window) ---
Auction Window: 2025-07-06 16:00:00 to 2025-07-06 16:02:00
Executing look-ahead query...
Diagnostic complete. Report saved to 'lookahead_diagnostic_report_20250706.txt'.
Final Look-Ahead Diagnostic Report: 2025-07-06

This report tests if a winning bid is followed by an impression for the same (USER, VENDOR, PRODUCT) within 30 minutes, ignoring AUCTION_ID.

Total Winning Bids in 2-Min Window: 370,505
Wins with a Subsequent Impression (within 30 mins): 0
Successful Look-Ahead Match Rate: 0.00%

------------------------------------------------------------
CONCLUSION:
ULTIMATE FAILURE CONFIRMED.
Even with a generous 30-minute look-ahead window, no meaningful link can be established between a winning bid and a subsequent impression.
We have now exhausted all plausible data linkage hypotheses. The data instrumentation is fundamentally and completely broken.
Recommendation: Escalate to Engineering. All incrementality analysis is 

In [28]:
import pandas as pd
from datetime import datetime, timedelta

# --- Configuration ---
TARGET_DATETIME_START = '2025-07-06 16:00:00'
TARGET_DATETIME_END = (datetime.strptime(TARGET_DATETIME_START, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=2)).strftime('%Y-%m-%d %H:%M:%S')
# Define a generous 30-minute look-ahead window for impressions
LOOKAHEAD_WINDOW_END = (datetime.strptime(TARGET_DATETIME_END, '%Y-%m-%d %H:%M:%S') + timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S')
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Final Diagnostic Logic ---
try:
    print(f"--- Running Final Look-Ahead Diagnostic (30-Minute Window) ---")
    print(f"Auction Window: {TARGET_DATETIME_START} to {TARGET_DATETIME_END}")

    # This query tests for a time-based link, ignoring the AUCTION_ID in the join.
    lookahead_query = f"""
    WITH WindowWinningBids AS (
        -- Step 1: Get all winning bids in the window, with their exact win time and user.
        SELECT
            au.OPAQUE_USER_ID AS USER_ID,
            ar.VENDOR_ID,
            ar.PRODUCT_ID,
            ar.CREATED_AT AS WIN_TIME
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS ar
        JOIN {DB_SCHEMA}.AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        WHERE ar.CREATED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{TARGET_DATETIME_END}'
          AND ar.IS_WINNER = TRUE
    ),
    ImpressionsInLookahead AS (
        -- Step 2: Get all impressions in the wider look-ahead window.
        SELECT
            USER_ID,
            TRY_HEX_DECODE_BINARY(VENDOR_ID) AS VENDOR_ID_BINARY,
            PRODUCT_ID,
            OCCURRED_AT AS IMPRESSION_TIME
        FROM {DB_SCHEMA}.IMPRESSIONS
        -- Filter impressions to the relevant time window for performance.
        WHERE OCCURRED_AT BETWEEN '{TARGET_DATETIME_START}' AND '{LOOKAHEAD_WINDOW_END}'
    )
    -- Step 3: Count total wins and find matches where an impression occurred AFTER the win for the same tuple.
    SELECT
        COUNT(b.WIN_TIME) AS total_wins_in_window,
        COUNT(i.IMPRESSION_TIME) AS wins_with_lookahead_impression
    FROM WindowWinningBids b
    LEFT JOIN ImpressionsInLookahead i
        -- The join key is now the stable (USER, VENDOR, PRODUCT) tuple.
        ON b.USER_ID = i.USER_ID
        AND b.VENDOR_ID = i.VENDOR_ID_BINARY
        AND b.PRODUCT_ID = i.PRODUCT_ID
        -- The critical look-ahead condition:
        AND i.IMPRESSION_TIME >= b.WIN_TIME
        AND DATEDIFF('minute', b.WIN_TIME, i.IMPRESSION_TIME) <= 30;
    """

    # --- Execute the query ---
    print("Executing look-ahead query...")
    result_df = run_query(lookahead_query)
    result_df.columns = result_df.columns.str.lower()
    r = result_df.iloc[0].to_dict()

    total_wins = r.get('total_wins_in_window', 0) or 0
    matched_wins = r.get('wins_with_lookahead_impression', 0) or 0
    match_rate = (matched_wins / total_wins) if total_wins > 0 else 0

    # --- Generate Final Report ---
    report_filename = f"lookahead_diagnostic_report_{TARGET_DATE.replace('-', '')}.txt"
    with open(report_filename, "w") as f:
        f.write(f"Final Look-Ahead Diagnostic Report: {TARGET_DATE}\n")
        f.write("="*60 + "\n\n")
        f.write("This report tests if a winning bid is followed by an impression for the same (USER, VENDOR, PRODUCT) within 30 minutes, ignoring AUCTION_ID.\n\n")
        
        f.write(f"Total Winning Bids in 2-Min Window: {total_wins:,}\n")
        f.write(f"Wins with a Subsequent Impression (within 30 mins): {matched_wins:,}\n")
        f.write(f"Successful Look-Ahead Match Rate: {match_rate:.2%}\n\n")
        
        f.write("-" * 60 + "\n")
        f.write("CONCLUSION:\n")
        
        if match_rate > 0.01: # Use a small threshold like 1% to declare success
            f.write("POTENTIAL SUCCESS. A time-based data link exists.\n")
            f.write("The AUCTION_ID key is confirmed to be unreliable for joining wins to impressions.\n")
            f.write("The correct method is to attribute impressions to wins based on the (USER, VENDOR, PRODUCT) tuple within a time window.\n")
            f.write("Next Step: Re-architect the main EDA using this look-ahead logic to find the true visibility cliff.\n")
        else:
            f.write("ULTIMATE FAILURE CONFIRMED.\n")
            f.write("Even with a generous 30-minute look-ahead window, no meaningful link can be established between a winning bid and a subsequent impression.\n")
            f.write("We have now exhausted all plausible data linkage hypotheses. The data instrumentation is fundamentally and completely broken.\n")
            f.write("Recommendation: Escalate to Engineering. All incrementality analysis is BLOCKED.\n")

    print(f"Diagnostic complete. Report saved to '{report_filename}'.")
    with open(report_filename, 'r') as f:
        print(f.read())

except NameError:
    print("\nERROR: The 'run_query' function is not defined.")
except Exception as e:
    print(f"\nAn error occurred during the look-ahead diagnostic: {e}")

--- Running Final Look-Ahead Diagnostic (30-Minute Window) ---
Auction Window: 2025-07-06 16:00:00 to 2025-07-06 16:02:00
Executing look-ahead query...
Diagnostic complete. Report saved to 'lookahead_diagnostic_report_20250706.txt'.
Final Look-Ahead Diagnostic Report: 2025-07-06

This report tests if a winning bid is followed by an impression for the same (USER, VENDOR, PRODUCT) within 30 minutes, ignoring AUCTION_ID.

Total Winning Bids in 2-Min Window: 370,505
Wins with a Subsequent Impression (within 30 mins): 0
Successful Look-Ahead Match Rate: 0.00%

------------------------------------------------------------
CONCLUSION:
ULTIMATE FAILURE CONFIRMED.
Even with a generous 30-minute look-ahead window, no meaningful link can be established between a winning bid and a subsequent impression.
We have now exhausted all plausible data linkage hypotheses. The data instrumentation is fundamentally and completely broken.
Recommendation: Escalate to Engineering. All incrementality analysis is 