# config

In [2]:
import os
import pandas as pd
from tabulate import tabulate
from dotenv import load_dotenv
import snowflake.connector
import sys

load_dotenv()

# --- Your provided Snowflake connection details ---
# This connection is for *reading* from INCREMENTALITY.
# We no longer need to switch contexts, as we are not creating objects.
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY', # Source database for Clicks and Purchases
    schema='INCREMENTALITY_RESEARCH' # Schema for Clicks and Purchases
)
cursor = conn.cursor()

def run_query(query):
    try:
        cursor.execute(query)
        # For SELECT queries, fetch results
        if cursor.description:
            results = cursor.fetchall()
            columns = [desc[0] for desc in cursor.description]
            return pd.DataFrame(results, columns=columns)
        # For DDL/DML, no results to fetch (though we won't be doing DDL now)
        return pd.DataFrame()
    except snowflake.connector.ProgrammingError as e:
        print(f"\nERROR executing query:\n{query}\nDetails: {e}")
        raise # Re-raise the exception to stop execution on error

def show_table(df, title=""):
    if title:
        print(f"\n{title}")
        print("="*len(title))
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

print("✅ Connected to Snowflake")

# Define the pilot week for consistency
PILOT_WEEK_START = '2025-07-01 00:00:00'
PILOT_WEEK_END = '2025-07-08 00:00:00' # End is exclusive

print("\n--- Generating User-Vendor-Week Panel for Pilot Week ---")
print(f"   Using data from {PILOT_WEEK_START} to {PILOT_WEEK_END} (exclusive)")

✅ Connected to Snowflake

--- Generating User-Vendor-Week Panel for Pilot Week ---
   Using data from 2025-07-01 00:00:00 to 2025-07-08 00:00:00 (exclusive)


# Daily Funnel

In [8]:
import pandas as pd
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

# --- Configuration ---
TARGET_DATE = '2025-07-07'
NEXT_DAY = (datetime.strptime(TARGET_DATE, '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Define All Queries ---
queries = {
    "auctions": f"SELECT COUNT(DISTINCT AUCTION_ID) AS val FROM {DB_SCHEMA}.AUCTIONS_USERS WHERE CREATED_AT >= '{TARGET_DATE}' AND CREATED_AT < '{NEXT_DAY}'",
    "auction_results": f"SELECT COUNT(*) AS val FROM {DB_SCHEMA}.AUCTIONS_RESULTS WHERE CREATED_AT >= '{TARGET_DATE}' AND CREATED_AT < '{NEXT_DAY}'",
    "winning_bids": f"SELECT COUNT(*) AS val FROM {DB_SCHEMA}.AUCTIONS_RESULTS WHERE IS_WINNER = TRUE AND CREATED_AT >= '{TARGET_DATE}' AND CREATED_AT < '{NEXT_DAY}'",
    "impressions": f"SELECT COUNT(*) AS val FROM {DB_SCHEMA}.IMPRESSIONS WHERE OCCURRED_AT >= '{TARGET_DATE}' AND OCCURRED_AT < '{NEXT_DAY}'",
    "clicks": f"SELECT COUNT(*) AS val FROM {DB_SCHEMA}.CLICKS WHERE OCCURRED_AT >= '{TARGET_DATE}' AND OCCURRED_AT < '{NEXT_DAY}'",
    "purchases": f"SELECT COUNT(DISTINCT PURCHASE_ID) AS val FROM {DB_SCHEMA}.PURCHASES WHERE PURCHASED_AT >= '{TARGET_DATE}' AND PURCHASED_AT < '{NEXT_DAY}'",
    "revenue_cents": f"SELECT COALESCE(SUM(QUANTITY * UNIT_PRICE), 0) AS val FROM {DB_SCHEMA}.PURCHASES WHERE PURCHASED_AT >= '{TARGET_DATE}' AND PURCHASED_AT < '{NEXT_DAY}'",
    "distinct_users_impression": f"SELECT APPROX_COUNT_DISTINCT(USER_ID) AS val FROM {DB_SCHEMA}.IMPRESSIONS WHERE OCCURRED_AT >= '{TARGET_DATE}' AND OCCURRED_AT < '{NEXT_DAY}'",
    "distinct_users_click": f"SELECT APPROX_COUNT_DISTINCT(USER_ID) AS val FROM {DB_SCHEMA}.CLICKS WHERE OCCURRED_AT >= '{TARGET_DATE}' AND OCCURRED_AT < '{NEXT_DAY}'",
    "distinct_users_purchase": f"SELECT APPROX_COUNT_DISTINCT(USER_ID) AS val FROM {DB_SCHEMA}.PURCHASES WHERE PURCHASED_AT >= '{TARGET_DATE}' AND PURCHASED_AT < '{NEXT_DAY}'"
}

# --- Execute Queries with TQDM Progress Bar ---
try:
    results = {}
    for name, query in tqdm(queries.items(), desc=f"Fetching metrics for {TARGET_DATE}"):
        # Assumes run_query is defined and returns a pandas DataFrame
        results[name] = run_query(query).iloc[0, 0]

    # --- Calculations for Summary Table ---
    r = results
    r['revenue_dollars'] = r.get('revenue_cents', 0) / 100
    
    slots_per_auction = (r['winning_bids'] / r['auctions']) if r['auctions'] > 0 else 0
    show_rate = (r['impressions'] / r['winning_bids']) if r['winning_bids'] > 0 else 0
    ctr = (r['clicks'] / r['impressions']) if r['impressions'] > 0 else 0
    cvr_click = (r['purchases'] / r['clicks']) if r['clicks'] > 0 else 0
    cvr_impression = (r['purchases'] / r['impressions']) if r['impressions'] > 0 else 0
    avg_purchase_val = (r['revenue_dollars'] / r['purchases']) if r['purchases'] > 0 else 0
    
    # --- Create and Display Final Summary Table ---
    summary_data = {
        "Metric": [
            "Auctions: Total Search Events", "Auctions: Total Bids Submitted by Vendors", "Auctions: Total Winning Bids (Impression Slots Won)", "Auctions: Average Ad Slots Filled per Search Event",
            "-----", "Ad Delivery: Impression-to-Win Ratio (Show Rate)", "-----",
            "Conversion Funnel: Total Impressions Delivered", "Conversion Funnel: Total Clicks on Ads", "Conversion Funnel: Total Unique Purchase Transactions", "Conversion Funnel: Total Revenue (in dollars)",
            "-----", "Key Ratios: Click-Through Rate (CTR)", "Key Ratios: Conversion Rate (from Click to Purchase)", "Key Ratios: Conversion Rate (from Impression to Purchase)", "Key Ratios: Average Purchase Value (Cart Size)",
            "-----", "User Engagement: Approx. Unique Users with Impressions", "User Engagement: Approx. Unique Users with Clicks", "User Engagement: Approx. Unique Users with Purchases"
        ],
        "Description": [
            "The number of distinct user actions (e.g., a search query) that triggered an ad auction.", "The total number of ad bids submitted by all vendors across all auctions.",
            "The number of bids that won a slot and were eligible to be shown to the user.", "The average number of sponsored listings shown for a single user search event.",
            "-----", "The ratio of actual impressions to winning bids. A value < 100% can indicate ads that won but were not rendered (e.g., below the fold).", "-----",
            "The total number of times an ad was actually displayed to a user.", "The total number of user clicks on the displayed ads.",
            "The number of distinct purchase events (e.g., shopping carts) that occurred.", "The total gross merchandise value (GMV) generated from purchases, converted from cents to dollars.",
            "-----", "Impressions / Clicks. The percentage of displayed ads that were clicked on.", "Purchases / Clicks. The percentage of clicks that resulted in a purchase transaction.",
            "Purchases / Impressions. The overall efficiency of the ad funnel from view to purchase.", "Total Revenue / Total Purchases. The average dollar value of a single purchase transaction.",
            "-----", "The approximate number of distinct users who were shown at least one ad.", "The approximate number of distinct users who clicked on at least one ad.", "The approximate number of distinct users who made a purchase."
        ],
        "Value": [
            f"{r.get('auctions', 0):,}", f"{r.get('auction_results', 0):,}", f"{r.get('winning_bids', 0):,}", f"{slots_per_auction:.2f}",
            "-----", f"{show_rate:.2%}", "-----",
            f"{r.get('impressions', 0):,}", f"{r.get('clicks', 0):,}", f"{r.get('purchases', 0):,}", f"${r['revenue_dollars']:,.2f}",
            "-----", f"{ctr:.2%}", f"{cvr_click:.2%}", f"{cvr_impression:.3%}", f"${avg_purchase_val:,.2f}",
            "-----", f"~ {r.get('distinct_users_impression', 0):,}", f"~ {r.get('distinct_users_click', 0):,}", f"~ {r.get('distinct_users_purchase', 0):,}"
        ]
    }
    summary_df = pd.DataFrame(summary_data)
    
    # Assumes show_table is defined for clean, untruncated output
    show_table(summary_df, f"Daily Metrics Summary for {TARGET_DATE}")

except Exception as e:
    print(f"\nAn error occurred while running metrics queries: {e}")

Fetching metrics for 2025-07-07:   0%|          | 0/10 [00:00<?, ?it/s]


Daily Metrics Summary for 2025-07-07
+-----------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+---------------+
| Metric                                                    | Description                                                                                                                             | Value         |
| Auctions: Total Search Events                             | The number of distinct user actions (e.g., a search query) that triggered an ad auction.                                                | 7,420,764     |
+-----------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+---------------+
| Auctions: Total Bids Submitted by Vendors                 | The total number of ad bids submitte

## Vendor-Product Mapping Problem

In [22]:
import pandas as pd

# ==============================================================================
# VALIDATING THE IMPROVED PRODUCT-VENDOR MAP (FACTS ONLY)
# ==============================================================================
print("### Validating the Improved Product-Vendor Map ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
LOOKBACK_DAYS = 30
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Print a clear header with the exact filters being used ---
print("\n--- Map Configuration ---")
print(f"  Mapping Source:      AUCTIONS_RESULTS table")
print(f"  Lookback Window:     {LOOKBACK_DAYS} days from {PILOT_WEEK_START}")
print("-------------------------")

# --- SQL Query to Calculate Revenue Coverage with the New, Improved Map ---
coverage_query_new_map = f"""
WITH
-- STEP 1: Build the new, more comprehensive product-vendor map
improved_product_vendor_map AS (
    SELECT PRODUCT_ID, VENDOR_ID
    FROM (
        SELECT
            PRODUCT_ID, VENDOR_ID,
            ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS
        WHERE CREATED_AT >= DATEADD(day, -{LOOKBACK_DAYS}, '{PILOT_WEEK_START}')
          AND CREATED_AT < '{PILOT_WEEK_END}'
        GROUP BY 1, 2
    )
    WHERE rn = 1
),
-- STEP 2: Pre-aggregate all purchases to measure coverage
all_purchases_in_period AS (
    SELECT
        PRODUCT_ID,
        (QUANTITY * UNIT_PRICE) AS revenue_cents
    FROM {DB_SCHEMA}.PURCHASES
    WHERE PURCHASED_AT >= '{PILOT_WEEK_START}' AND PURCHASED_AT < '{PILOT_WEEK_END}'
)
-- STEP 3: Calculate the final coverage metrics
SELECT
    SUM(revenue_cents) / 100.0 AS grand_total_revenue,
    SUM(IFF(
        PRODUCT_ID IN (SELECT PRODUCT_ID FROM improved_product_vendor_map),
        revenue_cents,
        0
    )) / 100.0 AS revenue_from_mapped_products
FROM all_purchases_in_period;
"""

print(f"\n--- Running query to calculate business coverage of the new map... ---")
try:
    # Assuming `run_query` and `show_table` are defined
    df_coverage = run_query(coverage_query_new_map)
    
    if not df_coverage.empty:
        # Extract facts
        r = df_coverage.iloc[0]
        grand_total = r['GRAND_TOTAL_REVENUE']
        mapped_revenue = r['REVENUE_FROM_MAPPED_PRODUCTS']
        unmapped_revenue = grand_total - mapped_revenue
        coverage_percent = (mapped_revenue / grand_total * 100) if grand_total > 0 else 0
        
        # --- Create and Display Summary Table of Facts ---
        summary_data = {
            "Metric": [
                "1. Grand Total Revenue (Entire Platform)",
                "2. Revenue from Mapped Products (using new map)",
                "3. Revenue from Unmapped Products",
                "4. % of Grand Total Revenue Covered by New Map"
            ],
            "Value": [
                f"${grand_total:,.2f}",
                f"${mapped_revenue:,.2f}",
                f"${unmapped_revenue:,.2f}",
                f"{coverage_percent:.2f}%"
            ]
        }
        summary_df = pd.DataFrame(summary_data)
        
        show_table(summary_df, "Validation Facts for Improved Product-Vendor Map")

    else:
        print("Warning: Could not retrieve coverage facts.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

### Validating the Improved Product-Vendor Map ###

--- Map Configuration ---
  Mapping Source:      AUCTIONS_RESULTS table
  Lookback Window:     30 days from 2025-07-07
-------------------------

--- Running query to calculate business coverage of the new map... ---

Validation Facts for Improved Product-Vendor Map
+-------------------------------------------------+----------------+
| Metric                                          | Value          |
| 1. Grand Total Revenue (Entire Platform)        | $34,515,572.49 |
+-------------------------------------------------+----------------+
| 2. Revenue from Mapped Products (using new map) | $5,064,090.00  |
+-------------------------------------------------+----------------+
| 3. Revenue from Unmapped Products               | $29,451,482.49 |
+-------------------------------------------------+----------------+
| 4. % of Grand Total Revenue Covered by New Map  | 14.67%         |
+-------------------------------------------------+---------

## Vendor-Revenue CDF

In [23]:
import pandas as pd

# ==============================================================================
# VENDOR REVENUE CDF (Using the Improved, Robust Map)
# ==============================================================================
print("### Analyzing Vendor Revenue CDF (Using Improved Map) ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
LOOKBACK_DAYS = 30
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Print a clear header with the exact filters being used ---
print("\n--- Filters and Mapping Applied ---")
print(f"  Analysis Period: {PILOT_WEEK_START} to {PILOT_WEEK_END}")
print(f"  Mapping Source:  AUCTIONS_RESULTS table")
print(f"  Lookback Window: {LOOKBACK_DAYS} days")
print("-----------------------------------")

# --- CORRECTED SQL Query to Generate the Full CDF Table ---
vendor_cdf_query = f"""
WITH
-- STEP 1: Build the new, more comprehensive product-vendor map (our best available)
improved_product_vendor_map AS (
    SELECT PRODUCT_ID, VENDOR_ID
    FROM (
        SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS
        WHERE CREATED_AT >= DATEADD(day, -{LOOKBACK_DAYS}, '{PILOT_WEEK_START}')
          AND CREATED_AT < '{PILOT_WEEK_END}'
        GROUP BY 1, 2
    )
    WHERE rn = 1
),
-- STEP 2: Calculate total revenue per vendor ONLY for the revenue we can map
vendor_sales AS (
    SELECT
        pv.VENDOR_ID,
        COALESCE(SUM(p.QUANTITY * p.UNIT_PRICE), 0) AS total_revenue_cents
    FROM {DB_SCHEMA}.PURCHASES p
    -- Use an INNER JOIN to consider only the sales we can successfully map
    INNER JOIN improved_product_vendor_map pv ON p.PRODUCT_ID = pv.PRODUCT_ID
    WHERE p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
    GROUP BY 1
),
-- 3. Calculate cumulative distribution and assign percentile buckets
cumulative_sales AS (
    SELECT
        VENDOR_ID, total_revenue_cents,
        SUM(total_revenue_cents) OVER (ORDER BY total_revenue_cents DESC) as cumulative_revenue_cents,
        CEIL(SUM(total_revenue_cents) OVER (ORDER BY total_revenue_cents DESC) * 10.0 / SUM(total_revenue_cents) OVER ()) as percentile_bucket
    FROM vendor_sales WHERE VENDOR_ID IS NOT NULL AND total_revenue_cents > 0
),
-- 4. Group by the buckets
bucketed_summary AS (
    SELECT
        percentile_bucket,
        COUNT(VENDOR_ID) AS vendors_in_bucket,
        SUM(total_revenue_cents) as revenue_in_bucket_cents
    FROM cumulative_sales GROUP BY 1
)
-- 5. Final aggregation to create a true cumulative view
SELECT
    bs.percentile_bucket * 10 AS cumulative_percent_revenue,
    bs.vendors_in_bucket,
    SUM(bs.vendors_in_bucket) OVER (ORDER BY bs.percentile_bucket) as cumulative_vendor_count,
    bs.revenue_in_bucket_cents,
    SUM(bs.revenue_in_bucket_cents) OVER (ORDER BY bs.percentile_bucket) as cumulative_revenue_cents,
    SUM(bs.vendors_in_bucket) OVER (ORDER BY bs.percentile_bucket) * 100.0 / (SELECT COUNT(*) FROM vendor_sales WHERE VENDOR_ID IS NOT NULL AND total_revenue_cents > 0) as cumulative_percent_of_vendors
FROM bucketed_summary bs
ORDER BY 1;
"""

print(f"\n--- Running query to generate the vendor revenue CDF for MAPPED REVENUE... ---")
try:
    df_cdf = run_query(vendor_cdf_query)
    
    if not df_cdf.empty:
        # Define descriptive column names for the final, corrected table
        df_cdf.columns = [
            'Revenue Tier (% of Mapped Revenue)',
            'Vendors in this Tier',
            'Cumulative Vendor Count',
            'Revenue in this Tier ($)',
            'Cumulative Revenue ($)',
            'Cumulative % of All Selling Vendors'
        ]
        
        # Format all columns for clear, readable output
        df_cdf['Vendors in this Tier'] = df_cdf['Vendors in this Tier'].apply(lambda x: f"{x:,}")
        df_cdf['Cumulative Vendor Count'] = df_cdf['Cumulative Vendor Count'].apply(lambda x: f"{x:,}")
        df_cdf['Revenue in this Tier ($)'] = (df_cdf['Revenue in this Tier ($)'] / 100.0).apply(lambda x: f"${x:,.2f}")
        df_cdf['Cumulative Revenue ($)'] = (df_cdf['Cumulative Revenue ($)'] / 100.0).apply(lambda x: f"${x:,.2f}")
        df_cdf['Cumulative % of All Selling Vendors'] = df_cdf['Cumulative % of All Selling Vendors'].apply(lambda x: f"{x:.2f}%")
        
        show_table(df_cdf, "Distribution of Mapped Vendor Revenue")
    else:
        print("Warning: Could not generate the CDF table.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

### Analyzing Vendor Revenue CDF (Using Improved Map) ###

--- Filters and Mapping Applied ---
  Analysis Period: 2025-07-07 to 2025-07-14
  Mapping Source:  AUCTIONS_RESULTS table
  Lookback Window: 30 days
-----------------------------------

--- Running query to generate the vendor revenue CDF for MAPPED REVENUE... ---

Distribution of Mapped Vendor Revenue
+--------------------------------------+------------------------+---------------------------+----------------------------+--------------------------+---------------------------------------+
|   Revenue Tier (% of Mapped Revenue) | Vendors in this Tier   | Cumulative Vendor Count   | Revenue in this Tier ($)   | Cumulative Revenue ($)   | Cumulative % of All Selling Vendors   |
|                                   10 | 39                     | 39                        | $504,297.00                | $504,297.00              | 0.13%                                 |
+--------------------------------------+------------------------+--

## User-Revenue CDF

In [24]:
import pandas as pd

# ==============================================================================
# USER REVENUE CDF (Focused on Mappable Revenue)
# ==============================================================================
print("### Analyzing User Revenue CDF for Mappable Purchases ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
LOOKBACK_DAYS = 30
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Print a clear header with the exact filters being used ---
print("\n--- Filters and Mapping Applied ---")
print(f"  Analysis Period: {PILOT_WEEK_START} to {PILOT_WEEK_END}")
print(f"  Revenue Base:    Only revenue from purchases that can be mapped to a vendor.")
print(f"  Mapping Source:  AUCTIONS_RESULTS table ({LOOKBACK_DAYS}-day lookback)")
print("-----------------------------------------------------")

# --- CORRECTED SQL Query to Generate the Full User Revenue CDF Table ---
user_cdf_query = f"""
WITH
-- 1. Build the best possible product-vendor map
improved_product_vendor_map AS (
    SELECT PRODUCT_ID, VENDOR_ID FROM (
        SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS
        WHERE CREATED_AT >= DATEADD(day, -{LOOKBACK_DAYS}, '{PILOT_WEEK_START}')
          AND CREATED_AT < '{PILOT_WEEK_END}'
        GROUP BY 1, 2
    ) WHERE rn = 1
),
-- 2. Calculate total MAPPABLE revenue per user
user_sales AS (
    SELECT
        p.USER_ID,
        COALESCE(SUM(p.QUANTITY * p.UNIT_PRICE), 0) AS total_revenue_cents
    FROM {DB_SCHEMA}.PURCHASES p
    -- INNER JOIN ensures we only count revenue from products we can map
    INNER JOIN improved_product_vendor_map pv ON p.PRODUCT_ID = pv.PRODUCT_ID
    WHERE p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
    GROUP BY 1
),
-- 3. Calculate cumulative distribution and assign percentile buckets
cumulative_sales AS (
    SELECT
        USER_ID, total_revenue_cents,
        SUM(total_revenue_cents) OVER (ORDER BY total_revenue_cents DESC) as cumulative_revenue_cents,
        CEIL(SUM(total_revenue_cents) OVER (ORDER BY total_revenue_cents DESC) * 10.0 / SUM(total_revenue_cents) OVER ()) as percentile_bucket
    FROM user_sales WHERE total_revenue_cents > 0
),
-- 4. Group by the buckets
bucketed_summary AS (
    SELECT
        percentile_bucket,
        COUNT(USER_ID) AS users_in_bucket,
        SUM(total_revenue_cents) as revenue_in_bucket_cents
    FROM cumulative_sales GROUP BY 1
)
-- 5. Final aggregation to create a true cumulative view
SELECT
    bs.percentile_bucket * 10 AS cumulative_percent_revenue,
    bs.users_in_bucket,
    SUM(bs.users_in_bucket) OVER (ORDER BY bs.percentile_bucket) as cumulative_user_count,
    bs.revenue_in_bucket_cents,
    SUM(bs.revenue_in_bucket_cents) OVER (ORDER BY bs.percentile_bucket) as cumulative_revenue_cents,
    SUM(bs.users_in_bucket) OVER (ORDER BY bs.percentile_bucket) * 100.0 / (SELECT COUNT(*) FROM user_sales WHERE total_revenue_cents > 0) as cumulative_percent_of_users
FROM bucketed_summary bs
ORDER BY 1;
"""

print(f"\n--- Running query to generate the full user revenue CDF... ---")
try:
    df_cdf_user = run_query(user_cdf_query)
    
    if not df_cdf_user.empty:
        # Define descriptive column names for the final table
        df_cdf_user.columns = [
            'Revenue Tier (% of Mapped Revenue)',
            'Users in this Tier',
            'Cumulative User Count',
            'Revenue in this Tier ($)',
            'Cumulative Revenue ($)',
            'Cumulative % of All Purchasing Users'
        ]
        
        # Format all columns for clear, readable output
        df_cdf_user['Users in this Tier'] = df_cdf_user['Users in this Tier'].apply(lambda x: f"{x:,}")
        df_cdf_user['Cumulative User Count'] = df_cdf_user['Cumulative User Count'].apply(lambda x: f"{x:,}")
        df_cdf_user['Revenue in this Tier ($)'] = (df_cdf_user['Revenue in this Tier ($)'] / 100.0).apply(lambda x: f"${x:,.2f}")
        df_cdf_user['Cumulative Revenue ($)'] = (df_cdf_user['Cumulative Revenue ($)'] / 100.0).apply(lambda x: f"${x:,.2f}")
        df_cdf_user['Cumulative % of All Purchasing Users'] = df_cdf_user['Cumulative % of All Purchasing Users'].apply(lambda x: f"{x:.2f}%")
        
        show_table(df_cdf_user, "User Revenue Distribution for Mappable Purchases")
    else:
        print("Warning: Could not generate the CDF table for users.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

### Analyzing User Revenue CDF for Mappable Purchases ###

--- Filters and Mapping Applied ---
  Analysis Period: 2025-07-07 to 2025-07-14
  Revenue Base:    Only revenue from purchases that can be mapped to a vendor.
  Mapping Source:  AUCTIONS_RESULTS table (30-day lookback)
-----------------------------------------------------

--- Running query to generate the full user revenue CDF... ---

User Revenue Distribution for Mappable Purchases
+--------------------------------------+----------------------+-------------------------+----------------------------+--------------------------+----------------------------------------+
|   Revenue Tier (% of Mapped Revenue) | Users in this Tier   | Cumulative User Count   | Revenue in this Tier ($)   | Cumulative Revenue ($)   | Cumulative % of All Purchasing Users   |
|                                   10 | 312                  | 312                     | $505,808.00                | $505,808.00              | 0.33%                             

## User-Vendor Panel Shape

In [25]:
import pandas as pd

# ==============================================================================
# "BUSINESS COVERAGE" ANALYSIS (Using Best Available Map)
# ==============================================================================
print("### Analyzing the Business Coverage of the 'Power Panel' ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
TOP_N_VENDORS = 5000
TOP_N_USERS = 100000
LOOKBACK_DAYS = 30
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Print a clear header with the exact filters being used ---
print("\n--- Population & Universe Definitions ---")
print(f"  Analysis Period: {PILOT_WEEK_START} to {PILOT_WEEK_END}")
print(f"  User Population: Top {TOP_N_USERS:,} users by total spend (across all purchases).")
print(f"  Vendor Universe: Top {TOP_N_VENDORS:,} vendors by total MAPPED sales revenue.")
print(f"  Mapping Source:  AUCTIONS_RESULTS table ({LOOKBACK_DAYS}-day lookback)")
print("-----------------------------------------------------")

# --- SQL Query to Calculate Revenue Coverage with the Improved Map ---
coverage_query = f"""
WITH
-- 1. Build our best possible product-vendor map
improved_product_vendor_map AS (
    SELECT PRODUCT_ID, VENDOR_ID FROM (
        SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS
        WHERE CREATED_AT >= DATEADD(day, -{LOOKBACK_DAYS}, '{PILOT_WEEK_START}') AND CREATED_AT < '{PILOT_WEEK_END}'
        GROUP BY 1, 2
    ) WHERE rn = 1
),
-- 2. Get the list of Top Users by their total spend (on all products)
top_users_cte AS (
    SELECT USER_ID FROM {DB_SCHEMA}.PURCHASES
    WHERE PURCHASED_AT >= '{PILOT_WEEK_START}' AND PURCHASED_AT < '{PILOT_WEEK_END}'
    GROUP BY 1 ORDER BY SUM(QUANTITY * UNIT_PRICE) DESC LIMIT {TOP_N_USERS}
),
-- 3. Get the list of Top Vendors by their total MAPPED sales revenue
top_vendors_cte AS (
    SELECT pv.VENDOR_ID
    FROM {DB_SCHEMA}.PURCHASES p
    INNER JOIN improved_product_vendor_map pv ON p.PRODUCT_ID = pv.PRODUCT_ID
    WHERE p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
    GROUP BY 1 ORDER BY SUM(p.QUANTITY * p.UNIT_PRICE) DESC LIMIT {TOP_N_VENDORS}
),
-- 4. Pre-aggregate all purchases with mapped vendors
all_purchases_mapped AS (
    SELECT
        p.USER_ID,
        pv.VENDOR_ID, -- Note: This will be NULL for unmapped products
        (p.QUANTITY * p.UNIT_PRICE) AS revenue_cents
    FROM {DB_SCHEMA}.PURCHASES p
    LEFT JOIN improved_product_vendor_map pv ON p.PRODUCT_ID = pv.PRODUCT_ID
    WHERE p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
)
-- 5. Calculate the final coverage metrics
SELECT
    -- Grand total revenue (mapped + unmapped)
    SUM(revenue_cents) / 100.0 AS grand_total_revenue,
    -- Total MAPPED revenue
    SUM(IFF(VENDOR_ID IS NOT NULL, revenue_cents, 0)) / 100.0 AS total_mapped_revenue,
    -- Revenue from Top Users (across all their purchases)
    SUM(IFF(USER_ID IN (SELECT USER_ID FROM top_users_cte), revenue_cents, 0)) / 100.0 AS revenue_from_top_users,
    -- Revenue FOR Top Vendors (only their mapped purchases)
    SUM(IFF(VENDOR_ID IN (SELECT VENDOR_ID FROM top_vendors_cte), revenue_cents, 0)) / 100.0 AS revenue_for_top_vendors,
    -- Revenue from Top Users buying from Top Vendors (only mapped purchases)
    SUM(IFF(USER_ID IN (SELECT USER_ID FROM top_users_cte) AND VENDOR_ID IN (SELECT VENDOR_ID FROM top_vendors_cte), revenue_cents, 0)) / 100.0 AS revenue_top_user_top_vendor
FROM all_purchases_mapped;
"""

print(f"\n--- Running query to calculate business coverage... ---")
try:
    df_coverage = run_query(coverage_query)
    
    if not df_coverage.empty:
        r = df_coverage.iloc[0]
        grand_total = r['GRAND_TOTAL_REVENUE']
        mapped_total = r['TOTAL_MAPPED_REVENUE']
        
        # Calculate percentages against BOTH grand total and mapped total
        panel_coverage_of_grand_total = (r['REVENUE_TOP_USER_TOP_VENDOR'] / grand_total * 100) if grand_total > 0 else 0
        panel_coverage_of_mapped_total = (r['REVENUE_TOP_USER_TOP_VENDOR'] / mapped_total * 100) if mapped_total > 0 else 0

        summary_data = {
            "Metric": [
                "1. Grand Total Revenue (Entire Platform)",
                "2. Total Mapped Revenue (Ad-Engaged Ecosystem)",
                "   -> % of Grand Total",
                "3. Revenue from Top 100k Users buying from Top 5k Vendors",
                "   -> % of Grand Total (Panel's overall business share)",
                "   -> % of Mapped Revenue (Panel's share of the ad ecosystem)"
            ],
            "Value": [
                f"${grand_total:,.2f}",
                f"${mapped_total:,.2f}",
                f"{(mapped_total / grand_total * 100):.2f}%",
                f"${r['REVENUE_TOP_USER_TOP_VENDOR']:,.2f}",
                f"{panel_coverage_of_grand_total:.2f}%",
                f"{panel_coverage_of_mapped_total:.2f}%"
            ]
        }
        summary_df = pd.DataFrame(summary_data)
        show_table(summary_df, "Business Coverage of the 'Power Panel' Population")

except Exception as e:
    print(f"\nAn error occurred: {e}")

### Analyzing the Business Coverage of the 'Power Panel' ###

--- Population & Universe Definitions ---
  Analysis Period: 2025-07-07 to 2025-07-14
  User Population: Top 100,000 users by total spend (across all purchases).
  Vendor Universe: Top 5,000 vendors by total MAPPED sales revenue.
  Mapping Source:  AUCTIONS_RESULTS table (30-day lookback)
-----------------------------------------------------

--- Running query to calculate business coverage... ---

Business Coverage of the 'Power Panel' Population
+------------------------------------------------------------+----------------+
| Metric                                                     | Value          |
| 1. Grand Total Revenue (Entire Platform)                   | $34,515,572.49 |
+------------------------------------------------------------+----------------+
| 2. Total Mapped Revenue (Ad-Engaged Ecosystem)             | $5,064,090.00  |
+------------------------------------------------------------+----------------+
| -> %

In [26]:
import pandas as pd
from datetime import datetime, timedelta

# ==============================================================================
# FINAL PANEL SIZING (Using Pre-Treatment Population Definition)
# ==============================================================================
print("### Sizing the Final Panel based on Pre-Treatment Cohorts ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
# Define the 30-day pre-treatment period
PRE_TREATMENT_END = PILOT_WEEK_START
PRE_TREATMENT_START = (datetime.strptime(PRE_TREATMENT_END, '%Y-%m-%d') - timedelta(days=30)).strftime('%Y-%m-%d')

TOP_N_VENDORS = 4000
TOP_N_USERS = 100000
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"

# --- Print a clear header with the exact filters being used ---
print("\n--- Population & Universe Definitions ---")
print(f"  Pre-Treatment Period (for defining cohorts): {PRE_TREATMENT_START} to {PRE_TREATMENT_END}")
print(f"  Analysis Period (for measuring eligibility): {PILOT_WEEK_START} to {PILOT_WEEK_END}")
print(f"  User Cohort: Top {TOP_N_USERS:,} users by spend in pre-treatment period.")
print(f"  Vendor Cohort: Top {TOP_N_VENDORS:,} vendors by sales in pre-treatment period.")
print("-----------------------------------------------------")

# --- SQL Query to Find the "Eligible Top Users" from our Pre-Defined Cohort ---
query_eligible_top_users = f"""
WITH
-- Step 1: Identify the top 100,000 users by total spend in the PRE-TREATMENT month
top_users_pretreatment_cte AS (
    SELECT USER_ID
    FROM {DB_SCHEMA}.PURCHASES
    WHERE PURCHASED_AT >= '{PRE_TREATMENT_START}' AND PURCHASED_AT < '{PRE_TREATMENT_END}'
    GROUP BY USER_ID
    ORDER BY SUM(QUANTITY * UNIT_PRICE) DESC
    LIMIT {TOP_N_USERS}
)
-- Step 2: From that pre-defined cohort, count how many were "eligible" (had impressions)
-- during the ACTUAL PILOT WEEK.
SELECT
    COUNT(DISTINCT i.USER_ID) as eligible_top_user_count
FROM {DB_SCHEMA}.IMPRESSIONS i
-- We only care about users who are in our pre-defined cohort
INNER JOIN top_users_pretreatment_cte top_users ON i.USER_ID = top_users.USER_ID
WHERE i.OCCURRED_AT >= '{PILOT_WEEK_START}' AND i.OCCURRED_AT < '{PILOT_WEEK_END}';
"""

print(f"\n--- Running query to find the count of eligible users from the pre-treatment cohort... ---")
try:
    df_eligible_top_users = run_query(query_eligible_top_users)
    
    if not df_eligible_top_users.empty:
        num_eligible_top_users = df_eligible_top_users['ELIGIBLE_TOP_USER_COUNT'].iloc[0]
        num_top_vendors = TOP_N_VENDORS
        estimated_final_size = num_eligible_top_users * num_top_vendors

        # --- Create and Display Summary Table of Facts ---
        summary_data = {
            "Fact": [
                "1. Pre-Treatment User Cohort Size",
                "2. Eligible Users from Cohort (Impressions in pilot week)",
                "3. Pre-Treatment Vendor Cohort Size",
                "4. Final Panel Rows (Eligible Users x Top Vendors)"
            ],
            "Description": [
                f"Top {TOP_N_USERS:,} spenders from the month prior to the pilot.",
                "The number of users from the pre-defined cohort who were active during the pilot week.",
                f"Top {TOP_N_VENDORS:,} sellers from the month prior to the pilot.",
                "The final row count for the panel before negative sampling."
            ],
            "Value": [
                f"{TOP_N_USERS:,}",
                f"{num_eligible_top_users:,}",
                f"{num_top_vendors:,}",
                f"{estimated_final_size:,}"
            ]
        }
        summary_df = pd.DataFrame(summary_data)
        
        show_table(summary_df, "Final Panel Sizing Facts (Based on Pre-Treatment Cohorts)")
    else:
        print("Warning: Could not determine the count of eligible top users.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

### Sizing the Final Panel based on Pre-Treatment Cohorts ###

--- Population & Universe Definitions ---
  Pre-Treatment Period (for defining cohorts): 2025-06-07 to 2025-07-07
  Analysis Period (for measuring eligibility): 2025-07-07 to 2025-07-14
  User Cohort: Top 100,000 users by spend in pre-treatment period.
  Vendor Cohort: Top 4,000 vendors by sales in pre-treatment period.
-----------------------------------------------------

--- Running query to find the count of eligible users from the pre-treatment cohort... ---

Final Panel Sizing Facts (Based on Pre-Treatment Cohorts)
+-----------------------------------------------------------+----------------------------------------------------------------------------------------+-------------+
| Fact                                                      | Description                                                                            | Value       |
| 1. Pre-Treatment User Cohort Size                         | Top 100,000 spende

# panel and downsampling sizing

In [29]:
import pandas as pd
from datetime import datetime, timedelta

# ==============================================================================
# SIZING & WEIGHT CALCULATION (Using Pre-Treatment Cohorts)
# ==============================================================================
print("### Sizing Panel & Calculating Weights for Negative Sampling ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
PRE_TREATMENT_END = PILOT_WEEK_START
PRE_TREATMENT_START = (datetime.strptime(PRE_TREATMENT_END, '%Y-%m-%d') - timedelta(days=30)).strftime('%Y-%m-%d')

TOP_N_VENDORS = 4000
TOP_N_USERS = 100000
NEGATIVE_TO_POSITIVE_RATIO = 5
DB_SCHEMA = "INCREMENTALITY.INCREMENTALITY_RESEARCH"
LOOKBACK_DAYS_MAP = 30 # For our best product-vendor map

# --- Print a clear header ---
print("\n--- Population & Universe Definitions ---")
print(f"  Pre-Treatment Period: {PRE_TREATMENT_START} to {PRE_TREATMENT_END}")
print(f"  Analysis Period:      {PILOT_WEEK_START} to {PILOT_WEEK_END}")
print(f"  User Cohort:          Top {TOP_N_USERS:,} spenders from pre-treatment period.")
print(f"  Vendor Cohort:          Top {TOP_N_VENDORS:,} sellers from pre-treatment period.")
print("-----------------------------------------------------")

# --- SQL Query to Gather All Necessary Facts for Weight Calculation ---
sizing_query = f"""
WITH
-- 1. Define the User Cohort from the pre-treatment period
top_users_pretreatment_cte AS (
    SELECT USER_ID FROM {DB_SCHEMA}.PURCHASES
    WHERE PURCHASED_AT >= '{PRE_TREATMENT_START}' AND PURCHASED_AT < '{PRE_TREATMENT_END}'
    GROUP BY 1 ORDER BY SUM(QUANTITY * UNIT_PRICE) DESC LIMIT {TOP_N_USERS}
),
-- 2. Define the Vendor Cohort from the pre-treatment period (using our best map logic)
top_vendors_pretreatment_cte AS (
    SELECT VENDOR_ID FROM (
        SELECT pv.VENDOR_ID, SUM(p.QUANTITY * p.UNIT_PRICE) AS rev
        FROM {DB_SCHEMA}.PURCHASES p
        INNER JOIN (
            SELECT PRODUCT_ID, VENDOR_ID FROM (
                SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
                FROM {DB_SCHEMA}.AUCTIONS_RESULTS
                WHERE CREATED_AT >= DATEADD(day, -{LOOKBACK_DAYS_MAP}, '{PRE_TREATMENT_END}') AND CREATED_AT < '{PRE_TREATMENT_END}'
                GROUP BY 1, 2
            ) WHERE rn = 1
        ) pv ON p.PRODUCT_ID = pv.PRODUCT_ID
        WHERE p.PURCHASED_AT >= '{PRE_TREATMENT_START}' AND p.PURCHASED_AT < '{PRE_TREATMENT_END}'
        GROUP BY 1
    ) ORDER BY rev DESC LIMIT {TOP_N_VENDORS}
),
-- 3. Find the subset of the user cohort who were "eligible" during the pilot week
eligible_users_in_pilot_cte AS (
    SELECT DISTINCT i.USER_ID
    FROM {DB_SCHEMA}.IMPRESSIONS i
    INNER JOIN top_users_pretreatment_cte tu ON i.USER_ID = tu.USER_ID
    WHERE i.OCCURRED_AT >= '{PILOT_WEEK_START}' AND i.OCCURRED_AT < '{PILOT_WEEK_END}'
),
-- 4. Count the "positive" interactions (purchases) during the pilot week for our cohorts
positive_interactions_in_pilot_cte AS (
    SELECT COUNT(DISTINCT p.USER_ID, pv.VENDOR_ID) as positive_interaction_count
    FROM {DB_SCHEMA}.PURCHASES p
    INNER JOIN (
        SELECT PRODUCT_ID, VENDOR_ID FROM (
            SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
            FROM {DB_SCHEMA}.AUCTIONS_RESULTS
            WHERE CREATED_AT >= DATEADD(day, -{LOOKBACK_DAYS_MAP}, '{PILOT_WEEK_START}') AND CREATED_AT < '{PILOT_WEEK_END}'
            GROUP BY 1, 2
        ) WHERE rn = 1
    ) pv ON p.PRODUCT_ID = pv.PRODUCT_ID
    WHERE p.USER_ID IN (SELECT USER_ID FROM eligible_users_in_pilot_cte)
      AND pv.VENDOR_ID IN (SELECT VENDOR_ID FROM top_vendors_pretreatment_cte)
      AND p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
)
-- 5. Final SELECT to get all the numbers we need
SELECT
    (SELECT COUNT(*) FROM eligible_users_in_pilot_cte) AS eligible_user_count,
    (SELECT COUNT(*) FROM top_vendors_pretreatment_cte) AS top_vendor_count,
    (SELECT positive_interaction_count FROM positive_interactions_in_pilot_cte) AS positive_interaction_count
;
"""

print(f"\n--- Running query to get sizing facts... ---")
try:
    df_facts = run_query(sizing_query)
    
    if not df_facts.empty:
        # Extract the facts
        num_eligible_users = df_facts['ELIGIBLE_USER_COUNT'].iloc[0]
        num_top_vendors = df_facts['TOP_VENDOR_COUNT'].iloc[0]
        num_positives = df_facts['POSITIVE_INTERACTION_COUNT'].iloc[0]
        
        # Calculate the derived metrics for our final panel
        total_potential_panel_size = num_eligible_users * num_top_vendors
        total_negatives = total_potential_panel_size - num_positives
        num_sampled_negatives = num_positives * NEGATIVE_TO_POSITIVE_RATIO
        final_panel_size = num_positives + num_sampled_negatives
        sampling_fraction = num_sampled_negatives / total_negatives if total_negatives > 0 else 0
        negative_weight = total_negatives / num_sampled_negatives if num_sampled_negatives > 0 else 1

        # --- Create and Display Final Summary Table of Facts ---
        summary_data = {
            "Fact": [
                "1. Eligible Users from Cohort (active in pilot week)",
                "2. Vendor Cohort Size",
                "3. Total Potential Panel Size (Users x Vendors)",
                "4. 'Positive' Interactions (Purchases in pilot week)",
                "5. 'Negative' Interactions (Potential but no purchase)",
                "6. Negatives to Sample (at a 5:1 ratio)",
                "7. Final Downsampled Panel Size (Positives + Sampled Negatives)",
                "8. Statistical Weight for each Negative Sample"
            ],
            "Value": [
                f"{num_eligible_users:,}", f"{num_top_vendors:,}", f"{total_potential_panel_size:,}",
                f"{num_positives:,}", f"{total_negatives:,}", f"{num_sampled_negatives:,}",
                f"{final_panel_size:,}", f"{negative_weight:.2f}"
            ]
        }
        summary_df = pd.DataFrame(summary_data)
        show_table(summary_df, "Final Sizing & Weight Calculation Facts")
    else:
        print("Warning: Could not retrieve sizing facts.")

except Exception as e:
    print(f"\nAn error occurred: {e}")

### Sizing Panel & Calculating Weights for Negative Sampling ###

--- Population & Universe Definitions ---
  Pre-Treatment Period: 2025-06-07 to 2025-07-07
  Analysis Period:      2025-07-07 to 2025-07-14
  User Cohort:          Top 100,000 spenders from pre-treatment period.
  Vendor Cohort:          Top 4,000 sellers from pre-treatment period.
-----------------------------------------------------

--- Running query to get sizing facts... ---

Final Sizing & Weight Calculation Facts
+-----------------------------------------------------------------+-------------+
| Fact                                                            | Value       |
| 1. Eligible Users from Cohort (active in pilot week)            | 72,935      |
+-----------------------------------------------------------------+-------------+
| 2. Vendor Cohort Size                                           | 4,000       |
+-----------------------------------------------------------------+-------------+
| 3. Total Potenti

# data pull

In [3]:
import pandas as pd
from datetime import datetime, timedelta

# ==============================================================================
# FINAL PANEL BUILD & SAVE (Optimized SQL with Type-Casting Fix)
# ==============================================================================
print("### Building Final Panel via Optimized SQL (1-Week, 1% Negative Sample) ###")

# --- Configuration ---
PILOT_WEEK_START = '2025-07-07'
PILOT_WEEK_END = '2025-07-14'
PRE_TREATMENT_START = (datetime.strptime(PILOT_WEEK_START, '%Y-%m-%d') - timedelta(days=30)).strftime('%Y-%m-%d')
PRE_TREATMENT_END = PILOT_WEEK_START

TOP_N_VENDORS = 1000
TOP_N_USERS = 100000
NEGATIVE_SAMPLING_FRACTION = 0.01
DB_SCHEMA = 'INCREMENTALITY.INCREMENTALITY_RESEARCH'
MAP_LOOKBACK_DAYS = 30

# --- Print a clear header of the methodology ---
print("\n--- Methodology ---")
print(f"  Cohorts defined from pre-treatment period: {PRE_TREATMENT_START} to {PRE_TREATMENT_END}")
print(f"  Analysis of activity during pilot week:    {PILOT_WEEK_START} to {PILOT_WEEK_END}")
print(f"  User Cohort: Top {TOP_N_USERS:,} click-active spenders.")
print(f"  Vendor Cohort: Top {TOP_N_VENDORS:,} sellers.")
print(f"  Sampling: Keeping all positives and a {NEGATIVE_SAMPLING_FRACTION:.0%} random sample of negatives.")
print("---------------------")

# --- The Final, Optimized, and Corrected SQL Query ---
final_panel_query = f"""
WITH
-- ==============================================================================
-- STEP 1: DEFINE COHORTS & MAPS (with Type Casting)
-- ==============================================================================
top_vendors_pretreatment_cte AS (
    SELECT VENDOR_ID FROM (
        SELECT pv.VENDOR_ID, SUM(p.QUANTITY * p.UNIT_PRICE) AS rev
        FROM {DB_SCHEMA}.PURCHASES p
        INNER JOIN (
            -- MAP with CASTING
            SELECT CAST(PRODUCT_ID AS VARCHAR) as PRODUCT_ID, CAST(VENDOR_ID AS VARCHAR) as VENDOR_ID
            FROM (
                SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
                FROM {DB_SCHEMA}.AUCTIONS_RESULTS
                WHERE CREATED_AT >= DATEADD(day, -{MAP_LOOKBACK_DAYS}, '{PRE_TREATMENT_END}') AND CREATED_AT < '{PRE_TREATMENT_END}'
                GROUP BY 1, 2
            ) WHERE rn = 1
        ) pv ON p.PRODUCT_ID = pv.PRODUCT_ID
        WHERE p.PURCHASED_AT >= '{PRE_TREATMENT_START}' AND p.PURCHASED_AT < '{PRE_TREATMENT_END}' GROUP BY 1
    ) ORDER BY rev DESC LIMIT {TOP_N_VENDORS}
),
top_click_active_users_pretreatment_cte AS (
    SELECT USER_ID FROM (
        SELECT p.USER_ID, SUM(p.QUANTITY * p.UNIT_PRICE) as total_spend
        FROM {DB_SCHEMA}.PURCHASES p
        WHERE p.USER_ID IN (
            SELECT DISTINCT USER_ID FROM {DB_SCHEMA}.CLICKS
            WHERE OCCURRED_AT >= '{PRE_TREATMENT_START}' AND OCCURRED_AT < '{PRE_TREATMENT_END}'
        )
        AND p.PURCHASED_AT >= '{PRE_TREATMENT_START}' AND p.PURCHASED_AT < '{PRE_TREATMENT_END}'
        GROUP BY 1
    ) ORDER BY total_spend DESC LIMIT {TOP_N_USERS}
),
eligible_users_in_pilot_cte AS (
    SELECT DISTINCT i.USER_ID
    FROM {DB_SCHEMA}.IMPRESSIONS i
    WHERE i.USER_ID IN (SELECT USER_ID FROM top_click_active_users_pretreatment_cte)
      AND i.OCCURRED_AT >= '{PILOT_WEEK_START}' AND i.OCCURRED_AT < '{PILOT_WEEK_END}'
),
pilot_week_map_cte AS (
    -- MAP with CASTING
    SELECT CAST(PRODUCT_ID AS VARCHAR) as PRODUCT_ID, CAST(VENDOR_ID AS VARCHAR) as VENDOR_ID
    FROM (
        SELECT PRODUCT_ID, VENDOR_ID, ROW_NUMBER() OVER (PARTITION BY PRODUCT_ID ORDER BY COUNT(*) DESC) as rn
        FROM {DB_SCHEMA}.AUCTIONS_RESULTS
        WHERE CREATED_AT >= DATEADD(day, -{MAP_LOOKBACK_DAYS}, '{PILOT_WEEK_START}') AND CREATED_AT < '{PILOT_WEEK_END}'
        GROUP BY 1, 2
    ) WHERE rn = 1
),
-- ==============================================================================
-- STEP 2: IDENTIFY POSITIVES & NEGATIVES DURING PILOT WEEK
-- ==============================================================================
base_panel_cte AS (
    SELECT u.USER_ID, v.VENDOR_ID FROM eligible_users_in_pilot_cte u CROSS JOIN top_vendors_pretreatment_cte v
),
positive_interactions_cte AS (
    SELECT DISTINCT p.USER_ID, pv.VENDOR_ID
    FROM {DB_SCHEMA}.PURCHASES p
    INNER JOIN pilot_week_map_cte pv ON p.PRODUCT_ID = pv.PRODUCT_ID
    WHERE p.USER_ID IN (SELECT USER_ID FROM eligible_users_in_pilot_cte)
      AND pv.VENDOR_ID IN (SELECT VENDOR_ID FROM top_vendors_pretreatment_cte)
      AND p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
),
-- ==============================================================================
-- STEP 3: ASSEMBLE FINAL PANEL WITH SAMPLED NEGATIVES
-- ==============================================================================
final_panel_keys AS (
    SELECT USER_ID, VENDOR_ID, 1.0 AS weight FROM positive_interactions_cte
    UNION ALL
    SELECT USER_ID, VENDOR_ID, (1.0 / {NEGATIVE_SAMPLING_FRACTION}) AS weight
    FROM (
        SELECT USER_ID, VENDOR_ID FROM base_panel_cte
        EXCEPT
        SELECT USER_ID, VENDOR_ID FROM positive_interactions_cte
    )
    WHERE MOD(HASH(USER_ID, VENDOR_ID), 100) < ({NEGATIVE_SAMPLING_FRACTION} * 100)
),
-- ==============================================================================
-- STEP 4: PRE-AGGREGATE FEATURES & ATTACH
-- ==============================================================================
clicks_agg_pilot AS (
    SELECT USER_ID, VENDOR_ID, COUNT(*) as total_clicks
    FROM {DB_SCHEMA}.CLICKS
    WHERE OCCURRED_AT >= '{PILOT_WEEK_START}' AND OCCURRED_AT < '{PILOT_WEEK_END}'
    GROUP BY 1, 2
),
purchases_agg_pilot AS (
    SELECT USER_ID, VENDOR_ID, SUM(QUANTITY * UNIT_PRICE) as total_revenue, COUNT(DISTINCT PURCHASE_ID) as total_sales
    FROM (
        SELECT p.USER_ID, pv.VENDOR_ID, p.QUANTITY, p.UNIT_PRICE, p.PURCHASE_ID
        FROM {DB_SCHEMA}.PURCHASES p
        INNER JOIN pilot_week_map_cte pv ON p.PRODUCT_ID = pv.PRODUCT_ID
        WHERE p.PURCHASED_AT >= '{PILOT_WEEK_START}' AND p.PURCHASED_AT < '{PILOT_WEEK_END}'
    )
    GROUP BY 1, 2
)
-- Final SELECT to join all features to our downsampled panel keys
SELECT
    DATE_TRUNC('WEEK', TO_TIMESTAMP_NTZ('{PILOT_WEEK_START}')) AS week,
    fpk.user_id,
    fpk.vendor_id,
    fpk.weight,
    COALESCE(c.total_clicks, 0) AS total_clicks,
    COALESCE(p.total_revenue, 0) / 100.0 AS total_revenue,
    COALESCE(p.total_sales, 0) AS total_sales
FROM final_panel_keys fpk
LEFT JOIN clicks_agg_pilot c ON fpk.user_id = c.user_id AND fpk.vendor_id = c.vendor_id
LEFT JOIN purchases_agg_pilot p ON fpk.user_id = p.user_id AND fpk.vendor_id = p.vendor_id;
"""

try:
    print("\n--- Executing final panel build query on Snowflake... ---")
    df_final_panel = run_query(final_panel_query)

    parquet_filename_final = 'final_analysis_panel.parquet'
    if not df_final_panel.empty:
        print(f"✅ Panel with {len(df_final_panel):,} rows extracted.")
        
        df_to_save = df_final_panel.copy()
        df_to_save.columns = [col.lower() for col in df_to_save.columns]
        df_to_save['week'] = pd.to_datetime(df_to_save['week'])
        for col in df_to_save.select_dtypes(include=['object']).columns:
            df_to_save[col] = df_to_save[col].astype(str)

        print(f"\n💾 --- Saving final panel to '{parquet_filename_final}'... ---")
        df_to_save.to_parquet(parquet_filename_final, engine='pyarrow', index=False)
        print(f"✅ Panel saved. Ready for analysis.")
    else:
        print("Warning: The panel query returned no data.")

except Exception as e:
    print(f"\nAn error occurred while building the panel: {e}")

### Building Final Panel via Optimized SQL (1-Week, 1% Negative Sample) ###

--- Methodology ---
  Cohorts defined from pre-treatment period: 2025-06-07 to 2025-07-07
  Analysis of activity during pilot week:    2025-07-07 to 2025-07-14
  User Cohort: Top 100,000 click-active spenders.
  Vendor Cohort: Top 1,000 sellers.
  Sampling: Keeping all positives and a 1% random sample of negatives.
---------------------

--- Executing final panel build query on Snowflake... ---
✅ Panel with 41,049,173 rows extracted.

💾 --- Saving final panel to 'final_analysis_panel.parquet'... ---
✅ Panel saved. Ready for analysis.


# analysis - feols, feglm

In [3]:

  import pandas as pd
  import numpy as np
  import os
  from IPython.display import display, Markdown

  # Set R environment variables BEFORE importing rpy2
  os.environ['R_HOME'] = '/Library/Frameworks/R.framework/Resources'
  os.environ['DYLD_LIBRARY_PATH'] = f"{os.environ['R_HOME']}/lib:{os.environ.get('DYLD_LIBRARY_PATH', '')}"

  # rpy2 imports for the Python-to-R bridge
  import rpy2.robjects as ro
  from rpy2.robjects.packages import importr, isinstalled
  from rpy2.robjects import pandas2ri
  from rpy2.robjects.conversion import localconverter

  # ==============================================================================
  # FINAL ANALYSIS: Core Models with Negative Sampling
  # ==============================================================================
  print("### FINAL ANALYSIS: Core Fixed Effects Models (10% Negative Sampling) ###")

  # --- 1. Set up rpy2 Environment ---
  try:
      print("\n--- Initializing rpy2 and importing R's `fixest` package... ---")

      utils = importr('utils')
      if not isinstalled('fixest'):
          print("Installing fixest package...")
          utils.install_packages('fixest', repos='https://cran.rstudio.com/')

      fixest = importr('fixest')
      print("✅ R's `fixest` package loaded successfully.")
      rpy2_setup_failed = False
  except Exception as e:
      print(f"ERROR: Failed to initialize rpy2 or load `fixest`: {e}")
      rpy2_setup_failed = True

  if not rpy2_setup_failed:
      # --- 2. Load and Sample the Data ---
      parquet_filename = 'final_analysis_panel.parquet'
      try:
          print(f"\n--- Loading data from '{parquet_filename}' for modeling... ---")
          df_full = pd.read_parquet(parquet_filename, engine='pyarrow')
          print(f"✅ Successfully loaded DataFrame with {len(df_full):,} rows.")
          print(f"   Columns: {df_full.columns.tolist()}")

          # Detect column names
          if 'total_revenue_vendor_product' in df_full.columns:
              revenue_col = 'total_revenue_vendor_product'
          elif 'total_revenue' in df_full.columns:
              revenue_col = 'total_revenue'
          else:
              revenue_col = 'revenue'

          if 'total_clicks_promoted' in df_full.columns:
              clicks_col = 'total_clicks_promoted'
          elif 'total_clicks' in df_full.columns:
              clicks_col = 'total_clicks'
          else:
              clicks_col = 'clicks'

          print(f"\nUsing revenue column: {revenue_col}")
          print(f"Using clicks column: {clicks_col}")

          # Convert columns to numeric
          print("\n--- Converting columns to numeric ---")
          df_full[revenue_col] = pd.to_numeric(df_full[revenue_col], errors='coerce')
          df_full[clicks_col] = pd.to_numeric(df_full[clicks_col], errors='coerce')

          # Convert weight column if it exists
          if 'weight' in df_full.columns:
              df_full['weight'] = pd.to_numeric(df_full['weight'], errors='coerce')
              print(f"  weight dtype: {df_full['weight'].dtype}")

          # Check data types
          print(f"  {revenue_col} dtype: {df_full[revenue_col].dtype}")
          print(f"  {clicks_col} dtype: {df_full[clicks_col].dtype}")

          # Drop NaN values
          before_rows = len(df_full)
          df_full = df_full.dropna(subset=[revenue_col, clicks_col])
          after_rows = len(df_full)
          if before_rows != after_rows:
              print(f"  Dropped {before_rows - after_rows} rows with NaN values")

          # SAMPLING STRATEGY: Keep all positives, sample 10% of negatives
          print("\n--- Applying negative sampling strategy ---")
          positives = df_full[df_full[revenue_col] > 0]
          negatives = df_full[df_full[revenue_col] == 0]

          print(f"  Positive revenue observations: {len(positives):,} (keeping all)")
          print(f"  Zero revenue observations: {len(negatives):,} (sampling 10%)")

          # Sample 10% of negatives
          np.random.seed(42)
          sampled_negatives = negatives.sample(frac=0.1)

          # Combine positives and sampled negatives
          df_analysis = pd.concat([positives, sampled_negatives]).sort_index()

          # Add sampling weight column in Python
          df_analysis['sampling_weight'] = np.where(df_analysis[revenue_col] > 0, 1, 10)

          print(f"✅ Final analysis dataset: {len(df_analysis):,} rows")
          print(f"   Reduction: {100 * (1 - len(df_analysis)/len(df_full)):.1f}%")

      except Exception as e:
          print(f"ERROR during data loading/sampling: {e}")
          import traceback
          traceback.print_exc()
          df_analysis = pd.DataFrame()

      if not df_analysis.empty:
          # --- 3. Run Models on Sampled Data ---
          try:
              print("\n--- Running Core Fixed Effects Models on Sampled Data ---")

              # Use the conversion context
              with localconverter(ro.default_converter + pandas2ri.converter):
                  r_df = pandas2ri.py2rpy(df_analysis)
                  ro.globalenv['df_for_r'] = r_df

              # Prepare data in R
              print("\n--- Preparing data in R ---")
              ro.r(f"""
              library(fixest)
              
              df_analysis <- df_for_r
              
              # Ensure all numeric columns are properly typed in R
              df_analysis[['{revenue_col}']] <- as.numeric(as.character(df_analysis[['{revenue_col}']]))
              df_analysis[['{clicks_col}']] <- as.numeric(as.character(df_analysis[['{clicks_col}']]))
              df_analysis$sampling_weight <- as.numeric(as.character(df_analysis$sampling_weight))
              
              # Check if original weight exists and convert it
              has_weights <- "weight" %in% names(df_analysis)
              if (has_weights) {{
                  df_analysis$weight <- as.numeric(as.character(df_analysis$weight))
                  # Combine original weight with sampling weight
                  df_analysis$final_weight <- df_analysis$weight * df_analysis$sampling_weight
              }} else {{
                  # Use only sampling weight
                  df_analysis$final_weight <- df_analysis$sampling_weight
              }}
              
              # Prepare variables
              df_analysis$ihs_clicks <- asinh(df_analysis[['{clicks_col}']])
              df_analysis$ihs_revenue <- asinh(df_analysis[['{revenue_col}']])
              df_analysis$purchase_binary <- as.integer(df_analysis[['{revenue_col}']] > 0)
              df_analysis$user_id <- as.factor(df_analysis$user_id)
              df_analysis$vendor_id <- as.factor(df_analysis$vendor_id)
              
              print(paste("Sampled observations:", nrow(df_analysis)))
              print(paste("Unique users:", length(unique(df_analysis$user_id))))
              print(paste("Unique vendors:", length(unique(df_analysis$vendor_id))))
              print(paste("Percent with purchases (in sample):", 
                         round(100 * mean(df_analysis$purchase_binary), 2), "%"))
              print(paste("Using original weights:", has_weights))
              print(paste("Average final weight:", round(mean(df_analysis$final_weight), 2)))
              """)

              # Run OLS model
              print("\n" + "="*60)
              print("MODEL 1: OLS FOR REVENUE")
              print("="*60)

              ro.r(f"""
              start_time <- Sys.time()
              
              model_ols <- feols(ihs_revenue ~ ihs_clicks | user_id + vendor_id, 
                                data = df_analysis, 
                                weights = ~ final_weight, 
                                cluster = ~ user_id)
              
              print(summary(model_ols))
              
              ols_coef <- coef(model_ols)["ihs_clicks"]
              ols_se <- se(model_ols)["ihs_clicks"]
              ols_pval <- pvalue(model_ols)["ihs_clicks"]
              
              print("")
              print(paste("→ Coefficient:", round(ols_coef, 4)))
              print(paste("→ Std Error:", round(ols_se, 4)))
              print(paste("→ P-value:", format(ols_pval, scientific = TRUE, digits = 3)))
              
              end_time <- Sys.time()
              print(paste("→ Time taken:", round(difftime(end_time, start_time, units = "secs"), 1), "seconds"))
              """)

              print("\n✅ OLS model completed\n")

              # Run Logit model
              print("="*60)
              print("MODEL 2: LOGIT FOR PURCHASE PROBABILITY")
              print("="*60)

              ro.r(f"""
              start_time <- Sys.time()
              
              model_logit <- feglm(purchase_binary ~ ihs_clicks | user_id + vendor_id, 
                                  family = binomial(), 
                                  data = df_analysis, 
                                  weights = ~ final_weight, 
                                  cluster = ~ user_id)
              
              print(summary(model_logit))
              
              logit_coef <- coef(model_logit)["ihs_clicks"]
              logit_se <- se(model_logit)["ihs_clicks"]
              logit_pval <- pvalue(model_logit)["ihs_clicks"]
              odds_ratio <- exp(logit_coef)
              
              print("")
              print(paste("→ Coefficient:", round(logit_coef, 4)))
              print(paste("→ Std Error:", round(logit_se, 4)))
              print(paste("→ P-value:", format(logit_pval, scientific = TRUE, digits = 3)))
              print(paste("→ Odds ratio:", round(odds_ratio, 4)))
              
              end_time <- Sys.time()
              print(paste("→ Time taken:", round(difftime(end_time, start_time, units = "secs"), 1), "seconds"))
              """)

              print("\n✅ Both models completed successfully")
              print("\n" + "="*60)
              print("Note: Results use 10% negative sampling with weight adjustment")

          except Exception as e:
              print(f"\nERROR running the models in R: {e}")
              import traceback
              traceback.print_exc()

          print("\n--- Analysis completed ---")


### FINAL ANALYSIS: Core Fixed Effects Models (10% Negative Sampling) ###

--- Initializing rpy2 and importing R's `fixest` package... ---
✅ R's `fixest` package loaded successfully.

--- Loading data from 'final_analysis_panel.parquet' for modeling... ---
✅ Successfully loaded DataFrame with 41,049,173 rows.
   Columns: ['week', 'user_id', 'vendor_id', 'weight', 'total_clicks', 'total_revenue', 'total_sales']

Using revenue column: total_revenue
Using clicks column: total_clicks

--- Converting columns to numeric ---
  weight dtype: float64
  total_revenue dtype: float64
  total_clicks dtype: int64

--- Applying negative sampling strategy ---
  Positive revenue observations: 5,770 (keeping all)
  Zero revenue observations: 41,043,403 (sampling 10%)
✅ Final analysis dataset: 4,110,110 rows
   Reduction: 90.0%

--- Running Core Fixed Effects Models on Sampled Data ---

--- Preparing data in R ---
[1] "Sampled observations: 4110110"
[1] "Unique users: 81286"
[1] "Unique vendors: 1000"
[1

R callback write-console: Error: in feols(ihs_revenue ~ ihs_clicks | user_id + vendor...: 
The only variable, 'ihs_clicks', is collinear with the fixed effects.
Without doubt, your model is misspecified.
  



ERROR running the models in R: Error: in feols(ihs_revenue ~ ihs_clicks | user_id + vendor...: 
The only variable, 'ihs_clicks', is collinear with the fixed effects.
Without doubt, your model is misspecified.


--- Analysis completed ---


Traceback (most recent call last):
  File "/var/folders/b7/1tvk5qmx0ds9c6gk2lrlhv380000gn/T/ipykernel_39849/899109279.py", line 166, in <module>
    ro.r(f"""
    ~~~~^^^^^
    start_time <- Sys.time()
    ^^^^^^^^^^^^^^^^^^^^^^^^
    ...<18 lines>...
    print(paste("→ Time taken:", round(difftime(end_time, start_time, units = "secs"), 1), "seconds"))
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    """)
    ^^^^
  File "/Users/pranjal/Code/topsort-incrementality/venv/lib/python3.13/site-packages/rpy2/robjects/__init__.py", line 552, in __call__
    res, visible = rinterface.evalr_expr_with_visible(   # type: ignore
                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^
        r_expr
        ^^^^^^
    )
    ^
  File "/Users/pranjal/Code/topsort-incrementality/venv/lib/python3.13/site-packages/rpy2/rinterface/__init__.py", line 205, in evalr_expr_with_visible
    raise embedded.RRuntimeError(_rinterface._geter

In [7]:

import pandas as pd
import numpy as np

# Load data
parquet_filename = 'final_analysis_panel.parquet'
df = pd.read_parquet(parquet_filename, engine='pyarrow')

print("### DATA STRUCTURE DIAGNOSIS ###\n")

# Convert to numeric first
print("Converting columns to numeric...")
df['total_revenue'] = pd.to_numeric(df['total_revenue'], errors='coerce')
df['total_clicks'] = pd.to_numeric(df['total_clicks'], errors='coerce')
df['total_sales'] = pd.to_numeric(df['total_sales'], errors='coerce')

print(f"\nTotal rows: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")

# Check the actual data
print("\n" + "="*60)
print("KEY FINDINGS")
print("="*60)

print(f"1. Rows per user-vendor: {df.groupby(['user_id', 'vendor_id']).size().mean():.1f}")
print(f"   → This is CROSS-SECTIONAL data (one obs per user-vendor)\n")

print(f"2. Total clicks statistics:")
print(f"   - Sum: {df['total_clicks'].sum()}")
print(f"   - Max: {df['total_clicks'].max()}")
print(f"   - Non-zero: {(df['total_clicks'] > 0).sum()}")
print(f"   → NO CLICK DATA (all zeros!)\n")

print(f"3. Revenue statistics:")
print(f"   - Positive revenue obs: {(df['total_revenue'] > 0).sum():,}")
print(f"   - Max revenue: ${df['total_revenue'].max():,.2f}")

print(f"\n4. Sales statistics:")
print(f"   - Positive sales obs: {(df['total_sales'] > 0).sum():,}")
print(f"   - Max sales: {df['total_sales'].max()}")

# Check if we have the right columns
print("\n" + "="*60)
print("COLUMN CHECK")
print("="*60)
available_cols = df.columns.tolist()
print("Available columns:", available_cols)

# Look for potential click columns
potential_click_cols = [col for col in available_cols if 'click' in col.lower()]
potential_impression_cols = [col for col in available_cols if 'impression' in col.lower()]

if potential_click_cols:
    print(f"\nFound click columns: {potential_click_cols}")
    for col in potential_click_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        print(f"  {col}: sum={df[col].sum()}, max={df[col].max()}")

if potential_impression_cols:
    print(f"\nFound impression columns: {potential_impression_cols}")


### DATA STRUCTURE DIAGNOSIS ###

Converting columns to numeric...

Total rows: 41,049,173
Columns: ['week', 'user_id', 'vendor_id', 'weight', 'total_clicks', 'total_revenue', 'total_sales']

KEY FINDINGS
1. Rows per user-vendor: 1.0
   → This is CROSS-SECTIONAL data (one obs per user-vendor)

2. Total clicks statistics:
   - Sum: 0
   - Max: 0
   - Non-zero: 0
   → NO CLICK DATA (all zeros!)

3. Revenue statistics:
   - Positive revenue obs: 5,770
   - Max revenue: $10,833.00

4. Sales statistics:
   - Positive sales obs: 5,770
   - Max sales: 13

COLUMN CHECK
Available columns: ['week', 'user_id', 'vendor_id', 'weight', 'total_clicks', 'total_revenue', 'total_sales']

Found click columns: ['total_clicks']
  total_clicks: sum=0, max=0
