In [1]:
import pandas as pd
import numpy as np
import os
import sys
import time
from tabulate import tabulate
from dotenv import load_dotenv
import snowflake.connector

# --- Initialize Connection ---
load_dotenv()
conn = None
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connection to Snowflake successful!")
except Exception as e:
    print(f"❌ ERROR: Could not connect to Snowflake. {e}", file=sys.stderr)
    exit()

# --- Load the Master Purchaser Universe ---
UNIVERSE_FILE = "final_purchaser_universe.parquet"
print(f"\n--- Loading master list from '{UNIVERSE_FILE}' ---")
try:
    df_universe = pd.read_parquet(UNIVERSE_FILE)
    print(f"✅ Loaded {len(df_universe):,} total unique purchasing users.")
except FileNotFoundError:
    print(f"❌ FATAL ERROR: The universe file '{UNIVERSE_FILE}' was not found.")
    exit()

✅ Connection to Snowflake successful!

--- Loading master list from 'final_purchaser_universe.parquet' ---
✅ Loaded 4,926,305 total unique purchasing users.


In [2]:
import polars as pl
import os
from tabulate import tabulate

# --- Configuration ---
# This is the main 32M row panel dataset
PANEL_FILE = "/Users/pranjal/Code/topsort-incrementality/panel/user_panel_full_history.parquet"
# This is the final output from your long-running script
HOLDOUT_FILE = "final_holdout_user_ids_final.parquet"
# This is the file we will create
OUTPUT_FILE = "user_panel_with_holdout_flag.parquet"

# --- Load the datasets using Polars ---
print("--- Loading Datasets ---")
try:
    print(f"-> Loading main panel from '{PANEL_FILE}'...")
    df_panel = pl.read_parquet(PANEL_FILE)
    print(f"   ✅ Loaded panel with {df_panel.height:,} rows.")

    print(f"-> Loading final holdout user list from '{HOLDOUT_FILE}'...")
    df_holdouts = pl.read_parquet(HOLDOUT_FILE)
    print(f"   ✅ Loaded {df_holdouts.height:,} holdout user IDs.")

    # --- NEW: Identify and load the "purchaser" list from the panel ---
    print(f"-> Identifying all users with at least one purchase from the panel...")
    df_purchasers = df_panel.filter(pl.col("purchases") > 0).select(pl.col("user_id").unique())
    print(f"   ✅ Identified {df_purchasers.height:,} unique purchasing users.")

except FileNotFoundError as e:
    print(f"\n❌ FATAL ERROR: A required file was not found. Please ensure the previous scripts have completed.")
    print(f"   Details: {e}")
    df_panel = None 

--- Loading Datasets ---
-> Loading main panel from '/Users/pranjal/Code/topsort-incrementality/panel/user_panel_full_history.parquet'...
   ✅ Loaded panel with 32,060,768 rows.
-> Loading final holdout user list from 'final_holdout_user_ids_final.parquet'...
   ✅ Loaded 784,133 holdout user IDs.
-> Identifying all users with at least one purchase from the panel...
   ✅ Identified 4,926,674 unique purchasing users.


In [3]:
if df_panel is not None:
    print("\n--- Enriching Panel with Holdout and Purchaser Flags ---")
    
    # --- Standardize column names to prevent errors ---
    print("-> Standardizing column names to lowercase...")
    df_panel.columns = [col.lower() for col in df_panel.columns]
    df_holdouts.columns = [col.lower() for col in df_holdouts.columns]
    df_purchasers.columns = [col.lower() for col in df_purchasers.columns]
    print("   ✅ Column names standardized.")
    
    # Use sets for maximum performance
    holdout_set = set(df_holdouts['user_id'])
    purchaser_set = set(df_purchasers['user_id'])

    # --- Create the flag columns using efficient expressions ---
    print("-> Creating 'is_holdout' and 'is_purchaser' columns...")
    df_enriched = df_panel.with_columns(
        is_holdout = pl.col('user_id').is_in(holdout_set).cast(pl.UInt8),
        is_purchaser = pl.col('user_id').is_in(purchaser_set).cast(pl.UInt8)
    )
    print("   ✅ Flag columns created successfully.")

    # --- Verification Step ---
    print("\n--- Verifying the new flags ---")
    print("Distribution of the 'is_holdout' flag:")
    print(df_enriched['is_holdout'].value_counts())
    print("\nDistribution of the 'is_purchaser' flag:")
    print(df_enriched['is_purchaser'].value_counts())


--- Enriching Panel with Holdout and Purchaser Flags ---
-> Standardizing column names to lowercase...
   ✅ Column names standardized.
-> Creating 'is_holdout' and 'is_purchaser' columns...
   ✅ Flag columns created successfully.

--- Verifying the new flags ---
Distribution of the 'is_holdout' flag:
shape: (2, 2)
┌────────────┬──────────┐
│ is_holdout ┆ count    │
│ ---        ┆ ---      │
│ u8         ┆ u32      │
╞════════════╪══════════╡
│ 0          ┆ 31220194 │
│ 1          ┆ 840574   │
└────────────┴──────────┘

Distribution of the 'is_purchaser' flag:
shape: (2, 2)
┌──────────────┬──────────┐
│ is_purchaser ┆ count    │
│ ---          ┆ ---      │
│ u8           ┆ u32      │
╞══════════════╪══════════╡
│ 0            ┆ 7703346  │
│ 1            ┆ 24357422 │
└──────────────┴──────────┘


In [4]:
if 'df_enriched' in locals():
    print("\n--- Performing Overall and Temporal EDA on the Enriched Panel ---")

    # --- 1. Calculate Overall Summary Statistics ---
    print("-> Calculating overall summary stats...")
    summary_stats = {
        "Total Rows (User-Weeks)": df_enriched.height,
        "Total Unique Users": df_enriched['user_id'].n_unique(),
        "Total Unique Purchasers": df_enriched.filter(pl.col('is_purchaser') == 1)['user_id'].n_unique(),
        "Total Unique Holdouts": df_enriched.filter(pl.col('is_holdout') == 1)['user_id'].n_unique(),
        "Total Clicks": df_enriched['clicks'].sum(),
        "Total Purchases": df_enriched['purchases'].sum(),
        "Total Revenue": df_enriched['revenue_dollars'].sum()
    }
    print("   ✅ Overall stats calculated.")

    # --- 2. Calculate Weekly Evolution of Metrics ---
    print("-> Calculating weekly evolution of metrics...")
    df_weekly = df_enriched.group_by('week').agg(
        active_users=pl.n_unique('user_id'),
        purchasing_users=pl.col('user_id').filter(pl.col('is_purchaser') == 1).n_unique(),
        holdout_users=pl.col('user_id').filter(pl.col('is_holdout') == 1).n_unique(),
        total_clicks=pl.sum('clicks'),
        total_purchases=pl.sum('purchases'),
        total_revenue=pl.sum('revenue_dollars')
    ).sort('week')
    print("   ✅ Weekly aggregates calculated.")

    # --- 3. Calculate Key Weekly Fractions ---
    print("-> Calculating weekly fractions...")
    df_weekly_fractions = df_weekly.with_columns(
        # What percentage of active users in a week are purchasers?
        purchaser_rate = pl.col('purchasing_users') / pl.col('active_users'),
        # What percentage of active users in a week are from the holdout group?
        holdout_rate = pl.col('holdout_users') / pl.col('active_users'),
        # How many clicks per active user?
        clicks_per_user = pl.col('total_clicks') / pl.col('active_users')
    ).select(['week', 'purchaser_rate', 'holdout_rate', 'clicks_per_user'])
    print("   ✅ Weekly fractions calculated.")
    
    # --- 4. Generate the Final Report ---
    report_filename = "panel_eda_summary_report.txt"
    with open(report_filename, "w") as f:
        f.write("Exploratory Data Analysis of the Enriched User-Week Panel\n")
        f.write("=" * 58 + "\n\n")

        f.write("Overall Dataset Summary\n")
        f.write("-----------------------\n")
        for key, value in summary_stats.items():
            f.write(f"- {key}: {value:,.0f}\n")
        f.write("\n\n")

        f.write("Evolution of Key Weekly Metrics\n")
        f.write("-------------------------------\n")
        f.write(tabulate(df_weekly.to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=",.0f"))
        f.write("\n\n")

        f.write("Evolution of Key Weekly Fractions\n")
        f.write("---------------------------------\n")
        f.write(tabulate(df_weekly_fractions.to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=".4f"))

    print(f"\n✅ ANALYSIS COMPLETE. A detailed EDA report has been saved to '{report_filename}'")
    
    # --- 5. Print a sample of the results to the console ---
    print("\n--- Evolution of Key Weekly Metrics (First 10 Weeks) ---")
    print(tabulate(df_weekly.head(10).to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=",.0f"))
    
    print("\n--- Evolution of Key Weekly Fractions (First 10 Weeks) ---")
    print(tabulate(df_weekly_fractions.head(10).to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=".4f"))


--- Performing Overall and Temporal EDA on the Enriched Panel ---
-> Calculating overall summary stats...
   ✅ Overall stats calculated.
-> Calculating weekly evolution of metrics...
   ✅ Weekly aggregates calculated.
-> Calculating weekly fractions...
   ✅ Weekly fractions calculated.

✅ ANALYSIS COMPLETE. A detailed EDA report has been saved to 'panel_eda_summary_report.txt'

--- Evolution of Key Weekly Metrics (First 10 Weeks) ---
+---------------------+----------------+--------------------+-----------------+----------------+-------------------+-----------------+
| week                |   active_users |   purchasing_users |   holdout_users |   total_clicks |   total_purchases |   total_revenue |
| 2025-03-10 00:00:00 |         571746 |             459848 |           14772 |        1537431 |            374695 |      17,165,085 |
+---------------------+----------------+--------------------+-----------------+----------------+-------------------+-----------------+
| 2025-03-17 00:00:00

In [1]:
import polars as pl
from tabulate import tabulate

# --- Configuration ---
ENRICHED_PANEL_FILE = "user_panel_with_holdout_flag.parquet"
FINAL_REPORT_FILE = "covariate_balance_check_report.txt"
CUTOFF_DATE = "2025-07-01"

# ==============================================================================
# PHASE 1: LOAD DATA AND DEFINE TIME PERIODS
# ==============================================================================
print("--- Phase 1: Loading Data and Defining Periods ---")
try:
    df_enriched = pl.read_parquet(ENRICHED_PANEL_FILE)
    print(f"✅ Successfully loaded panel with {df_enriched.height:,} rows.")
except FileNotFoundError:
    print(f"❌ FATAL ERROR: The enriched panel file '{ENRICHED_PANEL_FILE}' was not found.")
    df_enriched = None

if df_enriched is not None:
    cutoff_date_pl = pl.lit(CUTOFF_DATE).str.to_date()
    df_period1 = df_enriched.filter(pl.col("week") < cutoff_date_pl)
    df_period2 = df_enriched.filter(pl.col("week") >= cutoff_date_pl)
    print(f"   -> Split data with cutoff: {CUTOFF_DATE}")

    # ==============================================================================
    # PHASE 2: CONSTRUCT THE ANALYSIS DATAFRAME
    # ==============================================================================
    print("\n--- Phase 2: Constructing DataFrame with Controls ---")

    # Define Base Population (>= 3 purchases in P1)
    p1_user_purchases = df_period1.group_by("user_id").agg(total_purchases_p1=pl.sum("purchases"))
    base_users = p1_user_purchases.filter(pl.col("total_purchases_p1") >= 3).select("user_id")
    print(f"   -> Identified {base_users.height:,} users for the analysis base.")

    # Engineer Controls (X) from Period 1
    df_controls_p1 = df_period1.group_by("user_id").agg(
        revenue_p1=pl.sum("revenue_dollars"),
        purchases_p1=pl.sum("purchases"),
        clicks_p1=pl.sum("clicks")
    )
    
    # Get user holdout status
    user_holdout_status = df_enriched.select(["user_id", "is_holdout"]).unique(subset="user_id")
    
    # Construct the final DataFrame
    df_analysis = base_users.join(user_holdout_status, on="user_id", how="inner")
    df_analysis = df_analysis.join(df_controls_p1, on="user_id", how="left").fill_null(0)
    print(f"   -> Final analysis DataFrame has {df_analysis.height:,} users (rows).")
    
    # ==============================================================================
    # PHASE 3: COVARIATE BALANCE CHECK (INSPECTING THE CONTROLS)
    # ==============================================================================
    print("\n--- Phase 3: Performing Covariate Balance Check ---")

    # Group by cohort and calculate summary stats for Period 1 controls
    control_summary = df_analysis.group_by("is_holdout").agg(
        user_count=pl.len(),
        avg_revenue_p1=pl.mean("revenue_p1"),
        avg_purchases_p1=pl.mean("purchases_p1"),
        avg_clicks_p1=pl.mean("clicks_p1"),
        median_revenue_p1=pl.median("revenue_p1")
    ).with_columns(
        cohort = pl.when(pl.col('is_holdout') == 1).then(pl.lit("Control")).otherwise(pl.lit("Treatment"))
    ).select("cohort", "user_count", "avg_revenue_p1", "avg_purchases_p1", "avg_clicks_p1", "median_revenue_p1")
    
    print("   -> Summary statistics for controls calculated.")
    
    # ==============================================================================
    # PHASE 4: FINAL REPORTING
    # ==============================================================================
    print(f"\n--- Phase 4: Generating Final Report ---")

    with open(FINAL_REPORT_FILE, "w") as f:
        f.write("Covariate Balance Check Report (Comparison of Period 1 Controls)\n")
        f.write("=" * 64 + "\n\n")
        f.write("Methodology:\n")
        f.write("This table compares the average characteristics of the Treatment and Control groups\n")
        f.write("based *only* on their activity in Period 1 (before the outcome period).\n")
        f.write("Large differences here indicate strong selection bias that regression must control for.\n\n")
        
        f.write(tabulate(control_summary.to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=".2f"))

    print(f"\n✅ ANALYSIS COMPLETE. Covariate balance report saved to '{FINAL_REPORT_FILE}'")
    
    # Print the results to the console
    print("\n--- Covariate Balance Check ---")
    print(tabulate(control_summary.to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=".2f"))

--- Phase 1: Loading Data and Defining Periods ---
✅ Successfully loaded panel with 32,060,768 rows.
   -> Split data with cutoff: 2025-07-01

--- Phase 2: Constructing DataFrame with Controls ---
   -> Identified 1,119,128 users for the analysis base.
   -> Final analysis DataFrame has 1,119,128 users (rows).

--- Phase 3: Performing Covariate Balance Check ---
   -> Summary statistics for controls calculated.

--- Phase 4: Generating Final Report ---

✅ ANALYSIS COMPLETE. Covariate balance report saved to 'covariate_balance_check_report.txt'

--- Covariate Balance Check ---
+-----------+--------------+------------------+--------------------+-----------------+---------------------+
| cohort    |   user_count |   avg_revenue_p1 |   avg_purchases_p1 |   avg_clicks_p1 |   median_revenue_p1 |
| Treatment |      1111277 |           391.34 |               9.30 |           48.92 |              173.00 |
+-----------+--------------+------------------+--------------------+-----------------+----

In [8]:
!pip install doubleml scikit-learn lightgbm -q

import polars as pl
import pandas as pd
import numpy as np
import os
from tabulate import tabulate
import doubleml as dml
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.base import clone

# --- Configuration ---
ENRICHED_PANEL_FILE = "user_panel_with_holdout_flag.parquet"
FINAL_REPORT_FILE = "doubleml_irm_tenure_control_report.txt"
CUTOFF_DATE = "2025-07-01"

# ==============================================================================
# PHASE 1: FEATURE & OUTCOME ENGINEERING (WITH TENURE PROXY)
# ==============================================================================
print("--- Phase 1: Engineering Features, Outcomes, and Tenure Proxy ---")

df_enriched = pl.read_parquet(ENRICHED_PANEL_FILE)
cutoff_date_pl = pl.lit(CUTOFF_DATE).str.to_date()
df_period1 = df_enriched.filter(pl.col("week") < cutoff_date_pl)
df_period2 = df_enriched.filter(pl.col("week") >= cutoff_date_pl)

# Define Base Population (>= 3 purchases in P1)
p1_user_purchases = df_period1.group_by("user_id").agg(total_purchases_p1=pl.sum("purchases"))
base_users = p1_user_purchases.filter(pl.col("total_purchases_p1") >= 3).select("user_id")
print(f"   -> Identified {base_users.height:,} users for the analysis base.")

# --- Engineer the Tenure Proxy ---
df_tenure = df_period1.group_by("user_id").agg(
    first_week_p1=pl.min("week")
)
period1_start_date = df_period1.select(pl.min("week"))[0, 0]

# --- THE FIX IS HERE: Use .dt.total_days() instead of .dt.days() ---
df_tenure = df_tenure.with_columns(
    join_week_index_p1 = ((pl.col('first_week_p1') - period1_start_date).dt.total_days() // 7)
).select(["user_id", "join_week_index_p1"])
print("   -> Engineered 'join_week_index_p1' as a tenure proxy.")

# Engineer Per-Week Controls (X) from Period 1
df_controls_p1 = df_period1.group_by("user_id").agg(
    avg_weekly_revenue_p1=pl.mean("revenue_dollars"),
    avg_weekly_purchases_p1=pl.mean("purchases"),
    avg_weekly_clicks_p1=pl.mean("clicks")
)

# Engineer Per-Week Outcome (Y) from Period 2
df_outcomes_p2 = df_period2.group_by("user_id").agg(
    avg_weekly_revenue_p2=pl.mean("revenue_dollars")
)

# ==============================================================================
# PHASE 2: CONSTRUCT FINAL REGRESSION DATAFRAME
# ==============================================================================
print("\n--- Phase 2: Constructing Final DataFrame for Regression ---")
user_holdout_status = df_enriched.select(["user_id", "is_holdout"]).unique(subset="user_id")

df_analysis = base_users.join(user_holdout_status, on="user_id", how="inner")
df_analysis = df_analysis.join(df_controls_p1, on="user_id", how="left").fill_null(0)
df_analysis = df_analysis.join(df_outcomes_p2, on="user_id", how="left").fill_null(0)
df_analysis = df_analysis.join(df_tenure, on="user_id", how="left").fill_null(0)
df_analysis = df_analysis.with_columns(is_treated = 1 - pl.col("is_holdout"))

data_pd = df_analysis.to_pandas()
float_cols = [col for col in data_pd.columns if 'revenue' in col or 'purchases' in col or 'clicks' in col or 'index' in col]
data_pd[float_cols] = data_pd[float_cols].astype('float64')

print(f"   -> Final DataFrame for analysis has {len(data_pd):,} users (rows).")

# ==============================================================================
# PHASE 3: DOUBLEML SETUP AND ATE ESTIMATION
# ==============================================================================
print("\n--- Phase 3: Estimating ATE with Tenure Control ---")

x_cols = ['avg_weekly_revenue_p1', 'avg_weekly_purchases_p1', 'avg_weekly_clicks_p1', 'join_week_index_p1']
dml_data = dml.DoubleMLData(data_pd,
                              y_col='avg_weekly_revenue_p2',
                              d_cols='is_treated',
                              x_cols=x_cols)

learner_g = LGBMRegressor(n_jobs=-1, random_state=42, verbose=-1)
learner_m = LGBMClassifier(n_jobs=-1, random_state=42, verbose=-1)

dml_irm_obj = dml.DoubleMLIRM(dml_data,
                            ml_g=clone(learner_g),
                            ml_m=clone(learner_m))

dml_irm_obj.fit(store_predictions=True)
print("   -> DoubleML IRM model fitting complete.")
print(dml_irm_obj.summary)

# ==============================================================================
# PHASE 4: HETEROGENEITY ANALYSIS (GATES)
# ==============================================================================
print("\n--- Phase 4: Analyzing Heterogeneous Effects with GATEs ---")

tenure_median = data_pd['join_week_index_p1'].median()

groups = pd.DataFrame({
    'Early_Joiners_P1': (data_pd['join_week_index_p1'] <= tenure_median),
    'Late_Joiners_P1': (data_pd['join_week_index_p1'] > tenure_median)
})

gate_results = dml_irm_obj.gate(groups)
print("   -> Group Average Treatment Effects (GATEs) calculated.")
print(gate_results)

# ==============================================================================
# PHASE 5: FINAL REPORTING
# ==============================================================================
print(f"\n--- Phase 5: Generating Final Report ---")

with open(FINAL_REPORT_FILE, "w") as f:
    f.write("DoubleML IRM Analysis with Tenure Control\n")
    f.write("=" * 41 + "\n\n")
    f.write("Methodology:\n")
    f.write("A DoubleML IRM model was used to estimate the causal effect of ad exposure on the\n")
    f.write("average weekly revenue in Period 2. The model now includes a 'join_week_index_p1'\n")
    f.write("as a tenure proxy to control for when a user first became active.\n\n")
    
    f.write("Overall Average Treatment Effect (ATE) on Weekly Revenue\n")
    f.write("--------------------------------------------------------\n")
    f.write(str(dml_irm_obj.summary))
    f.write("\n\n")
    
    f.write("Heterogeneous Effects by User Tenure (GATEs)\n")
    f.write("--------------------------------------------\n")
    f.write("This table shows if the ATE on weekly revenue is different for users who joined\n")
    f.write("early in Period 1 vs. those who joined later.\n\n")
    f.write(str(gate_results))

print(f"\n✅ ANALYSIS COMPLETE. Final report saved to '{FINAL_REPORT_FILE}'")

--- Phase 1: Engineering Features, Outcomes, and Tenure Proxy ---
   -> Identified 1,119,128 users for the analysis base.
   -> Engineered 'join_week_index_p1' as a tenure proxy.

--- Phase 2: Constructing Final DataFrame for Regression ---
   -> Final DataFrame for analysis has 1,119,128 users (rows).

--- Phase 3: Estimating ATE with Tenure Control ---




   -> DoubleML IRM model fitting complete.
                coef   std err         t  P>|t|    2.5 %   97.5 %
is_treated  7.837105  0.171577  45.67686    0.0  7.50082  8.17339

--- Phase 4: Analyzing Heterogeneous Effects with GATEs ---
   -> Group Average Treatment Effects (GATEs) calculated.

------------------ Fit summary ------------------
                      coef   std err          t         P>|t|    [0.025  \
Early_Joiners_P1  9.797284  0.200115  48.958217  0.000000e+00  9.405065   
Late_Joiners_P1   4.873772  0.306875  15.881931  8.453542e-57  4.272307   

                     0.975]  
Early_Joiners_P1  10.189502  
Late_Joiners_P1    5.475236  

--- Phase 5: Generating Final Report ---

✅ ANALYSIS COMPLETE. Final report saved to 'doubleml_irm_tenure_control_report.txt'


In [16]:
!pip install doubleml scikit-learn lightgbm -q

import polars as pl
import pandas as pd
import numpy as np
import os
from tabulate import tabulate
import doubleml as dml
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.base import clone

# --- Configuration ---
ENRICHED_PANEL_FILE = "user_panel_with_holdout_flag.parquet"
CUTOFF_DATE = "2025-07-01"

# ==============================================================================
# PHASE 1: FEATURE & OUTCOME ENGINEERING (WITH TENURE PROXY)
# ==============================================================================
print("--- Phase 1: Engineering Features, Outcomes, and Tenure Proxy ---")

df_enriched = pl.read_parquet(ENRICHED_PANEL_FILE)
cutoff_date_pl = pl.lit(CUTOFF_DATE).str.to_date()
df_period1 = df_enriched.filter(pl.col("week") < cutoff_date_pl)
df_period2 = df_enriched.filter(pl.col("week") >= cutoff_date_pl)

# Define Base Population (>= 3 purchases in P1)
p1_user_purchases = df_period1.group_by("user_id").agg(total_purchases_p1=pl.sum("purchases"))
base_users = p1_user_purchases.filter(pl.col("total_purchases_p1") >= 3).select("user_id")
print(f"   -> Identified {base_users.height:,} users for the analysis base.")

# Engineer the Tenure Proxy
df_tenure = df_period1.group_by("user_id").agg(first_week_p1=pl.min("week"))
period1_start_date = df_period1.select(pl.min("week"))[0, 0]
df_tenure = df_tenure.with_columns(
    join_week_index_p1 = ((pl.col('first_week_p1') - period1_start_date).dt.total_days() // 7)
).select(["user_id", "join_week_index_p1"])
print("   -> Engineered 'join_week_index_p1' as a tenure proxy.")

# Engineer Per-Week Controls (X) from Period 1
df_controls_p1 = df_period1.group_by("user_id").agg(
    avg_weekly_revenue_p1=pl.mean("revenue_dollars"),
    avg_weekly_purchases_p1=pl.mean("purchases"),
    avg_weekly_clicks_p1=pl.mean("clicks")
)

# Engineer Per-Week Outcome (Y) from Period 2
df_outcomes_p2 = df_period2.group_by("user_id").agg(
    avg_weekly_revenue_p2=pl.mean("revenue_dollars")
)

# ==============================================================================
# PHASE 2: CONSTRUCT FINAL REGRESSION DATAFRAME
# ==============================================================================
print("\n--- Phase 2: Constructing Final DataFrame for Regression ---")
user_holdout_status = df_enriched.select(["user_id", "is_holdout"]).unique(subset="user_id")

df_analysis = base_users.join(user_holdout_status, on="user_id", how="inner")
df_analysis = df_analysis.join(df_controls_p1, on="user_id", how="left").fill_null(0)
df_analysis = df_analysis.join(df_outcomes_p2, on="user_id", how="left").fill_null(0)
df_analysis = df_analysis.join(df_tenure, on="user_id", how="left").fill_null(0)
df_analysis = df_analysis.with_columns(is_treated = 1 - pl.col("is_holdout"))

data_pd = df_analysis.to_pandas()
float_cols = [col for col in data_pd.columns if 'revenue' in col or 'purchases' in col or 'clicks' in col or 'index' in col]
data_pd[float_cols] = data_pd[float_cols].astype('float64')

print(f"   -> Final DataFrame for analysis has {len(data_pd):,} users (rows).")

# ==============================================================================
# PHASE 3: DOUBLEML ATE ESTIMATION WITH TENURE CONTROL
# ==============================================================================
print("\n--- Phase 3: Estimating ATE with Tenure Control ---")

x_cols = ['avg_weekly_revenue_p1', 'avg_weekly_purchases_p1', 'avg_weekly_clicks_p1', 'join_week_index_p1']
dml_data = dml.DoubleMLData(data_pd,
                              y_col='avg_weekly_revenue_p2',
                              d_cols='is_treated',
                              x_cols=x_cols)

learner_g = LGBMRegressor(n_jobs=-1, random_state=42, verbose=-1)
learner_m = LGBMClassifier(n_jobs=-1, random_state=42, verbose=-1)

dml_irm_obj = dml.DoubleMLIRM(dml_data,
                            ml_g=clone(learner_g),
                            ml_m=clone(learner_m))

dml_irm_obj.fit(store_predictions=True)
print("   -> DoubleML IRM model fitting complete.")

# ==============================================================================
# PHASE 4: FINAL INTERPRETABLE REPORTING
# ==============================================================================
print("\n--- Final Analysis: Interpreting Heterogeneous Treatment Effects ---")

# --- 4.1. INTERPRET THE OVERALL AVERAGE TREATMENT EFFECT (ATE) ---
control_group_data = data_pd[data_pd['is_treated'] == 0]
baseline_revenue = control_group_data['avg_weekly_revenue_p2'].mean()

ate_summary = dml_irm_obj.summary
ate_abs = ate_summary.loc['is_treated', 'coef']
ate_lower = ate_summary.loc['is_treated', '2.5 %']
ate_upper = ate_summary.loc['is_treated', '97.5 %']

ate_pct = (ate_abs / baseline_revenue) * 100 if baseline_revenue > 0 else float('inf')
ate_pct_lower = (ate_lower / baseline_revenue) * 100 if baseline_revenue > 0 else float('inf')
ate_pct_upper = (ate_upper / baseline_revenue) * 100 if baseline_revenue > 0 else float('inf')

print("\n" + "="*60)
print("Overall Average Treatment Effect (ATE) on Weekly Revenue")
print("="*60)
print(f"  - Baseline (Control Group Avg. Weekly Revenue): ${baseline_revenue:.2f}")
print(f"  - Absolute Incremental Lift:                    ${ate_abs:.2f} per user per week")
print(f"  - 95% Confidence Interval (Absolute):         (${ate_lower:.2f}, ${ate_upper:.2f})")
print(f"  - Relative Incremental Lift:                    {ate_pct:+.2f}%")
print(f"  - 95% Confidence Interval (Relative):         ({ate_pct_lower:+.2f}%, {ate_pct_upper:+.2f}%)")

# --- 4.2. COMPREHENSIVE HETEROGENEITY ANALYSIS (GATES) ---
control_vars = {
    'avg_weekly_revenue_p1': 'Avg. Weekly Revenue (P1)',
    'avg_weekly_purchases_p1': 'Avg. Weekly Purchases (P1)',
    'avg_weekly_clicks_p1': 'Avg. Weekly Clicks (P1)',
    'join_week_index_p1': 'Tenure (Join Week Index P1)'
}

for var, name in control_vars.items():
    print("\n" + "="*60)
    print(f"Heterogeneous Effects by: {name}")
    print("="*60)

    median_val = data_pd[var].median()
    groups = pd.DataFrame({
        f'High ({var} > {median_val:.2f})': (data_pd[var] > median_val),
        f'Low ({var} <= {median_val:.2f})': (data_pd[var] <= median_val)
    })

    gate_results = dml_irm_obj.gate(groups)
    gate_summary_df = gate_results.summary

    interpreted_gates = []
    for group_name in gate_summary_df.index:
        gate_abs = gate_summary_df.loc[group_name, 'coef']
        # --- THE FIX IS HERE ---
        # Use the correct, specific column names from the GATEs summary object
        gate_lower = gate_summary_df.loc[group_name, '[0.025']
        gate_upper = gate_summary_df.loc[group_name, '0.975]']
        
        group_mask = groups[group_name]
        control_in_gate = data_pd[(data_pd['is_treated'] == 0) & (group_mask)]
        gate_baseline = control_in_gate['avg_weekly_revenue_p2'].mean()
        
        if gate_baseline > 0:
            gate_pct = (gate_abs / gate_baseline) * 100
            gate_pct_lower = (gate_lower / gate_baseline) * 100
            gate_pct_upper = (gate_upper / gate_baseline) * 100
        else:
            gate_pct, gate_pct_lower, gate_pct_upper = np.nan, np.nan, np.nan
        
        interpreted_gates.append({
            'Subgroup': group_name,
            'ATE ($ Lift)': f"${gate_abs:.2f}",
            'ATE (% Lift)': f"{gate_pct:+.2f}%",
            '95% CI (% Lift)': f"({gate_pct_lower:+.2f}%, {gate_pct_upper:+.2f}%)",
            'Baseline Revenue': f"${gate_baseline:.2f}"
        })
    
    gates_df = pd.DataFrame(interpreted_gates)
    print(tabulate(gates_df, headers='keys', tablefmt='grid', showindex=False))

print("\n✅ ANALYSIS COMPLETE.")

--- Phase 1: Engineering Features, Outcomes, and Tenure Proxy ---
   -> Identified 1,119,128 users for the analysis base.
   -> Engineered 'join_week_index_p1' as a tenure proxy.

--- Phase 2: Constructing Final DataFrame for Regression ---
   -> Final DataFrame for analysis has 1,119,128 users (rows).

--- Phase 3: Estimating ATE with Tenure Control ---




   -> DoubleML IRM model fitting complete.

--- Final Analysis: Interpreting Heterogeneous Treatment Effects ---

Overall Average Treatment Effect (ATE) on Weekly Revenue
  - Baseline (Control Group Avg. Weekly Revenue): $27.32
  - Absolute Incremental Lift:                    $7.58 per user per week
  - 95% Confidence Interval (Absolute):         ($7.26, $7.91)
  - Relative Incremental Lift:                    +27.76%
  - 95% Confidence Interval (Relative):         (+26.56%, +28.96%)

Heterogeneous Effects by: Avg. Weekly Revenue (P1)
+--------------------------------------+----------------+----------------+--------------------+--------------------+
| Subgroup                             | ATE ($ Lift)   | ATE (% Lift)   | 95% CI (% Lift)    | Baseline Revenue   |
| High (avg_weekly_revenue_p1 > 30.07) | $13.21         | +41.68%        | (+39.69%, +43.68%) | $31.69             |
+--------------------------------------+----------------+----------------+--------------------+------------

## User-Vendor Spend Panel

In [20]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
import time
from tabulate import tabulate

# --- Standard Functions ---
load_dotenv()

def connect_to_snowflake():
    """Establishes a connection to Snowflake."""
    try:
        conn = snowflake.connector.connect(
            user=os.getenv('SNOWFLAKE_USER'),
            password=os.getenv('SNOWFLAKE_PASSWORD'),
            account=os.getenv('SNOWFLAKE_ACCOUNT'),
            warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
            database='INCREMENTALITY',
            schema='INCREMENTALITY_RESEARCH'
        )
        print("✅ Connected to Snowflake")
        return conn
    except snowflake.connector.Error as e:
        print(f"❌ Could not connect to Snowflake: {e}", file=sys.stderr)
        return None

def fetch_data(conn, query, query_name="Query"):
    """Executes a query, times it, and returns a pandas DataFrame."""
    print(f"   -> Executing: {query_name}...")
    start_time = time.time()
    try:
        df = pd.read_sql(query, conn)
        duration = time.time() - start_time
        print(f"   ✅ Query successful in {duration:.2f}s.")
        return df
    except Exception as e:
        print(f"   ❌ ERROR executing {query_name}: {e}", file=sys.stderr)
        return pd.DataFrame()

# --- NEW Refactored Query Functions ---

def get_purchases_date_range(conn):
    """Queries the PURCHASES table to find its min and max dates."""
    print("\n--- Phase 1: Discovering Full Date Range from PURCHASES ---")
    query = "SELECT MIN(PURCHASED_AT) AS min_date, MAX(PURCHASED_AT) AS max_date FROM PURCHASES;"
    df_dates = fetch_data(conn, query, "Date Range Query")
    if df_dates.empty:
        return None, None
    
    start_date = pd.to_datetime(df_dates['MIN_DATE'].iloc[0])
    end_date = pd.to_datetime(df_dates['MAX_DATE'].iloc[0])
    
    print(f"   -> Purchases data ranges from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}.")
    return start_date, end_date

def build_weekly_purchases_query(week_start, week_end):
    """Builds the SQL to get one week of purchase-based panel data for ALL users and vendors."""
    return f"""
    WITH
    -- This map is necessary to link a purchase's product_id to a vendor_id.
    PRODUCT_VENDOR_MAP AS (
        SELECT DISTINCT PRODUCT_ID, VENDOR_ID 
        FROM CLICKS 
        WHERE PRODUCT_ID IS NOT NULL AND VENDOR_ID IS NOT NULL
    )
    -- The main query only uses the PURCHASES table for activity.
    SELECT
        p.USER_ID,
        pvm.VENDOR_ID,
        DATE_TRUNC('WEEK', p.PURCHASED_AT) AS week,
        COUNT(DISTINCT p.PURCHASE_ID) AS purchases,
        (COALESCE(SUM(p.QUANTITY * p.UNIT_PRICE), 0) / 100)::DECIMAL(18, 2) AS revenue_dollars
    FROM PURCHASES AS p
    JOIN PRODUCT_VENDOR_MAP AS pvm ON p.PRODUCT_ID = pvm.PRODUCT_ID
    WHERE p.PURCHASED_AT >= '{week_start}' AND p.PURCHASED_AT < '{week_end}'
    GROUP BY 1, 2, 3;
    """

def main():
    # --- CONFIGURATION ---
    OUTPUT_FILENAME = "full_purchaser_vendor_panel_unfiltered.parquet"
    REPORT_FILENAME = "full_purchaser_vendor_panel_build_summary.txt"
    
    conn = connect_to_snowflake()
    if not conn:
        sys.exit(1)

    try:
        # Phase 1: Get the full date range dynamically
        start_date, end_date = get_purchases_date_range(conn)
        if not start_date or not end_date:
            print("Could not determine date range. Halting.")
            return

        # Phase 2: Loop through each week of the analysis period
        print("\n--- Phase 2: Building Purchaser Panel Incrementally ---")
        weekly_panels = []
        weekly_logs = []
        date_range = pd.date_range(start=start_date, end=end_date, freq='W-MON')

        for i, week_start in enumerate(date_range):
            week_end = week_start + pd.DateOffset(weeks=1)
            print(f"\nProcessing Week {i+1}/{len(date_range)}: {week_start.strftime('%Y-%m-%d')}...")
            
            weekly_query = build_weekly_purchases_query(week_start.strftime('%Y-%m-%d'), week_end.strftime('%Y-%m-%d'))
            weekly_df = fetch_data(conn, weekly_query, f"Week {i+1} Panel Query")
            
            if not weekly_df.empty:
                log_entry = {
                    "Week": week_start.strftime('%Y-%m-%d'),
                    "Rows": len(weekly_df),
                    "Unique Users": weekly_df['USER_ID'].nunique(),
                    "Unique Vendors": weekly_df['VENDOR_ID'].nunique()
                }
                print(f"   -> Data Shape: {weekly_df.shape}")
                print(f"   -> Weekly Stats: {log_entry['Unique Users']:,} users, {log_entry['Unique Vendors']:,} vendors")
                weekly_logs.append(log_entry)
                weekly_panels.append(weekly_df)
            else:
                print("   -> No data returned for this week.")

        # Phase 3: Concatenate, save, and report
        print("\n--- Phase 3: Finalizing Panel and Generating Report ---")
        if weekly_panels:
            final_panel_df = pd.concat(weekly_panels, ignore_index=True)
            print(f"   -> Concatenated all weekly data into a final panel.")
            
            # Add a 'clicks' column with all zeros for schema consistency
            final_panel_df['clicks'] = 0
            final_panel_df.columns = [col.lower() for col in final_panel_df.columns]
            
            # Reorder columns to the desired format
            final_panel_df = final_panel_df[['user_id', 'vendor_id', 'week', 'purchases', 'revenue_dollars', 'clicks']]
            
            final_panel_df.to_parquet(OUTPUT_FILENAME, index=False, engine='pyarrow')
            print(f"✅ Final purchaser-only panel successfully saved to '{OUTPUT_FILENAME}'")

            with open(REPORT_FILENAME, "w") as f:
                f.write("Full Purchaser-Vendor Panel Build Summary\n")
                f.write("=" * 42 + "\n\n")
                f.write("Overall Panel Statistics\n------------------------\n")
                f.write(f"- Total Rows: {len(final_panel_df):,}\n")
                f.write(f"- Total Unique Users: {final_panel_df['user_id'].nunique():,}\n")
                f.write(f"- Total Unique Vendors: {final_panel_df['vendor_id'].nunique():,}\n\n")
                f.write("Weekly Build Log\n----------------\n")
                summary_df = pd.DataFrame(weekly_logs)
                f.write(tabulate(summary_df, headers='keys', tablefmt='grid', showindex=False))
            print(f"✅ Summary report saved to '{REPORT_FILENAME}'")
        else:
            print("   -> No data was generated. No file saved.")

    finally:
        if conn and not conn.is_closed():
            conn.close()
            print("\n✅ Snowflake connection closed.")

if __name__ == "__main__":
    main()

✅ Connected to Snowflake

--- Phase 1: Discovering Full Date Range from PURCHASES ---
   -> Executing: Date Range Query...


  df = pd.read_sql(query, conn)


   ✅ Query successful in 53.56s.
   -> Purchases data ranges from 2025-03-14 to 2025-09-09.

--- Phase 2: Building Purchaser Panel Incrementally ---

Processing Week 1/26: 2025-03-17...
   -> Executing: Week 1 Panel Query...
   ✅ Query successful in 219.35s.
   -> Data Shape: (51317, 5)
   -> Weekly Stats: 45,780 users, 18,205 vendors

Processing Week 2/26: 2025-03-24...
   -> Executing: Week 2 Panel Query...
   ✅ Query successful in 217.03s.
   -> Data Shape: (60056, 5)
   -> Weekly Stats: 53,472 users, 20,330 vendors

Processing Week 3/26: 2025-03-31...
   -> Executing: Week 3 Panel Query...
   ✅ Query successful in 216.31s.
   -> Data Shape: (65819, 5)
   -> Weekly Stats: 58,067 users, 22,003 vendors

Processing Week 4/26: 2025-04-07...
   -> Executing: Week 4 Panel Query...
   ✅ Query successful in 215.01s.
   -> Data Shape: (69756, 5)
   -> Weekly Stats: 61,582 users, 22,894 vendors

Processing Week 5/26: 2025-04-14...
   -> Executing: Week 5 Panel Query...
   ✅ Query successful i

In [21]:
import polars as pl
import os
from tabulate import tabulate

# --- Configuration ---
# Input files from previous steps
USER_WEEK_PANEL_FILE = "user_panel_with_holdout_flag.parquet"
USER_VENDOR_PANEL_FILE = "full_purchaser_vendor_panel_unfiltered.parquet"

# Final output file for the analysis-ready dataset
OUTPUT_FILE = "final_analysis_user_level_dataset.parquet"
REPORT_FILE = "final_dataset_build_summary.txt"
CUTOFF_DATE = "2025-07-01"

# ==============================================================================
# PHASE 1: LOAD ALL SOURCE DATA
# ==============================================================================
print("--- Phase 1: Loading All Source Datasets ---")
try:
    df_user_panel = pl.read_parquet(USER_WEEK_PANEL_FILE)
    df_vendor_panel = pl.read_parquet(USER_VENDOR_PANEL_FILE)
    print("✅ All source panels loaded successfully.")
except FileNotFoundError as e:
    print(f"❌ FATAL ERROR: A required input file was not found. Details: {e}")
    df_user_panel = None

if df_user_panel is not None:
    # ==============================================================================
    # PHASE 2: DEFINE TIME PERIODS AND BASE POPULATION
    # ==============================================================================
    print("\n--- Phase 2: Defining Time Periods and Base Population ---")
    cutoff_date_pl = pl.lit(CUTOFF_DATE).str.to_date()

    # Split both panels by the cutoff date
    df_user_p1 = df_user_panel.filter(pl.col("week") < cutoff_date_pl)
    df_user_p2 = df_user_panel.filter(pl.col("week") >= cutoff_date_pl)
    df_vendor_p1 = df_vendor_panel.filter(pl.col("week") < cutoff_date_pl)
    df_vendor_p2 = df_vendor_panel.filter(pl.col("week") >= cutoff_date_pl)

    # Define Base Population (>= 3 purchases in P1) from the user panel
    p1_user_purchases = df_user_p1.group_by("user_id").agg(total_purchases_p1=pl.sum("purchases"))
    base_users = p1_user_purchases.filter(pl.col("total_purchases_p1") >= 3).select("user_id")
    print(f"   -> Identified {base_users.height:,} users for the analysis base (>= 3 purchases in P1).")

    # ==============================================================================
    # PHASE 3: FEATURE ENGINEERING FROM PERIOD 1
    # ==============================================================================
    print("\n--- Phase 3: Engineering All Features from Period 1 ---")

    # Engineer "older" features (clicks, tenure) from the user panel
    df_controls_user_p1 = df_user_p1.group_by("user_id").agg(
        clicks_p1=pl.sum("clicks")
    )
    df_tenure = df_user_p1.group_by("user_id").agg(first_week_p1=pl.min("week"))
    period1_start_date = df_user_p1.select(pl.min("week"))[0, 0]
    df_tenure = df_tenure.with_columns(
        join_week_index_p1=((pl.col('first_week_p1') - period1_start_date).dt.total_days() // 7)
    ).select(["user_id", "join_week_index_p1"])
    print("   -> Engineered clicks and tenure features.")

    # Engineer "newer" features (vendor loyalty/variety) from the vendor panel
    df_controls_vendor_p1_agg1 = df_vendor_p1.group_by(["user_id", "vendor_id"]).agg(
        revenue_per_vendor=pl.sum("revenue_dollars")
    )
    df_controls_vendor_p1 = df_controls_vendor_p1_agg1.group_by("user_id").agg(
        revenue_p1=pl.sum("revenue_per_vendor"),
        distinct_vendors_p1=pl.n_unique("vendor_id"),
        spend_on_top_vendor_p1=pl.max("revenue_per_vendor")
    ).with_columns(
        spend_concentration_p1 = pl.col("spend_on_top_vendor_p1") / pl.col("revenue_p1")
    ).drop("spend_on_top_vendor_p1")
    print("   -> Engineered vendor-based loyalty and variety features.")
    
    # Also get purchases_p1 from the base definition step
    df_purchases_p1 = p1_user_purchases.select(["user_id", pl.col("total_purchases_p1").alias("purchases_p1")])

    # ==============================================================================
    # PHASE 4: OUTCOME ENGINEERING FROM PERIOD 2
    # ==============================================================================
    print("\n--- Phase 4: Engineering Outcome from Period 2 ---")
    df_outcome_p2 = df_user_p2.group_by("user_id").agg(
        revenue_p2=pl.sum("revenue_dollars")
    )
    print(f"   -> Calculated Period 2 revenue for {df_outcome_p2.height:,} users.")

    # ==============================================================================
    # PHASE 5: CONSTRUCT AND SAVE THE FINAL DATASET
    # ==============================================================================
    print("\n--- Phase 5: Constructing and Saving the Final Analysis Dataset ---")
    
    # Get the static holdout flag for all users
    user_holdout_status = df_user_panel.select(["user_id", "is_holdout"]).unique(subset="user_id")
    
    # Start with our base of qualified users and their holdout status
    df_final = base_users.join(user_holdout_status, on="user_id", how="inner")
    
    # Sequentially join all our engineered feature sets
    df_final = df_final.join(df_purchases_p1, on="user_id", how="left")
    df_final = df_final.join(df_controls_user_p1, on="user_id", how="left")
    df_final = df_final.join(df_tenure, on="user_id", how="left")
    df_final = df_final.join(df_controls_vendor_p1, on="user_id", how="left")
    df_final = df_final.join(df_outcome_p2, on="user_id", how="left")
    
    # Create the 'is_treated' (T) flag and fill any nulls that may have resulted from joins
    df_final = df_final.with_columns(is_treated = 1 - pl.col("is_holdout")).fill_null(0)
    
    print(f"   -> Final dataset constructed with {df_final.height:,} users and {df_final.width} columns.")
    
    # Save the final dataset
    df_final.write_parquet(OUTPUT_FILE)
    print(f"✅ Final analysis-ready dataset saved to '{OUTPUT_FILE}'")

    # Generate a final summary report of the created dataset
    with open(REPORT_FILE, "w") as f:
        f.write("Summary of Final Analysis-Ready Dataset\n")
        f.write("=" * 40 + "\n\n")
        f.write("This dataset contains a single row for each user in the 'Base' population\n")
        f.write("(>= 3 purchases in Period 1), with features from Period 1 and the outcome from Period 2.\n\n")
        f.write(f"Total Users (Rows): {df_final.height:,}\n")
        f.write(f"Total Columns: {df_final.width}\n\n")
        f.write("Schema and Sample Data:\n")
        f.write("-----------------------\n")
        # Use pandas for tabulate, as it handles formatting well
        f.write(tabulate(df_final.head(10).to_pandas(), headers='keys', tablefmt='grid', showindex=False, floatfmt=".2f"))

    print(f"✅ A summary report of the new dataset has been saved to '{REPORT_FILE}'")

--- Phase 1: Loading All Source Datasets ---
✅ All source panels loaded successfully.

--- Phase 2: Defining Time Periods and Base Population ---
   -> Identified 1,119,128 users for the analysis base (>= 3 purchases in P1).

--- Phase 3: Engineering All Features from Period 1 ---
   -> Engineered clicks and tenure features.
   -> Engineered vendor-based loyalty and variety features.

--- Phase 4: Engineering Outcome from Period 2 ---
   -> Calculated Period 2 revenue for 5,397,437 users.

--- Phase 5: Constructing and Saving the Final Analysis Dataset ---
   -> Final dataset constructed with 1,119,128 users and 10 columns.
✅ Final analysis-ready dataset saved to 'final_analysis_user_level_dataset.parquet'
✅ A summary report of the new dataset has been saved to 'final_dataset_build_summary.txt'


In [23]:
!pip install doubleml scikit-learn lightgbm -q

import polars as pl
import pandas as pd
import numpy as np
import os
from tabulate import tabulate
import doubleml as dml
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.base import clone

# --- Configuration ---
ANALYSIS_DATA_FILE = "final_analysis_user_level_dataset.parquet"

# ==============================================================================
# PHASE 1 & 2: LOAD AND PREPARE DATA (Unchanged)
# ==============================================================================
print("--- Phase 1 & 2: Loading and Preparing Data ---")

data_pd = pd.read_parquet(ANALYSIS_DATA_FILE)
# Ensure all key numeric columns are standard float64
float_cols = [col for col in data_pd.columns if col not in ['user_id', 'is_holdout', 'is_treated']]
data_pd[float_cols] = data_pd[float_cols].astype('float64')
print(f"✅ Successfully loaded and prepared data with {len(data_pd):,} users.")

# ==============================================================================
# PHASE 3: DOUBLEML ATE ESTIMATION (Unchanged)
# ==============================================================================
print("\n--- Phase 3: Estimating Overall Average Treatment Effect (ATE) ---")

x_cols = ['revenue_p1', 'purchases_p1', 'clicks_p1', 'join_week_index_p1', 'distinct_vendors_p1', 'spend_concentration_p1']
dml_data = dml.DoubleMLData(data_pd, y_col='revenue_p2', d_cols='is_treated', x_cols=x_cols)
learner_g = LGBMRegressor(n_jobs=-1, random_state=42, verbose=-1)
learner_m = LGBMClassifier(n_jobs=-1, random_state=42, verbose=-1)
dml_irm_obj = dml.DoubleMLIRM(dml_data, ml_g=clone(learner_g), ml_m=clone(learner_m))
dml_irm_obj.fit(store_predictions=True)
print("   -> DoubleML IRM model fitting complete.")

# ==============================================================================
# PHASE 4: GRANULAR HETEROGENEITY ANALYSIS (NEW LOGIC)
# ==============================================================================
print("\n--- Final Analysis: Interpreting Granular Heterogeneous Treatment Effects ---")

# --- Helper function to create quantile groups robustly ---
def create_quantile_groups(data, var, n_quantiles, var_name):
    try:
        # Use qcut to create bins. duplicates='drop' handles non-unique bin edges.
        quantiles = pd.qcut(data[var], q=n_quantiles, labels=False, duplicates='drop')
        n_groups = quantiles.nunique()
        
        # If we get enough groups, use them
        if n_groups >= min(n_quantiles, 4):
            print(f"   -> Creating {n_groups} quantiles for '{var_name}'...")
            labels = [f"Quantile {i+1}" for i in range(n_groups)]
            if n_groups > 2:
                labels[0] += " (Lowest)"
                labels[-1] += " (Highest)"
            
            groups = pd.get_dummies(quantiles, prefix=f"Q").rename(
                columns={f"Q_{i}.0": labels[i] for i in range(n_groups)}
            )
            return groups
    except Exception:
        pass # Fallback if qcut fails for any reason
    
    # Fallback to median split if quantiles are not possible
    print(f"   -> Could not create quantiles for '{var_name}'. Falling back to median split.")
    median_val = data[var].median()
    groups = pd.DataFrame({
        f'High ({var_name} > {median_val:.2f})': (data[var] > median_val),
        f'Low ({var_name} <= {median_val:.2f})': (data[var] <= median_val)
    })
    return groups

# --- Overall ATE Interpretation ---
control_group_data = data_pd[data_pd['is_treated'] == 0]
baseline_revenue = control_group_data['revenue_p2'].mean()
ate_summary = dml_irm_obj.summary
ate_abs = ate_summary.loc['is_treated', 'coef']
ate_lower = ate_summary.loc['is_treated', '2.5 %']
ate_upper = ate_summary.loc['is_treated', '97.5 %']
ate_pct = (ate_abs / baseline_revenue) * 100 if baseline_revenue > 0 else float('inf')
ate_pct_lower = (ate_lower / baseline_revenue) * 100 if baseline_revenue > 0 else float('inf')
ate_pct_upper = (ate_upper / baseline_revenue) * 100 if baseline_revenue > 0 else float('inf')

print("\n" + "="*80)
print("Overall Average Treatment Effect (ATE) on Period 2 Revenue")
print("="*80)
print(f"  - Baseline (Control Group Avg. Revenue): ${baseline_revenue:.2f}")
print(f"  - Absolute Incremental Lift:             ${ate_abs:.2f} per user")
print(f"  - 95% Confidence Interval (Absolute):  (${ate_lower:.2f}, ${ate_upper:.2f})")
print(f"  - Relative Incremental Lift:             {ate_pct:+.2f}%")
print(f"  - 95% Confidence Interval (Relative):  ({ate_pct_lower:+.2f}%, {ate_pct_upper:+.2f}%)")

# --- Comprehensive Heterogeneity Analysis Loop ---
control_vars = {
    'revenue_p1': 'Revenue (P1)',
    'purchases_p1': 'Purchases (P1)',
    'clicks_p1': 'Clicks (P1)',
    'join_week_index_p1': 'Tenure (Join Week P1)',
    'distinct_vendors_p1': 'Vendor Variety (P1)',
    'spend_concentration_p1': 'Spend Concentration (P1)'
}

for var, name in control_vars.items():
    print("\n" + "="*80)
    print(f"Heterogeneous Effects by: {name}")
    print("="*80)
    
    # Use our robust helper function to create groups, trying for deciles first
    groups = create_quantile_groups(data_pd, var, 10, name)
    if len(groups.columns) < 4: # Fallback to quartiles if deciles failed
        groups = create_quantile_groups(data_pd, var, 4, name)

    gate_results = dml_irm_obj.gate(groups)
    gate_summary_df = gate_results.summary

    interpreted_gates = []
    for group_name in sorted(gate_summary_df.index): # Sort to ensure quantiles are in order
        gate_abs = gate_summary_df.loc[group_name, 'coef']
        gate_lower = gate_summary_df.loc[group_name, '[0.025']
        gate_upper = gate_summary_df.loc[group_name, '0.975]']
        
        group_mask = groups[group_name]
        control_in_gate = data_pd[(data_pd['is_treated'] == 0) & (group_mask)]
        gate_baseline = control_in_gate['revenue_p2'].mean()
        
        if gate_baseline > 0:
            gate_pct = (gate_abs / gate_baseline) * 100
            gate_pct_lower = (gate_lower / gate_baseline) * 100
            gate_pct_upper = (gate_upper / gate_baseline) * 100
        else:
            gate_pct, gate_pct_lower, gate_pct_upper = np.nan, np.nan, np.nan
        
        interpreted_gates.append({
            'Subgroup': group_name,
            'ATE ($ Lift)': f"${gate_abs:.2f}",
            'ATE (% Lift)': f"{gate_pct:+.2f}%",
            '95% CI (% Lift)': f"({gate_pct_lower:+.2f}%, {gate_pct_upper:+.2f}%)",
            'Baseline Revenue': f"${gate_baseline:.2f}"
        })
    
    gates_df = pd.DataFrame(interpreted_gates)
    print(tabulate(gates_df, headers='keys', tablefmt='grid', showindex=False))

print("\n✅ ANALYSIS COMPLETE.")

--- Phase 1 & 2: Loading and Preparing Data ---
✅ Successfully loaded and prepared data with 1,119,128 users.

--- Phase 3: Estimating Overall Average Treatment Effect (ATE) ---




   -> DoubleML IRM model fitting complete.

--- Final Analysis: Interpreting Granular Heterogeneous Treatment Effects ---

Overall Average Treatment Effect (ATE) on Period 2 Revenue
  - Baseline (Control Group Avg. Revenue): $84.32
  - Absolute Incremental Lift:             $33.72 per user
  - 95% Confidence Interval (Absolute):  ($30.00, $37.44)
  - Relative Incremental Lift:             +39.99%
  - 95% Confidence Interval (Relative):  (+35.58%, +44.41%)

Heterogeneous Effects by: Revenue (P1)
   -> Creating 5 quantiles for 'Revenue (P1)'...
+------------+----------------+----------------+--------------------+--------------------+
| Subgroup   | ATE ($ Lift)   | ATE (% Lift)   | 95% CI (% Lift)    | Baseline Revenue   |
| Q_0        | $28.29         | +56.92%        | (+51.85%, +61.99%) | $49.70             |
+------------+----------------+----------------+--------------------+--------------------+
| Q_1        | $23.77         | +35.08%        | (+26.94%, +43.22%) | $67.75           

In [25]:
import polars as pl
import os
from tabulate import tabulate

# --- 1. Configuration ---
ORIGINAL_HOLDOUT_FILE = "final_holdout_user_ids_final.parquet"
USER_PANEL_FILE = "user_panel_with_holdout_flag.parquet"
CLEANED_HOLDOUT_FILE = "final_holdout_user_ids_CLEANED.parquet"
REPORT_FILE = "holdout_cleaning_summary_report.txt"

# ==============================================================================
# PHASE 1: LOAD DATA
# ==============================================================================
print("--- Phase 1: Loading Data for Cleaning ---")
try:
    df_holdouts_original = pl.read_parquet(ORIGINAL_HOLDOUT_FILE)
    df_panel = pl.read_parquet(USER_PANEL_FILE)
    print("✅ Successfully loaded original holdout list and the main user panel.")
except FileNotFoundError as e:
    print(f"❌ FATAL ERROR: A required input file was not found. Details: {e}")
    df_holdouts_original = None

if df_holdouts_original is not None:
    # --- FIX: Standardize column names immediately after loading ---
    print("-> Standardizing column names to lowercase for consistency...")
    df_holdouts_original.columns = [col.lower() for col in df_holdouts_original.columns]
    df_panel.columns = [col.lower() for col in df_panel.columns]
    print("   ✅ Column names standardized.")
    
    # ==============================================================================
    # PHASE 2: IDENTIFY AND SIZE THE "LEAKED" HOLDOUTS
    # ==============================================================================
    print("\n--- Phase 2: Identifying Holdouts with Click Activity ---")

    df_all_clickers = df_panel.filter(pl.col("clicks") > 0).select("user_id").unique()
    
    # Now the join will work correctly.
    df_leaked_holdouts = df_holdouts_original.join(df_all_clickers, on="user_id", how="inner")
    
    original_holdout_count = df_holdouts_original.height
    leaked_holdout_count = df_leaked_holdouts.height
    
    print(f"   -> Original Holdout Count: {original_holdout_count:,}")
    print(f"   -> Found {leaked_holdout_count:,} holdout users with at least one click.")
    
    # ==============================================================================
    # PHASE 3: REMOVE LEAKED USERS AND CREATE THE CLEANED LIST
    # ==============================================================================
    print("\n--- Phase 3: Creating the Cleaned Holdout List ---")

    df_holdouts_cleaned = df_holdouts_original.join(df_leaked_holdouts, on="user_id", how="anti")
    
    cleaned_holdout_count = df_holdouts_cleaned.height
    
    print(f"   -> Removed {leaked_holdout_count:,} users.")
    print(f"   -> New, Cleaned Holdout Count: {cleaned_holdout_count:,}")

    # ==============================================================================
    # PHASE 4: SAVE THE CLEANED LIST AND GENERATE REPORT
    # ==============================================================================
    print(f"\n--- Phase 4: Saving Cleaned List and Generating Report ---")
    
    df_holdouts_cleaned.write_parquet(CLEANED_HOLDOUT_FILE)
    print(f"✅ Cleaned holdout user list successfully saved to '{CLEANED_HOLDOUT_FILE}'")

    with open(REPORT_FILE, "w") as f:
        f.write("Holdout Group Cleaning Summary\n")
        f.write("=" * 30 + "\n\n")
        f.write("Methodology:\n")
        f.write("The original holdout list was checked against all users with click activity.\n")
        f.write("Any holdout user found to have at least one click was removed to create a\n")
        f.write("new, clean, and definitive holdout list for analysis.\n\n")
        
        f.write("Summary of Cleaning Process\n")
        f.write("---------------------------\n")
        f.write(f"- Original Holdout Count: {original_holdout_count:,}\n")
        f.write(f"- Users Removed (with clicks): {leaked_holdout_count:,}\n")
        f.write(f"- Final Cleaned Holdout Count: {cleaned_holdout_count:,}\n\n")
        f.write(f"The cleaned list has been saved to:\n'{CLEANED_HOLDOUT_FILE}'\n")

    print(f"✅ A summary report has been saved to '{REPORT_FILE}'")

--- Phase 1: Loading Data for Cleaning ---
✅ Successfully loaded original holdout list and the main user panel.
-> Standardizing column names to lowercase for consistency...
   ✅ Column names standardized.

--- Phase 2: Identifying Holdouts with Click Activity ---
   -> Original Holdout Count: 784,133
   -> Found 3,164 holdout users with at least one click.

--- Phase 3: Creating the Cleaned Holdout List ---
   -> Removed 3,164 users.
   -> New, Cleaned Holdout Count: 780,969

--- Phase 4: Saving Cleaned List and Generating Report ---
✅ Cleaned holdout user list successfully saved to 'final_holdout_user_ids_CLEANED.parquet'
✅ A summary report has been saved to 'holdout_cleaning_summary_report.txt'


In [26]:
import polars as pl
import pandas as pd
import numpy as np
import os
from tabulate import tabulate
import doubleml as dml
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.base import clone

# --- Configuration ---
USER_WEEK_PANEL_FILE = "user_panel_with_holdout_flag.parquet"
USER_VENDOR_PANEL_FILE = "full_purchaser_vendor_panel_unfiltered.parquet"
CLEANED_HOLDOUT_FILE = "final_holdout_user_ids_CLEANED.parquet"
FINAL_REPORT_FILE = "final_multi_outcome_causal_report.txt"
CUTOFF_DATE = "2025-07-01"

# ==============================================================================
# PHASE 1: COMPREHENSIVE FEATURE AND OUTCOME ENGINEERING
# ==============================================================================
print("--- Phase 1: Engineering All Features and Outcomes ---")

# --- Load Data ---
df_user_panel = pl.read_parquet(USER_WEEK_PANEL_FILE)
df_vendor_panel = pl.read_parquet(USER_VENDOR_PANEL_FILE)
df_holdouts_cleaned = pl.read_parquet(CLEANED_HOLDOUT_FILE)

# --- Define Time Periods ---
cutoff_date_pl = pl.lit(CUTOFF_DATE).str.to_date()
df_user_p1 = df_user_panel.filter(pl.col("week") < cutoff_date_pl)
df_user_p2 = df_user_panel.filter(pl.col("week") >= cutoff_date_pl)
df_vendor_p1 = df_vendor_panel.filter(pl.col("week") < cutoff_date_pl)
df_vendor_p2 = df_vendor_panel.filter(pl.col("week") >= cutoff_date_pl)

# --- Define Base Population (>= 3 purchases in P1) ---
p1_user_purchases = df_user_p1.group_by("user_id").agg(total_purchases_p1=pl.sum("purchases"))
base_users = p1_user_purchases.filter(pl.col("total_purchases_p1") >= 3).select("user_id")
print(f"   -> Identified {base_users.height:,} users for the analysis base.")

# --- Engineer Period 1 Controls (X) ---
df_controls_user_p1 = df_user_p1.group_by("user_id").agg(clicks_p1=pl.sum("clicks"))
df_tenure = df_user_p1.group_by("user_id").agg(first_week_p1=pl.min("week"))
period1_start_date = df_user_p1.select(pl.min("week"))[0, 0]
df_tenure = df_tenure.with_columns(
    join_week_index_p1=((pl.col('first_week_p1') - period1_start_date).dt.total_days() // 7)
).select(["user_id", "join_week_index_p1"])
df_controls_vendor_p1_agg1 = df_vendor_p1.group_by(["user_id", "vendor_id"]).agg(revenue_per_vendor=pl.sum("revenue_dollars"))
df_controls_vendor_p1 = df_controls_vendor_p1_agg1.group_by("user_id").agg(
    revenue_p1=pl.sum("revenue_per_vendor"),
    distinct_vendors_p1=pl.n_unique("vendor_id"),
    spend_on_top_vendor_p1=pl.max("revenue_per_vendor")
).with_columns(spend_concentration_p1 = pl.col("spend_on_top_vendor_p1") / pl.col("revenue_p1")).drop("spend_on_top_vendor_p1")
df_purchases_p1 = p1_user_purchases.select(["user_id", pl.col("total_purchases_p1").alias("purchases_p1")])
print("   -> Engineered all Period 1 control variables.")

# --- Engineer Period 2 Outcomes (Y vector) ---
df_outcomes_p2_base = df_user_p2.group_by("user_id").agg(
    revenue_p2=pl.sum("revenue_dollars"),
    purchases_p2=pl.sum("purchases")
)
df_outcomes_p2_vendor_agg1 = df_vendor_p2.group_by(["user_id", "vendor_id"]).agg(revenue_per_vendor=pl.sum("revenue_dollars"))
df_outcomes_p2_vendor = df_outcomes_p2_vendor_agg1.group_by("user_id").agg(
    p2_total_rev=pl.sum("revenue_per_vendor"), # temp column
    distinct_vendors_p2=pl.n_unique("vendor_id"),
    p2_spend_on_top=pl.max("revenue_per_vendor")
).with_columns(spend_concentration_p2 = pl.col("p2_spend_on_top") / pl.col("p2_total_rev")).select(["user_id", "distinct_vendors_p2", "spend_concentration_p2"])
print("   -> Engineered all Period 2 outcome variables.")

# --- Construct Final Analysis DataFrame ---
holdout_set = set(df_holdouts_cleaned['user_id'])
df_final = base_users.with_columns(is_holdout = pl.col("user_id").is_in(holdout_set).cast(pl.UInt8))
# Join all features and outcomes
df_final = df_final.join(df_purchases_p1, on="user_id", how="left")
df_final = df_final.join(df_controls_user_p1, on="user_id", how="left")
df_final = df_final.join(df_tenure, on="user_id", how="left")
df_final = df_final.join(df_controls_vendor_p1, on="user_id", how="left")
df_final = df_final.join(df_outcomes_p2_base, on="user_id", how="left")
df_final = df_final.join(df_outcomes_p2_vendor, on="user_id", how="left")
df_final = df_final.with_columns(is_treated = 1 - pl.col("is_holdout")).fill_null(0)
data_pd = df_final.to_pandas().astype({col: 'float64' for col in df_final.columns if col != 'user_id'})
print(f"   -> Final DataFrame constructed with {len(data_pd):,} users.")

# ==============================================================================
# PHASE 2: MULTI-OUTCOME CAUSAL ANALYSIS
# ==============================================================================
print("\n--- Phase 2: Running DoubleML Causal Analysis for Multiple Outcomes ---")

outcome_vars = {
    'revenue_p2': 'Revenue ($)',
    'purchases_p2': 'Purchases (#)',
    'distinct_vendors_p2': 'Vendor Variety (#)',
    'spend_concentration_p2': 'Spend Concentration (%)'
}
x_cols = ['revenue_p1', 'purchases_p1', 'clicks_p1', 'join_week_index_p1', 'distinct_vendors_p1', 'spend_concentration_p1']
all_results = []

for y_var, y_name in outcome_vars.items():
    print(f"\n--- Analyzing Outcome: {y_name} ---")
    
    dml_data = dml.DoubleMLData(data_pd, y_col=y_var, d_cols='is_treated', x_cols=x_cols)
    learner_g = LGBMRegressor(n_jobs=-1, random_state=42, verbose=-1)
    learner_m = LGBMClassifier(n_jobs=-1, random_state=42, verbose=--1)
    dml_irm_obj = dml.DoubleMLIRM(dml_data, ml_g=clone(learner_g), ml_m=clone(learner_m))
    dml_irm_obj.fit()
    
    # --- Interpret and store results ---
    control_baseline = data_pd[data_pd['is_treated'] == 0][y_var].mean()
    ate_summary = dml_irm_obj.summary
    ate_abs = ate_summary.loc['is_treated', 'coef']
    ate_lower = ate_summary.loc['is_treated', '2.5 %']
    ate_upper = ate_summary.loc['is_treated', '97.5 %']
    ate_pct = (ate_abs / control_baseline) * 100 if control_baseline != 0 else float('inf')
    
    all_results.append({
        "Outcome Variable": y_name,
        "Control Group Baseline": f"{control_baseline:.2f}",
        "Incremental Lift (ATE)": f"{ate_abs:+.2f}",
        "Relative Lift (%)": f"{ate_pct:+.2f}%",
        "95% CI (Absolute)": f"({ate_lower:.2f}, {ate_upper:.2f})"
    })
    print(f"   -> ATE for {y_name}: {ate_abs:+.2f} ({ate_pct:+.2f}%)")

# ==============================================================================
# PHASE 3: FINAL REPORTING
# ==============================================================================
print("\n--- Phase 3: Generating Final Report ---")

results_df = pd.DataFrame(all_results)
with open(FINAL_REPORT_FILE, "w") as f:
    f.write("Multi-Outcome Causal Analysis of Advertising\n")
    f.write("=" * 44 + "\n\n")
    f.write("Methodology:\n")
    f.write("A DoubleML IRM model was run independently for four different outcome variables from Period 2.\n")
    f.write("Each model controlled for a user's complete behavioral profile from Period 1.\n\n")
    f.write("Summary of Average Treatment Effects (ATE)\n")
    f.write("----------------------------------------\n")
    f.write(tabulate(results_df, headers='keys', tablefmt='grid', showindex=False))

print(f"\n✅ ANALYSIS COMPLETE. Final multi-outcome report saved to '{FINAL_REPORT_FILE}'")

--- Phase 1: Engineering All Features and Outcomes ---
   -> Identified 1,119,128 users for the analysis base.
   -> Engineered all Period 1 control variables.
   -> Engineered all Period 2 outcome variables.
   -> Final DataFrame constructed with 1,119,128 users.

--- Phase 2: Running DoubleML Causal Analysis for Multiple Outcomes ---

--- Analyzing Outcome: Revenue ($) ---




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001945 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001653 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




   -> ATE for Revenue ($): +39.13 (+46.48%)

--- Analyzing Outcome: Purchases (#) ---




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1083
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002012 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1080
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




   -> ATE for Purchases (#): +0.75 (+39.16%)

--- Analyzing Outcome: Vendor Variety (#) ---




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1084
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




   -> ATE for Vendor Variety (#): +0.12 (+57.70%)

--- Analyzing Outcome: Spend Concentration (%) ---




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1080
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889654, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1080
[LightGBM] [Info] Number of data points in the train set: 895302, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059531
[LightGBM] [Info] Start training from score 5.059531




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




[LightGBM] [Info] Number of positive: 889655, number of negative: 5648
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 895303, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.993692 -> initscore=5.059532
[LightGBM] [Info] Start training from score 5.059532




   -> ATE for Spend Concentration (%): +0.06 (+54.11%)

--- Phase 3: Generating Final Report ---

✅ ANALYSIS COMPLETE. Final multi-outcome report saved to 'final_multi_outcome_causal_report.txt'
