In [2]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

# Load environment variables from the .env file
# This command looks for the .env file in the same directory as your notebook
load_dotenv()

# --- Snowflake Connection Block ---
conn = None  # Initialize conn to None
try:
    # Establish the connection using credentials from the .env file
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connection to Snowflake successful!")

    # Optional: Verify the connection with a simple query
    cursor = conn.cursor()
    cursor.execute("SELECT CURRENT_VERSION()")
    one_row = cursor.fetchone()
    print(f"   Snowflake version: {one_row[0]}")

except Exception as e:
    print(f"❌ ERROR: Could not connect to Snowflake.", file=sys.stderr)
    print(f"   Please check your credentials in the .env file and network connection.", file=sys.stderr)
    print(f"   Details: {e}", file=sys.stderr)


✅ Connection to Snowflake successful!
   Snowflake version: 9.27.0


In [2]:
from tabulate import tabulate
import pandas as pd

# We assume 'conn' is an active Snowflake connection.

# --- 1. Define the Query and Day for Analysis ---
# We'll analyze a single, representative day. A weekday is often a good choice.
TARGET_DATE = '2025-07-08'

# This query is designed to be extremely fast and low-cost.
# It uses ::DATE to cast the timestamp to a date for a clean filter.
one_day_count_query = f"""
SELECT
    APPROX_COUNT_DISTINCT(USER_ID) AS approximate_distinct_users
FROM
    IMPRESSIONS
WHERE
    OCCURRED_AT::DATE = '{TARGET_DATE}';
"""

# --- 2. Execute the Query ---
print(f"--- Counting Distinct Impressed Users for {TARGET_DATE} ---")
try:
    cursor = conn.cursor()
    cursor.execute(one_day_count_query)
    # Fetch the single result from the first row, first column
    result = cursor.fetchone()
    approx_count = result[0] if result else 0
    print(f"✅ Query successful.")

    # --- 3. Generate the Report ---
    report_filename = "daily_impressed_users_sample_count.txt"
    with open(report_filename, "w") as f:
        f.write("Daily Impressed User Count (One-Day Sample)\n")
        f.write("=" * 43 + "\n\n")
        f.write("Methodology: A fast approximation query was run on the IMPRESSIONS table\n")
        f.write("             to estimate the number of unique users shown at least one ad\n")
        f.write("             within a single, representative day.\n\n")
        
        f.write(f"Date Analyzed: {TARGET_DATE}\n")
        f.write("-" * 43 + "\n")
        f.write(f"Approximate Distinct Users Shown Ads: {approx_count:,.0f}\n")
        f.write("-" * 43 + "\n")
        
    print(f"✅ Analysis complete. Report saved to '{report_filename}'")

    print(f"\n--- Report Content for '{report_filename}' ---")
    with open(report_filename, 'r') as f:
        print(f.read())

except Exception as e:
    print(f"❌ ERROR executing query: {e}", file=sys.stderr)
finally:
    pass

--- Counting Distinct Impressed Users for 2025-07-08 ---
✅ Query successful.
✅ Analysis complete. Report saved to 'daily_impressed_users_sample_count.txt'

--- Report Content for 'daily_impressed_users_sample_count.txt' ---
Daily Impressed User Count (One-Day Sample)

Methodology: A fast approximation query was run on the IMPRESSIONS table
             to estimate the number of unique users shown at least one ad
             within a single, representative day.

Date Analyzed: 2025-07-08
-------------------------------------------
Approximate Distinct Users Shown Ads: 673,017
-------------------------------------------



In [3]:
from tabulate import tabulate
import pandas as pd

# We assume 'conn' is an active Snowflake connection.

# --- 1. Define the Query and Time Period ---
# We'll analyze a single, representative month.
START_DATE = '2025-07-01'
END_DATE = '2025-08-01'  # The end date is exclusive, capturing all of July.

# This query is designed to be very efficient.
one_month_purchasers_query = f"""
SELECT
    APPROX_COUNT_DISTINCT(USER_ID) AS approximate_distinct_users
FROM
    PURCHASES
WHERE
    PURCHASED_AT >= '{START_DATE}' AND PURCHASED_AT < '{END_DATE}';
"""

# --- 2. Execute the Query ---
print(f"--- Counting Distinct Purchasing Users for {START_DATE} to {END_DATE} ---")
try:
    cursor = conn.cursor()
    cursor.execute(one_month_purchasers_query)
    # Fetch the single result from the first row, first column
    result = cursor.fetchone()
    approx_count = result[0] if result else 0
    print(f"✅ Query successful.")

    # --- 3. Generate the Report ---
    report_filename = "monthly_purchasing_users_count.txt"
    with open(report_filename, "w") as f:
        f.write("Monthly Purchasing User Count (One-Month Sample)\n")
        f.write("=" * 48 + "\n\n")
        f.write("Methodology: A fast approximation query was run on the PURCHASES table\n")
        f.write("             to estimate the number of unique users who made at least one\n")
        f.write("             purchase within a single, representative month.\n\n")
        
        f.write(f"Time Period Analyzed: {START_DATE} to {END_DATE} (exclusive)\n")
        f.write("-" * 58 + "\n")
        f.write(f"Approximate Distinct Purchasing Users: {approx_count:,.0f}\n")
        f.write("-" * 58 + "\n")
        
    print(f"✅ Analysis complete. Report saved to '{report_filename}'")

    print(f"\n--- Report Content for '{report_filename}' ---")
    with open(report_filename, 'r') as f:
        print(f.read())

except Exception as e:
    print(f"❌ ERROR executing query: {e}", file=sys.stderr)
finally:
    if 'cursor' in locals() and cursor:
        cursor.close()

--- Counting Distinct Purchasing Users for 2025-07-01 to 2025-08-01 ---
✅ Query successful.
✅ Analysis complete. Report saved to 'monthly_purchasing_users_count.txt'

--- Report Content for 'monthly_purchasing_users_count.txt' ---
Monthly Purchasing User Count (One-Month Sample)

Methodology: A fast approximation query was run on the PURCHASES table
             to estimate the number of unique users who made at least one
             purchase within a single, representative month.

Time Period Analyzed: 2025-07-01 to 2025-08-01 (exclusive)
----------------------------------------------------------
Approximate Distinct Purchasing Users: 1,475,138
----------------------------------------------------------



In [5]:
import pandas as pd
import os
import sys
from tabulate import tabulate

# We assume 'conn' is an active Snowflake connection from a previous cell.

# --- Helper Function to Execute Queries ---
def run_query(connection, query):
    """Executes a query and returns the results as a Pandas DataFrame."""
    print("Executing query on Snowflake...")
    try:
        # The UserWarning is expected and can be ignored.
        df = pd.read_sql(query, connection)
        print(f"✅ Query successful. Fetched {len(df):,} rows.")
        return df
    except Exception as e:
        print(f"❌ ERROR executing query: {e}", file=sys.stderr)
        return pd.DataFrame()

# --- 1. Configuration ---
# Define the overall date range for the extraction.
START_MONTH = '2025-03-01'
END_MONTH = '2025-09-01'

# Create a dedicated directory to store the output files.
OUTPUT_DIR = "monthly_purchaser_lists"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output will be saved to the '{OUTPUT_DIR}' directory.")

# --- 2. Generate Monthly Date Ranges ---
# 'MS' frequency gives the start of each month.
monthly_start_dates = pd.date_range(start=START_MONTH, end=END_MONTH, freq='MS')

# --- 3. Loop Through Each Month and Extract Data ---
summary_data = []
print("\n--- Starting Monthly Extraction Process ---")

for start_date in monthly_start_dates:
    # Calculate the end of the month (which is the start of the next month)
    end_date = start_date + pd.DateOffset(months=1)
    
    # Format dates for the SQL query
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    year_month_str = start_date.strftime('%Y-%m')

    print(f"\nProcessing Month: {year_month_str}...")

    # Define the query for the current month
    monthly_purchasers_query = f"""
    SELECT DISTINCT USER_ID
    FROM PURCHASES
    WHERE PURCHASED_AT >= '{start_date_str}' AND PURCHASED_AT < '{end_date_str}';
    """
    
    # Execute the query
    df_monthly_users = run_query(conn, monthly_purchasers_query)

    # Save the result to a Parquet file
    if not df_monthly_users.empty:
        file_path = os.path.join(OUTPUT_DIR, f"purchasers_{year_month_str}.parquet")
        df_monthly_users.to_parquet(file_path, index=False)
        
        user_count = len(df_monthly_users)
        print(f"   -> Found {user_count:,.0f} unique users. Saved to '{file_path}'")
        summary_data.append([year_month_str, user_count, file_path])
    else:
        print(f"   -> No purchasing users found for this month.")
        summary_data.append([year_month_str, 0, "N/A"])

# --- 4. Generate a Final Summary Report ---
print("\n\n--- Extraction Process Complete ---")

summary_df = pd.DataFrame(summary_data, columns=['Month', 'Unique Purchasers', 'File Path'])

report_filename = "monthly_purchaser_extraction_summary.txt"
with open(report_filename, "w") as f:
    f.write("Summary of Monthly Purchaser Extractions\n")
    f.write("=" * 40 + "\n\n")
    f.write("Methodology: The PURCHASES table was queried for each month in the specified\n")
    f.write("             range. A list of unique USER_IDs for each month was saved to a\n")
    f.write("             separate Parquet file for further analysis.\n\n")
    
    f.write(tabulate(summary_df, headers='keys', tablefmt='grid', showindex=False))

print(f"\n✅ All months processed. A summary report has been saved to '{report_filename}'")

print(f"\n--- Summary Report ---")
print(tabulate(summary_df, headers='keys', tablefmt='grid', showindex=False))

Output will be saved to the 'monthly_purchaser_lists' directory.

--- Starting Monthly Extraction Process ---

Processing Month: 2025-03...
Executing query on Snowflake...


  df = pd.read_sql(query, connection)


✅ Query successful. Fetched 976,617 rows.
   -> Found 976,617 unique users. Saved to 'monthly_purchaser_lists/purchasers_2025-03.parquet'

Processing Month: 2025-04...
Executing query on Snowflake...
✅ Query successful. Fetched 1,399,898 rows.
   -> Found 1,399,898 unique users. Saved to 'monthly_purchaser_lists/purchasers_2025-04.parquet'

Processing Month: 2025-05...
Executing query on Snowflake...
✅ Query successful. Fetched 1,451,601 rows.
   -> Found 1,451,601 unique users. Saved to 'monthly_purchaser_lists/purchasers_2025-05.parquet'

Processing Month: 2025-06...
Executing query on Snowflake...
✅ Query successful. Fetched 1,396,621 rows.
   -> Found 1,396,621 unique users. Saved to 'monthly_purchaser_lists/purchasers_2025-06.parquet'

Processing Month: 2025-07...
Executing query on Snowflake...
✅ Query successful. Fetched 1,436,993 rows.
   -> Found 1,436,993 unique users. Saved to 'monthly_purchaser_lists/purchasers_2025-07.parquet'

Processing Month: 2025-08...
Executing query 

In [6]:
import pandas as pd
import os
from tabulate import tabulate

# --- 1. Configuration ---
INPUT_DIR = "monthly_purchaser_lists"
FINAL_OUTPUT_FILE = "final_purchaser_universe.parquet"

# --- 2. Discover and Process Monthly Files ---
# Find all the Parquet files in the input directory
try:
    monthly_files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.parquet')]
    print(f"Found {len(monthly_files)} monthly files to process in '{INPUT_DIR}'.")
except FileNotFoundError:
    print(f"❌ ERROR: Directory not found: '{INPUT_DIR}'. Please run the previous extraction script first.")
    # Exit the cell if the directory doesn't exist
    exit()

# Use a set for efficient, memory-safe de-duplication
all_unique_users = set()
files_processed = []

print("\n--- Consolidating Monthly User Lists ---")
for filename in monthly_files:
    file_path = os.path.join(INPUT_DIR, filename)
    print(f"Processing {filename}...")
    
    df_month = pd.read_parquet(file_path)
    
    # The .update() method efficiently adds all items from the list to the set
    all_unique_users.update(df_month['USER_ID'])
    
    print(f"   -> Read {len(df_month):,} users. Total unique users so far: {len(all_unique_users):,}")
    files_processed.append(filename)


# --- 3. Finalize and Save the Master List ---
print("\n--- Consolidation Complete ---")
final_df = pd.DataFrame(list(all_unique_users), columns=['USER_ID'])
final_df.to_parquet(FINAL_OUTPUT_FILE, index=False)

final_user_count = len(final_df)
print(f"The final, de-duplicated list contains {final_user_count:,.0f} unique purchasing users.")
print(f"Master list saved to '{FINAL_OUTPUT_FILE}'")


# --- 4. Generate a Final Summary Report ---
report_filename = "purchaser_universe_consolidation_report.txt"
with open(report_filename, "w") as f:
    f.write("Purchaser Universe Consolidation Report\n")
    f.write("=" * 39 + "\n\n")
    f.write("Methodology: All monthly Parquet files containing unique purchaser IDs\n")
    f.write("             were read and consolidated. A de-duplicated master list of all\n")
    f.write("             users who made at least one purchase was generated.\n\n")

    f.write("Final Result\n")
    f.write("-" * 12 + "\n")
    f.write(f"Total Unique Purchasing Users: {final_user_count:,.0f}\n\n")

    f.write("Source Files Processed\n")
    f.write("-" * 22 + "\n")
    for fname in sorted(files_processed):
        f.write(f"- {fname}\n")

print(f"\n✅ A summary report has been saved to '{report_filename}'")

Found 7 monthly files to process in 'monthly_purchaser_lists'.

--- Consolidating Monthly User Lists ---
Processing purchasers_2025-08.parquet...
   -> Read 1,561,961 users. Total unique users so far: 1,561,961
Processing purchasers_2025-09.parquet...
   -> Read 600,029 users. Total unique users so far: 1,870,438
Processing purchasers_2025-03.parquet...
   -> Read 976,617 users. Total unique users so far: 2,466,845
Processing purchasers_2025-06.parquet...
   -> Read 1,396,621 users. Total unique users so far: 3,215,571
Processing purchasers_2025-07.parquet...
   -> Read 1,436,993 users. Total unique users so far: 3,836,549
Processing purchasers_2025-05.parquet...
   -> Read 1,451,601 users. Total unique users so far: 4,425,325
Processing purchasers_2025-04.parquet...
   -> Read 1,399,898 users. Total unique users so far: 4,926,305

--- Consolidation Complete ---
The final, de-duplicated list contains 4,926,305 unique purchasing users.
Master list saved to 'final_purchaser_universe.parq

In [4]:
import pandas as pd
import os
import time
from tabulate import tabulate

# --- 1. Configuration ---
UNIVERSE_FILE = "final_purchaser_universe.parquet"
CHECKPOINT_FILE = "potential_holdouts_checkpoint.parquet"
FINAL_HOLDOUT_FILE = "final_holdout_user_ids_final.parquet"
LOG_REPORT_FILE = "final_holdout_run_log_with_dissolution.txt"
DISSOLVED_LOGS_DIR = "dissolved_holdout_logs"

START_DATE = "2025-03-10"
END_DATE = "2025-09-15"

# --- 2. Setup: Load Universe or Resume from Checkpoint ---
print("--- SCRIPT INITIATED: Holdout Identification (Max Verbosity) ---")
os.makedirs(DISSOLVED_LOGS_DIR, exist_ok=True)

if os.path.exists(CHECKPOINT_FILE):
    print(f"Resuming from checkpoint: '{CHECKPOINT_FILE}'")
    df_checkpoint = pd.read_parquet(CHECKPOINT_FILE)
    potential_holdouts_set = set(df_checkpoint['USER_ID'])
    last_date_processed = pd.to_datetime(df_checkpoint.attrs['last_date_processed'])
    start_date_obj = last_date_processed + pd.DateOffset(days=1)
    print(f"Resuming process from {start_date_obj.strftime('%Y-%m-%d')}")
else:
    print(f"Loading master universe from '{UNIVERSE_FILE}'")
    df_universe = pd.read_parquet(UNIVERSE_FILE)
    potential_holdouts_set = set(df_universe['USER_ID'])
    start_date_obj = pd.to_datetime(START_DATE)

initial_universe_size = len(potential_holdouts_set)
print(f"Loaded {initial_universe_size:,} potential holdout users to start.")

# --- 3. Main Processing Loop ---
weekly_date_ranges = pd.date_range(start=start_date_obj, end=END_DATE, freq='W-MON')
weekly_logs = []
total_start_time = time.time()
cursor = None

try:
    current_start = start_date_obj
    for i, week_end in enumerate(weekly_date_ranges):
        week_start_str = current_start.strftime('%Y-%m-%d')
        week_end_str = (week_end + pd.DateOffset(days=1)).strftime('%Y-%m-%d')
        
        print(f"\n{'='*80}\n--- Processing Week {i+1}/{len(weekly_date_ranges)} ({week_start_str} to {week_end_str}) ---\n{'='*80}")
        
        holdouts_before_week = len(potential_holdouts_set)
        print(f"  [Step 1] Holdouts at start of week: {holdouts_before_week:,}")

        # --- Query Snowflake ---
        query_start_time = time.time()
        weekly_impressed_query = f"SELECT DISTINCT USER_ID FROM IMPRESSIONS WHERE OCCURRED_AT >= '{week_start_str}' AND OCCURRED_AT < '{week_end_str}';"
        
        print(f"  [Step 2] Querying Snowflake for users with impressions this week...")
        cursor = conn.cursor()
        cursor.execute(weekly_impressed_query)
        
        impressed_this_week_set = set()
        for batch in cursor.fetch_pandas_batches():
            impressed_this_week_set.update(batch['USER_ID'])
        
        query_duration = time.time() - query_start_time
        impressed_count_this_week = len(impressed_this_week_set)
        print(f"     -> Query complete in {query_duration:.2f}s. Found {impressed_count_this_week:,} distinct impressed users.")

        # --- Calculate and Log Overlap ---
        print(f"  [Step 3] Calculating overlap between {holdouts_before_week:,} holdouts and {impressed_count_this_week:,} impressed users...")
        dissolved_this_week_set = potential_holdouts_set.intersection(impressed_this_week_set)
        dissolved_count = len(dissolved_this_week_set)
        print(f"     -> Overlap found: {dissolved_count:,} users. These are the dissolved holdouts.")

        if dissolved_count > 0:
            dissolved_df = pd.DataFrame(list(dissolved_this_week_set), columns=['USER_ID'])
            log_path = os.path.join(DISSOLVED_LOGS_DIR, f"dissolved_{week_start_str}.parquet")
            dissolved_df.to_parquet(log_path, index=False)
            print(f"     -> Logged these {dissolved_count:,} user IDs to '{log_path}'")

        # --- Update Main Set and Report ---
        print(f"  [Step 4] Removing dissolved users from the main set...")
        potential_holdouts_set.difference_update(dissolved_this_week_set)
        holdouts_after_week = len(potential_holdouts_set)
        print(f"     -> Holdouts remaining: {holdouts_after_week:,} ({holdouts_before_week:,} -> {holdouts_after_week:,})")
        
        weekly_logs.append({
            "Week Start": week_start_str,
            "Query Time (s)": f"{query_duration:.2f}",
            "Holdouts Before": f"{holdouts_before_week:,}",
            "Impressed This Week": f"{impressed_count_this_week:,}",
            "Holdouts Dissolved": f"{dissolved_count:,}",
            "Holdouts After": f"{holdouts_after_week:,}"
        })

        # --- Checkpoint ---
        print(f"  [Step 5] Saving weekly checkpoint with {holdouts_after_week:,} users...")
        checkpoint_df = pd.DataFrame(list(potential_holdouts_set), columns=['USER_ID'])
        checkpoint_df.attrs['last_date_processed'] = week_end.strftime('%Y-%m-%d')
        checkpoint_df.to_parquet(CHECKPOINT_FILE, index=False)
        print("     -> Checkpoint saved successfully.")
        
        current_start = week_end + pd.DateOffset(days=1)

finally:
    total_duration_minutes = (time.time() - total_start_time) / 60
    
    print(f"\n{'='*80}\n--- PROCESS COMPLETE OR INTERRUPTED: FINALIZING ---\n{'='*80}")
    
    final_holdouts_df = pd.DataFrame(list(potential_holdouts_set), columns=['USER_ID'])
    final_count = len(final_holdouts_df)
    
    print(f"  -> Saving final list of {final_count:,} holdout users to '{FINAL_HOLDOUT_FILE}'...")
    final_holdouts_df.to_parquet(FINAL_HOLDOUT_FILE, index=False)
    
    log_df = pd.DataFrame(weekly_logs)
    
    print(f"  -> Saving detailed run log to '{LOG_REPORT_FILE}'...")
    with open(LOG_REPORT_FILE, "w") as f:
        f.write("Optimized Holdout Identification - Run Log & Summary\n")
        f.write("=" * 51 + "\n\n")
        f.write(f"Total Run Time: {total_duration_minutes:.2f} minutes\n")
        f.write(f"Weeks Processed: {len(weekly_logs)}\n")
        if 'df_universe' in locals():
            f.write(f"Initial User Universe: {len(df_universe):,}\n")
        f.write(f"Final Holdout Users Found: {final_count:,}\n\n")
        f.write(f"NOTE: Detailed lists of dissolved holdouts are in '{DISSOLVED_LOGS_DIR}/'\n\n")
        f.write("Weekly Processing Log\n---------------------\n")
        f.write(tabulate(log_df, headers='keys', tablefmt='grid', showindex=False))
    
    if os.path.exists(CHECKPOINT_FILE):
        print(f"  -> Cleaning up checkpoint file...")
        os.remove(CHECKPOINT_FILE)
    
    print("\n✅ All operations complete.")

--- SCRIPT INITIATED: Holdout Identification (Max Verbosity) ---
Loading master universe from 'final_purchaser_universe.parquet'
Loaded 4,926,305 potential holdout users to start.

--- Processing Week 1/28 (2025-03-10 to 2025-03-11) ---
  [Step 1] Holdouts at start of week: 4,926,305
  [Step 2] Querying Snowflake for users with impressions this week...
     -> Query complete in 2.72s. Found 0 distinct impressed users.
  [Step 3] Calculating overlap between 4,926,305 holdouts and 0 impressed users...
     -> Overlap found: 0 users. These are the dissolved holdouts.
  [Step 4] Removing dissolved users from the main set...
     -> Holdouts remaining: 4,926,305 (4,926,305 -> 4,926,305)
  [Step 5] Saving weekly checkpoint with 4,926,305 users...
     -> Checkpoint saved successfully.

--- Processing Week 2/28 (2025-03-11 to 2025-03-18) ---
  [Step 1] Holdouts at start of week: 4,926,305
  [Step 2] Querying Snowflake for users with impressions this week...
     -> Query complete in 74.27s. F