In [1]:
import os
import pandas as pd
from tabulate import tabulate
from dotenv import load_dotenv
import snowflake.connector
import sys

load_dotenv()

# --- Your provided Snowflake connection details ---
# This connection is for *reading* from INCREMENTALITY.
# We no longer need to switch contexts, as we are not creating objects.
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY', # Source database for Clicks and Purchases
    schema='INCREMENTALITY_RESEARCH' # Schema for Clicks and Purchases
)
cursor = conn.cursor()

def run_query(query):
    try:
        cursor.execute(query)
        # For SELECT queries, fetch results
        if cursor.description:
            results = cursor.fetchall()
            columns = [desc[0] for desc in cursor.description]
            return pd.DataFrame(results, columns=columns)
        # For DDL/DML, no results to fetch (though we won't be doing DDL now)
        return pd.DataFrame()
    except snowflake.connector.ProgrammingError as e:
        print(f"\nERROR executing query:\n{query}\nDetails: {e}")
        raise # Re-raise the exception to stop execution on error

def show_table(df, title=""):
    if title:
        print(f"\n{title}")
        print("="*len(title))
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

print("✅ Connected to Snowflake")

# Define the pilot week for consistency
PILOT_WEEK_START = '2025-07-01 00:00:00'
PILOT_WEEK_END = '2025-07-08 00:00:00' # End is exclusive

print("\n--- Generating User-Vendor-Week Panel for Pilot Week ---")
print(f"   Using data from {PILOT_WEEK_START} to {PILOT_WEEK_END} (exclusive)")

✅ Connected to Snowflake

--- Generating User-Vendor-Week Panel for Pilot Week ---
   Using data from 2025-07-01 00:00:00 to 2025-07-08 00:00:00 (exclusive)


# purchases

In [5]:
# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30' # Multi-week range for example

print(f"\n--- Collecting Hourly Raw Purchase Metrics (Weekly Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_hourly_data = []
current_week_start = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

while current_week_start < end_date_obj:
    current_week_end = current_week_start + timedelta(days=7)

    week_start_str = current_week_start.strftime('%Y-%m-%d 00:00:00')
    week_end_str = current_week_end.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing week: {current_week_start.date()} to {current_week_end.date()}...")

    query = f"""
    SELECT
        -- 1. Time bucket
        DATE_TRUNC('HOUR', PURCHASED_AT)::TIMESTAMP_NTZ AS activity_hour,

        -- 2. Raw Aggregate Metrics
        COUNT(DISTINCT PURCHASE_ID) AS hourly_transaction_count,
        COALESCE(SUM(QUANTITY * UNIT_PRICE), 0) AS hourly_gmv,
        COALESCE(SUM(QUANTITY), 0) AS hourly_units_sold,
        COUNT(DISTINCT USER_ID) AS hourly_purchasing_users,
        COUNT(DISTINCT PRODUCT_ID) AS hourly_products_purchased
    FROM
        PURCHASES
    WHERE
        PURCHASED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
        AND PURCHASED_AT < '{week_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        hourly_df_for_week = run_query(query)
        if not hourly_df_for_week.empty:
            all_hourly_data.append(hourly_df_for_week)
            print(f"   -> Success: Found {hourly_df_for_week.shape[0]} hourly records for the week.")
        else:
            print(f"   -> Info: No purchase data found for this week.")

    except Exception as e:
        print(f"   -> ERROR processing week starting {current_week_start.date()}: {e}")

    # Move to the next week
    current_week_start = current_week_end

# --- Final Processing and Display ---
if all_hourly_data:
    final_df = pd.concat(all_hourly_data, ignore_index=True)
    final_df['HOURLY_GMV'] = pd.to_numeric(final_df['HOURLY_GMV'])
    final_df['HOURLY_UNITS_SOLD'] = pd.to_numeric(final_df['HOURLY_UNITS_SOLD'])

    # --- SAVE TO PARQUET ---
    output_filename = f"hourly_purchases_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df, "Aggregated Hourly Purchase Metrics")
else:
    print("\nNo purchase data found for the entire specified period.")


--- Collecting Hourly Raw Purchase Metrics (Weekly Pulls) ---
Period: 2025-03-01 to 2025-09-30
Processing week: 2025-03-01 to 2025-03-08...
   -> Info: No purchase data found for this week.
Processing week: 2025-03-08 to 2025-03-15...
   -> Success: Found 24 hourly records for the week.
Processing week: 2025-03-15 to 2025-03-22...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-03-22 to 2025-03-29...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-03-29 to 2025-04-05...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-05 to 2025-04-12...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-12 to 2025-04-19...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-19 to 2025-04-26...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-26 to 2025-05-03...
   -> Success: Found 168 hourly records for the week.
Processing week: 2

In [6]:
# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'

print(f"\n--- Collecting Hourly Raw Click Metrics (Weekly Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_hourly_data = []
current_week_start = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

while current_week_start < end_date_obj:
    current_week_end = current_week_start + timedelta(days=7)

    week_start_str = current_week_start.strftime('%Y-%m-%d 00:00:00')
    week_end_str = current_week_end.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing week: {current_week_start.date()} to {current_week_end.date()}...")

    query = f"""
    SELECT
        -- 1. Time bucket
        DATE_TRUNC('HOUR', OCCURRED_AT)::TIMESTAMP_NTZ AS activity_hour,

        -- 2. Raw Aggregate Metrics
        COUNT(INTERACTION_ID) AS hourly_click_count,
        COUNT(DISTINCT USER_ID) AS hourly_clicking_users,
        COUNT(DISTINCT VENDOR_ID) AS hourly_clicked_vendors,
        COUNT(DISTINCT CAMPAIGN_ID) AS hourly_clicked_campaigns,
        COUNT(DISTINCT PRODUCT_ID) AS hourly_clicked_products
    FROM
        CLICKS
    WHERE
        OCCURRED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
        AND OCCURRED_AT < '{week_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        hourly_df_for_week = run_query(query)
        if not hourly_df_for_week.empty:
            all_hourly_data.append(hourly_df_for_week)
            print(f"   -> Success: Found {hourly_df_for_week.shape[0]} hourly records for the week.")
        else:
            print(f"   -> Info: No click data found for this week.")

    except Exception as e:
        print(f"   -> ERROR processing week starting {current_week_start.date()}: {e}")

    # Move to the next week
    current_week_start = current_week_end

# --- Final Processing and Display ---
if all_hourly_data:
    final_df = pd.concat(all_hourly_data, ignore_index=True)

    # --- SAVE TO PARQUET ---
    output_filename = f"hourly_clicks_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df, "Aggregated Hourly Click Metrics")
else:
    print("\nNo click data found for the entire specified period.")


--- Collecting Hourly Raw Click Metrics (Weekly Pulls) ---
Period: 2025-03-01 to 2025-09-30
Processing week: 2025-03-01 to 2025-03-08...
   -> Info: No click data found for this week.
Processing week: 2025-03-08 to 2025-03-15...
   -> Success: Found 24 hourly records for the week.
Processing week: 2025-03-15 to 2025-03-22...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-03-22 to 2025-03-29...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-03-29 to 2025-04-05...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-05 to 2025-04-12...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-12 to 2025-04-19...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-19 to 2025-04-26...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-04-26 to 2025-05-03...
   -> Success: Found 168 hourly records for the week.
Processing week: 2025-05

# impresssions

In [28]:
# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'

print(f"\n--- Collecting Hourly Raw Impression Metrics (Daily Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_hourly_data = []
current_date = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

while current_date <= end_date_obj:
    day_start_str = current_date.strftime('%Y-%m-%d 00:00:00')
    next_day = current_date + timedelta(days=1)
    day_end_str = next_day.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing date: {current_date.date()}...")

    query = f"""
    SELECT
        -- 1. Time bucket
        DATE_TRUNC('HOUR', OCCURRED_AT)::TIMESTAMP_NTZ AS activity_hour,

        -- 2. Raw Aggregate Metrics
        COUNT(INTERACTION_ID) AS hourly_impression_count,
        COUNT(DISTINCT USER_ID) AS hourly_impressed_users,
        COUNT(DISTINCT VENDOR_ID) AS hourly_impressed_vendors,
        COUNT(DISTINCT CAMPAIGN_ID) AS hourly_impressed_campaigns,
        COUNT(DISTINCT PRODUCT_ID) AS hourly_impressed_products
    FROM
        IMPRESSIONS
    WHERE
        OCCURRED_AT >= '{day_start_str}'::TIMESTAMP_NTZ
        AND OCCURRED_AT < '{day_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        hourly_df_for_day = run_query(query)
        if not hourly_df_for_day.empty:
            all_hourly_data.append(hourly_df_for_day)
            print(f"   -> Success: Found {hourly_df_for_day.shape[0]} hourly records for the day.")
        else:
            print(f"   -> Info: No impression data found for this day.")

    except Exception as e:
        print(f"   -> ERROR processing {current_date.date()}: {e}")

    # Move to the next day
    current_date = next_day

# --- Final Processing and Display ---
if all_hourly_data:
    final_df = pd.concat(all_hourly_data, ignore_index=True)

    # --- SAVE TO PARQUET ---
    output_filename = f"hourly_impressions_{ANALYSIS_START_ДATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df, "Aggregated Hourly Impression Metrics")
else:
    print("\nNo impression data found for the entire specified period.")


--- Collecting Hourly Raw Impression Metrics (Daily Pulls) ---
Period: 2025-03-01 to 2025-09-30
Processing date: 2025-03-01...
   -> Info: No impression data found for this day.
Processing date: 2025-03-02...
   -> Info: No impression data found for this day.
Processing date: 2025-03-03...
   -> Info: No impression data found for this day.
Processing date: 2025-03-04...
   -> Info: No impression data found for this day.
Processing date: 2025-03-05...
   -> Info: No impression data found for this day.
Processing date: 2025-03-06...
   -> Info: No impression data found for this day.
Processing date: 2025-03-07...
   -> Info: No impression data found for this day.
Processing date: 2025-03-08...
   -> Info: No impression data found for this day.
Processing date: 2025-03-09...
   -> Info: No impression data found for this day.
Processing date: 2025-03-10...
   -> Info: No impression data found for this day.
Processing date: 2025-03-11...
   -> Info: No impression data found for this day.
P

NameError: name 'ANALYSIS_START_ДATE' is not defined

In [29]:
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'
# --- Final Processing and Display ---
if all_hourly_data:
    final_df = pd.concat(all_hourly_data, ignore_index=True)

    # --- SAVE TO PARQUET ---
    output_filename = f"hourly_impressions_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df, "Aggregated Hourly Impression Metrics")
else:
    print("\nNo impression data found for the entire specified period.")


✅ Data successfully saved to hourly_impressions_2025-03-01_to_2025-09-30.parquet

--- Aggregated Hourly Impression Metrics (Showing first 25 rows) ---
+---------------------+---------------------------+--------------------------+----------------------------+------------------------------+-----------------------------+
| ACTIVITY_HOUR       |   HOURLY_IMPRESSION_COUNT |   HOURLY_IMPRESSED_USERS |   HOURLY_IMPRESSED_VENDORS |   HOURLY_IMPRESSED_CAMPAIGNS |   HOURLY_IMPRESSED_PRODUCTS |
| 2025-03-14 00:00:00 |                    967755 |                    55780 |                      17005 |                        17055 |                      386059 |
+---------------------+---------------------------+--------------------------+----------------------------+------------------------------+-----------------------------+
| 2025-03-14 01:00:00 |                    804213 |                    57538 |                       9643 |                         9690 |                      184819 |
+--

# auctions

In [30]:
# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'

print(f"\n--- Collecting Hourly Raw Auction Metrics (Daily Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_hourly_data = []
current_date = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

while current_date <= end_date_obj:
    day_start_str = current_date.strftime('%Y-%m-%d 00:00:00')
    next_day = current_date + timedelta(days=1)
    day_end_str = next_day.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing date: {current_date.date()}...")

    query = f"""
    SELECT
        -- 1. Time bucket
        DATE_TRUNC('HOUR', CREATED_AT)::TIMESTAMP_NTZ AS activity_hour,

        -- 2. Raw Aggregate Metrics
        COUNT(AUCTION_ID) AS hourly_auction_count,
        COUNT(DISTINCT OPAQUE_USER_ID) AS hourly_auction_users
    FROM
        AUCTIONS_USERS
    WHERE
        CREATED_AT >= '{day_start_str}'::TIMESTAMP_NTZ
        AND CREATED_AT < '{day_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        hourly_df_for_day = run_query(query)
        if not hourly_df_for_day.empty:
            all_hourly_data.append(hourly_df_for_day)
            print(f"   -> Success: Found {hourly_df_for_day.shape[0]} hourly records for the day.")
        else:
            print(f"   -> Info: No auction data found for this day.")

    except Exception as e:
        print(f"   -> ERROR processing {current_date.date()}: {e}")

    # Move to the next day
    current_date = next_day

# --- Final Processing and Display ---
if all_hourly_data:
    final_df = pd.concat(all_hourly_data, ignore_index=True)

    # --- SAVE TO PARQUET ---
    output_filename = f"hourly_auctions_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df, "Aggregated Hourly Auction Metrics")
else:
    print("\nNo auction data found for the entire specified period.")


--- Collecting Hourly Raw Auction Metrics (Daily Pulls) ---
Period: 2025-03-01 to 2025-09-30
Processing date: 2025-03-01...
   -> Info: No auction data found for this day.
Processing date: 2025-03-02...
   -> Info: No auction data found for this day.
Processing date: 2025-03-03...
   -> Info: No auction data found for this day.
Processing date: 2025-03-04...
   -> Info: No auction data found for this day.
Processing date: 2025-03-05...
   -> Info: No auction data found for this day.
Processing date: 2025-03-06...
   -> Info: No auction data found for this day.
Processing date: 2025-03-07...
   -> Info: No auction data found for this day.
Processing date: 2025-03-08...
   -> Info: No auction data found for this day.
Processing date: 2025-03-09...
   -> Info: No auction data found for this day.
Processing date: 2025-03-10...
   -> Info: No auction data found for this day.
Processing date: 2025-03-11...
   -> Info: No auction data found for this day.
Processing date: 2025-03-12...
   -> 