In [1]:

import os
import pandas as pd
from tabulate import tabulate
from dotenv import load_dotenv
import snowflake.connector
import sys
from datetime import datetime, timedelta
from tqdm import tqdm

load_dotenv()

# Create data folder if it doesn't exist
os.makedirs('data', exist_ok=True)

# --- Snowflake connection ---
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)
cursor = conn.cursor()

def run_query(query):
    try:
        cursor.execute(query)
        if cursor.description:
            results = cursor.fetchall()
            columns = [desc[0] for desc in cursor.description]
            return pd.DataFrame(results, columns=columns)
        return pd.DataFrame()
    except snowflake.connector.ProgrammingError as e:
        print(f"\nERROR executing query:\n{query}\nDetails: {e}")
        raise

def show_table(df, title=""):
    if title:
        print(f"\n{title}")
        print("="*len(title))
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

print("✅ Connected to Snowflake")


✅ Connected to Snowflake


# purchases

In [5]:

  # --- CONFIGURATION ---
  ANALYSIS_START_DATE = '2025-03-01'
  ANALYSIS_END_DATE = '2025-09-30'

  print(f"\n--- Collecting Half-Hourly Raw Purchase Metrics (Weekly Pulls) ---")
  print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

  # --- Main Loop ---
  all_half_hourly_data = []
  current_week_start = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
  end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

  week_count = (end_date_obj - current_week_start).days // 7 + 1
  progress_bar = tqdm(total=week_count, desc="Processing weeks")

  while current_week_start < end_date_obj:
      current_week_end = current_week_start + timedelta(days=7)

      week_start_str = current_week_start.strftime('%Y-%m-%d 00:00:00')
      week_end_str = current_week_end.strftime('%Y-%m-%d 00:00:00')

      print(f"Processing week: {current_week_start.date()} to {current_week_end.date()}...")

      query = f"""
      SELECT
          -- Half-hourly time bucket
          DATEADD(MINUTE, FLOOR(EXTRACT(MINUTE FROM PURCHASED_AT)/30)*30, 
                  DATE_TRUNC('HOUR', PURCHASED_AT))::TIMESTAMP_NTZ AS activity_half_hour,

          -- Raw Aggregate Metrics
          COUNT(DISTINCT PURCHASE_ID) AS half_hourly_transaction_count,
          COALESCE(SUM(QUANTITY * UNIT_PRICE), 0) AS half_hourly_gmv,
          COALESCE(SUM(QUANTITY), 0) AS half_hourly_units_sold,
          COUNT(DISTINCT USER_ID) AS half_hourly_purchasing_users,
          COUNT(DISTINCT PRODUCT_ID) AS half_hourly_products_purchased,
          COUNT(*) AS half_hourly_total_purchase_lines,
          
          -- Price and transaction metrics
          COALESCE(AVG(QUANTITY * UNIT_PRICE), 0) AS half_hourly_avg_transaction_value,
          COALESCE(AVG(QUANTITY), 0) AS half_hourly_avg_units_per_line,
          COALESCE(AVG(UNIT_PRICE), 0) AS half_hourly_avg_unit_price,
          COALESCE(MIN(UNIT_PRICE), 0) AS half_hourly_min_unit_price,
          COALESCE(MAX(UNIT_PRICE), 0) AS half_hourly_max_unit_price,
          COALESCE(STDDEV(UNIT_PRICE), 0) AS half_hourly_stddev_unit_price,
          
          -- Quantity metrics
          COALESCE(MIN(QUANTITY), 0) AS half_hourly_min_quantity,
          COALESCE(MAX(QUANTITY), 0) AS half_hourly_max_quantity,
          COALESCE(STDDEV(QUANTITY), 0) AS half_hourly_stddev_quantity
      FROM
          PURCHASES
      WHERE
          PURCHASED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
          AND PURCHASED_AT < '{week_end_str}'::TIMESTAMP_NTZ
      GROUP BY
          1
      ORDER BY
          1;
      """

      try:
          half_hourly_df_for_week = run_query(query)
          if not half_hourly_df_for_week.empty:
              all_half_hourly_data.append(half_hourly_df_for_week)
              print(f"   -> Success: Found {half_hourly_df_for_week.shape[0]} half-hourly records for the week.")
          else:
              print(f"   -> Info: No purchase data found for this week.")

      except Exception as e:
          print(f"   -> ERROR processing week starting {current_week_start.date()}: {e}")

      # Move to the next week
      current_week_start = current_week_end
      progress_bar.update(1)

  progress_bar.close()

  # --- Final Processing and Display ---
  if all_half_hourly_data:
      final_df = pd.concat(all_half_hourly_data, ignore_index=True)

      # Convert all numeric columns
      numeric_columns = [col for col in final_df.columns if col != 'ACTIVITY_HALF_HOUR']
      for col in numeric_columns:
          final_df[col] = pd.to_numeric(final_df[col])

      # --- SAVE TO PARQUET IN DATA FOLDER ---
      output_filename = f"data/half_hourly_purchases_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
      final_df.to_parquet(output_filename, index=False)
      print(f"\n✅ Data successfully saved to {output_filename}")

      show_table(final_df.head(25), "Aggregated Half-Hourly Purchase Metrics")

      print(f"\nDataset Summary:")
      print(f"Total records: {final_df.shape[0]:,}")
      print(f"Total columns: {final_df.shape[1]}")
      print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
  else:
      print("\nNo purchase data found for the entire specified period.")


--- Collecting Half-Hourly Raw Purchase Metrics (Weekly Pulls) ---
Period: 2025-03-01 to 2025-09-30


Processing weeks:  10%|▉         | 3/31 [03:41<34:31, 73.97s/it]

Processing week: 2025-03-01 to 2025-03-08...



Processing weeks:   3%|▎         | 1/31 [00:02<01:21,  2.71s/it]

   -> Info: No purchase data found for this week.
Processing week: 2025-03-08 to 2025-03-15...


Processing weeks:   6%|▋         | 2/31 [00:04<01:00,  2.09s/it]

   -> Success: Found 48 half-hourly records for the week.
Processing week: 2025-03-15 to 2025-03-22...


Processing weeks:  10%|▉         | 3/31 [00:07<01:11,  2.56s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-03-22 to 2025-03-29...


Processing weeks:  13%|█▎        | 4/31 [00:11<01:22,  3.05s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-03-29 to 2025-04-05...


Processing weeks:  16%|█▌        | 5/31 [00:14<01:23,  3.19s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-05 to 2025-04-12...


Processing weeks:  19%|█▉        | 6/31 [00:17<01:20,  3.20s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-12 to 2025-04-19...


Processing weeks:  23%|██▎       | 7/31 [00:22<01:28,  3.70s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-19 to 2025-04-26...


Processing weeks:  26%|██▌       | 8/31 [00:27<01:36,  4.19s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-26 to 2025-05-03...


Processing weeks:  29%|██▉       | 9/31 [00:33<01:38,  4.48s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-03 to 2025-05-10...


Processing weeks:  32%|███▏      | 10/31 [00:37<01:31,  4.37s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-10 to 2025-05-17...


Processing weeks:  35%|███▌      | 11/31 [00:40<01:21,  4.10s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-17 to 2025-05-24...


Processing weeks:  39%|███▊      | 12/31 [00:44<01:16,  4.03s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-24 to 2025-05-31...


Processing weeks:  42%|████▏     | 13/31 [00:47<01:04,  3.58s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-31 to 2025-06-07...


Processing weeks:  45%|████▌     | 14/31 [00:51<01:07,  3.96s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-07 to 2025-06-14...


Processing weeks:  48%|████▊     | 15/31 [00:54<00:54,  3.41s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-14 to 2025-06-21...


Processing weeks:  52%|█████▏    | 16/31 [00:58<00:56,  3.75s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-21 to 2025-06-28...


Processing weeks:  55%|█████▍    | 17/31 [01:04<00:59,  4.26s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-28 to 2025-07-05...


Processing weeks:  58%|█████▊    | 18/31 [01:10<01:04,  4.99s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-05 to 2025-07-12...


Processing weeks:  61%|██████▏   | 19/31 [01:18<01:10,  5.89s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-12 to 2025-07-19...


Processing weeks:  65%|██████▍   | 20/31 [01:23<01:01,  5.58s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-19 to 2025-07-26...


Processing weeks:  68%|██████▊   | 21/31 [01:28<00:52,  5.28s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-26 to 2025-08-02...


Processing weeks:  71%|███████   | 22/31 [01:32<00:45,  5.09s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-02 to 2025-08-09...


Processing weeks:  74%|███████▍  | 23/31 [01:39<00:44,  5.54s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-09 to 2025-08-16...


Processing weeks:  77%|███████▋  | 24/31 [01:45<00:39,  5.61s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-16 to 2025-08-23...


Processing weeks:  81%|████████  | 25/31 [01:51<00:34,  5.75s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-23 to 2025-08-30...


Processing weeks:  84%|████████▍ | 26/31 [01:59<00:32,  6.48s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-30 to 2025-09-06...


Processing weeks:  87%|████████▋ | 27/31 [02:08<00:28,  7.23s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-09-06 to 2025-09-13...


Processing weeks:  90%|█████████ | 28/31 [02:17<00:22,  7.65s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-09-13 to 2025-09-20...


Processing weeks:  97%|█████████▋| 30/31 [02:21<00:04,  4.66s/it]

   -> Success: Found 191 half-hourly records for the week.
Processing week: 2025-09-20 to 2025-09-27...
   -> Info: No purchase data found for this week.
Processing week: 2025-09-27 to 2025-10-04...


Processing weeks: 100%|██████████| 31/31 [02:21<00:00,  4.58s/it]

   -> Info: No purchase data found for this week.

✅ Data successfully saved to data/half_hourly_purchases_2025-03-01_to_2025-09-30.parquet

Aggregated Half-Hourly Purchase Metrics
+----------------------+---------------------------------+-------------------+--------------------------+--------------------------------+----------------------------------+------------------------------------+-------------------------------------+----------------------------------+------------------------------+------------------------------+------------------------------+---------------------------------+----------------------------+----------------------------+-------------------------------+
| ACTIVITY_HALF_HOUR   |   HALF_HOURLY_TRANSACTION_COUNT |   HALF_HOURLY_GMV |   HALF_HOURLY_UNITS_SOLD |   HALF_HOURLY_PURCHASING_USERS |   HALF_HOURLY_PRODUCTS_PURCHASED |   HALF_HOURLY_TOTAL_PURCHASE_LINES |   HALF_HOURLY_AVG_TRANSACTION_VALUE |   HALF_HOURLY_AVG_UNITS_PER_LINE |   HALF_HOURLY_AVG_UNIT_PRICE |   H




# clicks

In [6]:

# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'

print(f"\n--- Collecting Half-Hourly Raw Click Metrics (Weekly Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_half_hourly_data = []
current_week_start = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

week_count = (end_date_obj - current_week_start).days // 7 + 1
progress_bar = tqdm(total=week_count, desc="Processing weeks")

while current_week_start < end_date_obj:
    current_week_end = current_week_start + timedelta(days=7)

    week_start_str = current_week_start.strftime('%Y-%m-%d 00:00:00')
    week_end_str = current_week_end.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing week: {current_week_start.date()} to {current_week_end.date()}...")

    query = f"""
    SELECT
        -- Half-hourly time bucket
        DATEADD(MINUTE, FLOOR(EXTRACT(MINUTE FROM OCCURRED_AT)/30)*30, 
                DATE_TRUNC('HOUR', OCCURRED_AT))::TIMESTAMP_NTZ AS activity_half_hour,

        -- Raw Aggregate Metrics
        COUNT(INTERACTION_ID) AS half_hourly_click_count,
        COUNT(DISTINCT USER_ID) AS half_hourly_clicking_users,
        COUNT(DISTINCT VENDOR_ID) AS half_hourly_clicked_vendors,
        COUNT(DISTINCT CAMPAIGN_ID) AS half_hourly_clicked_campaigns,
        COUNT(DISTINCT PRODUCT_ID) AS half_hourly_clicked_products,
        COUNT(DISTINCT AUCTION_ID) AS half_hourly_clicked_auctions,
        
        -- Engagement intensity metrics
        COUNT(DISTINCT CONCAT(USER_ID, '_', VENDOR_ID)) AS half_hourly_user_vendor_pairs,
        COUNT(DISTINCT CONCAT(USER_ID, '_', PRODUCT_ID)) AS half_hourly_user_product_pairs,
        COUNT(DISTINCT CONCAT(VENDOR_ID, '_', PRODUCT_ID)) AS half_hourly_vendor_product_pairs,
        COUNT(DISTINCT CONCAT(CAMPAIGN_ID, '_', PRODUCT_ID)) AS half_hourly_campaign_product_pairs,
        COUNT(DISTINCT CONCAT(USER_ID, '_', CAMPAIGN_ID)) AS half_hourly_user_campaign_pairs
    FROM
        CLICKS
    WHERE
        OCCURRED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
        AND OCCURRED_AT < '{week_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        half_hourly_df_for_week = run_query(query)
        if not half_hourly_df_for_week.empty:
            all_half_hourly_data.append(half_hourly_df_for_week)
            print(f"   -> Success: Found {half_hourly_df_for_week.shape[0]} half-hourly records for the week.")
        else:
            print(f"   -> Info: No click data found for this week.")

    except Exception as e:
        print(f"   -> ERROR processing week starting {current_week_start.date()}: {e}")

    # Move to the next week
    current_week_start = current_week_end
    progress_bar.update(1)

progress_bar.close()

# --- Final Processing and Display ---
if all_half_hourly_data:
    final_df = pd.concat(all_half_hourly_data, ignore_index=True)

    # Convert all numeric columns
    numeric_columns = [col for col in final_df.columns if col != 'ACTIVITY_HALF_HOUR']
    for col in numeric_columns:
        final_df[col] = pd.to_numeric(final_df[col])

    # --- SAVE TO PARQUET IN DATA FOLDER ---
    output_filename = f"data/half_hourly_clicks_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df.head(25), "Aggregated Half-Hourly Click Metrics")

    print(f"\nDataset Summary:")
    print(f"Total records: {final_df.shape[0]:,}")
    print(f"Total columns: {final_df.shape[1]}")
    print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
else:
    print("\nNo click data found for the entire specified period.")



--- Collecting Half-Hourly Raw Click Metrics (Weekly Pulls) ---
Period: 2025-03-01 to 2025-09-30


Processing weeks:   0%|          | 0/31 [00:00<?, ?it/s]

Processing week: 2025-03-01 to 2025-03-08...


Processing weeks:   3%|▎         | 1/31 [00:03<01:43,  3.47s/it]

   -> Info: No click data found for this week.
Processing week: 2025-03-08 to 2025-03-15...


Processing weeks:   6%|▋         | 2/31 [00:09<02:24,  5.00s/it]

   -> Success: Found 48 half-hourly records for the week.
Processing week: 2025-03-15 to 2025-03-22...


Processing weeks:  10%|▉         | 3/31 [00:21<03:45,  8.07s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-03-22 to 2025-03-29...


Processing weeks:  13%|█▎        | 4/31 [00:34<04:27,  9.91s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-03-29 to 2025-04-05...


Processing weeks:  16%|█▌        | 5/31 [00:47<04:53, 11.28s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-05 to 2025-04-12...


Processing weeks:  19%|█▉        | 6/31 [01:02<05:13, 12.55s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-12 to 2025-04-19...


Processing weeks:  23%|██▎       | 7/31 [01:39<08:10, 20.43s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-19 to 2025-04-26...


Processing weeks:  26%|██▌       | 8/31 [02:16<09:55, 25.89s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-04-26 to 2025-05-03...


Processing weeks:  29%|██▉       | 9/31 [02:33<08:27, 23.07s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-03 to 2025-05-10...


Processing weeks:  32%|███▏      | 10/31 [02:52<07:33, 21.62s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-10 to 2025-05-17...


Processing weeks:  35%|███▌      | 11/31 [03:39<09:51, 29.58s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-17 to 2025-05-24...


Processing weeks:  39%|███▊      | 12/31 [04:22<10:37, 33.57s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-24 to 2025-05-31...


Processing weeks:  42%|████▏     | 13/31 [04:46<09:14, 30.81s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-05-31 to 2025-06-07...


Processing weeks:  45%|████▌     | 14/31 [05:33<10:02, 35.43s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-07 to 2025-06-14...


Processing weeks:  48%|████▊     | 15/31 [06:02<08:58, 33.64s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-14 to 2025-06-21...


Processing weeks:  52%|█████▏    | 16/31 [06:23<07:26, 29.80s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-21 to 2025-06-28...


Processing weeks:  55%|█████▍    | 17/31 [06:44<06:18, 27.05s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-06-28 to 2025-07-05...


Processing weeks:  58%|█████▊    | 18/31 [07:05<05:28, 25.24s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-05 to 2025-07-12...


Processing weeks:  61%|██████▏   | 19/31 [07:52<06:21, 31.78s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-12 to 2025-07-19...


Processing weeks:  65%|██████▍   | 20/31 [08:39<06:41, 36.52s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-19 to 2025-07-26...


Processing weeks:  68%|██████▊   | 21/31 [09:02<05:25, 32.51s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-07-26 to 2025-08-02...


Processing weeks:  71%|███████   | 22/31 [09:53<05:40, 37.80s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-02 to 2025-08-09...


Processing weeks:  74%|███████▍  | 23/31 [10:42<05:31, 41.44s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-09 to 2025-08-16...


Processing weeks:  77%|███████▋  | 24/31 [11:30<05:02, 43.21s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-16 to 2025-08-23...


Processing weeks:  81%|████████  | 25/31 [12:19<04:29, 44.91s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-23 to 2025-08-30...


Processing weeks:  84%|████████▍ | 26/31 [12:43<03:13, 38.80s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-08-30 to 2025-09-06...


Processing weeks:  87%|████████▋ | 27/31 [13:37<02:52, 43.22s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-09-06 to 2025-09-13...


Processing weeks:  90%|█████████ | 28/31 [14:13<02:03, 41.08s/it]

   -> Success: Found 336 half-hourly records for the week.
Processing week: 2025-09-13 to 2025-09-20...


Processing weeks:  94%|█████████▎| 29/31 [14:27<01:06, 33.16s/it]

   -> Success: Found 191 half-hourly records for the week.
Processing week: 2025-09-20 to 2025-09-27...


Processing weeks:  97%|█████████▋| 30/31 [14:28<00:23, 23.41s/it]

   -> Info: No click data found for this week.
Processing week: 2025-09-27 to 2025-10-04...


Processing weeks: 100%|██████████| 31/31 [14:29<00:00, 28.04s/it]

   -> Info: No click data found for this week.

✅ Data successfully saved to data/half_hourly_clicks_2025-03-01_to_2025-09-30.parquet

Aggregated Half-Hourly Click Metrics
+----------------------+---------------------------+------------------------------+-------------------------------+---------------------------------+--------------------------------+--------------------------------+---------------------------------+----------------------------------+------------------------------------+--------------------------------------+-----------------------------------+
| ACTIVITY_HALF_HOUR   |   HALF_HOURLY_CLICK_COUNT |   HALF_HOURLY_CLICKING_USERS |   HALF_HOURLY_CLICKED_VENDORS |   HALF_HOURLY_CLICKED_CAMPAIGNS |   HALF_HOURLY_CLICKED_PRODUCTS |   HALF_HOURLY_CLICKED_AUCTIONS |   HALF_HOURLY_USER_VENDOR_PAIRS |   HALF_HOURLY_USER_PRODUCT_PAIRS |   HALF_HOURLY_VENDOR_PRODUCT_PAIRS |   HALF_HOURLY_CAMPAIGN_PRODUCT_PAIRS |   HALF_HOURLY_USER_CAMPAIGN_PAIRS |
| 2025-03-14 00:00:00  |          




# impressions

In [5]:

# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'

print(f"\n--- Collecting Half-Hourly Raw Impression Metrics (Weekly Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_half_hourly_data = []
current_week_start = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

week_count = (end_date_obj - current_week_start).days // 7 + 1
progress_bar = tqdm(total=week_count, desc="Processing weeks")

while current_week_start < end_date_obj:
    current_week_end = current_week_start + timedelta(days=7)

    week_start_str = current_week_start.strftime('%Y-%m-%d 00:00:00')
    week_end_str = current_week_end.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing week: {current_week_start.date()} to {current_week_end.date()}...")

    query = f"""
    SELECT
        -- Half-hourly time bucket
        DATEADD(MINUTE, FLOOR(EXTRACT(MINUTE FROM OCCURRED_AT)/30)*30, 
                DATE_TRUNC('HOUR', OCCURRED_AT))::TIMESTAMP_NTZ AS activity_half_hour,

        -- Raw Aggregate Metrics
        COUNT(INTERACTION_ID) AS half_hourly_impression_count,
        COUNT(DISTINCT USER_ID) AS half_hourly_impressed_users,
        COUNT(DISTINCT VENDOR_ID) AS half_hourly_impressed_vendors,
        COUNT(DISTINCT CAMPAIGN_ID) AS half_hourly_impressed_campaigns,
        COUNT(DISTINCT PRODUCT_ID) AS half_hourly_impressed_products,
        COUNT(DISTINCT AUCTION_ID) AS half_hourly_impressed_auctions,
        
        -- Concentration metrics
        COUNT(DISTINCT CONCAT(USER_ID, '_', VENDOR_ID)) AS half_hourly_user_vendor_pairs,
        COUNT(DISTINCT CONCAT(USER_ID, '_', PRODUCT_ID)) AS half_hourly_user_product_pairs,
        COUNT(DISTINCT CONCAT(VENDOR_ID, '_', PRODUCT_ID)) AS half_hourly_vendor_product_pairs,
        COUNT(DISTINCT CONCAT(CAMPAIGN_ID, '_', PRODUCT_ID)) AS half_hourly_campaign_product_pairs
    FROM
        IMPRESSIONS
    WHERE
        OCCURRED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
        AND OCCURRED_AT < '{week_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        half_hourly_df_for_week = run_query(query)
        if not half_hourly_df_for_week.empty:
            all_half_hourly_data.append(half_hourly_df_for_week)
            print(f"   -> Success: Found {half_hourly_df_for_week.shape[0]} half-hourly records for the week.")
        else:
            print(f"   -> Info: No impression data found for this week.")

    except Exception as e:
        print(f"   -> ERROR processing week starting {current_week_start.date()}: {e}")

    # Move to the next week
    current_week_start = current_week_end
    progress_bar.update(1)

progress_bar.close()

# --- Final Processing and Display ---
if all_half_hourly_data:
    final_df = pd.concat(all_half_hourly_data, ignore_index=True)

    # Convert all numeric columns
    numeric_columns = [col for col in final_df.columns if col != 'ACTIVITY_HALF_HOUR']
    for col in numeric_columns:
        final_df[col] = pd.to_numeric(final_df[col])

    # --- SAVE TO PARQUET IN DATA FOLDER ---
    output_filename = f"data/half_hourly_impressions_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df.head(25), "Aggregated Half-Hourly Impression Metrics")

    print(f"\nDataset Summary:")
    print(f"Total records: {final_df.shape[0]:,}")
    print(f"Total columns: {final_df.shape[1]}")
    print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
else:
    print("\nNo impression data found for the entire specified period.")


--- Collecting Half-Hourly Raw Impression Metrics (Weekly Pulls) ---
Period: 2025-03-01 to 2025-09-30


Processing remaining weeks:   3%|▎         | 1/31 [12:12<6:06:24, 732.82s/it]

Processing week: 2025-03-01 to 2025-03-08...





   -> Info: No impression data found for this week.
Processing week: 2025-03-08 to 2025-03-15...




   -> Success: Found 48 half-hourly records for the week.
Processing week: 2025-03-15 to 2025-03-22...


KeyboardInterrupt: 

# auctions - users

In [None]:

# --- CONFIGURATION ---
ANALYSIS_START_DATE = '2025-03-01'
ANALYSIS_END_DATE = '2025-09-30'

print(f"\n--- Collecting Half-Hourly Raw Auction User Metrics (Weekly Pulls) ---")
print(f"Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")

# --- Main Loop ---
all_half_hourly_data = []
current_week_start = datetime.strptime(ANALYSIS_START_DATE, '%Y-%m-%d')
end_date_obj = datetime.strptime(ANALYSIS_END_DATE, '%Y-%m-%d')

week_count = (end_date_obj - current_week_start).days // 7 + 1
progress_bar = tqdm(total=week_count, desc="Processing weeks")

while current_week_start < end_date_obj:
    current_week_end = current_week_start + timedelta(days=7)

    week_start_str = current_week_start.strftime('%Y-%m-%d 00:00:00')
    week_end_str = current_week_end.strftime('%Y-%m-%d 00:00:00')

    print(f"Processing week: {current_week_start.date()} to {current_week_end.date()}...")

    query = f"""
    SELECT
        -- Half-hourly time bucket
        DATEADD(MINUTE, FLOOR(EXTRACT(MINUTE FROM CREATED_AT)/30)*30, 
                DATE_TRUNC('HOUR', CREATED_AT))::TIMESTAMP_NTZ AS activity_half_hour,

        -- Auction metrics
        COUNT(AUCTION_ID) AS half_hourly_auction_count,
        COUNT(DISTINCT AUCTION_ID) AS half_hourly_unique_auctions,
        COUNT(DISTINCT OPAQUE_USER_ID) AS half_hourly_auction_users,
        
        -- User activity intensity
        COALESCE(AVG(user_auction_counts.auctions_per_user), 0) AS half_hourly_avg_auctions_per_user,
        COALESCE(MIN(user_auction_counts.auctions_per_user), 0) AS half_hourly_min_auctions_per_user,
        COALESCE(MAX(user_auction_counts.auctions_per_user), 0) AS half_hourly_max_auctions_per_user,
        COALESCE(STDDEV(user_auction_counts.auctions_per_user), 0) AS half_hourly_stddev_auctions_per_user
    FROM
        AUCTIONS_USERS au
    LEFT JOIN (
        SELECT 
            OPAQUE_USER_ID,
            DATEADD(MINUTE, FLOOR(EXTRACT(MINUTE FROM CREATED_AT)/30)*30, 
                    DATE_TRUNC('HOUR', CREATED_AT))::TIMESTAMP_NTZ AS half_hour_bucket,
            COUNT(AUCTION_ID) AS auctions_per_user
        FROM AUCTIONS_USERS
        WHERE CREATED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
        AND CREATED_AT < '{week_end_str}'::TIMESTAMP_NTZ
        GROUP BY OPAQUE_USER_ID, half_hour_bucket
    ) user_auction_counts ON au.OPAQUE_USER_ID = user_auction_counts.OPAQUE_USER_ID 
                        AND DATEADD(MINUTE, FLOOR(EXTRACT(MINUTE FROM au.CREATED_AT)/30)*30, 
                                    DATE_TRUNC('HOUR', au.CREATED_AT))::TIMESTAMP_NTZ = user_auction_counts.half_hour_bucket
    WHERE
        au.CREATED_AT >= '{week_start_str}'::TIMESTAMP_NTZ
        AND au.CREATED_AT < '{week_end_str}'::TIMESTAMP_NTZ
    GROUP BY
        1
    ORDER BY
        1;
    """

    try:
        half_hourly_df_for_week = run_query(query)
        if not half_hourly_df_for_week.empty:
            all_half_hourly_data.append(half_hourly_df_for_week)
            print(f"   -> Success: Found {half_hourly_df_for_week.shape[0]} half-hourly records for the week.")
        else:
            print(f"   -> Info: No auction user data found for this week.")

    except Exception as e:
        print(f"   -> ERROR processing week starting {current_week_start.date()}: {e}")

    # Move to the next week
    current_week_start = current_week_end
    progress_bar.update(1)

progress_bar.close()

# --- Final Processing and Display ---
if all_half_hourly_data:
    final_df = pd.concat(all_half_hourly_data, ignore_index=True)

    # Convert all numeric columns
    numeric_columns = [col for col in final_df.columns if col != 'ACTIVITY_HALF_HOUR']
    for col in numeric_columns:
        final_df[col] = pd.to_numeric(final_df[col])

    # --- SAVE TO PARQUET IN DATA FOLDER ---
    output_filename = f"data/half_hourly_auction_users_{ANALYSIS_START_DATE}_to_{ANALYSIS_END_DATE}.parquet"
    final_df.to_parquet(output_filename, index=False)
    print(f"\n✅ Data successfully saved to {output_filename}")

    show_table(final_df.head(25), "Aggregated Half-Hourly Auction User Metrics")

    print(f"\nDataset Summary:")
    print(f"Total records: {final_df.shape[0]:,}")
    print(f"Total columns: {final_df.shape[1]}")
    print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
else:
    print("\nNo auction user data found for the entire specified period.")



--- Collecting Hourly Raw Auction Metrics (Daily Pulls) ---
Period: 2025-03-01 to 2025-09-30
Processing date: 2025-03-01...
   -> Info: No auction data found for this day.
Processing date: 2025-03-02...
   -> Info: No auction data found for this day.
Processing date: 2025-03-03...
   -> Info: No auction data found for this day.
Processing date: 2025-03-04...
   -> Info: No auction data found for this day.
Processing date: 2025-03-05...
   -> Info: No auction data found for this day.
Processing date: 2025-03-06...
   -> Info: No auction data found for this day.
Processing date: 2025-03-07...
   -> Info: No auction data found for this day.
Processing date: 2025-03-08...
   -> Info: No auction data found for this day.
Processing date: 2025-03-09...
   -> Info: No auction data found for this day.
Processing date: 2025-03-10...
   -> Info: No auction data found for this day.
Processing date: 2025-03-11...
   -> Info: No auction data found for this day.
Processing date: 2025-03-12...
   -> 