In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected")

Connected


In [4]:

import pandas as pd
from pathlib import Path
from datetime import date, timedelta
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import snowflake.connector

# SETUP AND CONFIGURATION
load_dotenv()
# conn = snowflake.connector.connect(...) 

ANALYSIS_START = date(2025, 3, 14)
ANALYSIS_END = date(2025, 9, 7)

BASE_PATH = Path("/Users/pranjal/Code/marketplace-incrementality/daily_summaries/data")
OUTPUT_DIR = BASE_PATH / "product_daily_auctions_dataset"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"--- Configuration ---")
print(f"Analysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"Output Directory: {OUTPUT_DIR}")

# OPTIMIZED AUCTION QUERY - NO JOINS
def get_auctions_query(date_str: str, next_date_str: str) -> str:
    return f"""
    SELECT
        PRODUCT_ID,
        '{date_str}'::DATE AS date,
        ANY_VALUE(VENDOR_ID) AS vendor_id,
        ANY_VALUE(CAMPAIGN_ID) AS campaign_id,
        COUNT(DISTINCT AUCTION_ID) AS product_auctions_count,
        COUNT(*) AS total_bids_for_product,
        SUM(IFF(IS_WINNER, 1, 0)) AS total_wins_for_product,
        ROUND(AVG(RANKING), 2) AS avg_bid_rank_for_product,
        COUNT(DISTINCT VENDOR_ID) AS distinct_bidders_for_product,
    FROM
        AUCTIONS_RESULTS
    WHERE
        CREATED_AT >= '{date_str}'::TIMESTAMP_NTZ 
        AND CREATED_AT < '{next_date_str}'::TIMESTAMP_NTZ
        AND PRODUCT_ID IS NOT NULL
    GROUP BY
        PRODUCT_ID
    """

print("\n--- Starting Daily Product-Level Auction Extraction (Optimized) ---")
date_list = pd.date_range(start=ANALYSIS_START, end=ANALYSIS_END, freq='D')

for current_date in tqdm(date_list, desc="Extracting Daily Auction Data"):
    date_str = current_date.strftime('%Y-%m-%d')
    next_date_str = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')

    output_file = OUTPUT_DIR / f"data_{date_str}.parquet"

    if output_file.exists():
        continue

    try:
        query = get_auctions_query(date_str, next_date_str)
        df_day = pd.read_sql(query, conn)

        if not df_day.empty:
            df_day.to_parquet(output_file, index=False, engine='pyarrow', compression='snappy')
            print(f"  {date_str}: {len(df_day):,} products with auction activity")

    except Exception as e:
        print(f"\nProcessing for {date_str} FAILED: {e}")

print("\n--- Daily Auction Extraction Complete ---")

# VERIFICATION
print("\n--- Verifying the Complete Auctions Dataset ---")
try:
    df_full_auctions = pd.read_parquet(OUTPUT_DIR)

    print(f"✅ Successfully loaded complete dataset with {len(df_full_auctions):,} rows.")

    print(f"\n=== AUCTIONS EXTRACTION FINAL SUMMARY ===")
    print(f"Date range: {df_full_auctions['DATE'].min().date()} to {df_full_auctions['DATE'].max().date()}")
    print(f"Unique products with auction activity: {df_full_auctions['PRODUCT_ID'].nunique():,}")

    total_bids = df_full_auctions['TOTAL_BIDS_FOR_PRODUCT'].sum()
    total_wins = df_full_auctions['TOTAL_WINS_FOR_PRODUCT'].sum()
    print(f"Total bids extracted: {total_bids:,}")
    print(f"Total wins extracted: {total_wins:,}")
    print(f"Overall win rate: {(total_wins/total_bids*100):.2f}%")

    print("\nSample of 5 rows:")
    print(df_full_auctions.head().to_markdown(index=False))

    print("\nSchema and Memory Usage:")
    df_full_auctions.info(memory_usage='deep')

except Exception as e:
    print(f"\n❌ Could not load or verify the full dataset. Error: {e}")


--- Configuration ---
Analysis Period: 2025-03-14 to 2025-09-07
Output Directory: /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/product_daily_auctions_dataset

--- Starting Daily Product-Level Auction Extraction (Optimized) ---


Extracting Daily Auction Data:   0%|          | 0/178 [00:00<?, ?it/s]

  df_day = pd.read_sql(query, conn)


  2025-04-15: 5,073,047 products with auction activity
  2025-04-16: 5,067,440 products with auction activity
  2025-04-17: 5,134,052 products with auction activity
  2025-04-18: 5,112,569 products with auction activity
  2025-04-19: 5,100,738 products with auction activity
  2025-04-20: 5,055,776 products with auction activity
  2025-04-21: 5,075,690 products with auction activity
  2025-04-22: 5,084,195 products with auction activity
  2025-04-23: 5,065,633 products with auction activity
  2025-04-24: 5,059,353 products with auction activity
  2025-04-25: 5,075,229 products with auction activity
  2025-04-26: 5,094,619 products with auction activity
  2025-04-27: 5,079,354 products with auction activity
  2025-04-28: 5,093,009 products with auction activity
  2025-04-29: 5,127,847 products with auction activity
  2025-04-30: 5,133,006 products with auction activity
  2025-05-01: 5,157,220 products with auction activity
  2025-05-02: 5,154,319 products with auction activity
  2025-05-

: 