# Extract Daily Auctions Data
This notebook extracts daily auction data from Snowflake AUCTIONS_USERS table

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from datetime import datetime, timedelta
from pathlib import Path

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected to Snowflake")

In [None]:
# Test query - check if AUCTIONS_USERS table is accessible
test_query = """
SELECT * FROM AUCTIONS_USERS LIMIT 5;
"""

print("Testing AUCTIONS_USERS table access...")
cursor = conn.cursor()
cursor.execute(test_query)
results = cursor.fetchall()

df_test = pd.DataFrame(results)
print(f"âœ… Got {len(df_test)} test records")
print(df_test)

In [None]:
# Main extraction loop - one day at a time
START_DATE = '2025-03-14'
END_DATE = '2025-09-07'

# Create local checkpoint directory
checkpoint_dir = Path('daily_granular_auctions')
checkpoint_dir.mkdir(exist_ok=True)

# External drive path for final output
EXTERNAL_DRIVE_PATH = '/Volumes/rawat/data/topsort/daily_vendor_summaries'

# Generate all dates
current_date = datetime.strptime(START_DATE, '%Y-%m-%d')
end_date = datetime.strptime(END_DATE, '%Y-%m-%d')

all_results = []

while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    
    # Check if already processed
    checkpoint_file = checkpoint_dir / f"checkpoint_{date_str}.txt"
    if checkpoint_file.exists():
        print(f"Skipping {date_str} - already processed")
        current_date += timedelta(days=1)
        continue
    
    print(f"Processing {date_str}...", end='')
    
    # Query for one day only - aggregate by user
    query = f"""
    SELECT
        DATE(CREATED_AT) AS date,
        OPAQUE_USER_ID AS user_id,
        COUNT(*) AS auctions
    FROM AUCTIONS_USERS
    WHERE CREATED_AT >= '{date_str}' AND CREATED_AT < '{(current_date + timedelta(days=1)).strftime('%Y-%m-%d')}'
    GROUP BY DATE(CREATED_AT), OPAQUE_USER_ID
    """
    
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        
        df_day = pd.DataFrame(results, columns=['date', 'user_id', 'auctions'])
        all_results.append(df_day)
        
        # Save checkpoint
        checkpoint_file.touch()
        
        print(f" {len(df_day):,} users, {df_day['auctions'].sum():,} auctions")
        
    except Exception as e:
        print(f" FAILED: {e}")
        break
    
    current_date += timedelta(days=1)

# Combine all results and save to external drive
if all_results:
    df_final = pd.concat(all_results, ignore_index=True)
    output_file = f'{EXTERNAL_DRIVE_PATH}/daily_user_auctions.parquet'
    df_final.to_parquet(output_file, index=False)
    
    print(f"\n=== FINAL SUMMARY ===")
    print(f"Saved {len(df_final):,} total rows to {output_file}")
    print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")
    print(f"Unique users: {df_final['user_id'].nunique():,}")
    print(f"Total auctions: {df_final['auctions'].sum():,}")
else:
    print("No data collected")

In [None]:
# Close connection
conn.close()
print("Connection closed")