# Snowflake

### Config

In [1]:
import snowflake.connector
import dotenv
import os

# Load environment variables
SNOWFLAKE_PASSWORD = os.getenv("SNOWFLAKE_PASSWORD")

# Snowflake connection configuration
config = {
    "account": "SFEDU02-KFB85562",
    "user": "BISON",
    "password": SNOWFLAKE_PASSWORD,
    "role": "TRAINING_ROLE",
    "warehouse": "ANIMAL_TASK_WH",
    "database": "CATCH_ME_IF_YOU_CAN",
    "schema": "PUBLIC"
}

### Create new table and delete old one

In [2]:
# Connect to Snowflake
conn = snowflake.connector.connect(**config)

# Define the schema and create the table
create_table_query = """
CREATE OR REPLACE TABLE live_queries (
    instance_id VARCHAR,
    cluster_size INTEGER,
    user_id VARCHAR,
    database_id VARCHAR,
    query_id VARCHAR,
    arrival_timestamp TIMESTAMP,
    compile_duration_ms INTEGER,
    queue_duration_ms INTEGER,
    execution_duration_ms INTEGER,
    feature_fingerprint VARCHAR,
    was_aborted BOOLEAN,
    was_cached BOOLEAN,
    cache_source_query_id VARCHAR,
    query_type VARCHAR,
    num_permanent_tables_accessed INTEGER,
    num_external_tables_accessed INTEGER,
    num_system_tables_accessed INTEGER,
    read_table_ids VARCHAR,
    write_table_ids VARCHAR,
    mbytes_scanned INTEGER,
    mbytes_spilled INTEGER,
    num_joins INTEGER,
    num_scans INTEGER,
    num_aggregations INTEGER,
    dataset_type VARCHAR
);
"""

# Execute the query to create the table
cur = conn.cursor()
cur.execute(create_table_query)

# Close the cursor and connection
cur.close()
conn.close()

print("Snowflake table created successfully.")

Snowflake table created successfully.


### Insert data into the table in given interval to simulate streaming

In [6]:
import snowflake.connector
import pandas as pd
import pyarrow.parquet as pq
import time
from tqdm import tqdm

# Playback configuration
PLAYBACKSPEED = 10
FILEPATH = '../combined_sorted_redset_datasets.parquet'
_seconds = 60 * 60 * 24 * 90  # 3 months in seconds

# Get total entries without loading full file
_parquet_file = pq.ParquetFile(FILEPATH)
_entries = _parquet_file.metadata.num_rows
print(f"Entries: {_entries}")

# Calculate batch parameters
INTERVAL = 1
BATCH_SIZE = max(1, int(_entries / (_seconds / INTERVAL) * PLAYBACKSPEED))
print(f"Interval: {INTERVAL}\nBatch size: {BATCH_SIZE}")

def process_batch(batch):
    """Clean and convert a single batch"""
    # Define expected types for each column (modify according to your schema)
    column_types = {
        'instance_id': 'str',
        'cluster_size': 'int',
        'user_id': 'str',
        'database_id': 'str',
        'query_id': 'str',
        'arrival_timestamp': 'datetime',
        'compile_duration_ms': 'int',
        'queue_duration_ms': 'int',
        'execution_duration_ms': 'int',
        'feature_fingerprint': 'str',
        'was_aborted': 'bool',
        'was_cached': 'bool',
        'cache_source_query_id': 'str',
        'query_type': 'str',
        'num_permanent_tables_accessed': 'int',
        'num_external_tables_accessed': 'int',
        'num_system_tables_accessed': 'int',
        'read_table_ids': 'str',
        'write_table_ids': 'str',
        'mbytes_scanned': 'int',
        'mbytes_spilled': 'int',
        'num_joins': 'int',
        'num_scans': 'int',
        'num_aggregations': 'int',
        'dataset_type': 'str'
    }

    for col in batch.columns:
        col_type = column_types.get(col, 'str')
        
        # Handle null values
        if col_type in ['int', 'float']:
            batch[col] = batch[col].fillna(0).astype(col_type)
        elif col_type == 'bool':
            batch[col] = batch[col].fillna(False).astype(bool)
        elif col_type == 'datetime':
            batch[col] = pd.to_datetime(batch[col], errors='coerce')
            batch[col] = batch[col].dt.strftime('%Y-%m-%d %H:%M:%S')
        else:  # string
            batch[col] = batch[col].astype(str).str.strip()
            batch[col] = batch[col].replace({'nan': '', 'None': '', 'null': ''})
            batch[col] = batch[col].fillna('')

    return batch

def main():
    """Main processing loop"""
    parquet_file = pq.ParquetFile(FILEPATH)
    total_batches = _entries // BATCH_SIZE + 1
    
    with snowflake.connector.connect(**config) as conn, conn.cursor() as cur:
        while True:
            for i, arrow_batch in enumerate(tqdm(parquet_file.iter_batches(batch_size=BATCH_SIZE), total=total_batches)):
                start_time = time.time()
                
                try:
                    # Convert Arrow batch to pandas DataFrame
                    raw_batch = arrow_batch.to_pandas()
                    batch = process_batch(raw_batch)
                    rows = [tuple(row) for row in batch.to_numpy()]
                    
                    columns = ', '.join(batch.columns)
                    placeholders = ', '.join(['%s'] * len(batch.columns))
                    
                    cur.executemany(
                    f"INSERT INTO live_queries ({columns}) VALUES ({placeholders})",
                    rows
                    )
                    conn.commit()
                    
                    print(f"{time.ctime()} - Batch {i+1}: Inserted {len(batch)} rows")
                    
                except Exception as e:
                    print(f"Error processing batch {i+1}: {str(e)}")
                
                sleep_time = max(0, INTERVAL - (time.time() - start_time))
                time.sleep(sleep_time)
                print(f"Sleeping for {sleep_time} seconds")
            
            # If run once, delete the table data and start again
            cur.execute("DELETE FROM live_queries")
            conn.commit()
            print("Table data deleted, starting again")

if __name__ == "__main__":
    main()


Entries: 12575460
Interval: 1
Batch size: 16


  0%|          | 1/785967 [00:01<252:06:20,  1.15s/it]

Mon Jan 27 10:53:29 2025 - Batch 1: Inserted 16 rows
Sleeping for 0 seconds


  0%|          | 2/785967 [00:02<232:31:58,  1.07s/it]

Mon Jan 27 10:53:30 2025 - Batch 2: Inserted 16 rows
Sleeping for 0.05856513977050781 seconds


  0%|          | 3/785967 [00:03<232:35:35,  1.07s/it]

Mon Jan 27 10:53:31 2025 - Batch 3: Inserted 16 rows
Sleeping for 0 seconds


  0%|          | 4/785967 [00:04<227:25:08,  1.04s/it]

Mon Jan 27 10:53:32 2025 - Batch 4: Inserted 16 rows
Sleeping for 0.0562591552734375 seconds


  0%|          | 5/785967 [00:05<224:34:32,  1.03s/it]

Mon Jan 27 10:53:33 2025 - Batch 5: Inserted 16 rows
Sleeping for 0.061862945556640625 seconds


  0%|          | 5/785967 [00:06<288:15:07,  1.32s/it]


KeyboardInterrupt: 