# Snowflake Realtime Data Simulation

In [None]:
import os
import time
import pandas as pd
import pyarrow.parquet as pq
import snowflake.connector
from tqdm.notebook import tqdm

# Load environment variables
SNOWFLAKE_PASSWORD = os.getenv("SNOWFLAKE_PASSWORD")
DATA_FILE_PATH = '../streaming_data.parquet'

# Snowflake connection configuration
SNOWFLAKE_CONFIG = {
    "account": "SFEDU02-KFB85562",
    "user": "BISON",
    "password": SNOWFLAKE_PASSWORD,
    "role": "TRAINING_ROLE",
    "warehouse": "ANIMAL_TASK_WH",
    "database": "CATCH_ME_IF_YOU_CAN",
    "schema": "PUBLIC"
}

# Simulation configuration
SIMULATION_DURATION_MINUTES = 60  # Total simulation time in minutes
INSERT_INTERVAL_SECONDS = 1  # Insert interval in seconds
simulation_duration_seconds = SIMULATION_DURATION_MINUTES * 60

# Get total entries without loading full file
parquet_file = pq.ParquetFile(DATA_FILE_PATH)
total_entries = parquet_file.metadata.num_rows
print(f"Total Entries: {total_entries}")
print(f"Simulation Duration: {SIMULATION_DURATION_MINUTES} minutes")
print(f"Interval Between Inserts: {INSERT_INTERVAL_SECONDS} seconds")


def process_batch(batch_df):
    """Clean and convert a single batch of data."""
    column_types = {
        'instance_id': 'str',
        'cluster_size': 'int',
        'user_id': 'str',
        'database_id': 'str',
        'query_id': 'str',
        'arrival_timestamp': 'datetime',
        'compile_duration_ms': 'int',
        'queue_duration_ms': 'int',
        'execution_duration_ms': 'int',
        'feature_fingerprint': 'str',
        'was_aborted': 'bool',
        'was_cached': 'bool',
        'cache_source_query_id': 'str',
        'query_type': 'str',
        'num_permanent_tables_accessed': 'int',
        'num_external_tables_accessed': 'int',
        'num_system_tables_accessed': 'int',
        'read_table_ids': 'str',
        'write_table_ids': 'str',
        'mbytes_scanned': 'int',
        'mbytes_spilled': 'int',
        'num_joins': 'int',
        'num_scans': 'int',
        'num_aggregations': 'int',
        'dataset_type': 'str'
    }

    for col in batch_df.columns:
        col_type = column_types.get(col, 'str')

        # Handle null values
        if col_type in ['int', 'float']:
            batch_df[col] = batch_df[col].fillna(0).astype(col_type)
        elif col_type == 'bool':
            batch_df[col] = batch_df[col].fillna(False).astype(bool)
        elif col_type == 'datetime':
            batch_df[col] = pd.to_datetime(batch_df[col], errors='coerce')
        else:  # string
            batch_df[col] = batch_df[col].astype(str).str.strip().replace({'nan': '', 'None': '', 'null': ''}).fillna('')
    return batch_df


def main():
    """Main processing loop for simulating data insertion."""
    parquet_file = pq.ParquetFile(DATA_FILE_PATH)

    # Get the earliest and latest arrival timestamps
    try:
        first_batch_arrow = next(parquet_file.iter_batches(batch_size=1))
        first_batch_df = process_batch(first_batch_arrow.to_pandas())
        original_start_time = pd.to_datetime(first_batch_df['arrival_timestamp'].iloc[0])

        num_row_groups = parquet_file.num_row_groups
        last_row_group_table = parquet_file.read_row_group(num_row_groups - 1)
        last_row_group_df = process_batch(last_row_group_table.to_pandas())
        original_end_time = pd.to_datetime(last_row_group_df['arrival_timestamp'].iloc[-1])
    except Exception as e:
        print(f"Error reading timestamps from parquet file: {e}")
        return

    original_time_span_seconds = (original_end_time - original_start_time).total_seconds()
    if original_time_span_seconds == 0:
        print("Error: Original data time span is zero.")
        return

    compression_factor = original_time_span_seconds / simulation_duration_seconds
    print(f"Original Data Time Span: {original_time_span_seconds} seconds")
    print(f"Compression Factor: {compression_factor}")

    start_date_str = original_start_time.strftime('%Y-%m-%d %H:%M:%S')
    print(f"Simulation Data Starts and Static Data Ends at: {start_date_str}")

    with snowflake.connector.connect(**SNOWFLAKE_CONFIG) as conn:
        with conn.cursor() as cur:
            while True:
                # Delete previous data in live_queries with arrival_timestamp >= start_date
                delete_sql = f"DELETE FROM live_queries WHERE arrival_timestamp >= '{start_date_str}'"
                try:
                    cur.execute(delete_sql)
                    delete_count = cur.rowcount
                    conn.commit()
                    print(f"Deleted {delete_count} rows from previous simulation.")
                except Exception as e:
                    print(f"Error deleting previous data: {e}")
                    continue  # Skip to the next iteration

                pending_data = pd.DataFrame()
                batch_iter = parquet_file.iter_batches(batch_size=10000)
                simulation_time = 0
                data_iterator_exhausted = False

                total_simulation_steps = int(simulation_duration_seconds / INSERT_INTERVAL_SECONDS) + 1
                progress_bar = tqdm(total=total_simulation_steps, desc="Simulation Time")

                while simulation_time <= simulation_duration_seconds:
                    start_time = time.time()

                    # Compute original time window for current interval
                    original_start_time_interval = original_start_time + pd.Timedelta(seconds=simulation_time * compression_factor)
                    original_end_time_interval = original_start_time + pd.Timedelta(seconds=(simulation_time + INSERT_INTERVAL_SECONDS) * compression_factor)

                    # Read data batches until we have all data up to original_end_time_interval
                    while not data_iterator_exhausted:
                        if pending_data.empty or pending_data['arrival_timestamp'].max() < original_end_time_interval:
                            try:
                                arrow_batch = next(batch_iter)
                                raw_batch_df = arrow_batch.to_pandas()
                                batch_df = process_batch(raw_batch_df)
                                pending_data = pd.concat([pending_data, batch_df], ignore_index=True)
                            except StopIteration:
                                data_iterator_exhausted = True
                                break
                        else:
                            break

                    # Select data for current interval
                    mask = (pending_data['arrival_timestamp'] >= original_start_time_interval) & (pending_data['arrival_timestamp'] < original_end_time_interval)
                    current_interval_data = pending_data.loc[mask]

                    if not current_interval_data.empty:
                        # Prepare data for insertion
                        current_interval_data['arrival_timestamp'] = current_interval_data['arrival_timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
                        
                        # Convert to list of tuples for SQL insertion
                        rows = [tuple(row) for row in current_interval_data.to_numpy()]

                        # Generate bulk INSERT query
                        columns = ', '.join(current_interval_data.columns)
                        placeholders_per_row = ', '.join(['%s'] * len(current_interval_data.columns))
                        values_placeholders = ', '.join([f'({placeholders_per_row})' for _ in rows])

                        sql = f"INSERT INTO live_queries ({columns}) VALUES {values_placeholders}"

                        # Flatten rows into a single parameter list
                        params = [param for row in rows for param in row]

                        # Execute in one go
                        try:
                            cur.execute(sql, params)
                            conn.commit()
                        except Exception as e:
                            print(f"Error inserting data at simulation time {simulation_time}: {e}")


                    # Remove inserted data from pending_data
                    pending_data = pending_data.loc[~mask].reset_index(drop=True)

                    # Sleep for any remaining time in interval
                    elapsed_time = time.time() - start_time
                    sleep_time = max(0, INSERT_INTERVAL_SECONDS - elapsed_time)
                    time.sleep(sleep_time)

                    simulation_time += INSERT_INTERVAL_SECONDS
                    progress_bar.update(1)

                    # If data iterator is exhausted and pending_data is empty, we can break early
                    if data_iterator_exhausted and pending_data.empty:
                        break

                progress_bar.close()
                print("Simulation complete. Restarting...")
                time.sleep(5)  # Sleep for 5 seconds before restarting


if __name__ == "__main__":
    main()


Total Entries: 1500598
Simulation Duration: 60 minutes
Interval Between Inserts: 5 seconds
Original Data Time Span: 2637643.610532 seconds
Compression Factor: 732.6787807033334
Simulation Data Starts and Static Data Ends at: 2024-04-30 11:18:59
Deleted 0 rows from previous simulation.


Simulation Time:   0%|          | 0/721 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_interval_data['arrival_timestamp'] = current_interval_data['arrival_timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_interval_data['arrival_timestamp'] = current_interval_data['arrival_timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

## Create or Replace Table (DONT RUN IT WILL DELETE ALL THE DATA)

In [None]:
import snowflake.connector
import os

# Load environment variables
# to use create a .env file in the root directory and insert the following:
# SNOWFLAKE_PASSWORD=your_password
SNOWFLAKE_PASSWORD = os.getenv("SNOWFLAKE_PASSWORD")

# Snowflake connection configuration
SNOWFLAKE_CONFIG = {
    "account": "SFEDU02-KFB85562",
    "user": "BISON",
    "password": SNOWFLAKE_PASSWORD,
    "role": "TRAINING_ROLE",
    "warehouse": "ANIMAL_TASK_WH",
    "database": "CATCH_ME_IF_YOU_CAN",
    "schema": "PUBLIC"
}

input = input("Do you really want to create the table? (yes/no): ")
if input != "yes":
    print("Aborting table creation.")
    exit()
# Connect to Snowflake
conn = snowflake.connector.connect(**SNOWFLAKE_CONFIG)
cur = conn.cursor()

create_table_query = """
CREATE OR REPLACE TABLE live_queries (
    instance_id VARCHAR,
    cluster_size INTEGER,
    user_id VARCHAR,
    database_id VARCHAR,
    query_id VARCHAR,
    arrival_timestamp TIMESTAMP,
    compile_duration_ms INTEGER,
    queue_duration_ms INTEGER,
    execution_duration_ms INTEGER,
    feature_fingerprint VARCHAR,
    was_aborted BOOLEAN,
    was_cached BOOLEAN,
    cache_source_query_id VARCHAR,
    query_type VARCHAR,
    num_permanent_tables_accessed INTEGER,
    num_external_tables_accessed INTEGER,
    num_system_tables_accessed INTEGER,
    read_table_ids VARCHAR,
    write_table_ids VARCHAR,
    mbytes_scanned INTEGER,
    mbytes_spilled INTEGER,
    num_joins INTEGER,
    num_scans INTEGER,
    num_aggregations INTEGER,
    dataset_type VARCHAR
);
"""
cur.execute(create_table_query)

# Close the cursor and connection
cur.close()
conn.close()

print("Snowflake table created successfully.")