# DuckDB

### Delete Table data

In [1]:
import duckdb

# Create a new DuckDB database (or connect to an existing one)
conn = duckdb.connect('redshift_queries.duckdb')

conn.execute("DELETE FROM live_queries")

conn.close()

### Create new table and delete old one

In [2]:
import duckdb

# Create a new DuckDB database (or connect to an existing one)
conn = duckdb.connect('redshift_queries.duckdb')

# Define the schema and create the table
create_table_query = """
CREATE TABLE live_queries (
    instance_id VARCHAR,
    cluster_size INTEGER,
    user_id VARCHAR,
    database_id VARCHAR,
    query_id VARCHAR,
    arrival_timestamp TIMESTAMP,
    compile_duration_ms INTEGER,
    queue_duration_ms INTEGER,
    execution_duration_ms INTEGER,
    feature_fingerprint VARCHAR,
    was_aborted BOOLEAN,
    was_cached BOOLEAN,
    cache_source_query_id VARCHAR,
    query_type VARCHAR,
    num_permanent_tables_accessed INTEGER,
    num_external_tables_accessed INTEGER,
    num_system_tables_accessed INTEGER,
    read_table_ids VARCHAR,
    write_table_ids VARCHAR,
    mbytes_scanned INTEGER,
    mbytes_spilled INTEGER,
    num_joins INTEGER,
    num_scans INTEGER,
    num_aggregations INTEGER,
    dataset_type VARCHAR,
);
"""

# Execute the query to create the table
conn.execute("DROP TABLE IF EXISTS live_queries")

conn.execute(create_table_query)

conn.close()

print("DuckDB database and table created successfully.")


DuckDB database and table created successfully.


### Insert data into the table in given interval to simulate streaming

In [7]:
import duckdb
import pandas as pd
import time

# in which speed should the 3 months be played back
_playbackspeed = 100

# 3 months of data
_seconds = 60 * 60 * 24 * 90
_entries = len(pd.read_parquet('combined_sorted_redset_datasets.parquet'))
print(f"Entries: {_entries}")

# Calculate the interval and batch size based on the playback speed
INTERVAL = 0.5
print(f"Interval: {INTERVAL}")

BATCH_SIZE = int(_entries / (_seconds / INTERVAL) * _playbackspeed)
print(f"Batch size: {BATCH_SIZE}")


def load_and_clean_data(parquet_file):
    # Read the entire Parquet file once
    df = pd.read_parquet(parquet_file)
    print(len(df), "rows loaded from", parquet_file)
    
    # Define conversion functions (from your existing code)
    def convert_to_string(s):
        try: return str(s).strip() if not pd.isna(s) and s != '' else None
        except: return None
    
    def convert_to_integer(s):
        try: return int(float(s)) if not pd.isna(s) and s != '' else None
        except: return None
    
    # ... include all other conversion functions from your code ...

    # Define column conversions
    column_conversions = {
        'instance_id': convert_to_string,
        'cluster_size': convert_to_integer,
        # ... include all other columns ...
    }

    # Apply conversions
    for col, func in column_conversions.items():
        if col in df.columns: df[col] = df[col].apply(func)
        else: print(f"Warning: Column '{col}' not found")

    return df

def batch_generator(df, batch_size=10):
    """Yield batches of cleaned data from the DataFrame"""
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i+batch_size]

def main():
    # Load and clean data once at startup
    parquet_file = 'combined_sorted_redset_datasets.parquet'
    cleaned_df = load_and_clean_data(parquet_file)
    
    # Create batch generator
    batch_iter = batch_generator(cleaned_df, batch_size=BATCH_SIZE)
    
    # Continuously insert batches
    while True:
        try:
            batch = next(batch_iter)
            conn = duckdb.connect('redshift_queries.duckdb')
            
            # Register the batch as a temporary DuckDB table
            conn.register('current_batch', batch)
            
            # Insert into live_queries
            conn.execute("""
                INSERT INTO live_queries 
                SELECT * FROM current_batch
            """)
            conn.commit()
            print(f"{time.ctime()}: Inserted {len(batch)} rows into live_queries")
            
        except StopIteration:
            print("All data processed. Exiting...")
            break
            
        except Exception as e:
            print(f"Writer Error: {e}")
            
        finally:
            if 'conn' in locals(): conn.close()
        
        time.sleep(INTERVAL)

if __name__ == "__main__":
    main()

Entries: 12575460
Batch size: 80
12575460 rows loaded from combined_sorted_redset_datasets.parquet


KeyboardInterrupt: 