# Download Data, sort after date , split into static and streaming data, save

In [8]:
import pandas as pd
import os
import gc  # Import garbage collector interface


# Check if the data is already downloaded
if not os.path.exists("sample_0.01_serverless.parquet"):
      !aws s3 cp --no-sign-request s3://redshift-downloads/redset/serverless/sample_0.01.parquet sample_0.01_serverless.parquet
      print("Serverless data downloaded.")
else:
      print("Serverless data already exists.")

if not os.path.exists("sample_0.01_provisioned.parquet"):
      !aws s3 cp --no-sign-request s3://redshift-downloads/redset/provisioned/sample_0.01.parquet sample_0.01_provisioned.parquet
      print("Provisioned data downloaded.")
else:
      print("Provisioned data already exists.")


# Load the serverless dataset
print("Loading serverless dataset...")
serverless_df = pd.read_parquet("sample_0.01_serverless.parquet")
serverless_df['dataset_type'] = 'serverless'  # Add flag for serverless

# Load the provisioned dataset
print("Loading provisioned dataset...")
provisioned_df = pd.read_parquet("sample_0.01_provisioned.parquet")
provisioned_df['dataset_type'] = 'provisioned'  # Add flag for provisioned

# Combine both datasets into one DataFrame
print("Combining datasets...")
combined_df = pd.concat([serverless_df, provisioned_df], ignore_index=True)

# Clear memory of the individual datasets as they're no longer needed
del serverless_df
del provisioned_df
gc.collect()  # Force garbage collection

# Sort the combined dataset by 'arrival_timestamp'
print("Sorting by arrival_timestamp...")
combined_df = combined_df.sort_values(by='arrival_timestamp')

# Drop duplicate records
print("Dropping duplicates...")
combined_df = combined_df.drop_duplicates()

# Calculate the index for 66% of the DataFrame
split_index = int(len(combined_df) * 0.66)

# Split the DataFrame into static_data and streaming_data
static_data = combined_df.iloc[:split_index]

# Display information about the static dataset
print(f"Static dataset - Start day: {static_data['arrival_timestamp'].iloc[0]}, "
      f"End day: {static_data['arrival_timestamp'].iloc[-1]}")

# Save the static data to a Parquet file
print("Saving static_data.parquet...")
static_data.to_parquet("static_data.parquet", index=False)

# Clear memory of static_data
del static_data
gc.collect()  # Force garbage collection

# Take the remaining 34% for the streaming dataset
streaming_data = combined_df.iloc[split_index:]

# Display information about the streaming dataset
print(f"Streaming dataset - Start day: {streaming_data['arrival_timestamp'].iloc[0]}, "
      f"End day: {streaming_data['arrival_timestamp'].iloc[-1]}")

# Save the streaming data to a Parquet file
print("Saving streaming_data.parquet...")
streaming_data.to_parquet("streaming_data.parquet", index=False)

# Clear memory of streaming_data
del streaming_data
gc.collect()  # Force garbage collection

# Clear the combined DataFrame from memory
del combined_df
gc.collect()  # Force garbage collection

print("Done!")

Serverless data already exists.
Provisioned data already exists.
Loading serverless dataset...
Loading provisioned dataset...
Combining datasets...
Sorting by arrival_timestamp...
Dropping duplicates...
Static dataset - Start day: 2024-03-01 00:00:05.086395, End day: 2024-04-30 11:18:57.975306
Saving static_data.parquet...
Streaming dataset - Start day: 2024-04-30 11:18:59.069925, End day: 2024-05-30 23:59:42.680457
Saving streaming_data.parquet...
Done!


## Convert Datatypes in Static Data to upload directly to snowflake

In [9]:
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm

# Paths to the input and output Parquet files
INPUT_PARQUET_FILE = 'static_data.parquet'  # Replace with your input file path
OUTPUT_PARQUET_FILE = 'static_data_ready2upload.parquet'  # Replace with your desired output file path

# Function to process a batch
def process_batch(batch):
    """Clean and convert a single batch"""
    # Define expected types for each column (modify according to your schema)
    column_types = {
        'instance_id': 'str',
        'cluster_size': 'int',
        'user_id': 'str',
        'database_id': 'str',
        'query_id': 'str',
        'arrival_timestamp': 'datetime',
        'compile_duration_ms': 'int',
        'queue_duration_ms': 'int',
        'execution_duration_ms': 'int',
        'feature_fingerprint': 'str',
        'was_aborted': 'bool',
        'was_cached': 'bool',
        'cache_source_query_id': 'str',
        'query_type': 'str',
        'num_permanent_tables_accessed': 'int',
        'num_external_tables_accessed': 'int',
        'num_system_tables_accessed': 'int',
        'read_table_ids': 'str',
        'write_table_ids': 'str',
        'mbytes_scanned': 'int',
        'mbytes_spilled': 'int',
        'num_joins': 'int',
        'num_scans': 'int',
        'num_aggregations': 'int',
        'dataset_type': 'str'
    }

    for col in batch.columns:
        col_type = column_types.get(col, 'str')
        
        # Handle null values
        if col_type in ['int', 'float']:
            batch[col] = batch[col].fillna(0).astype(col_type)
        elif col_type == 'bool':
            batch[col] = batch[col].fillna(False).astype(bool)
        elif col_type == 'datetime':
            batch[col] = pd.to_datetime(batch[col], errors='coerce')
            batch[col] = batch[col].dt.strftime('%Y-%m-%d %H:%M:%S')
        else:  # string
            batch[col] = batch[col].astype(str).str.strip()
            batch[col] = batch[col].replace({'nan': '', 'None': '', 'null': ''})
            batch[col] = batch[col].fillna('')
    return batch

# Function to process the entire Parquet file in batches
def process_parquet_file(input_file, output_file, batch_size=10000):
    # Open the Parquet file using ParquetFile
    parquet_file = pq.ParquetFile(input_file)
    
    # Get total number of rows
    total_rows = parquet_file.metadata.num_rows
    
    # Calculate number of batches
    num_batches = (total_rows + batch_size - 1) // batch_size  # Total number of batches
    
    # Prepare to write to the output Parquet file
    writer = None
    
    # Progress bar
    with tqdm(total=num_batches, desc='Processing batches', unit='batch') as pbar:
        # Read and process each batch
        for batch in parquet_file.iter_batches(batch_size=batch_size):
            # Convert the batch (RecordBatch) to a pandas DataFrame
            df_batch = batch.to_pandas()
            
            # Process the batch
            processed_batch = process_batch(df_batch)
            
            # Convert the processed batch back to a RecordBatch
            table = pa.Table.from_pandas(processed_batch)
            
            # Initialize the Parquet writer with schema on first batch
            if writer is None:
                writer = pq.ParquetWriter(output_file, table.schema)
            
            # Write the batch to the output Parquet file
            writer.write_table(table)
            
            # Update progress bar
            pbar.update(1)
    
    # Close the Parquet writer
    if writer:
        writer.close()
    
    print(f"Processed data written to {output_file}")

# Call the function to process the file
process_parquet_file(INPUT_PARQUET_FILE, OUTPUT_PARQUET_FILE, batch_size=10000)


Processing batches: 100%|██████████| 292/292 [00:17<00:00, 17.13batch/s]


Processed data written to static_data_ready2upload.parquet
