# FINAL - Pyarrow stream batches

In [None]:
import pyarrow.parquet as pq
import confluent_kafka
import json
from datetime import datetime
import time
import pandas as pd

# Custom JSON encoder to handle Timestamp
def json_serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')

def stream_parquet_to_kafka(parquet_file_path, kafka_topic, bootstrap_servers):
    # Start total time tracking
    total_start_time = time.time()
    print(f"Starting to stream Parquet file: {parquet_file_path}")

    # Configure Kafka producer
    producer_config = {
        'bootstrap.servers': bootstrap_servers,
        'client.id': 'parquet-streamer',
        'batch.size': 50000000,       # Increase batch size (50MB)
        'linger.ms': 100,              # Small delay for better batching
        'compression.type': 'gzip',    # Compress messages
        'message.max.bytes': 104857600, # 100MB
        'queue.buffering.max.messages': 1000000, # Handle large queues
    }
    producer = confluent_kafka.Producer(producer_config)

    # Open Parquet file and get file metadata
    parquet_file = pq.ParquetFile(parquet_file_path)
    total_rows = parquet_file.metadata.num_rows
    print(f"Total rows in Parquet file: {total_rows}")

    # Tracking variables
    total_rows_processed = 0
    total_batch_time = 0
    total_kafka_time = 0
    file_writer = 0

    # Variable to track rows processed per day
    rows_processed_today = 0
    max_rows_per_day = 45000  # Limit per day

    # Variable to track the current date
    current_day = None

    # Iterate through batches instead of individual rows
    for batch_num, batch in enumerate(parquet_file.iter_batches(batch_size = 800), 1):

        batch_start_time = time.time()
        
        # Convert batch to pandas for easier processing
        df_batch = batch.to_pandas()
        batch_conversion_time = time.time() - batch_start_time

        df_batch = df_batch.drop(columns = ["mbytes_scanned", "instance_id", "cluster_size", "database_id", "queue_duration_ms", "feature_fingerprint", "was_aborted", "cache_source_query_id", "num_permanent_tables_accessed", "num_external_tables_accessed", "num_system_tables_accessed", "mbytes_spilled", "num_aggregations", "rn"])

        # Convert entire batch to records and stream
        kafka_start_time = time.time()
        # Convert entire batch to JSON and stream as a single message
        batch_messages = json.dumps(df_batch.to_dict('records'), default=json_serializer)
        producer.produce(
            topic=kafka_topic, 
            value=batch_messages.encode('utf-8')
        )

        # Flush to ensure messages are sent
        producer.flush()
        
        kafka_time = time.time() - kafka_start_time
        total_kafka_time += kafka_time

        total_rows_processed += len(df_batch)
        batch_total_time = time.time() - batch_start_time
        total_batch_time += batch_total_time
        
    # Close producer
    producer.close()

    # Print final statistics
    total_runtime = time.time() - total_start_time
    print("\n--- Streaming Complete ---")
    print(f"Total runtime: {total_runtime:.2f} seconds")
    print(f"Total rows processed: {total_rows_processed}")
    print(f"Total Kafka production time: {total_kafka_time:.2f} seconds")
    print(f"Average batch processing time: {total_batch_time / batch_num:.2f} seconds")

# Example usage
stream_parquet_to_kafka(
    parquet_file_path='../sorted_p.parquet', 
    kafka_topic='muhid', 
    bootstrap_servers='localhost:9092'
)

Starting to stream Parquet file: ../sorted_p.parquet
Total rows in Parquet file: 433107811
