In [1]:
import pandas as pd

# Read the Parquet file
df = pd.read_parquet("full.parquet")

# View all columns and their data types
print(df.dtypes)

instance_id                               int32
cluster_size                            float64
user_id                                   int64
database_id                               int64
query_id                                  int64
arrival_timestamp                datetime64[us]
compile_duration_ms                     float64
queue_duration_ms                         int64
execution_duration_ms                     int64
feature_fingerprint                      object
was_aborted                               int32
was_cached                                int32
cache_source_query_id                   float64
query_type                               object
num_permanent_tables_accessed           float64
num_external_tables_accessed            float64
num_system_tables_accessed              float64
read_table_ids                           object
write_table_ids                          object
mbytes_scanned                          float64
mbytes_spilled                          

In [8]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("ParquetFileExplorer") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/22 17:04:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
# Load the Parquet file
df = spark.read.parquet("full.parquet")

                                                                                

In [10]:
df.show(20)

                                                                                

+-----------+------------+-------+-----------+--------+--------------------+-------------------+-----------------+---------------------+--------------------+-----------+----------+---------------------+----------+-----------------------------+----------------------------+--------------------------+--------------------+---------------+--------------+--------------+---------+---------+----------------+
|instance_id|cluster_size|user_id|database_id|query_id|   arrival_timestamp|compile_duration_ms|queue_duration_ms|execution_duration_ms| feature_fingerprint|was_aborted|was_cached|cache_source_query_id|query_type|num_permanent_tables_accessed|num_external_tables_accessed|num_system_tables_accessed|      read_table_ids|write_table_ids|mbytes_scanned|mbytes_spilled|num_joins|num_scans|num_aggregations|
+-----------+------------+-------+-----------+--------+--------------------+-------------------+-----------------+---------------------+--------------------+-----------+----------+------------

In [4]:
from pyspark.sql import SparkSession
from kafka import KafkaProducer
import json
from datetime import datetime
import time

# Function to convert row to JSON serializable format
def row_to_dict(row):
    row_dict = row.asDict()
    for key, value in row_dict.items():
        if isinstance(value, datetime):
            row_dict[key] = value.isoformat()
    return row_dict

# Create Spark session
spark = SparkSession.builder \
    .appName("KafkaPublisher") \
    .getOrCreate()

# Load your dataset into a Spark DataFrame
df = spark.read.parquet("full.parquet")

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers='localhost:9092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Continuously stream data
while True:
    # Select the first 20 rows
    first_20_rows = df.limit(20).collect()

    # Publish each row to Kafka
    for row in first_20_rows:
        message = row_to_dict(row)
        print(f"Broadcasting message: {message}")
        producer.send('sagnik', message)

    # Sleep for a while before sending the next batch
    time.sleep(5)
    break

# Close the producer (this will never be reached in this infinite loop)
producer.flush()
producer.close()

Broadcasting message: {'instance_id': 0, 'cluster_size': None, 'user_id': 1, 'database_id': 0, 'query_id': 4241475, 'arrival_timestamp': '2024-02-29T23:59:59.462678', 'compile_duration_ms': 8265.0, 'queue_duration_ms': 0, 'execution_duration_ms': 8281, 'feature_fingerprint': '42e3eae744df217d738077f11c29c7fe25d752f66229273a7eda9be6aa2c34d1', 'was_aborted': 0, 'was_cached': 0, 'cache_source_query_id': None, 'query_type': 'select', 'num_permanent_tables_accessed': 0.0, 'num_external_tables_accessed': 0.0, 'num_system_tables_accessed': 15.0, 'read_table_ids': None, 'write_table_ids': None, 'mbytes_scanned': 0.0, 'mbytes_spilled': 0.0, 'num_joins': 22, 'num_scans': 0, 'num_aggregations': 23}
Broadcasting message: {'instance_id': 0, 'cluster_size': None, 'user_id': 1, 'database_id': 0, 'query_id': 3379841, 'arrival_timestamp': '2024-03-01T00:00:00.503618', 'compile_duration_ms': 8220.0, 'queue_duration_ms': 0, 'execution_duration_ms': 8235, 'feature_fingerprint': '42e3eae744df217d738077f11c