In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("ParquetExplorer") \
    .getOrCreate()

# Read the parquet file
df = spark.read.parquet("sorted.parquet")

# Show a sample of the data and the schema
df.show(5)
df.printSchema()

# Remember to stop the Spark session when you're done with your processing
spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/12 09:53:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-----------+------------+-------+-----------+--------+--------------------+-------------------+-----------------+---------------------+--------------------+-----------+----------+---------------------+----------+-----------------------------+----------------------------+--------------------------+--------------+---------------+--------------+--------------+---------+---------+----------------+---------+
|instance_id|cluster_size|user_id|database_id|query_id|   arrival_timestamp|compile_duration_ms|queue_duration_ms|execution_duration_ms| feature_fingerprint|was_aborted|was_cached|cache_source_query_id|query_type|num_permanent_tables_accessed|num_external_tables_accessed|num_system_tables_accessed|read_table_ids|write_table_ids|mbytes_scanned|mbytes_spilled|num_joins|num_scans|num_aggregations|       rn|
+-----------+------------+-------+-----------+--------+--------------------+-------------------+-----------------+---------------------+--------------------+-----------+----------+----

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    hour, 
    mean, 
    sum, 
    count, 
    date_format, 
    concat, 
    col, 
    lit,  # Added lit
    lpad  # Added lpad
)

# Create a Spark session
spark = SparkSession.builder \
    .appName("HourlyAnalysis") \
    .getOrCreate()

# Read the parquet file
df = spark.read.parquet("sorted.parquet")

# Extract date and hour from arrival_timestamp and create hourly aggregations
hourly_stats = df.groupBy(
        concat(
            date_format(col("arrival_timestamp"), "yyyyMMdd"),
            lit(" "),
            lpad(hour(col("arrival_timestamp")).cast("string"), 2, "0")
        ).alias("date_hour")
    ) \
    .agg(
        # Performance metrics
        mean("compile_duration_ms").alias("avg_compile_time"),
        mean("queue_duration_ms").alias("avg_queue_time"),
        mean("execution_duration_ms").alias("avg_execution_time"),
        
        # Query characteristics
        mean("cluster_size").alias("avg_cluster_size"),
        sum("was_cached").alias("total_cached_queries"),
        sum("was_aborted").alias("total_aborted_queries"),
        
        # Resource usage
        mean("mbytes_scanned").alias("avg_mbytes_scanned"),
        mean("mbytes_spilled").alias("avg_mbytes_spilled"),
        
        # Query complexity metrics
        mean("num_joins").alias("avg_joins"),
        mean("num_scans").alias("avg_scans"),
        mean("num_aggregations").alias("avg_aggregations"),
        
        # Count of queries
        count("*").alias("total_queries")
    ) \
    .orderBy("date_hour")

# Display the results
hourly_stats.show(24, truncate=False)

hourly_stats.write.csv("hourly_analysis.csv", header=True)

spark.stop()

                                                                                

+-----------+------------------+------------------+------------------+------------------+--------------------+---------------------+------------------+------------------+-------------------+------------------+------------------+-------------+
|date_hour  |avg_compile_time  |avg_queue_time    |avg_execution_time|avg_cluster_size  |total_cached_queries|total_aborted_queries|avg_mbytes_scanned|avg_mbytes_spilled|avg_joins          |avg_scans         |avg_aggregations  |total_queries|
+-----------+------------------+------------------+------------------+------------------+--------------------+---------------------+------------------+------------------+-------------------+------------------+------------------+-------------+
|20240229 23|776.3225806451613 |0.0               |2565.232558139535 |7.565217391304348 |12                  |1                    |3976.516129032258 |0.0               |0.9069767441860465 |1.0465116279069768|1.8837209302325582|43           |
|20240301 00|3136.0850959884

                                                                                