In [None]:
# Notebook: 05_Spark_Analysis.ipynb

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# --- 1. Configure and Start Spark Session ---
spark = SparkSession.builder \
    .appName("F1 Telemetry Analysis") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

print("Spark session created!")

# --- 2. Load Data from MinIO into Spark ---
print("Loading telemetry data from S3a...")
telemetry_spark_df = spark.read.parquet("s3a://raw-data/2024_bahrain_Q_telemetry.parquet")

telemetry_spark_df.printSchema()
telemetry_spark_df.show(5)

# --- 3. Process Data ---
# For this analysis, our "processing" is simply selecting the columns
# we need for our plot.
processed_tel_df = telemetry_spark_df.select(
    "Driver",
    "Distance",
    "Speed",
    "Throttle",
    "Brake"
).orderBy("Driver", "Distance")

print(f"Total rows: {processed_tel_df.count()}")

# --- 4. Save Processed Data back to MinIO ---
print("Saving processed data...")
processed_tel_df.write \
    .mode("overwrite") \
    .parquet("s3a://processed-data/bahrain_Q_telemetry_processed.parquet")

print("Analysis complete and results saved!")
spark.stop()