In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# --- 1. Configure and Start Spark Session ---
# This is the "magic" that connects Spark to S3/MinIO
# It auto-downloads the required connectors (JAR files)
spark = SparkSession.builder \
    .appName("F1 Analysis") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

print("Spark session created!")
sc = spark.sparkContext

# --- 2. Load Data from MinIO into Spark ---
print("Loading data from S3a...")
laps_spark_df = spark.read.parquet("s3a://raw-data/2024_bahrain_laps.parquet")
laps_spark_df.printSchema()
laps_spark_df.show(5)

# --- 3. Perform Analysis (Tire Degradation Model) ---
# This is a sample analysis. You can build on this.

# Filter for relevant laps (not in/out laps)
# Filter for relevant laps (not in/out laps)
# We check for laps where the driver did NOT enter or exit the pits
stint_laps = laps_spark_df.filter(
    (F.col('PitInTime').isNull()) &
    (F.col('PitOutTime').isNull()) &
    (F.col('LapTime').isNotNull())
)

# === ADD THIS DEBUG CODE ===
print(f"Total laps loaded: {laps_spark_df.count()}")
print(f"Laps after filtering: {stint_laps.count()}")
# === END DEBUG CODE ===

# The 'LapTime' column is already in seconds (float) thanks to our fix.
# We just rename it for clarity.
stint_laps = stint_laps.withColumnRenamed('LapTime', 'LapTimeSeconds')

print("Filtered for non-pit laps.")

# The 'LapTime' column is already in seconds (float) thanks to our fix.
# We just rename it for clarity.
stint_laps = stint_laps.withColumnRenamed('LapTime', 'LapTimeSeconds')

# Group by driver and stint
stint_analysis = stint_laps.groupBy("Driver", "Stint") \
    .agg(
        F.avg("LapTimeSeconds").alias("AvgLapTime"),
        F.min("LapTimeSeconds").alias("MinLapTime"),
        F.max("LapTimeSeconds").alias("MaxLapTime"),
        F.count("LapNumber").alias("LapsInStint")
    ) \
    .orderBy("Driver", "Stint")

print("Tire stint analysis:")
stint_analysis.show(20)

# --- 4. Save Processed Data back to MinIO ---
print("Saving processed data...")
stint_analysis.write \
    .mode("overwrite") \
    .parquet("s3a://processed-data/bahrain_stint_analysis.parquet")

print("Analysis complete and results saved!")

spark.stop()

Spark session created!
Loading data from S3a...
root
 |-- Time: double (nullable = true)
 |-- Driver: string (nullable = true)
 |-- DriverNumber: string (nullable = true)
 |-- LapTime: double (nullable = true)
 |-- LapNumber: double (nullable = true)
 |-- Stint: double (nullable = true)
 |-- PitOutTime: double (nullable = true)
 |-- PitInTime: double (nullable = true)
 |-- Sector1Time: double (nullable = true)
 |-- Sector2Time: double (nullable = true)
 |-- Sector3Time: double (nullable = true)
 |-- Sector1SessionTime: double (nullable = true)
 |-- Sector2SessionTime: double (nullable = true)
 |-- Sector3SessionTime: double (nullable = true)
 |-- SpeedI1: double (nullable = true)
 |-- SpeedI2: double (nullable = true)
 |-- SpeedFL: double (nullable = true)
 |-- SpeedST: double (nullable = true)
 |-- IsPersonalBest: boolean (nullable = true)
 |-- Compound: string (nullable = true)
 |-- TyreLife: double (nullable = true)
 |-- FreshTyre: boolean (nullable = true)
 |-- Team: string (nullab