In [1]:
# Notebook: 08_ML_Model_Training.ipynb

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# --- 1. Configure and Start Spark Session ---
spark = SparkSession.builder \
    .appName("F1 ML Model Training") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

print("Spark session created!")

# --- 2. Load "Massive" Data from MinIO ---
# This is the key: We read the *entire directory* as one DataFrame
laps_dir = "s3a://raw-data/laps/"
print(f"Loading all lap data from {laps_dir}...")

df = spark.read.parquet(laps_dir)
print(f"Total laps loaded from all 24 races: {df.count()}")

# --- 3. Feature Engineering and Cleaning ---
print("Cleaning data and preparing features...")

# Select only the columns we need
# Label (what we predict): LapTime
# Features (what we use to predict): LapNumber, TyreLife, Compound
feature_df = df.select(
    "LapTime", 
    "LapNumber", 
    "TyreLife", 
    "Compound",
    "IsAccurate" # For filtering
)

# Clean the data
# 1. Filter for "good" laps only
clean_df = feature_df.filter(
    (F.col('IsAccurate') == True) &
    (F.col('LapTime').isNotNull()) &
    (F.col('LapNumber').isNotNull()) &
    (F.col('TyreLife').isNotNull()) &
    (F.col('Compound').isNotNull()) &
    (F.col('Compound') != 'UNKNOWN')
)

# 2. Drop any remaining nulls
clean_df = clean_df.na.drop()

print(f"Total 'clean' laps for training: {clean_df.count()}")
clean_df.show(5)

# --- 4. Define the ML Pipeline ---
print("Defining ML pipeline...")

# Stage 1: Convert 'Compound' (SOFT, MEDIUM, HARD) to a number (e.g., 0.0, 1.0, 2.0)
compound_indexer = StringIndexer(inputCol="Compound", outputCol="CompoundIndex")

# Stage 2: Assemble all feature columns into a single "features" vector
feature_cols = ["LapNumber", "TyreLife", "CompoundIndex"]
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Stage 3: (Optional but good practice) Scale features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Stage 4: Define the Machine Learning model
# We're predicting LapTime (a number), so we use a regression model
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="LapTime")

# Chain all stages together into a single pipeline
pipeline = Pipeline(stages=[compound_indexer, vector_assembler, scaler, lr])

# --- 5. Train the Model ---
print("Splitting data and training the model...")
(training_data, test_data) = clean_df.randomSplit([0.8, 0.2], seed=42)

# This is the "product": the trained model
model = pipeline.fit(training_data)

print("Model training complete!")

# --- 6. Evaluate the Model ---
print("Evaluating model performance...")
predictions = model.transform(test_data)

# Show a few predictions
predictions.select("LapTime", "prediction", "Compound", "TyreLife").show(10)

# Get the Root Mean Squared Error (RMSE)
evaluator = RegressionEvaluator(labelCol="LapTime", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Model Performance (RMSE): {rmse}")
print("This means our model's predictions are, on average, off by {rmse} seconds.")

# --- 7. Save the "Product" (The Trained Model) ---
model_path = "s3a://processed-data/models/f1_laptime_model"
print(f"Saving model to {model_path}...")

model.write().overwrite().save(model_path)

print("--- ML Model Trained and Saved to MinIO! ---")
spark.stop()

Spark session created!
Loading all lap data from s3a://raw-data/laps/...
Total laps loaded from all 24 races: 27050
Cleaning data and preparing features...
Total 'clean' laps for training: 23976
+-------+---------+--------+------------+----------+
|LapTime|LapNumber|TyreLife|    Compound|IsAccurate|
+-------+---------+--------+------------+----------+
| 97.672|      2.0|     2.0|INTERMEDIATE|      true|
| 96.443|      3.0|     3.0|INTERMEDIATE|      true|
| 95.105|      4.0|     4.0|INTERMEDIATE|      true|
| 93.859|      5.0|     5.0|INTERMEDIATE|      true|
| 92.634|      6.0|     6.0|INTERMEDIATE|      true|
+-------+---------+--------+------------+----------+
only showing top 5 rows

Defining ML pipeline...
Splitting data and training the model...
Model training complete!
Evaluating model performance...
+-------+-----------------+--------+--------+
|LapTime|       prediction|Compound|TyreLife|
+-------+-----------------+--------+--------+
| 73.883|82.52775088951172|    SOFT|     2.