In [0]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from pyspark.sql import functions as F
import pandas as pd

# Load data from Delta table
iris_df = spark.table("dev.iris_data")

# Feature engineering: add 'sepal_area' column
iris_df = iris_df.withColumn("sepal_area", F.col("sepal_length") * F.col("sepal_width"))

# Convert to pandas DataFrame for sklearn
iris_pd = iris_df.toPandas()

# Define features and target variable
X = iris_pd[["sepal_length", "sepal_width", "petal_length", "petal_width", "sepal_area"]]
y = iris_pd["class"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model with MLflow tracking
with mlflow.start_run(run_name="rf_dev"):
    model = RandomForestClassifier(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    # Infer model signature for logging (input/output schema)
    example_input = X_train.head(5)
    signature = infer_signature(example_input, model.predict(example_input))

    mlflow.log_param("n_estimators", 50)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, 
                            "model", 
                            registered_model_name="iris_classifier_dev",
                            signature=signature,
                            input_example=example_input)

print("Model accuracy:", acc)
