In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time

In [0]:
spark.sql("USE CATALOG hive_metastore")
spark.sql("USE default")

print("="*70)
print("ENVIRONMENT SETUP")
print("="*70)
print(f"✓ Current Catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
print(f"✓ Current Database: {spark.sql('SELECT current_database()').collect()[0][0]}")
print(f"✓ Spark version: {spark.version}")
print("="*70)

ENVIRONMENT SETUP
✓ Current Catalog: hive_metastore
✓ Current Database: default
✓ Spark version: 4.0.0


In [0]:
# ============================================
# Setup MLFlow
# ============================================

# Set experiment
username = spark.sql("SELECT current_user()").collect()[0][0]
experiment_name = f"/Users/{username}/nyc-taxi-prediction"

mlflow.set_experiment(experiment_name)
mlflow.spark.autolog()

print("="*70)
print("MLFLOW EXPERIMENT SETUP")
print("="*70)
print(f"Experiment: {experiment_name}")
print(f"Autologging: ENABLED")
print(f"MLFlow version: {mlflow.__version__}")
print("="*70)



MLFLOW EXPERIMENT SETUP
Experiment: /Users/hingushrey2707@gmail.com/nyc-taxi-prediction
Autologging: ENABLED
MLFlow version: 3.0.1


In [0]:
# ============================================
# Load ML Data
# ============================================

train = spark.table("taxi_ml_train")
test = spark.table("taxi_ml_test")

# Force evaluation and cache
train.cache().count()
test.cache().count()

print(f"\nTraining data: {train.count():,} samples")
print(f"Test data: {test.count():,} samples")

# Define evaluator
evaluator = RegressionEvaluator(
    labelCol="trip_duration_minutes",
    predictionCol="prediction"
)


Training data: 42,487,529 samples
Test data: 10,619,390 samples


In [0]:
# ============================================
# Model 1 - Linear Regression (Baseline)
# ============================================

print("\n" + "="*60)
print("MODEL 1: LINEAR REGRESSION (BASELINE)")
print("="*60)

with mlflow.start_run(run_name="01_linear_regression_baseline") as run:
    
    print(f"MLFlow Run ID: {run.info.run_id}")
    
    # Train model
    start_time = time.time()
    
    lr = LinearRegression(
        featuresCol="features",
        labelCol="trip_duration_minutes",
        maxIter=10,
        regParam=0.01
    )
    
    lr_model = lr.fit(train)
    training_time = time.time() - start_time
    
    # Predictions
    predictions_lr = lr_model.transform(test)
    
    # Evaluate
    rmse_lr = evaluator.evaluate(predictions_lr, {evaluator.metricName: "rmse"})
    mae_lr = evaluator.evaluate(predictions_lr, {evaluator.metricName: "mae"})
    r2_lr = evaluator.evaluate(predictions_lr, {evaluator.metricName: "r2"})
    
    # Log additional metrics
    mlflow.log_metric("training_time_seconds", training_time)
    mlflow.log_metric("rmse", rmse_lr)  
    mlflow.log_metric("mae", mae_lr)    
    mlflow.log_metric("r2", r2_lr)    
    mlflow.log_param("model_type", "Linear Regression")
    mlflow.log_param("distributed_training", "False")
    
    print(f"\nTraining time: {training_time:.2f} seconds")
    print(f"RMSE: {rmse_lr:.2f} minutes")
    print(f"MAE: {mae_lr:.2f} minutes")
    print(f"R²: {r2_lr:.4f}")


MODEL 1: LINEAR REGRESSION (BASELINE)
MLFlow Run ID: 1440703963464261b8376c9d0792e991

Training time: 44.88 seconds
RMSE: 7.30 minutes
MAE: 4.22 minutes
R²: 0.7151


In [0]:
# ============================================
# Model 2 - Random Forest
# DS FEATURE: DISTRIBUTED ML TRAINING (Data Parallelism)
# ============================================

print("\n" + "="*60)
print("MODEL 2: RANDOM FOREST (DISTRIBUTED TRAINING)")
print("="*60)

with mlflow.start_run(run_name="02_random_forest_v1") as run:
    
    print(f"MLFlow Run ID: {run.info.run_id}")
    print("\nTraining Random Forest with distributed data parallelism...")
    print("- Each worker trains subset of trees independently")
    print("- Trees aggregated at driver to form ensemble")
    
    # Train model
    start_time = time.time()
    
    rf = RandomForestRegressor(
        featuresCol="features",
        labelCol="trip_duration_minutes",
        numTrees=50,
        maxDepth=7,
        minInstancesPerNode=1,
        seed=42
    )
    
    rf_model = rf.fit(train)
    training_time = time.time() - start_time
    
    # Predictions
    predictions_rf = rf_model.transform(test)
    
    # Evaluate
    rmse_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "rmse"})
    mae_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "mae"})
    r2_rf = evaluator.evaluate(predictions_rf, {evaluator.metricName: "r2"})
    
    # Log metrics
    mlflow.log_metric("training_time_seconds", training_time)
    mlflow.log_metric("rmse", rmse_rf)  
    mlflow.log_metric("mae", mae_rf)    
    mlflow.log_metric("r2", r2_rf)      
    mlflow.log_param("model_type", "Random Forest")
    mlflow.log_param("distributed_training", "True")
    
    # Log feature importance
    feature_importance = rf_model.featureImportances.toArray()
    for idx, importance in enumerate(feature_importance):
        mlflow.log_metric(f"feature_{idx}_importance", float(importance))
    
    print(f"\nTraining time: {training_time:.2f} seconds")
    print(f"RMSE: {rmse_rf:.2f} minutes")
    print(f"MAE: {mae_rf:.2f} minutes")
    print(f"R²: {r2_rf:.4f}")
    
    # Show feature importance
    print("\nTop 5 Feature Importances:")
    feature_names = ["trip_distance", "passenger_count", "fare_amount", 
                     "hour_of_day", "day_of_week", "is_weekend", "PULocationID_idx", "DOLocationID_idx"]
    importance_pairs = list(zip(feature_names, feature_importance))
    importance_pairs.sort(key=lambda x: x[1], reverse=True)
    for name, imp in importance_pairs[:5]:
        print(f"  {name}: {imp:.4f}")


MODEL 2: RANDOM FOREST (DISTRIBUTED TRAINING)
MLFlow Run ID: 0af8d89acebb4a67958621b6fccdf8e8

Training Random Forest with distributed data parallelism...
- Each worker trains subset of trees independently
- Trees aggregated at driver to form ensemble

Training time: 809.72 seconds
RMSE: 5.17 minutes
MAE: 2.55 minutes
R²: 0.8571

Top 5 Feature Importances:
  fare_amount: 0.5900
  trip_distance: 0.2691
  PULocationID_idx: 0.0806
  hour_of_day: 0.0281
  DOLocationID_idx: 0.0259


In [0]:
# ============================================
# Model 3 - Gradient Boosted Trees
# ============================================

print("\n" + "="*60)
print("MODEL 3: GRADIENT BOOSTED TREES")
print("="*60)

with mlflow.start_run(run_name="03_gbt_v1") as run:
    
    print(f"MLFlow Run ID: {run.info.run_id}")
    
    # Train model
    start_time = time.time()
    
    gbt = GBTRegressor(
        featuresCol="features",
        labelCol="trip_duration_minutes",
        maxIter=20,
        maxDepth=5,
        seed=42
    )
    
    gbt_model = gbt.fit(train)
    training_time = time.time() - start_time
    
    # Predictions
    test_sample = test.sample(fraction=0.3, seed=42)
    predictions_gbt = gbt_model.transform(test_sample)
    
    # Evaluate
    rmse_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "rmse"})
    mae_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "mae"})
    r2_gbt = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "r2"})
    
    # Log metrics
    mlflow.log_metric("training_time_seconds", training_time)
    mlflow.log_metric("rmse", rmse_gbt)  
    mlflow.log_metric("mae", mae_gbt)    
    mlflow.log_metric("r2", r2_gbt)     
    mlflow.log_param("model_type", "Gradient Boosted Trees")
    
    print(f"\nTraining time: {training_time:.2f} seconds")
    print(f"RMSE: {rmse_gbt:.2f} minutes")
    print(f"MAE: {mae_gbt:.2f} minutes")
    print(f"R²: {r2_gbt:.4f}")


MODEL 3: GRADIENT BOOSTED TREES
MLFlow Run ID: 9254956582fb4d19bec2029d0e6f6fb4

Training time: 3061.72 seconds
RMSE: 4.79 minutes
MAE: 2.22 minutes
R²: 0.8776


In [0]:
# ============================================
# Compare All Models
# ============================================

print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)

comparison_data = [
    ("Linear Regression", rmse_lr, mae_lr, r2_lr),
    ("Random Forest", rmse_rf, mae_rf, r2_rf),
    ("Gradient Boosted Trees", rmse_gbt, mae_gbt, r2_gbt)
]

print(f"\n{'Model':<25} {'RMSE (min)':<15} {'MAE (min)':<15} {'R²':<10}")
print("-" * 70)
for model, rmse, mae, r2 in comparison_data:
    print(f"{model:<25} {rmse:<15.2f} {mae:<15.2f} {r2:<10.4f}")

# Find best model
best_model_idx = min(range(len(comparison_data)), key=lambda i: comparison_data[i][1])
best_model_name = comparison_data[best_model_idx][0]

print("\n" + "="*70)
print(f"BEST MODEL: {best_model_name}")
print(f"RMSE: {comparison_data[best_model_idx][1]:.2f} minutes")
print("="*70)



MODEL COMPARISON SUMMARY

Model                     RMSE (min)      MAE (min)       R²        
----------------------------------------------------------------------
Linear Regression         7.30            4.22            0.7151    
Random Forest             5.17            2.55            0.8571    
Gradient Boosted Trees    4.79            2.22            0.8776    

BEST MODEL: Gradient Boosted Trees
RMSE: 4.79 minutes


In [0]:
# ============================================
# Store Best Model Metadata for Registry
# ============================================

print("\n" + "="*70)
print("STORING BEST MODEL METADATA")
print("="*70)

# Find best model by lowest RMSE
best_metrics = {
    "linear_regression": {"rmse": rmse_lr, "mae": mae_lr, "r2": r2_lr},
    "random_forest": {"rmse": rmse_rf, "mae": mae_rf, "r2": r2_rf},
    "gradient_boosted_trees": {"rmse": rmse_gbt, "mae": mae_gbt, "r2": r2_gbt}
}

# Find model with lowest RMSE
best_model_name = min(best_metrics.keys(), key=lambda k: best_metrics[k]["rmse"])
best_model_metrics = best_metrics[best_model_name]

print(f"\nBest Model: {best_model_name}")
print(f"  RMSE: {best_model_metrics['rmse']:.2f} minutes")
print(f"  MAE: {best_model_metrics['mae']:.2f} minutes")
print(f"  R²: {best_model_metrics['r2']:.4f}")

# Save to Delta table for easy retrieval
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("model_name", StringType(), False),
    StructField("rmse", DoubleType(), False),
    StructField("mae", DoubleType(), False),
    StructField("r2", DoubleType(), False),
    StructField("timestamp", StringType(), False)
])

from datetime import datetime

model_comparison_data = [
    ("linear_regression", rmse_lr, mae_lr, r2_lr, datetime.now().isoformat()),
    ("random_forest", rmse_rf, mae_rf, r2_rf, datetime.now().isoformat()),
    ("gradient_boosted_trees", rmse_gbt, mae_gbt, r2_gbt, datetime.now().isoformat())
]

model_comparison_df = spark.createDataFrame(model_comparison_data, schema=schema)

# Save to Delta table
model_comparison_df.write.format("delta").mode("overwrite").saveAsTable("model_comparison_metrics")

print("\nModel metrics saved to table: model_comparison_metrics")


STORING BEST MODEL METADATA

Best Model: gradient_boosted_trees
  RMSE: 4.79 minutes
  MAE: 2.22 minutes
  R²: 0.8776

Model metrics saved to table: model_comparison_metrics


In [0]:
# ============================================
# Save Best Model (Dynamic Selection)
# ============================================

print(f"\nSaving best model ({best_model_name})...")

# Save the actual best model based on metrics
if best_model_name == "linear_regression":
    best_model_object = lr_model
elif best_model_name == "random_forest":
    best_model_object = rf_model
else:  # gradient_boosted_trees
    best_model_object = gbt_model

# Save to DBFS with dynamic name
model_save_path = f"/mnt/taxi-data/models/best_{best_model_name}"
best_model_object.write().overwrite().save(model_save_path)

print(f"Best model saved to: {model_save_path}")
print(f"   Model Type: {best_model_name}")
print(f"   RMSE: {best_model_metrics['rmse']:.2f} minutes")



Saving best model (gradient_boosted_trees)...
Best model saved to: /mnt/taxi-data/models/best_gradient_boosted_trees
   Model Type: gradient_boosted_trees
   RMSE: 4.79 minutes
