### Azure Databricks Notebook: Serverless ETL & ML on NYC Taxi Data
**Goal**: Load, clean, and transform Yellow Taxi Trip data, then train a KMeans model using Spark MLlib.
![](path)

In [0]:
from pyspark.sql.functions import *
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator
import mlflow.spark


In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv(
    "dbfs:/FileStore/shared_uploads/yellow_tripdata_2024_01-1.csv"
)
df.printSchema()
df.show(5)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- Airport_fee: string (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+

### Data Cleaning

In [0]:
df_clean = df.filter((col("trip_distance") > 0) & 
                     (col("fare_amount") > 0) & 
                     (col("passenger_count") > 0))
df_clean = df_clean.dropna()
df_clean.select("trip_distance", "fare_amount", "passenger_count").describe().show()

+-------+-----------------+------------------+------------------+
|summary|    trip_distance|       fare_amount|   passenger_count|
+-------+-----------------+------------------+------------------+
|  count|          2723805|           2723805|           2723805|
|   mean|3.302459328035661|18.442573117384878|1.3547287709656162|
| stddev|12.32758977075121|17.436109682284737|0.8448002586726229|
|    min|             0.01|              0.01|                 1|
|    max|         15400.32|            2221.3|                 9|
+-------+-----------------+------------------+------------------+



### Feature Engineering

In [0]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import mlflow
import mlflow.spark

# Cast passenger_count to int
df_clean = df_clean.withColumn("passenger_count", col("passenger_count").cast("int"))

# Vector Assembler
assembler = VectorAssembler(
    inputCols=["trip_distance", "fare_amount", "passenger_count"],
    outputCol="features_raw"
)
df_vector = assembler.transform(df_clean)

#Feature Scaling
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

### Train KMeans Clustering Model

In [0]:
# Train-Test Split
train_data, test_data = df_scaled.randomSplit([0.8, 0.2], seed=42)

In [0]:
from pyspark.ml.clustering import KMeans

k = 4
kmeans = KMeans(featuresCol="features", predictionCol="prediction", k=k, seed=42)
model = kmeans.fit(train_data)

In [0]:
#predict on test data
predictions = model.transform(test_data)
print(predictions)

DataFrame[VendorID: int, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: int, trip_distance: double, RatecodeID: string, store_and_fwd_flag: string, PULocationID: int, DOLocationID: int, payment_type: int, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: string, Airport_fee: string, features_raw: vector, features: vector, prediction: int]


In [0]:
#Evaluate with silhoutte score
from pyspark.ml.evaluation import ClusteringEvaluator

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="prediction", metricName="silhouette")
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette Score (Test Data): {silhouette}")

Silhouette Score (Test Data): 0.8409256608088838


In [0]:
#Log with MLFlow
import mlflow
import mlflow.spark

mlflow.set_experiment("/Shared/NYC_Taxi_KMeans_FullPipeline_Cleaned")
with mlflow.start_run(run_name="Final_KMeans_Run"):
    mlflow.log_param("k", k)
    mlflow.log_metric("silhouette", silhouette)
    mlflow.spark.log_model(model, "kmeans_model")

2025/07/15 11:17:14 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


In [0]:
#Display sample predictions
predictions.select("trip_distance", "fare_amount", "passenger_count", "prediction").show(10)

+-------------+-----------+---------------+----------+
|trip_distance|fare_amount|passenger_count|prediction|
+-------------+-----------+---------------+----------+
|          1.9|       16.3|              1|         0|
|          0.5|        5.1|              4|         3|
|          1.0|        8.6|              1|         0|
|          8.7|       34.5|              1|         0|
|          1.2|       10.0|              1|         0|
|          4.0|       21.9|              1|         0|
|          0.6|        5.8|              1|         0|
|          5.0|       33.8|              2|         0|
|          2.6|       14.9|              1|         0|
|          2.7|       17.7|              1|         0|
+-------------+-----------+---------------+----------+
only showing top 10 rows



In [0]:
#Cluster distribution
from pyspark.sql.functions import col

cluster_counts = predictions.groupBy("prediction").count().orderBy("prediction")
display(cluster_counts)  

prediction,count
0,451364
1,56147
3,36224


Databricks visualization. Run in Databricks to view.

In [0]:
#Cluster-wise Averages
cluster_avg = predictions.groupBy("prediction").avg("trip_distance", "fare_amount", "passenger_count").orderBy("prediction")
display(cluster_avg)

prediction,avg(trip_distance),avg(fare_amount),avg(passenger_count)
0,1.9954654558183649,13.466840775959088,1.1497106548151823
1,14.291601866528948,60.30749425614899,1.3167221757173135
3,2.5960435070671397,15.9945246245583,3.997460247349824


In [0]:
#Print Cluster Centers
print("Cluster Centers:")
for i, center in enumerate(model.clusterCenters()):
    print(f"Cluster {i}: {center}")


Cluster Centers:
Cluster 0: [-0.10579908 -0.28467819 -0.24333289]
Cluster 1: [ 0.88776676  2.40235224 -0.04659966]
Cluster 2: [ 1.06561767e+03  1.77834548e+00 -4.19896617e-01]
Cluster 3: [-0.05864871 -0.14326695  3.11311511]


🔸 Cluster 0:

[-0.10, -0.28, -0.24]
Slightly below average distance and fare

Slightly fewer passengers

 Likely short trips with 1 person — very common (you saw this had 450k+ records)

🔸 Cluster 1:

[0.88, 2.40, -0.04]
Above average distance

Much higher than average fare

Typical passenger count

 Possibly longer expensive trips, maybe airport rides

Cluster 2
 The distance is ~1000+ after scaling — that’s way off!

May be a data error, or just extreme outlier trip
 Ignore this one or treat it as an outlier group

🔸 Cluster 3:
[-0.05, -0.14, 3.11]
Average distance/fare

Much higher than average passenger count (like 3+)
 Group/shared rides or families, high passenger count



In [0]:
# Save Output to ADLS
predictions.write.mode("overwrite").parquet("/mnt/yellowtaxidata/clustered_test_output_cleaned/")

In [0]:
import mlflow
import mlflow.spark

mlflow.set_experiment("/Shared/NYC_Taxi_KMeans_Training")

with mlflow.start_run():
    mlflow.log_param("k", 4)
    mlflow.log_metric("silhouette_score", silhouette_score)
    mlflow.spark.log_model(model, "kmeans_model")
