In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [0]:
spark.sql("USE CATALOG hive_metastore")
spark.sql("USE default")

print("="*70)
print("ENVIRONMENT SETUP")
print("="*70)
print(f"✓ Current Catalog: {spark.sql('SELECT current_catalog()').collect()[0][0]}")
print(f"✓ Current Database: {spark.sql('SELECT current_database()').collect()[0][0]}")
print(f"✓ Spark version: {spark.version}")
print("="*70)

ENVIRONMENT SETUP
✓ Current Catalog: hive_metastore
✓ Current Database: default
✓ Spark version: 4.0.0


In [0]:
# ============================================
# Load Data and Split
# ============================================

# Load the clean data
df = spark.table("taxi_trips")

# Verify data
print(f"Loaded {df.count():,} rows")

# Create train/test split (80/20)
train, test = df.randomSplit([0.8, 0.2], seed=42)

# Cache for performance
train.cache()
test.cache()

print(f"\nTraining set: {train.count():,} rows")
print(f"Test set: {test.count():,} rows")

Loaded 53,106,919 rows

Training set: 42,487,529 rows
Test set: 10,619,390 rows


In [0]:
# ============================================
# Define Feature Columns
# ============================================

# Numeric features
numeric_features = [
    "trip_distance",
    "passenger_count",
    "fare_amount",
    "hour_of_day",
    "day_of_week",
    "is_weekend"
]

# Categorical features (location IDs)
categorical_features = [
    "PULocationID",
    "DOLocationID"
]

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


Numeric features: 6
Categorical features: 2


In [0]:
# ============================================
# Feature Preprocessing Pipeline
# DS FEATURE: Distributed feature transformation
# ============================================

print("\nBuilding feature preprocessing pipeline...")

# Stage 1: Index categorical features
indexers = [
    StringIndexer(
        inputCol=col, 
        outputCol=f"{col}_indexed",
        handleInvalid="keep"
    )
    for col in categorical_features
]

# Stage 2: Assemble all features
indexed_categorical = [f"{col}_indexed" for col in categorical_features]
all_feature_cols = numeric_features + indexed_categorical

assembler = VectorAssembler(
    inputCols=all_feature_cols,
    outputCol="features_raw",
    handleInvalid="skip"
)

# Stage 3: Scale features
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=False
)

# Create pipeline
preprocessing_pipeline = Pipeline(stages=indexers + [assembler, scaler])

print("Pipeline stages:")
for i, stage in enumerate(preprocessing_pipeline.getStages()):
    print(f"  {i+1}. {stage.__class__.__name__}")


Building feature preprocessing pipeline...
Pipeline stages:
  1. StringIndexer
  2. StringIndexer
  3. VectorAssembler
  4. StandardScaler


In [0]:
# ============================================
# Fit Pipeline and Transform Data
# ============================================

print("\nFitting preprocessing pipeline...")
import time

start_time = time.time()

# Fit on training data
pipeline_model = preprocessing_pipeline.fit(train)

# Transform both train and test
train_processed = pipeline_model.transform(train)
test_processed = pipeline_model.transform(test)

processing_time = time.time() - start_time

print(f"Pipeline fitted and applied in {processing_time:.2f} seconds")

# Select final columns for ML
final_cols = ["features", "trip_duration_minutes"]

train_ml = train_processed.select(final_cols)
test_ml = test_processed.select(final_cols)

# Cache processed data
train_ml.cache()
test_ml.cache()

print(f"\nTraining samples: {train_ml.count():,}")
print(f"Test samples: {test_ml.count():,}")
print(f"Feature vector size: {len(all_feature_cols)}")


Fitting preprocessing pipeline...
Pipeline fitted and applied in 47.23 seconds

Training samples: 42,487,529
Test samples: 10,619,390
Feature vector size: 8


In [0]:
# ============================================
# Save ML-Ready Data
# ============================================

print("\nSaving ML-ready datasets...")

# Save as Delta tables
train_ml.write.format("delta").mode("overwrite").saveAsTable("taxi_ml_train")
test_ml.write.format("delta").mode("overwrite").saveAsTable("taxi_ml_test")

# Save preprocessing model
pipeline_model.write().overwrite().save("/mnt/taxi-data/models/preprocessing_pipeline")

print("ML datasets saved:")
print("  - taxi_ml_train")
print("  - taxi_ml_test")
print("  - Preprocessing pipeline saved")


Saving ML-ready datasets...
ML datasets saved:
  - taxi_ml_train
  - taxi_ml_test
  - Preprocessing pipeline saved


In [0]:
# ============================================
# CELL 6: Verify ML Data
# ============================================

print("\n" + "="*60)
print("ML DATA VERIFICATION")
print("="*60)

# Show sample
print("\nSample ML data:")
train_ml.show(5, truncate=False)

# Feature vector inspection
sample_row = train_ml.first()
print(f"\nFeature vector size: {len(sample_row['features'])}")
print(f"Target variable: {sample_row['trip_duration_minutes']:.2f} minutes")

print("\n" + "="*60)
print("ML DATA PREPARATION COMPLETE")
print("Ready for model training")
print("="*60)


ML DATA VERIFICATION

Sample ML data:
+---------------------------------------------------------------------------------------------------------------------------+---------------------+
|features                                                                                                                   |trip_duration_minutes|
+---------------------------------------------------------------------------------------------------------------------------+---------------------+
|[1.0899394519394712,1.2792148626010864,2.4495977928807227,0.0,3.0709646716449024,0.0,2.3271751888860606,0.6753788003241491]|55.733333333333334   |
|[0.22243662284479002,1.2792148626010864,0.6798195709600171,0.0,3.0709646716449024,0.0,1.606859058992756,1.2796650953510194]|13.983333333333333   |
|[0.378142258836143,1.2792148626010864,0.7191479758915884,0.0,3.0709646716449024,0.0,0.609498263755873,0.03554625264863943] |10.083333333333334   |
|[0.5338478948274961,1.2792148626010864,1.309074049865157,0.0,3.070964671