In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, hour, dayofweek
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import math

# Initialize Spark session
spark = SparkSession.builder \
    .appName("NYC Taxi Trip Duration Analysis and Prediction") \
    .getOrCreate()

## Load Training Data and Initial Exploration
We load the `train.csv` file, which contains the NYC Taxi Trip Duration dataset with features like pickup/dropoff locations, timestamps, and the target variable `trip_duration`. We display the schema to check column names and data types, and show a sample of the data to understand its content.

In [2]:
# Load the training data
train_data = spark.read.csv("/kaggle/input/problem02/train.csv", header=True, inferSchema=True)

# Check schema and sample data
print("Schema of the training dataset:")
train_data.printSchema()

print("\nSample of the training dataset:")
train_data.show(5)

Schema of the training dataset:
root
 |-- id: string (nullable = true)
 |-- vendor_id: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- trip_duration: integer (nullable = true)


Sample of the training dataset:
+---------+---------+-------------------+-------------------+---------------+------------------+------------------+------------------+------------------+------------------+-------------+
|       id|vendor_id|    pickup_datetime|   dropoff_datetime|passenger_count|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|store_and_fwd_flag|trip_duration|
+---------+---------+-------------------+-

## EDA: Summary Statistics and Missing Values
We compute summary statistics for numerical columns to understand their distributions and identify potential outliers. We also check for missing values in each column to ensure data quality, which is crucial for model performance.

In [3]:
# Summary statistics for numerical columns
numerical_cols = ["passenger_count", "pickup_longitude", "pickup_latitude",
                  "dropoff_longitude", "dropoff_latitude", "trip_duration"]
print("\nSummary statistics for numerical columns in train.csv:")
train_data.select(numerical_cols).describe().show()

# Check for missing values
print("\nMissing values in each column in train.csv:")
for column in train_data.columns:
    missing_count = train_data.filter(col(column).isNull()).count()
    print(f"{column}: {missing_count}")


Summary statistics for numerical columns in train.csv:
+-------+------------------+-------------------+-------------------+-------------------+-------------------+-----------------+
|summary|   passenger_count|   pickup_longitude|    pickup_latitude|  dropoff_longitude|   dropoff_latitude|    trip_duration|
+-------+------------------+-------------------+-------------------+-------------------+-------------------+-----------------+
|  count|           1458644|            1458644|            1458644|            1458644|            1458644|          1458644|
|   mean|1.6645295219395548| -73.97348630489282| 40.750920908391734|  -73.9734159469458|   40.7517995149002|959.4922729603659|
| stddev|  1.31424216782312| 0.0709018584227037|0.03288118625763338| 0.0706432680972028|0.03589055560563534|5237.431724497609|
|    min|                 0|-121.93334197998047|  34.35969543457031|-121.93330383300781|   32.1811408996582|                1|
|    max|                 9| -61.33552932739258|  51.88

## EDA: Feature Distribution Analysis
We explore the distributions of key features to guide preprocessing:
- `trip_duration`: To identify outliers in the target variable.
- `passenger_count`: To understand the distribution of passengers per trip.
- `store_and_fwd_flag`: To check the balance of this categorical feature.
These insights help us decide on outlier removal and feature encoding.

In [4]:
# Distribution of trip_duration
print("\nTrip duration distribution (quantiles) in train.csv:")
train_data.select("trip_duration").summary("min", "25%", "50%", "75%", "max").show()

# Distribution of passenger_count
print("\nPassenger count distribution in train.csv:")
train_data.groupBy("passenger_count").count().orderBy("passenger_count").show()

# Distribution of store_and_fwd_flag
print("\nStore and forward flag distribution in train.csv:")
train_data.groupBy("store_and_fwd_flag").count().show()


Trip duration distribution (quantiles) in train.csv:
+-------+-------------+
|summary|trip_duration|
+-------+-------------+
|    min|            1|
|    25%|          397|
|    50%|          662|
|    75%|         1075|
|    max|      3526282|
+-------+-------------+


Passenger count distribution in train.csv:
+---------------+-------+
|passenger_count|  count|
+---------------+-------+
|              0|     60|
|              1|1033540|
|              2| 210318|
|              3|  59896|
|              4|  28404|
|              5|  78088|
|              6|  48333|
|              7|      3|
|              8|      1|
|              9|      1|
+---------------+-------+


Store and forward flag distribution in train.csv:
+------------------+-------+
|store_and_fwd_flag|  count|
+------------------+-------+
|                 Y|   8045|
|                 N|1450599|
+------------------+-------+



## Feature Engineering: Temporal Features and Distance
We engineer new features to improve model performance:
- Extract `pickup_hour` and `pickup_dayofweek` from `pickup_datetime` to capture temporal patterns (e.g., rush hour effects).
- Calculate `distance_km` between pickup and dropoff locations using the Haversine formula, which computes the great-circle distance in kilometers. This feature is likely a strong predictor of trip duration.

In [5]:
# Extract hour and day of week from pickup_datetime
train_data = train_data.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime")))
train_data = train_data.withColumn("pickup_hour", hour(col("pickup_datetime")))
train_data = train_data.withColumn("pickup_dayofweek", dayofweek(col("pickup_datetime")))

print("\nSample with new temporal features in train.csv:")
train_data.select("pickup_datetime", "pickup_hour", "pickup_dayofweek").show(5)

# Calculate distance between pickup and dropoff (Haversine formula)
def haversine(lon1, lat1, lon2, lat2):
    R = 6371  # Earth radius in kilometers
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c

# Register UDF for distance calculation
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
haversine_udf = udf(haversine, DoubleType())
train_data = train_data.withColumn("distance_km",
    haversine_udf(col("pickup_longitude"), col("pickup_latitude"),
                  col("dropoff_longitude"), col("dropoff_latitude")))

print("\nSample with distance feature in train.csv:")
train_data.select("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "distance_km").show(5)


Sample with new temporal features in train.csv:
+-------------------+-----------+----------------+
|    pickup_datetime|pickup_hour|pickup_dayofweek|
+-------------------+-----------+----------------+
|2016-03-14 17:24:55|         17|               2|
|2016-06-12 00:43:35|          0|               1|
|2016-01-19 11:35:24|         11|               3|
|2016-04-06 19:32:31|         19|               4|
|2016-03-26 13:30:55|         13|               7|
+-------------------+-----------+----------------+
only showing top 5 rows


Sample with distance feature in train.csv:
+------------------+------------------+------------------+------------------+------------------+
|  pickup_longitude|   pickup_latitude| dropoff_longitude|  dropoff_latitude|       distance_km|
+------------------+------------------+------------------+------------------+------------------+
| -73.9821548461914| 40.76793670654297|-73.96463012695312|40.765602111816406|1.4985207796474773|
|-73.98041534423828|40.738563537597

## Preprocess Training Data
We preprocess the training data to ensure it is suitable for model training:
- Encode the categorical `store_and_fwd_flag` using `StringIndexer`.
- Drop rows with missing values.
- Filter `trip_duration` to a reasonable range (60 seconds to 4 hours) to remove outliers.
- Filter coordinates to NYC bounds (latitude: 40-41, longitude: -74 to -73) to exclude invalid locations.
These steps improve data quality and consistency.

In [6]:
# Encode store_and_fwd_flag
indexer = StringIndexer(inputCol="store_and_fwd_flag", outputCol="store_and_fwd_flag_indexed")
train_data = indexer.fit(train_data).transform(train_data)

# Handle missing values
train_data = train_data.na.drop()

# Filter outliers in trip_duration (e.g., keep trips between 60 seconds and 4 hours)
train_data = train_data.filter((col("trip_duration") >= 60) & (col("trip_duration") <= 14400))

# Filter reasonable coordinates (NYC bounds)
train_data = train_data.filter((col("pickup_latitude").between(40, 41)) &
                               (col("pickup_longitude").between(-74, -73)) &
                               (col("dropoff_latitude").between(40, 41)) &
                               (col("dropoff_longitude").between(-74, -73)))

## Feature Selection, Vector Assembly, and Train-Test Split
We select features for the model, including numerical features, engineered features (`pickup_hour`, `pickup_dayofweek`, `distance_km`), and the encoded `store_and_fwd_flag`. We use `VectorAssembler` to combine these features into a single vector column. The data is then split into training and validation sets (80-20 split) for model training and evaluation.

In [7]:
# Feature selection and vector assembly for training
feature_cols = ["vendor_id", "passenger_count", "pickup_longitude", "pickup_latitude",
                "dropoff_longitude", "dropoff_latitude", "pickup_hour",
                "pickup_dayofweek", "distance_km", "store_and_fwd_flag_indexed"]

# Assemble features into a vector
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)
assembled_train_data = assembler.transform(train_data).select("features", "trip_duration", "id")

# Split into training and validation sets
train_split, val_split = assembled_train_data.randomSplit([0.8, 0.2], seed=42)

## Train, Tune, and Analyze Decision Tree Regressor
We first train an initial Decision Tree Regressor with a maximum depth of 5 and variance as the impurity measure to establish a baseline. After training on the training split, we analyze its tree structure and feature importances to understand its decision rules and influential features. To improve performance, we then perform hyperparameter tuning using 3-fold cross-validation, testing combinations of `maxDepth` (5, 7, 10) and `minInstancesPerNode` (1, 10, 50) to optimize the model based on RMSE. The best model is selected, and we analyze its tree structure and feature importances to compare with the initial model and identify key predictors.

In [8]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

# Train initial DecisionTreeRegressor model (for comparison)
dt_initial = DecisionTreeRegressor(
    featuresCol="features",
    labelCol="trip_duration",
    maxDepth=5,
    impurity="variance",
    seed=42
)

# Fit the initial model
initial_model = dt_initial.fit(train_split)

# Analyze initial tree structure and feature importance
print("\nInitial Tree Structure:")
print(initial_model.toDebugString)

print("\nInitial Feature Importances:")
for feature, importance in zip(feature_cols, initial_model.featureImportances.toArray()):
    print(f"{feature}: {importance:.4f}")

# Hyperparameter Tuning with Cross-Validation
print("\n--- Starting Hyperparameter Tuning with Cross-Validation ---")

# Define the DecisionTreeRegressor for tuning
dt_tune = DecisionTreeRegressor(
    featuresCol="features",
    labelCol="trip_duration",
    impurity="variance",
    seed=42
)

# Define the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(dt_tune.maxDepth, [5, 7, 10]) \
    .addGrid(dt_tune.minInstancesPerNode, [1, 10, 50]) \
    .build()

# Define the evaluator
evaluator = RegressionEvaluator(
    labelCol="trip_duration",
    predictionCol="prediction",
    metricName="rmse"
)

# Set up CrossValidator
crossval = CrossValidator(
    estimator=dt_tune,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    seed=42
)

# Fit CrossValidator to the training data
cv_model = crossval.fit(train_split)

# Get the best model
model = cv_model.bestModel  # Update 'model' to use the best model

# Print the best parameters
print("\nBest Model Parameters:")
print(f"maxDepth: {model.getMaxDepth()}")
print(f"minInstancesPerNode: {model.getMinInstancesPerNode()}")

# Evaluate the best model on the validation split
best_val_predictions = model.transform(val_split)
best_rmse = evaluator.evaluate(best_val_predictions)
print(f"\nRoot Mean Squared Error (RMSE) on validation split (best model): {best_rmse:.4f}")

best_r2 = RegressionEvaluator(
    labelCol="trip_duration",
    predictionCol="prediction",
    metricName="r2"
).evaluate(best_val_predictions)
print(f"R-squared (R²) on validation split (best model): {best_r2:.4f}")

# Analyze the best model's tree structure and feature importances
print("\nBest Model Tree Structure:")
print(model.toDebugString)

print("\nBest Model Feature Importances:")
for feature, importance in zip(feature_cols, model.featureImportances.toArray()):
    print(f"{feature}: {importance:.4f}")


Initial Tree Structure:
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_63af35238698, depth=5, numNodes=63, numFeatures=10
  If (feature 8 <= 5.423026623607559)
   If (feature 8 <= 1.9973042790322402)
    If (feature 8 <= 1.2045349743906306)
     If (feature 5 <= 40.767473220825195)
      If (feature 6 <= 7.5)
       Predict: 292.7657350433687
      Else (feature 6 > 7.5)
       Predict: 423.01783888279334
     Else (feature 5 > 40.767473220825195)
      If (feature 8 <= 0.9413700670582281)
       Predict: 271.28365300017595
      Else (feature 8 > 0.9413700670582281)
       Predict: 335.8212076121858
    Else (feature 8 > 1.2045349743906306)
     If (feature 5 <= 40.769575119018555)
      If (feature 6 <= 7.5)
       Predict: 432.2597942073171
      Else (feature 6 > 7.5)
       Predict: 641.2321411512261
     Else (feature 5 > 40.769575119018555)
      If (feature 8 <= 1.6655255165486342)
       Predict: 413.8078877262366
      Else (feature 8 > 1.6655255165486342)
       Pre

## Evaluate Model on Validation Split
We evaluate the model on the validation split using:
- **Root Mean Squared Error (RMSE)**: Measures the average prediction error in seconds.
- **R-squared (R²)**: Indicates the proportion of variance in `trip_duration` explained by the model.
These metrics provide an estimate of the model's performance on unseen data from the training set.

In [9]:
# Evaluate the model on the validation split
val_predictions = model.transform(val_split)

# Calculate RMSE
rmse_evaluator = RegressionEvaluator(
    labelCol="trip_duration",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = rmse_evaluator.evaluate(val_predictions)
print(f"\nRoot Mean Squared Error (RMSE) on validation split: {rmse:.4f}")

# Calculate R²
r2_evaluator = RegressionEvaluator(
    labelCol="trip_duration",
    predictionCol="prediction",
    metricName="r2"
)
r2 = r2_evaluator.evaluate(val_predictions)
print(f"R-squared (R²) on validation split: {r2:.4f}")


Root Mean Squared Error (RMSE) on validation split: 344.3223
R-squared (R²) on validation split: 0.7305


## Preprocess Test Data, Generate Predictions, and Cleanup
We load and preprocess `test.csv` to match the training data format by adding temporal features, calculating `distance_km`, encoding `store_and_fwd_flag`, and filtering coordinates. We then generate predictions and save them to `submission.csv` in the Kaggle submission format (`id`, `trip_duration`). Finally, we stop the Spark session to free up resources.

**Note**: `test.csv` lacks `trip_duration`, so we cannot evaluate directly. Submit `submission.csv` to Kaggle to get the test set performance.

In [10]:
import shutil
import os
from pyspark.sql.functions import col

# Load the test data
test_data = spark.read.csv("/kaggle/input/problem02/test.csv", header=True, inferSchema=True)

# Preprocess the test data in the same way as the training data
# Add temporal features
test_data = test_data.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime")))
test_data = test_data.withColumn("pickup_hour", hour(col("pickup_datetime")))
test_data = test_data.withColumn("pickup_dayofweek", dayofweek(col("pickup_datetime")))

# Calculate distance (assumes haversine_udf is defined earlier)
test_data = test_data.withColumn("distance_km",
    haversine_udf(col("pickup_longitude"), col("pickup_latitude"),
                  col("dropoff_longitude"), col("dropoff_latitude")))

# Encode store_and_fwd_flag (assumes indexer is defined earlier)
test_data = indexer.fit(test_data).transform(test_data)

# Handle missing values and filter coordinates
test_data = test_data.na.drop()
test_data = test_data.filter((col("pickup_latitude").between(40, 41)) &
                             (col("pickup_longitude").between(-74, -73)) &
                             (col("dropoff_latitude").between(40, 41)) &
                             (col("dropoff_longitude").between(-74, -73)))

# Assemble features for test data (assumes assembler is defined earlier)
assembled_test_data = assembler.transform(test_data).select("features", "id")

# Make predictions on test data (assumes model is defined earlier)
test_predictions = model.transform(assembled_test_data)

# Prepare MLlib predictions
mllib_predictions = test_predictions.select("id", col("prediction").alias("trip_duration"))
# Combine the predictions by joining on the 'id' column
combined_predictions = mllib_predictions

# Write the combined predictions to a temporary directory with a single partition
temp_dir = "temp_submission_combined"
combined_predictions.coalesce(1).write.csv(temp_dir, header=True, mode="overwrite")

# Find the single CSV file in the temporary directory
csv_file = [f for f in os.listdir(temp_dir) if f.startswith("part-") and f.endswith(".csv")][0]
csv_file_path = os.path.join(temp_dir, csv_file)

# Move and rename the CSV file to submission_combined.csv
final_submission_path = "submission_combined.csv"
shutil.move(csv_file_path, final_submission_path)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

print(f"Combined predictions have been saved to {final_submission_path}")

# Print sample combined predictions
print("\nSample Combined Predictions:")
combined_predictions.show(5, truncate=False)

# Stop Spark session
spark.stop()

Combined predictions have been saved to submission_combined.csv

Sample Combined Predictions:
+---------+------------------+
|id       |trip_duration     |
+---------+------------------+
|id3004672|836.5085236410422 |
|id3505355|729.7780443911217 |
|id1217141|533.6348159509203 |
|id2150126|1010.321692411014 |
|id1598245|415.70353830298865|
+---------+------------------+
only showing top 5 rows

