In [1]:
import pandas as pd
import pickle
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- Helper Function for MAPE ---
def mean_absolute_percentage_error(y_true, y_pred):
    """Calculates MAPE, handling cases where y_true is zero."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Avoid division by zero
    non_zero_mask = y_true != 0
    return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100

# --- 1. Load the Dataset ---
print("Loading the generated trip data...")
try:
    df = pd.read_csv('data/eta_trip_data.csv')
except FileNotFoundError:
    print("Error: 'eta_trip_data.csv' not found. Please run the data generation script first.")
    exit()

print("Data loaded successfully.")

# --- 2. Feature Engineering & Preprocessing ---
print("\nPerforming feature engineering (One-Hot Encoding)...")
df_processed = pd.get_dummies(df, columns=['vehicle_type', 'weather', 'load_type'])
print("Categorical features have been encoded.")

# --- 3. Define Features (X) and Target (y) ---
print("\nDefining features (X) and target (y)...")
y = df_processed['actual_eta_hours']
X = df_processed.drop(columns=['actual_eta_hours', 'route_id'])
model_columns = X.columns.tolist() # Save column order

# --- 4. Split Data into Training and Testing Sets ---
print("Splitting data into training and testing sets (80/20 split)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set has {len(X_train)} samples.")
print(f"Testing set has {len(X_test)} samples.")

# --- 5. Train the Regression Model ---
print("\nTraining the RandomForestRegressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete.")

# --- 6. Evaluate the Model with Multiple Metrics ---
print("\nEvaluating the model on the unseen test set...")
predictions = model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse) # Take the square root of MSE to get RMSE
r2 = r2_score(y_test, predictions)
mape = mean_absolute_percentage_error(y_test, predictions)

# The 'confidence' for the final output is based on the MAE
confidence_interval = round(mae, 2)

print("\n--- Model Performance Metrics ---")
print(f"Mean Absolute Error (MAE): {mae:.2f} hours")
print(f"  -> Interpretation: On average, our model's prediction is off by ±{confidence_interval} hours.")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} hours")
print(f"  -> Interpretation: Similar to MAE, but penalizes larger errors more heavily.")
print(f"R-squared (R²): {r2:.3f}")
print(f"  -> Interpretation: Our model explains {r2:.1%} of the variance in the trip times.")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"  -> Interpretation: On average, our model's prediction is off by {mape:.2f}%.")
print("---------------------------------")

# --- 7. Save the Model and Supporting Assets ---
print("\nSaving model and all performance assets to disk...")

# a) Save the trained model object using pickle
with open('eta_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("- Model saved as 'eta_model.pkl'")

# b) Save the list of feature columns and all calculated metrics
asset_data = {
    "model_columns": model_columns,
    "performance_metrics": {
        "confidence_mae_hours": confidence_interval,
        "rmse_hours": round(rmse, 2),
        "r2_score": round(r2, 3),
        "mape_percent": round(mape, 2)
    }
}
with open('eta_model_assets.json', 'w') as f:
    json.dump(asset_data, f, indent=4)
print("- Model assets (columns and metrics) saved as 'eta_model_assets.json'")

print("\n✅ All steps completed successfully!")

Loading the generated trip data...
Data loaded successfully.

Performing feature engineering (One-Hot Encoding)...
Categorical features have been encoded.

Defining features (X) and target (y)...
Splitting data into training and testing sets (80/20 split)...
Training set has 800 samples.
Testing set has 200 samples.

Training the RandomForestRegressor model...
Model training complete.

Evaluating the model on the unseen test set...

--- Model Performance Metrics ---
Mean Absolute Error (MAE): 0.63 hours
  -> Interpretation: On average, our model's prediction is off by ±0.63 hours.
Root Mean Squared Error (RMSE): 0.84 hours
  -> Interpretation: Similar to MAE, but penalizes larger errors more heavily.
R-squared (R²): 0.992
  -> Interpretation: Our model explains 99.2% of the variance in the trip times.
Mean Absolute Percentage Error (MAPE): 4.79%
  -> Interpretation: On average, our model's prediction is off by 4.79%.
---------------------------------

Saving model and all performance a