NYC Taxi Fleet Recommender Pipeline
Predicts optimal fleet mix (sedan/SUV/van) for next 15 days
Based on passenger demand patterns

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import logging
from google.cloud import bigquery, storage
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

# Configuration

In [None]:
# Configuration
PROJECT_ID = "nyctaxi-467111"
SOURCE_DATASET = "CleanSilver"
OUTPUT_DATASET = "PostMlGold"
BUCKET_NAME = "nyc_raw_data_bucket"
MODEL_FOLDER = "fleet_recommender_models"

# Taxi types to process
TAXI_TYPES = ["yellow", "green"]

# Vehicle throughput assumptions (trips/vehicle/hour)

In [None]:
THROUGHPUT = {
    'sedan': 1.8,
    'suv': 1.6,
    'van': 1.3
}

# Setup logging

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


# Initialize clients

In [None]:
bq_client = bigquery.Client(project=PROJECT_ID)
storage_client = storage.Client()

# helpers

In [None]:
def extract_features_from_bigquery(taxi_type, end_date=None):
    """Extract and prepare features from BigQuery tables."""
    logger.info(f"Extracting features for {taxi_type} taxi...")

    if end_date is None:
        # Get the latest date from available data
        latest_date_query = f"""
        SELECT MAX(DATE(pickup_datetime)) as max_date
        FROM (
            SELECT {'tpep' if taxi_type == 'yellow' else 'lpep'}_pickup_datetime as pickup_datetime
            FROM `{PROJECT_ID}.{SOURCE_DATASET}.{taxi_type}*`
        )
        """
        end_date = list(bq_client.query(latest_date_query).result())[0].max_date

    pickup_col = 'tpep_pickup_datetime' if taxi_type == 'yellow' else 'lpep_pickup_datetime'

    query = f"""
    WITH bounds AS (
        SELECT
            DATE('2024-01-01') AS min_d,  -- Start from 2024 for consistency
            DATE('{end_date}') AS max_d
    ),
    zones AS (
        SELECT DISTINCT PULocationID AS zone_id
        FROM `{PROJECT_ID}.{SOURCE_DATASET}.{taxi_type}*`
        WHERE PULocationID IS NOT NULL
    ),
    grid AS (
        SELECT
            d,
            hr,
            zone_id
        FROM bounds b,
        UNNEST(GENERATE_DATE_ARRAY(b.min_d, b.max_d)) AS d,
        UNNEST(GENERATE_ARRAY(0, 23)) AS hr
        CROSS JOIN zones
    ),
    raw_trips AS (
        SELECT
            DATE({pickup_col}) AS d,
            EXTRACT(HOUR FROM {pickup_col}) AS hr,
            PULocationID AS zone_id,
            CASE
                WHEN passenger_count <= 1.5 THEN 'single'
                WHEN passenger_count <= 3.5 THEN 'small'
                WHEN passenger_count <= 5.5 THEN 'medium'
                ELSE 'large'
            END AS bucket
        FROM `{PROJECT_ID}.{SOURCE_DATASET}.{taxi_type}*`
        WHERE DATE({pickup_col}) >= '2024-01-01'
          AND DATE({pickup_col}) <= '{end_date}'
          AND PULocationID IS NOT NULL
    ),
    trip_counts AS (
        SELECT
            d, hr, zone_id,
            COUNTIF(bucket = 'single') AS y_single,
            COUNTIF(bucket = 'small') AS y_small,
            COUNTIF(bucket = 'medium') AS y_medium,
            COUNTIF(bucket = 'large') AS y_large
        FROM raw_trips
        GROUP BY d, hr, zone_id
    ),
    filled AS (
        SELECT
            g.d, g.hr, g.zone_id,
            COALESCE(c.y_single, 0) AS y_single,
            COALESCE(c.y_small, 0) AS y_small,
            COALESCE(c.y_medium, 0) AS y_medium,
            COALESCE(c.y_large, 0) AS y_large,
            COALESCE(c.y_single + c.y_small + c.y_medium + c.y_large, 0) AS total_trips
        FROM grid g
        LEFT JOIN trip_counts c USING (d, hr, zone_id)
    ),
    with_features AS (
        SELECT
            f.*,
            EXTRACT(DAYOFWEEK FROM f.d) AS dow,
            EXTRACT(MONTH FROM f.d) AS month,
            -- Lag features
            LAG(y_single, 1) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_single_lag1,
            LAG(y_single, 7) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_single_lag7,
            LAG(y_small, 1) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_small_lag1,
            LAG(y_small, 7) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_small_lag7,
            LAG(y_medium, 1) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_medium_lag1,
            LAG(y_medium, 7) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_medium_lag7,
            LAG(y_large, 1) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_large_lag1,
            LAG(y_large, 7) OVER (PARTITION BY zone_id, hr ORDER BY d) AS y_large_lag7,
            -- Rolling averages
            AVG(total_trips) OVER (
                PARTITION BY zone_id, hr
                ORDER BY d
                ROWS BETWEEN 7 PRECEDING AND 1 PRECEDING
            ) AS total_trips_7dma
        FROM filled f
    )
    SELECT * FROM with_features
    WHERE d >= DATE('2024-01-08')  -- Ensure we have lag features
    ORDER BY zone_id, d, hr
    """

    df = bq_client.query(query).to_dataframe()
    logger.info(f"Extracted {len(df):,} rows of feature data")
    print(f"Extracted {len(df):,} rows of feature data")
    return df

In [None]:
def prepare_features(df):
    """Add cyclical time features and handle missing values."""
    # Fill NaN values
    lag_cols = [c for c in df.columns if '_lag' in c]
    df[lag_cols] = df[lag_cols].fillna(0.0)
    df['total_trips_7dma'] = df['total_trips_7dma'].fillna(0.0)

    # Convert numeric columns to float to avoid pandas nullable integer issues
    numeric_cols = ['total_trips', 'y_single', 'y_small', 'y_medium', 'y_large'] + lag_cols
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype('float64')

    # Convert zone_id to string for one-hot encoding
    df['zone_id'] = df['zone_id'].astype(str)

    # Add cyclical time features
    df['hour_sin'] = np.sin(2 * np.pi * df['hr'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hr'] / 24)
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    return df

In [None]:
def build_preprocessor(df):
    """Build feature preprocessor with one-hot encoding for zones."""
    cat_features = ['zone_id']
    num_features = [
        'hr', 'dow', 'month',
        'total_trips', 'total_trips_7dma',
        # Lag features
        'y_single_lag1', 'y_single_lag7',
        'y_small_lag1', 'y_small_lag7',
        'y_medium_lag1', 'y_medium_lag7',
        'y_large_lag1', 'y_large_lag7',
        # Cyclical features
        'hour_sin', 'hour_cos',
        'dow_sin', 'dow_cos',
        'month_sin', 'month_cos'
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=50), cat_features),
            ('num', 'passthrough', num_features)
        ],
        sparse_threshold=0.3
    )

    return preprocessor, cat_features + num_features


def train_models(df, test_days=7):
    """Train XGBoost models for each passenger bucket."""
    # Split data
    df['d'] = pd.to_datetime(df['d'])
    cutoff_date = df['d'].max() - timedelta(days=test_days)

    train_df = df[df['d'] <= cutoff_date].copy()
    test_df = df[df['d'] > cutoff_date].copy()

    logger.info(f"Training period: {train_df['d'].min()} to {train_df['d'].max()}")
    logger.info(f"Test period: {test_df['d'].min()} to {test_df['d'].max()}")
    print(f"Training period: {train_df['d'].min()} to {train_df['d'].max()}")
    print(f"Test period: {test_df['d'].min()} to {test_df['d'].max()}")

    # Prepare features
    preprocessor, feature_cols = build_preprocessor(df)
    X_train = preprocessor.fit_transform(train_df[feature_cols])
    X_test = preprocessor.transform(test_df[feature_cols])

    # Target columns
    targets = ['y_single', 'y_small', 'y_medium', 'y_large']

    # Train models
    models = {}
    metrics = {}

    for target in tqdm(targets):
        logger.info(f"Training model for {target}...")

        y_train = train_df[target].values
        y_test = test_df[target].values

        model = XGBRegressor(
            objective='count:poisson',
            max_delta_step=1,
            n_estimators=200,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        )

        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            verbose=False
        )

        # Evaluate
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        models[target] = model
        metrics[target] = {'mae': mae, 'rmse': rmse}

        logger.info(f"  {target} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")
        print(f"  {target} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

    return models, preprocessor, metrics, feature_cols, cutoff_date

In [None]:
def predict_future(df, models, preprocessor, feature_cols, last_date, days_ahead=15):
    """Generate predictions for future days."""
    # Get unique zones
    zones = df['zone_id'].unique()

    # Generate future dates
    future_dates = pd.date_range(
        start=last_date + timedelta(days=1),
        end=last_date + timedelta(days=days_ahead),
        freq='D'
    )

    # Create future grid
    future_data = []
    for date in future_dates:
        for hour in range(24):
            for zone in zones:
                future_data.append({
                    'd': date,
                    'hr': hour,
                    'zone_id': zone
                })

    future_df = pd.DataFrame(future_data)

    # Add time features
    future_df['dow'] = future_df['d'].dt.dayofweek
    future_df['month'] = future_df['d'].dt.month

    # Add cyclical features
    future_df['hour_sin'] = np.sin(2 * np.pi * future_df['hr'] / 24)
    future_df['hour_cos'] = np.cos(2 * np.pi * future_df['hr'] / 24)
    future_df['dow_sin'] = np.sin(2 * np.pi * future_df['dow'] / 7)
    future_df['dow_cos'] = np.cos(2 * np.pi * future_df['dow'] / 7)
    future_df['month_sin'] = np.sin(2 * np.pi * future_df['month'] / 12)
    future_df['month_cos'] = np.cos(2 * np.pi * future_df['month'] / 12)

    # Get historical features for lag values
    # Use averages by zone, hour, and day of week
    historical_avgs = df.groupby(['zone_id', 'hr', 'dow']).agg({
        'y_single': 'mean',
        'y_small': 'mean',
        'y_medium': 'mean',
        'y_large': 'mean',
        'total_trips': 'mean'
    }).reset_index()

    # Merge with future data
    future_df = future_df.merge(
        historical_avgs,
        on=['zone_id', 'hr', 'dow'],
        how='left',
        suffixes=('', '_avg')
    )

    # Fill lag features with historical averages
    future_df['y_single_lag1'] = future_df['y_single']
    future_df['y_single_lag7'] = future_df['y_single']
    future_df['y_small_lag1'] = future_df['y_small']
    future_df['y_small_lag7'] = future_df['y_small']
    future_df['y_medium_lag1'] = future_df['y_medium']
    future_df['y_medium_lag7'] = future_df['y_medium']
    future_df['y_large_lag1'] = future_df['y_large']
    future_df['y_large_lag7'] = future_df['y_large']
    future_df['total_trips_7dma'] = future_df['total_trips']

    # Make predictions
    X_future = preprocessor.transform(future_df[feature_cols])

    for target in ['y_single', 'y_small', 'y_medium', 'y_large']:
        future_df[f'pred_{target}'] = np.maximum(0, models[target].predict(X_future))

    return future_df


In [None]:
def calculate_fleet_requirements(df):
    """Convert passenger predictions to vehicle requirements."""
    # Calculate vehicle needs based on throughput
    df['veh_sedan'] = np.ceil(
        (df['pred_y_single'] + 0.5 * df['pred_y_small']) / THROUGHPUT['sedan']
    ).astype(int)

    df['veh_suv'] = np.ceil(
        (0.5 * df['pred_y_small'] + 0.7 * df['pred_y_medium'] + 0.1 * df['pred_y_large']) / THROUGHPUT['suv']
    ).astype(int)

    df['veh_van'] = np.ceil(
        (0.3 * df['pred_y_medium'] + 0.9 * df['pred_y_large']) / THROUGHPUT['van']
    ).astype(int)

    df['total_vehicles'] = df['veh_sedan'] + df['veh_suv'] + df['veh_van']
    df['total_pred_trips'] = (
        df['pred_y_single'] + df['pred_y_small'] +
        df['pred_y_medium'] + df['pred_y_large']
    )

    return df


def save_results(df, taxi_type, metrics):
    """Save predictions and model metrics to BigQuery."""
    # Prepare output table
    output_table = f"{PROJECT_ID}.{OUTPUT_DATASET}.fleet_recommender_{taxi_type}_predictions"
    metrics_table = f"{PROJECT_ID}.{OUTPUT_DATASET}.fleet_recommender_{taxi_type}_metrics"

    # Add metadata
    df['taxi_type'] = taxi_type
    df['prediction_timestamp'] = datetime.now()
    df['date'] = df['d'].dt.date

    # Select and rename columns for output
    output_df = df[[
        'taxi_type', 'date', 'hr', 'zone_id',
        'dow', 'month',
        'pred_y_single', 'pred_y_small', 'pred_y_medium', 'pred_y_large',
        'veh_sedan', 'veh_suv', 'veh_van',
        'total_vehicles', 'total_pred_trips',
        'prediction_timestamp'
    ]].copy()

    # Save predictions
    output_df.to_gbq(
        output_table,
        project_id=PROJECT_ID,
        if_exists='replace'
    )
    logger.info(f"Saved predictions to {output_table}")
    print(f"Saved predictions to {output_table}")

    # Save metrics
    metrics_df = pd.DataFrame([
        {
            'taxi_type': taxi_type,
            'target': target,
            'mae': metrics[target]['mae'],
            'rmse': metrics[target]['rmse'],
            'timestamp': datetime.now()
        }
        for target in metrics
    ])

    metrics_df.to_gbq(
        metrics_table,
        project_id=PROJECT_ID,
        if_exists='replace'
    )
    logger.info(f"Saved metrics to {metrics_table}")
    print(f"Saved metrics to {metrics_table}")


def save_models_to_gcs(models, preprocessor, taxi_type):
    """Save trained models and preprocessor to GCS."""
    bucket = storage_client.get_bucket(BUCKET_NAME)

    # Save models
    for target, model in models.items():
        model_path = f"{MODEL_FOLDER}/{taxi_type}_{target}_model.joblib"
        blob = bucket.blob(model_path)
        with blob.open('wb') as f:
            joblib.dump(model, f)
        logger.info(f"Saved model to gs://{BUCKET_NAME}/{model_path}")

    # Save preprocessor
    preprocessor_path = f"{MODEL_FOLDER}/{taxi_type}_preprocessor.joblib"
    blob = bucket.blob(preprocessor_path)
    with blob.open('wb') as f:
        joblib.dump(preprocessor, f)
    logger.info(f"Saved preprocessor to gs://{BUCKET_NAME}/{preprocessor_path}")
    print(f"Saved preprocessor to gs://{BUCKET_NAME}/{preprocessor_path}")



In [None]:
def main():
    """Main pipeline execution."""
    logger.info("Starting NYC Taxi Fleet Recommender Pipeline")

    for taxi_type in tqdm(TAXI_TYPES):
        logger.info(f"\nProcessing {taxi_type} taxi...")

        try:
            # Extract features
            df = extract_features_from_bigquery(taxi_type)
            df = prepare_features(df)

            # Train models
            models, preprocessor, metrics, feature_cols, last_date = train_models(df)

            # Generate future predictions
            future_df = predict_future(df, models, preprocessor, feature_cols, last_date)
            future_df = calculate_fleet_requirements(future_df)

            # Save results
            save_results(future_df, taxi_type, metrics)
            save_models_to_gcs(models, preprocessor, taxi_type)

            logger.info(f"Completed processing for {taxi_type} taxi")
            print((f"Completed processing for {taxi_type} taxi"))

        except Exception as e:
            logger.error(f"Error processing {taxi_type}: {str(e)}")
            continue

    logger.info("\nPipeline completed successfully!")

    # Create summary view
    summary_query = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.{OUTPUT_DATASET}.fleet_recommender_summary` AS
    WITH combined AS (
        SELECT * FROM `{PROJECT_ID}.{OUTPUT_DATASET}.fleet_recommender_yellow_predictions`
        UNION ALL
        SELECT * FROM `{PROJECT_ID}.{OUTPUT_DATASET}.fleet_recommender_green_predictions`
    )
    SELECT
        taxi_type,
        date,
        SUM(total_vehicles) as total_fleet_needed,
        SUM(veh_sedan) as total_sedans,
        SUM(veh_suv) as total_suvs,
        SUM(veh_van) as total_vans,
        SUM(total_pred_trips) as total_expected_trips,
        COUNT(DISTINCT zone_id) as active_zones
    FROM combined
    GROUP BY taxi_type, date
    ORDER BY taxi_type, date
    """

    bq_client.query(summary_query).result()
    logger.info("Created fleet summary view")

In [None]:
main()