In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
#Libraries

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
import catboost
import lightgbm as lgb


In [None]:
nyc_data_agg_final_df = pd.read_csv('final_dataset_nyc_agg.csv')

In [None]:
nyc_data_agg_final_df.head()

Unnamed: 0,rides_on_previous_21_day,rides_on_previous_20_day,rides_on_previous_19_day,rides_on_previous_18_day,rides_on_previous_17_day,rides_on_previous_16_day,rides_on_previous_15_day,rides_on_previous_14_day,rides_on_previous_13_day,rides_on_previous_12_day,...,avg_speed_previous_3_day,avg_speed_previous_2_day,avg_speed_previous_1_day,avg_temp_previous_3_hours,avg_precip_previous_3_hours,avg_windspeed_previous_3_hours,avg_visibility_previous_3_hours,pickup_hour,location_id,rides_next_hour_target
0,5.416667,1.041667,1.333333,1.375,1.125,2.458333,8.833333,5.666667,1.083333,1.416667,...,8.514062,8.744473,10.991154,46.75,0.0,6.675,9.9,2023-01-29,84,30
1,1.041667,1.333333,1.375,1.125,2.458333,8.833333,5.666667,1.083333,1.416667,1.75,...,8.744473,8.639012,0.0,23.525,0.0,4.7,4.95,2023-01-30,84,0
2,1.333333,1.375,1.125,2.458333,8.833333,5.666667,1.083333,1.416667,1.75,2.166667,...,8.639012,6.791597,0.0,11.575,0.0,1.125,2.475,2023-01-31,84,0
3,1.375,1.125,2.458333,8.833333,5.666667,1.083333,1.416667,1.75,2.166667,4.041667,...,6.791597,7.134792,14.455,35.15,0.0,11.275,9.9,2023-02-01,84,7
4,1.125,2.458333,8.833333,5.666667,1.083333,1.416667,1.75,2.166667,4.041667,10.25,...,7.134792,8.667242,10.1325,32.5,0.0,5.825,9.9,2023-02-02,84,4


In [None]:
nyc_data_agg_final_df.shape

(16120, 342)

## **Train and Test data split**

In [None]:
from datetime import datetime

def train_test_split(df,cutoff_date,target_column_name):

    df['pickup_hour'] = pd.to_datetime(df['pickup_hour'])

    train_data = df[df.pickup_hour < cutoff_date].reset_index(drop=True)
    test_data = df[df.pickup_hour >= cutoff_date].reset_index(drop=True)

    X_train = train_data.drop(columns=[target_column_name])
    X_train = X_train.iloc[:,:-2]
    y_train = train_data[target_column_name]
    X_test = test_data.drop(columns=[target_column_name])
    X_test = X_test.iloc[:,:-2]
    y_test = test_data[target_column_name]

    return X_train, y_train, X_test, y_test



In [None]:
X_train, y_train, X_test, y_test = train_test_split(nyc_data_agg_final_df,datetime(2023, 3, 1, 0, 0, 0),"rides_next_hour_target")

## **Fitting XGBoost Model**

In [45]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)
xgb_predictions


array([2.1911447e+00, 2.9647956e+00, 9.6542616e+00, ..., 5.7757222e-03,
       5.7757222e-03, 5.7757222e-03], dtype=float32)

## **Predictions and evaluating MAE**

In [46]:
from sklearn.metrics import mean_absolute_error
test_mae_xgb = mean_absolute_error(y_test, xgb_predictions)
print(f'{test_mae_xgb=:.4f}')

test_mae_xgb=1.6221


## **Fitting LightGBM Model**

In [53]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train, y_train)

lgb_predictions = lgb_model.predict(X_test)
lgb_predictions

test_mae_lgb = mean_absolute_error(y_test, lgb_predictions)
print(f'{test_mae_lgb=:.4f}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74810
[LightGBM] [Info] Number of data points in the train set: 8060, number of used features: 339
[LightGBM] [Info] Start training from score 6.215012
test_mae_lgb=1.6144


## **Hyper Parameter Tuning of LightGBM Model**



In [None]:
pip install optuna



In [48]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna


def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = KFold(n_splits=3)
    scores = []

    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train the model
        mod_lgb = lgb.LGBMRegressor(**hyperparams)
        mod_lgb.fit(X_train_, y_train_)

        # evaluate the model
        y_pred = mod_lgb.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    # Return the mean score
    return np.array(scores).mean()

In [54]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)



[I 2024-04-22 05:01:39,714] A new study created in memory with name: no-name-32a1d793-0628-425b-a3e0-700dcedc9932
[I 2024-04-22 05:01:46,068] Trial 0 finished with value: 5.402615158533472 and parameters: {'num_leaves': 31, 'feature_fraction': 0.3169507718803224, 'bagging_fraction': 0.5419423760146582, 'min_child_samples': 100}. Best is trial 0 with value: 5.402615158533472.
[I 2024-04-22 05:01:53,832] Trial 1 finished with value: 2.8061901004179295 and parameters: {'num_leaves': 33, 'feature_fraction': 0.5472983713560933, 'bagging_fraction': 0.7217672757851845, 'min_child_samples': 16}. Best is trial 1 with value: 2.8061901004179295.
[I 2024-04-22 05:02:01,928] Trial 2 finished with value: 3.18223737287032 and parameters: {'num_leaves': 102, 'feature_fraction': 0.21526699521228343, 'bagging_fraction': 0.8376880539884093, 'min_child_samples': 27}. Best is trial 1 with value: 2.8061901004179295.
[I 2024-04-22 05:02:11,890] Trial 3 finished with value: 4.99379652417752 and parameters: {'

In [55]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 33, 'feature_fraction': 0.5472983713560933, 'bagging_fraction': 0.7217672757851845, 'min_child_samples': 16}


In [56]:
mod_lgb = lgb.LGBMRegressor(**best_params)
mod_lgb.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74810
[LightGBM] [Info] Number of data points in the train set: 8060, number of used features: 339
[LightGBM] [Info] Start training from score 6.215012


In [57]:
predictions = mod_lgb.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=1.5577


## **Fitting CatBoost Model**

In [42]:
catb_model = catboost.CatBoostRegressor(loss_function='MAE')
catb_model.fit(X_train, y_train)

0:	learn: 6.1321997	total: 157ms	remaining: 2m 36s
1:	learn: 6.0537372	total: 253ms	remaining: 2m 6s
2:	learn: 5.9949642	total: 353ms	remaining: 1m 57s
3:	learn: 5.9028998	total: 447ms	remaining: 1m 51s
4:	learn: 5.8383131	total: 572ms	remaining: 1m 53s
5:	learn: 5.7573549	total: 705ms	remaining: 1m 56s
6:	learn: 5.6911268	total: 842ms	remaining: 1m 59s
7:	learn: 5.6342055	total: 1.01s	remaining: 2m 5s
8:	learn: 5.5649190	total: 1.16s	remaining: 2m 8s
9:	learn: 5.5036476	total: 1.34s	remaining: 2m 12s
10:	learn: 5.4100279	total: 1.5s	remaining: 2m 15s
11:	learn: 5.3529910	total: 1.67s	remaining: 2m 17s
12:	learn: 5.2763217	total: 1.84s	remaining: 2m 19s
13:	learn: 5.2011669	total: 1.99s	remaining: 2m 20s
14:	learn: 5.1414006	total: 2.16s	remaining: 2m 21s
15:	learn: 5.0931493	total: 2.34s	remaining: 2m 24s
16:	learn: 5.0128492	total: 2.5s	remaining: 2m 24s
17:	learn: 4.9545966	total: 2.68s	remaining: 2m 26s
18:	learn: 4.8870358	total: 2.81s	remaining: 2m 25s
19:	learn: 4.8291674	total:

<catboost.core.CatBoostRegressor at 0x7b5639d1f760>

In [43]:
catb_predictions = catb_model.predict(X_test)
catb_predictions

test_mae_catb = mean_absolute_error(y_test, catb_predictions)
print(f'{test_mae_catb=:.4f}')

test_mae_catb=1.5315


In [44]:
catb_model. get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'MAE',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Exact',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'MAE',
 'learnin

## **Fitting AdaBoost Model**

In [None]:
adb_model = AdaBoostRegressor()
adb_model.fit(X_train, y_train)

adb_predictions = adb_model.predict(X_test)
adb_predictions

test_mae_adb = mean_absolute_error(y_test, adb_predictions)
print(f'{test_mae_adb=:.4f}')


test_mae_adb=13.8870


##**MAE value Comparision**

In [61]:

data = [
    {"Model": "XGBoost Model", "MAE": 1.6221},
    {"Model": "LightGBM Model", "MAE": 1.6144},
    {"Model": "LightGBM Model after Hyper Parameter Tuning", "MAE": 1.5577},
    {"Model": "CatBoost Model", "MAE": 1.5315},
    {"Model": "AdaBoost Model", "MAE": 13.8870}
]


In [62]:
# Creating DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Model,MAE
0,XGBoost Model,1.6221
1,LightGBM Model,1.6144
2,LightGBM Model after Hyper Parameter Tuning,1.5577
3,CatBoost Model,1.5315
4,AdaBoost Model,13.887


Based on the Mean Absolute Error Metric, CatBoost Model performed well with least MAE out of all those boosting techniques.

