In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import TimeSeriesSplit

<center><b>Data Prep</b></center>

In [3]:
data = pd.read_parquet(
    'd:/uber-taxi-demand/data/yellow_trip_features_v4.parquet'
)

data.head().T

Unnamed: 0,0,1,2,3,4
timestamp,2022-01-02 00:00:00,2022-01-02 01:00:00,2022-01-02 02:00:00,2022-01-02 03:00:00,2022-01-02 04:00:00
taxiDemand,1483,1016,703,502,357
timestamp_day_of_week,6,6,6,6,6
passengerDemand_window_7_std,1132.856188,1264.258469,1292.484006,1294.830068,1351.861486
passengerDemand_expanding_std,1926.172935,1920.277885,1941.251618,1983.175723,2029.912596
passengerDemand_lag_8,5754.0,5881.0,5875.0,5296.0,4214.0
passengerDemand_lag_24,6838.0,7738.0,5974.0,4107.0,2485.0
timestamp_year_start,0,0,0,0,0
timestamp_month_end,0,0,0,0,0
passengerDemand_lag_1,2798.0,2288.0,1594.0,1047.0,776.0


In [4]:
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(data):
    train, test = data.loc[train_index], data.loc[test_index]

In [5]:
train.shape

(7261, 17)

In [6]:
test.shape

(1452, 17)

<center><b>XGBoost</b></center>

In [19]:
import mlflow
from mlflow.xgboost import autolog
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_log_error, r2_score
import xgboost as xgb
from scipy.stats import uniform, randint

In [8]:
if not mlflow.get_experiment_by_name("Uber Taxi-Demand"):
    mlflow.create_experiment("Uber Taxi-Demand")
else:
    print(mlflow.get_experiment_by_name("Uber Taxi-Demand"))

<Experiment: artifact_location='file:///D:/uber-taxi-demand/notebooks/c_ModelExperimentation/mlruns/954115172674949808', creation_time=1703384061341, experiment_id='954115172674949808', last_update_time=1703384061341, lifecycle_stage='active', name='Uber Taxi-Demand', tags={}>


In [9]:
mlflow.set_experiment('Uber Taxi-Demand')
### Start an MLflow run
with mlflow.start_run():
    

    ### Train and Evaluate an XGBoost model
    params = {'objective': 'reg:squarederror', 'colsample_bytree': 0.3, 'learning_rate': 0.1,
              'max_depth': 5, 'alpha': 10, 'n_estimators': 10}
    
    scores = []
    tscv = TimeSeriesSplit(n_splits=5)

    for train_index, test_index in tscv.split(data):
        train, test = data.loc[train_index], data.loc[test_index]
        X_train, y_train = train.drop(columns=['timestamp', 'taxiDemand']), train['taxiDemand']
        X_test, y_test = test.drop(columns=['timestamp', 'taxiDemand']), test['taxiDemand']
        
        ## Train
        model = xgb.XGBRegressor(**params)
        model.fit(X_train, y_train)
        
        ## Evaluate
        predictions = model.predict(X_test)
        score = [
            mean_squared_log_error(y_test, predictions),
            mean_absolute_percentage_error(y_test, predictions),
            r2_score(y_test, predictions)
        ]
        scores.append(score)

    ### Log Params
    mlflow.log_params(params=params)
    
    ### Log metrics
    scores_avg = np.array(scores).mean(axis=1)
    mlflow.log_metric("msle", scores_avg[0])
    mlflow.log_metric("mape", scores_avg[1])
    mlflow.log_metric("r2", scores_avg[2])
        
    ### Log the XGBoost model with autologging
    mlflow.xgboost.autolog(
        importance_types=None,
        log_input_examples=False,
        log_model_signatures=True,
        log_models=True,
        log_datasets=False,
        disable=False,
        exclusive=False,
        disable_for_unsupported_versions=False,
        silent=False,
        registered_model_name=None,
        model_format='xgb',
        extra_tags=None,
    )



    ### Save the model in MLflow format
    mlflow.xgboost.log_model(model, "XGBR")


In [10]:
xgb.XGBRegressor?

In [15]:
param_dist = {
    'learning_rate': uniform(0.01, 0.3),        # Learning rate
    'n_estimators': randint(50, 200),             # Number of trees
    'max_depth': randint(3, 10),                 # Maximum depth of a tree
    'subsample': uniform(0.5, 1.0),              # Subsample ratio of the training instances
    'colsample_bytree': uniform(0.5, 1.0),       # Subsample ratio of columns when constructing each tree
    'gamma': uniform(0, 1),                      # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'reg_alpha': uniform(0, 1),                  # L1 regularization term on weights
    'reg_lambda': uniform(0, 1)                  # L2 regularization term on weights
}

from itertools import permutations, product

In [18]:
product(list(np.(0.01, 0.3)), list(randint(50, 200)))

TypeError: 'rv_continuous_frozen' object is not iterable