In [1]:
import pandas as pd
from datetime import datetime
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso 
  
from src.paths import TEMPORARY_DATA, TRAINING_DATA
from src.hyperparameter_tuning import optimise_hyperparams

  from .autonotebook import tqdm as notebook_tqdm


## Importing and making some changes to the datasets

In [None]:
start_table = pd.read_parquet(TRAINING_DATA/"mar_to_jan_starts.parquet").sort_index()
stop_table = pd.read_parquet(TRAINING_DATA/"jan_to_jan_stops.parquet").sort_index()

## Tuning on Starts Data

### Changing data types to reduce memory load

In [None]:
import numpy as np
from tqdm import tqdm
from src.miscellaneous import change_column_data_type

#for dataset in tqdm([start_table, start_past_4_months, stop_table]):
  
for dataset in [start_table, stop_table]:
  for col in dataset.columns:
      
      if "trips" in col:

        change_column_data_type(
          data=dataset, 
          columns=col, 
          to_format=np.int8)

In [None]:
start_table.to_parquet(TRAINING_DATA/"integers/mar_to_jan_starts.parquet")
stop_table.to_parquet(TRAINING_DATA/"integers/jan_to_jan_stops.parquet")

### Choosing how many rows of the data to drop to reduce memory load

In [None]:
from datetime import datetime

# Restrict the data to the rows after June
start_june = start_table[start_table["start_hour"] > datetime(2023,6,1)]

In [None]:
start_june.to_parquet(TRAINING_DATA/"integers/start_june.parquet")

## Tuning on Starts Data

In [2]:
start_june = pd.read_parquet(TRAINING_DATA/"integers/start_june.parquet")

In [4]:
best_lasso_hyperparams_for_starts = \
  optimise_hyperparams(
    model_fn=Lasso, 
    hyperparam_trials = 5, 
    scenario = "start",   
    X = start_june.drop("trips_next_hour", axis = 1), 
    y = start_june["trips_next_hour"]
)

ClearML Task: created new task id=4b5b27e8f8384b0fb224fd4399e98e83
2024-02-10 19:31:38,600 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/1b181c40a8b243d5b50eb2bc14ea4d45/experiments/4b5b27e8f8384b0fb224fd4399e98e83/output/log
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


divvy_trips - INFO - Beginning hyperparameter search
[I 2024-02-10 19:31:53,698] A new study created in memory with name: no-name-b47c7789-e378-43b3-9573-5bc9b95ba047
divvy_trips - INFO - Start Trial 0
divvy_trips - INFO - Performing split number 0
divvy_trips - INFO - Performing split number 1


ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


divvy_trips - INFO - Performing split number 2
divvy_trips - INFO - Performing split number 3
divvy_trips - INFO - Performing split number 4
[I 2024-02-10 19:47:38,491] Trial 0 finished with value: 0.012503867935649032 and parameters: {'alpha': 0.015864111625064632}. Best is trial 0 with value: 0.012503867935649032.
divvy_trips - INFO - Start Trial 1
divvy_trips - INFO - Performing split number 0
divvy_trips - INFO - Performing split number 1
divvy_trips - INFO - Performing split number 2
divvy_trips - INFO - Performing split number 3
divvy_trips - INFO - Performing split number 4
[I 2024-02-10 19:48:43,227] Trial 1 finished with value: 0.03251249750141218 and parameters: {'alpha': 0.874849790517695}. Best is trial 0 with value: 0.012503867935649032.
divvy_trips - INFO - Start Trial 2
divvy_trips - INFO - Performing split number 0
divvy_trips - INFO - Performing split number 1
divvy_trips - INFO - Performing split number 2
divvy_trips - INFO - Performing split number 3
divvy_trips - IN

#### LGBMRegressor

In [3]:
best_lasso_hyperparams_for_start = \
  optimise_hyperparams(
    model_fn=LGBMRegressor, 
    hyperparam_trials = 5, 
    scenario = "start", 
    X = start_june.drop("trips_next_hour", axis = 1), 
    y = start_june["trips_next_hour"]
)

ClearML Task: created new task id=19efa9714ca74369ae8b55c0d6175941
2024-02-11 21:30:22,171 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/1b181c40a8b243d5b50eb2bc14ea4d45/experiments/19efa9714ca74369ae8b55c0d6175941/output/log


divvy_trips - INFO - Beginning hyperparameter search
[I 2024-02-11 21:30:32,914] A new study created in memory with name: no-name-ca415d73-2471-4ae7-a491-d163298a4b76
divvy_trips - INFO - Start Trial 0
divvy_trips - INFO - Performing split number 0


ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


divvy_trips - INFO - Performing split number 1
divvy_trips - INFO - Performing split number 2
divvy_trips - INFO - Performing split number 3
divvy_trips - INFO - Performing split number 4
[I 2024-02-11 21:32:48,000] Trial 0 finished with value: 0.008906199082582935 and parameters: {'num_leaves': 20, 'max_depth': 6, 'n_estimators': 69, 'learning_rate': 0.1335340030140376, 'importance_type': 'split', 'subsample': 1, 'feature_fraction': 0.6993501719351904, 'bagging_fraction': 0.3542643035202445}. Best is trial 0 with value: 0.008906199082582935.
divvy_trips - INFO - Start Trial 1
divvy_trips - INFO - Performing split number 0
divvy_trips - INFO - Performing split number 1
divvy_trips - INFO - Performing split number 2


ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


divvy_trips - INFO - Performing split number 3
divvy_trips - INFO - Performing split number 4
[I 2024-02-11 21:35:42,166] Trial 1 finished with value: 0.008658780525690862 and parameters: {'num_leaves': 20, 'max_depth': 4, 'n_estimators': 43, 'learning_rate': 0.206476979086468, 'importance_type': 'split', 'subsample': 1, 'feature_fraction': 0.2869926513900841, 'bagging_fraction': 0.6867430854647192}. Best is trial 1 with value: 0.008658780525690862.
divvy_trips - INFO - Start Trial 2
divvy_trips - INFO - Performing split number 0
divvy_trips - INFO - Performing split number 1
divvy_trips - INFO - Performing split number 2
divvy_trips - INFO - Performing split number 3
divvy_trips - INFO - Performing split number 4
[I 2024-02-11 21:37:47,116] Trial 2 finished with value: 0.008548896283498654 and parameters: {'num_leaves': 44, 'max_depth': 9, 'n_estimators': 76, 'learning_rate': 0.47953726836882615, 'importance_type': 'gain', 'subsample': 1, 'feature_fraction': 0.9565789882653105, 'baggi

Retrying (Retry(total=233, connect=237, read=236, redirect=240, status=240)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7efc18de7f10>: Failed to resolve 'api.clear.ml' ([Errno -2] Name or service not known)")': /v2.23/tasks.get_all


#### XGBRegressor

In [None]:
best_xgb_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=XGBRegressor, 
    hyperparam_trials=5, 
    scenario="start", 
    X=start_june.drop("trips_next_hour", axis = 1),
    y=start_june["trips_next_hour"]
)

## Tuning on Stops Data

#### Lasso

In [None]:
best_lasso_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=Lasso, 
    hyperparam_trials = 5, 
    scenario = "stop", 
    X = stop_table.drop("trips_next_hour", axis = 1), 
    y = stop_table["trips_next_hour"]
)

#### LGBMRegressor

In [None]:
best_lgb_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=LGBMRegressor, 
    hyperparam_trials=5, 
    scenario = "stop", 
    X=stop_table.drop("trips_next_hour", axis = 1),
    y=stop_table["trips_next_hour"]
)

#### XGBRegressor

In [None]:
best_xgb_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=XGBRegressor, 
    hyperparam_trials=5, 
    scenario="stop", 
    X=stop_table.drop("trips_next_hour", axis = 1),
    y=stop_table["trips_next_hour"]
)