In [1]:
import pandas as pd
from datetime import datetime
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso 
  
from src.paths import TEMPORARY_DATA, TRAINING_DATA
from src.hyperparameter_tuning import optimise_hyperparams

  from .autonotebook import tqdm as notebook_tqdm


## Importing and making some changes to the datasets

In [2]:
start_table = pd.read_parquet(TRAINING_DATA/"mar_to_jan_starts.parquet").sort_index()
stop_table = pd.read_parquet(TRAINING_DATA/"jan_to_jan_stops.parquet").sort_index()

## Tuning on Starts Data

### Changing data types to reduce memory load

In [None]:
import numpy as np
from tqdm import tqdm
from src.miscellaneous import change_column_data_type

#for dataset in tqdm([start_table, start_past_4_months, stop_table]):
  
for dataset in [start_table, stop_table]:
  for col in dataset.columns:
      
      if "trips" in col:

        change_column_data_type(
          data=dataset, 
          columns=col, 
          to_format=np.int8)

In [None]:
start_table.to_parquet(TRAINING_DATA/"integers/mar_to_jan_starts.parquet")
stop_table.to_parquet(TRAINING_DATA/"integers/jan_to_jan_stops.parquet")

### Choosing how many rows of the data to drop to reduce memory load

In [4]:
from datetime import datetime

# Restrict the data to the rows after June
start_june = start_table[start_table["start_hour"] > datetime(2023,6,1)]

In [6]:
start_june.to_parquet(TRAINING_DATA/"start_june.parquet")

In [2]:
start_june = pd.read_parquet(TRAINING_DATA/"start_june.parquet")

In [4]:
best_lasso_hyperparams_for_starts = \
  optimise_hyperparams(
    model_fn=Lasso, 
    hyperparam_trials = 5, 
    scenario = "start",   
    X = start_june.drop("trips_next_hour", axis = 1), 
    y = start_june["trips_next_hour"]
)

divvy_trips - INFO - Beginning hyperparameter search
[I 2024-02-09 14:29:55,220] A new study created in memory with name: no-name-f8550a6d-cf0c-45e4-b896-bac2cfdb7876
divvy_trips - INFO - Start Trial 0
divvy_trips - INFO - Performing split number 0
divvy_trips - INFO - Performing split number 1
divvy_trips - INFO - Performing split number 2


#### LGBMRegressor

In [None]:
best_lasso_hyperparams_for_start = \
  optimise_hyperparams(
    model_fn=LGBMRegressor, 
    hyperparam_trials = 5, 
    scenario = "start", 
    X = start_june.drop("trips_next_hour", axis = 1), 
    y = start_june["trips_next_hour"]
)

#### XGBRegressor

In [None]:
best_xgb_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=XGBRegressor, 
    hyperparam_trials=5, 
    scenario="start", 
    X=start_june.drop("trips_next_hour", axis = 1),
    y=start_june["trips_next_hour"]
)

## Tuning on Stops Data

#### Lasso

In [None]:
best_lasso_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=Lasso, 
    hyperparam_trials = 5, 
    scenario = "stop", 
    X = stop_table.drop("trips_next_hour", axis = 1), 
    y = stop_table["trips_next_hour"]
)

#### LGBMRegressor

In [None]:
best_lgb_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=LGBMRegressor, 
    hyperparam_trials=5, 
    scenario = "stop", 
    X=stop_table.drop("trips_next_hour", axis = 1),
    y=stop_table["trips_next_hour"]
)

#### XGBRegressor

In [None]:
best_xgb_hyperparams_for_stops = \
  optimise_hyperparams(
    model_fn=XGBRegressor, 
    hyperparam_trials=5, 
    scenario="stop", 
    X=stop_table.drop("trips_next_hour", axis = 1),
    y=stop_table["trips_next_hour"]
)