# Starts

In [1]:
import pandas as pd
from src.paths import TRANSFORMED_DATA

from datetime import datetime
from src.data_splitting import train_test_split

import lightgbm as lgb 
from sklearn.metrics import mean_absolute_error

In [2]:
starts_table = pd.read_parquet(TRANSFORMED_DATA/"starts_table.parquet")

## Train-test split

In [3]:
start_x_train, start_y_train, start_x_test, start_y_test = train_test_split(
    data = starts_table,
    scenario = "start",
    cutoff_date = datetime(2023,6,1,0,0,0),
    target_column = "trips_next_hour"
)

print(f"{start_x_train.shape=}")
print(f"{start_y_train.shape=}")
print(f"{start_x_test.shape=}")
print(f"{start_y_test.shape=}")

start_x_train.shape=(225090, 674)
start_y_train.shape=(225090,)
start_x_test.shape=(333060, 674)
start_y_test.shape=(333060,)


## LightGBM

In [4]:
start_trip_columns = [column for column in start_x_train.columns if column.startswith("trips_")]
start_x_train_numeric = start_x_train[start_trip_columns]

In [5]:
model = lgb.LGBMRegressor()
model.fit(start_x_train_numeric, start_y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.502542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11643
[LightGBM] [Info] Number of data points in the train set: 225090, number of used features: 672
[LightGBM] [Info] Start training from score 0.068897


In [6]:
start_x_test_numeric = start_x_test[start_trip_columns]
start_predictions = model.predict(start_x_test_numeric)

start_predictions

array([0.00755774, 0.02548266, 0.02465195, ..., 0.00536388, 0.00395872,
       0.02641365])

In [7]:
test_mae = mean_absolute_error(start_y_test, predictions)

test_mae

0.16585199387693633

# Stops

In [8]:
stops_table = pd.read_parquet(TRANSFORMED_DATA/"stops_table.parquet")

## Train-test split

In [9]:
stop_x_train, stop_y_train, stop_x_test, stop_y_test = train_test_split(
    data = stops_table,
    scenario = "stop",
    cutoff_date = datetime(2023,6,1,0,0,0), 
    target_column = "trips_next_hour"
)

print(f"{stop_x_train.shape=}")
print(f"{stop_y_train.shape=}")
print(f"{stop_x_test.shape=}")
print(f"{stop_y_test.shape=}")

stop_x_train.shape=(152520, 674)
stop_y_train.shape=(152520,)
stop_x_test.shape=(226920, 674)
stop_y_test.shape=(226920,)


## LightGBM

In [11]:
stop_trip_columns = [column for column in stop_x_train.columns if column.startswith("trips_")]
stop_x_train_numeric = stop_x_train[stop_trip_columns]

In [12]:
model = lgb.LGBMRegressor()
model.fit(start_x_train_numeric, start_y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.507441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11643
[LightGBM] [Info] Number of data points in the train set: 225090, number of used features: 672
[LightGBM] [Info] Start training from score 0.068897


In [16]:
stop_x_test_numeric = stop_x_test[stop_trip_columns]
stop_predictions = model.predict(stop_x_test_numeric)

stop_predictions

array([0.00249174, 0.01722918, 0.0133472 , ..., 0.00249174, 0.00249174,
       0.00249174])

In [17]:
stop_test_mae = mean_absolute_error(stop_y_test, stop_predictions)

stop_test_mae

0.2491278259205201