In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from datetime import date, timedelta, datetime
from src.data_split import train_test_split
from src.model import get_pipeline
import src.config as cfg
import hopsworks
import pandas as pd
import pytz
import numpy as np
import optuna




In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# connect to the project
project = hopsworks.login(
    project=cfg.HOPSWORKS_PROJECT_NAME,
    api_key_value=cfg.HOPSWORKS_API_KEY
)

# connect to the feature store
feature_store = project.get_feature_store()

# connect to the feature grou
feature_group = feature_store.get_or_create_feature_group(
    name=cfg.FEATURE_GROUP_NAME,
    version=cfg.FEATURE_GROUP_VERSION,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/603286
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
try:
    # create feature view if it doesn't exist yet
    feature_store.create_feature_view(
        name=cfg.FEATURE_VIEW_NAME,
        version=cfg.FEATURE_VIEW_VERSION,
        query=feature_group.select_all()
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
feature_view = feature_store.get_feature_view(
    name=cfg.FEATURE_VIEW_NAME,
    version=cfg.FEATURE_VIEW_VERSION
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/603286/fs/599109/fv/ts_stations_hourly_feature_view/version/1


In [7]:
ts_pax_data, _ = feature_view.training_data(
    description='Time-series hourly total passengers flow',
)
ts_pax_data

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (114.88s) 




Unnamed: 0,hour_of_entry,line,station,total_pax
0,2023-09-22 20:00:00+00:00,LineaC,Independencia,1.0
1,2024-07-23 17:00:00+00:00,LineaB,Carlos Gardel,1021.0
2,2022-10-05 12:00:00+00:00,LineaA,Castro Barros,702.0
3,2022-12-06 18:00:00+00:00,LineaA,Plaza Miserere,1020.0
4,2024-03-17 06:00:00+00:00,LineaB,Tronador,0.0
...,...,...,...,...
772480,2023-07-20 19:00:00+00:00,LineaB,Federico Lacroze,659.0
772481,2022-10-29 01:00:00+00:00,LineaB,Medrano,0.0
772482,2023-05-31 05:00:00+00:00,LineaA,San Pedrito,22.0
772483,2023-07-15 08:00:00+00:00,LineaB,Echeverria,781.0


In [10]:
# Sort the combined DataFrame by 'hour_of_entry' in ascending order
ts_pax_data = ts_pax_data.sort_values(by='hour_of_entry', ascending=True).reset_index(drop=True)

# Create a label encoder object
label_encoder = LabelEncoder()

#  Transform 'hour_of_entry to Timestamp
ts_pax_data['hour_of_entry'] = pd.to_datetime(ts_pax_data['hour_of_entry'], utc=True)

# Apply label encoding to 'line' and 'station'
ts_pax_data['line'] = label_encoder.fit_transform(ts_pax_data['line'])
ts_pax_data['station'] = label_encoder.fit_transform(ts_pax_data['station'])

# Initialize the scaler
#scaler = MinMaxScaler()
#ts_pax_data['total_pax'] = scaler.fit_transform(ts_pax_data[['total_pax']])
#ts_pax_data
ts_pax_data







Unnamed: 0,hour_of_entry,line,station,total_pax
0,2022-09-19 23:00:00+00:00,0,8,17.0
1,2022-09-19 23:00:00+00:00,2,18,34.0
2,2022-09-19 23:00:00+00:00,1,38,17.0
3,2022-09-19 23:00:00+00:00,1,42,12.0
4,2022-09-19 23:00:00+00:00,1,23,11.0
...,...,...,...,...
772480,2024-09-21 13:00:00+00:00,1,4,609.0
772481,2024-09-21 13:00:00+00:00,2,11,206.0
772482,2024-09-21 13:00:00+00:00,2,18,278.0
772483,2024-09-21 13:00:00+00:00,2,40,220.0


In [11]:
from src.data_training import transform_ts_data_into_features_and_target

features, targets = transform_ts_data_into_features_and_target(
    ts_pax_data,
    input_seq_len=24*14, # Two weeks
    step_size=23, # Hourly step
    output_seq_len=3 # Three days
)

# Initialize MinMaxScaler
#scaler_features = MinMaxScaler()
#scaler_targets = MinMaxScaler()

# Apply MinMaxScaler to the feature columns and target columns
#scaled_features = scaler_features.fit_transform(features.filter(like='total_pax_previous'))
#scaled_targets = scaler_targets.fit_transform(targets)

# Convert the scaled features and targets back into DataFrames
#scaled_features_df = pd.DataFrame(scaled_features, columns=features.filter(like='total_pax_previous').columns)
#scaled_targets_df = pd.DataFrame(scaled_targets, columns=targets.columns)

# Add back non-normalized columns (e.g., 'hour_of_entry', 'station')
#scaled_features_df['hour_of_entry'] = features['hour_of_entry']
#scaled_features_df['station'] = features['station']

# Concatenate scaled features and targets
#features_and_target = pd.concat([scaled_features_df, scaled_targets_df], axis=1)

features_and_target = features.copy()
features_and_target = pd.concat([features_and_target, targets], axis=1) 
#features_and_target['target_rides_next_hour'] = targets

print(f'{features_and_target.shape=}')

100%|██████████| 44/44 [00:13<00:00,  3.32it/s]

features_and_target.shape=(32949, 342)





In [12]:
print(f'{features_and_target.shape=}')
features_and_target

features_and_target.shape=(32949, 342)


Unnamed: 0,total_pax_previous_336_hour,total_pax_previous_335_hour,total_pax_previous_334_hour,total_pax_previous_333_hour,total_pax_previous_332_hour,total_pax_previous_331_hour,total_pax_previous_330_hour,total_pax_previous_329_hour,total_pax_previous_328_hour,total_pax_previous_327_hour,...,total_pax_previous_4_hour,total_pax_previous_3_hour,total_pax_previous_2_hour,total_pax_previous_1_hour,hour_of_entry,station,line,total_pax_next_1_hour,total_pax_next_2_hour,total_pax_next_3_hour
0,17.0,0.0,0.0,0.0,0.0,0.0,2.0,114.0,319.0,914.0,...,400.0,507.0,209.0,206.0,2022-10-03 23:00:00+00:00,8,0,18.0,0.0,0.0
1,257.0,14.0,0.0,0.0,0.0,0.0,0.0,4.0,107.0,206.0,...,902.0,406.0,489.0,255.0,2022-10-04 22:00:00+00:00,8,0,204.0,22.0,0.0
2,184.0,183.0,27.0,0.0,0.0,0.0,0.0,0.0,1.0,113.0,...,627.0,906.0,392.0,483.0,2022-10-05 21:00:00+00:00,8,0,229.0,203.0,23.0
3,522.0,177.0,232.0,27.0,0.0,0.0,0.0,0.0,0.0,4.0,...,980.0,678.0,926.0,431.0,2022-10-06 20:00:00+00:00,8,0,576.0,238.0,231.0
4,372.0,542.0,284.0,194.0,12.0,0.0,0.0,0.0,0.0,0.0,...,204.0,331.0,236.0,375.0,2022-10-07 19:00:00+00:00,8,0,223.0,355.0,149.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32944,303.0,70.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,683.0,563.0,1063.0,370.0,2024-09-16 22:00:00+00:00,24,1,404.0,86.0,9.0
32945,130.0,102.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,219.0,...,274.0,431.0,187.0,34.0,2024-09-17 21:00:00+00:00,24,1,26.0,9.0,0.0
32946,730.0,282.0,275.0,43.0,0.0,0.0,0.0,0.0,0.0,5.0,...,1314.0,877.0,1567.0,592.0,2024-09-18 20:00:00+00:00,24,1,757.0,263.0,284.0
32947,697.0,754.0,344.0,347.0,58.0,0.0,0.0,0.0,0.0,0.0,...,711.0,1432.0,1094.0,1653.0,2024-09-19 19:00:00+00:00,24,1,638.0,747.0,338.0


In [13]:
features_and_target.hour_of_entry.max()

Timestamp('2024-09-20 18:00:00+0000', tz='UTC')

In [16]:
# Define Argentina's timezone (GMT-3)
argentina_tz = pytz.timezone('America/Argentina/Buenos_Aires')

# Get the current date and time in Argentina
#cutoff_date = datetime.now(argentina_tz).replace(minute=0, second=0, microsecond=0).strftime('%Y-%m-%d %H:%M:%S')
# Round down (floor) to the nearest hour by setting minutes, seconds, and microseconds to 0
#cutoff_date = pd.to_datetime(current_time_in_argentina) - timedelta(hours=3)

# Define the cutoff_date (timezone-aware)
cutoff_date = pd.Timestamp("2024-09-06 18:00:00").tz_localize('UTC')

X_train, y_train, X_test, y_test = train_test_split(
    features_and_target,
    cutoff_date,
    targets_columns_names=[c for c in features_and_target.columns if c.startswith('total_pax_next')] 
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')
print({X_train.hour_of_entry.min()}, {X_train.hour_of_entry.max()})
print({X_test.hour_of_entry.min()}, {X_test.hour_of_entry.max()})

X_train.shape=(32289, 339)
y_train.shape=(32289, 3)
X_test.shape=(660, 339)
y_test.shape=(660, 3)
{Timestamp('2022-10-03 23:00:00+0000', tz='UTC')} {Timestamp('2024-09-06 09:00:00+0000', tz='UTC')}
{Timestamp('2024-09-07 02:00:00+0000', tz='UTC')} {Timestamp('2024-09-20 18:00:00+0000', tz='UTC')}


In [17]:
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [18]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)

[I 2024-09-21 20:28:12,196] A new study created in memory with name: no-name-7b9fe428-4356-4296-99c6-044b057f6940


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

[I 2024-09-21 20:28:44,662] Trial 0 finished with value: 73.41616555538712 and parameters: {'num_leaves': 74, 'feature_fraction': 0.3414660191010661, 'bagging_fraction': 0.3316660761155541, 'min_child_samples': 16, 'max_depth': 31, 'learning_rate': 0.1004626841311671, 'n_estimators': 97}. Best is trial 0 with value: 73.41616555538712.


In [19]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 74, 'feature_fraction': 0.3414660191010661, 'bagging_fraction': 0.3316660761155541, 'min_child_samples': 16, 'max_depth': 31, 'learning_rate': 0.1004626841311671, 'n_estimators': 97}


In [20]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)




[LightGBM] [Info] Total Bins 85815
[LightGBM] [Info] Number of data points in the train set: 32289, number of used features: 343
[LightGBM] [Info] Start training from score 367.151940
[LightGBM] [Info] Total Bins 85815
[LightGBM] [Info] Number of data points in the train set: 32289, number of used features: 343
[LightGBM] [Info] Start training from score 370.474713
[LightGBM] [Info] Total Bins 85815
[LightGBM] [Info] Number of data points in the train set: 32289, number of used features: 343
[LightGBM] [Info] Start training from score 365.153613


In [22]:
# Make predictions for dataset
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.2f}')




test_mae=29.32


In [30]:
predictions

array([[ 3.82582732e+02,  7.23296449e+02,  5.01556892e+02],
       [ 6.12594681e+02,  4.06858544e+02,  8.15502520e+02],
       [ 3.72542849e+02,  6.30166635e+02,  3.87706241e+02],
       ...,
       [ 4.44253959e+01,  7.04885164e+00,  1.17965807e+00],
       [ 2.05832799e+02,  5.08550159e+01,  4.08292634e+00],
       [ 1.25113627e+02,  1.16643086e+02, -6.19208796e-01]])

In [23]:
import joblib
import pickle
from src.paths import MODELS_DIR

# Save the model for pushing it into the registry
MODELS_DIR_ = MODELS_DIR / 'model_222324.pkl'
with open(MODELS_DIR_, "wb") as f:
    pickle.dump(pipeline, f)

In [24]:
import pickle
from src.paths import MODELS_DIR

# Load the model from the pickle file
MODELS_DIR_ = MODELS_DIR / 'model_222324.pkl'
with open(MODELS_DIR_, "rb") as f:
    loaded_model = pickle.load(f)

In [26]:
from src.logger import get_logger
from comet_ml import Experiment
from dotenv import load_dotenv
from src.paths import PARENT_DIR
import src.config as config
import os

load_dotenv(PARENT_DIR / '.env')
COMET_ML_API_KEY = os.environ["COMET_ML_API_KEY"]
COMET_ML_WORKSPACE = os.environ["COMET_ML_WORKSPACE"]
COMET_ML_PROJECT_NAME = os.environ['COMET_ML_PROJECT_NAME']

experiment = Experiment(api_key=COMET_ML_API_KEY, project_name=COMET_ML_PROJECT_NAME, auto_output_logging=None)

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : severe_chapel_2653
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/simonmontt/ba-passenger-flow/32cac82565cc4adea194eea3355de4ae
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     environment details : 1
[1;38;5;39mCOMET INFO:[0m     filename            : 1
[1;38;5;39mCOMET INFO:[0m     git metadata        : 1
[1;38;5;39mCOMET INFO:[0m     installed packages  : 1
[1;38;5;39mCOMET INFO:[0m     notebook            : 1
[1;38;5;39mCOMET INFO:[0m     source_code         : 1
[1;38;5;39mCOMET INFO:[0m 




In [27]:
from src.model_registry import push_model_to_registry
model_comet = push_model_to_registry(model=loaded_model, model_name='subwayBA_passenger_flow_updt') 

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : moral_stain_9552
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/simonmontt/ba-passenger-flow/0830543dae8a42faa3028a0f883d7642
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     environment details : 1
[1;38;5;39mCOMET INFO:[0m     filename            : 1
[1;38;5;39mCOMET INFO:[0m     git metadata        : 1
[1;38;5;39mCOMET INFO:[0m     installed packages  : 1
[1;38;5;39mCOMET INFO:[0m     notebook            : 1
[1;38;5;39mCOMET INFO:[0m     source_code         : 1
[1;38;5;39mCOMET INFO:[0m 


2024-09-21 20:30:35,636 INFO: Starting logging model to Comet ML
2024-09-21 20:30:35,697 INFO: Finished logging model subwayBA_passenger_flow_updt
2024-09-21 20:30:35,698 INFO: Pushing model to the registry as "Production"


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : moral_stain_9552
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/simonmontt/ba-passenger-flow/0830543dae8a42faa3028a0f883d7642
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     model-element : 1 (1.88 MB)
[1;38;5;39mCOMET INFO:[0m 
[1;38;5;39mCOMET INFO:[0m Please wait for metadata to finish uploading (timeout is 3600 seconds)
[1;38;5;39mCOMET INFO:[0m Uploading 18 metrics, params and output messages
[1;38;5;39mCOMET INFO:[0m Uploading 13 metrics, params and output messages
[1;38;5;39mCOME