In [5]:
import os
import numpy as np
import math
import pandas as pd

from datetime import datetime
from datetime import datetime
from pydantic import BaseModel
from sqlalchemy import ForeignKey, create_engine, URL, select
from sqlalchemy.orm import DeclarativeBase, Mapped, Session, mapped_column

from dotenv import load_dotenv


In [6]:
url_object = URL.create(
    "postgresql+psycopg",
    username=os.getenv("SJSUBARK_PSQL_USER"),
    password=os.getenv("SJSUBARK_PSQL_PASSWORD"),
    host=os.getenv("SJSUBARK_PSQL_HOST"),
    database=os.getenv("SJSUBARK_PSQL_DB"),
    port=int(os.getenv("SJSUBARK_PSQL_PORT", 5432)),
)
db = create_engine(url_object)


#loads in garage_fullness + calendar
init_query = '''
SELECT gf.utc_timestamp, gf.fullness, gf.garage_id, c.is_weekend, c.is_campus_closed 
FROM garage_fullness gf JOIN calendar c  
ON gf.utc_timestamp >= c.utc_start
   AND gf.utc_timestamp < c.utc_end
;'''
garage_data = pd.read_sql(init_query, db)
garage_data = garage_data.sort_values(["garage_id","utc_timestamp"])

In [7]:
#Remove Non-Interval Data, AKA "if two records of the same garage is not 10 minutes apart, ignore it"


#CONFIGURATION
NUM_OF_GARAGE = 4
INTERVAL_MIN_OFFSET = 1
#######

groups = garage_data.groupby('garage_id')
garage_data['time_to_next'] = groups['utc_timestamp'].diff(-1).abs()
garage_data['fullness_next'] = groups['fullness'].shift(-1)
lower_bound = pd.Timedelta(10 - INTERVAL_MIN_OFFSET, unit="m")
upper_bound = pd.Timedelta(10 + INTERVAL_MIN_OFFSET, unit="m")
garage_data_cleaned = garage_data[
    (garage_data['time_to_next'] >= lower_bound) & 
    (garage_data['time_to_next'] <= upper_bound)
].copy()
garage_data_cleaned['is_weekend'] = garage_data_cleaned['is_weekend'].astype(int)
garage_data_cleaned['is_campus_closed'] = garage_data_cleaned['is_campus_closed'].astype(int)
garage_data_cleaned = garage_data_cleaned.drop(columns=['time_to_next'])
garage_data_cleaned = garage_data_cleaned.sort_values(['utc_timestamp'], ascending=True)
garage_data_cleaned['utc_timestamp'] = pd.to_datetime(garage_data_cleaned['utc_timestamp'], utc=True)
garage_data_cleaned['hour'] = garage_data_cleaned['utc_timestamp'].dt.tz_convert('America/Los_Angeles').dt.hour
garage_data_cleaned['min'] = garage_data_cleaned['utc_timestamp'].dt.tz_convert('America/Los_Angeles').dt.minute
garage_data_cleaned['year'] = garage_data_cleaned['utc_timestamp'].dt.tz_convert('America/Los_Angeles').dt.year
garage_data_cleaned['day'] = garage_data_cleaned['utc_timestamp'].dt.tz_convert('America/Los_Angeles').dt.day
garage_data_cleaned['month'] = garage_data_cleaned['utc_timestamp'].dt.tz_convert('America/Los_Angeles').dt.month
garage_data_cleaned['day_of_week'] = garage_data_cleaned['utc_timestamp'].dt.dayofweek
garage_data_cleaned = garage_data_cleaned.drop(columns=["utc_timestamp"])
garage_data_cleaned = garage_data_cleaned.dropna(subset=['fullness_next'])
garage_data_cleaned

Unnamed: 0,fullness,garage_id,is_weekend,is_campus_closed,fullness_next,hour,min,year,day,month,day_of_week
4,40,1,0,0,38.0,21,10,2025,25,8,1
3769,24,3,0,0,23.0,21,10,2025,25,8,1
3774,15,2,0,0,13.0,21,10,2025,25,8,1
4251,1,4,0,0,0.0,21,10,2025,25,8,1
4255,0,4,0,0,0.0,21,20,2025,25,8,1
...,...,...,...,...,...,...,...,...,...,...,...
4100,40,1,1,0,40.0,13,0,2026,18,1,6
4106,3,2,1,0,3.0,13,10,2026,18,1,6
4105,40,1,1,0,40.0,13,10,2026,18,1,6
4107,13,3,1,0,13.0,13,10,2026,18,1,6


In [8]:
from sklearn import model_selection
y = garage_data_cleaned["fullness_next"]
X = garage_data_cleaned.drop(columns=["fullness_next"])
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=67)

In [9]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
preprocessor = ColumnTransformer(
    transformers=[
        ('garage_id_ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['garage_id'])
    ],
    remainder='passthrough' # Keeps all other numeric columns
)
pipeline = Pipeline([
    ('encoder', preprocessor),
    ('model', HistGradientBoostingRegressor()) 
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('encoder', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('garage_id_ohe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,100
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [14]:
print(X_test.columns)
y_pred = pipeline.predict(X_test)
print("MAE: ", mean_absolute_error(y_test, y_pred))

Index(['fullness', 'garage_id', 'is_weekend', 'is_campus_closed', 'hour',
       'min', 'year', 'day', 'month', 'day_of_week'],
      dtype='object')
MAE:  0.40522812784540513


In [None]:
import joblib
filename = "boost_pipelines.joblib"
joblib.dump(pipeline, filename)

In [23]:
pipeline.predict(X_test)[0]

np.float64(18.722751357596305)