In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

In [3]:
def convert_timestamp(timestamp):
    ts = timestamp.split(" ")
    return datetime.fromisoformat("".join([ts[0], "T", ts[1].split(".")[0], "+00:00"]))
#Testing function
print(convert_timestamp("2025-08-26 04:10:01 +0000 UTC"))

2025-08-26 04:10:01+00:00


In [4]:
#Load in the CSV 
data = pd.read_csv("master.csv")
data["utc_timestamp"] = (data["utc_timestamp"].apply(lambda x: convert_timestamp(x)))
#Need to extract the fullness of the next timestamp
data.sort_values(["utc_timestamp", "name"])
print("Total records: ", len(data))

# Remove any entries such that the time difference is not consistently around 10 mins
garages = ["NorthGarage", "SouthCampusGarage", "SouthGarage", "WestGarage"]
result = []

for garage in garages:
    g = data.loc[data['name'] == garage].copy()
    g['lag1'] = g['fullness'].shift(1)
    g['time_lag1_diff'] = g['utc_timestamp'] - g['utc_timestamp'].shift(1)
    diff = g.loc[(g['time_lag1_diff'] < pd.Timedelta(11, unit="m")) & (g['time_lag1_diff'] > pd.Timedelta(9, unit="m"))]
    result.append(diff)
data = pd.concat(result, ignore_index=True)
data.drop('time_lag1_diff', axis=1)
data['is_weekend'] = data['is_weekend'].astype(int)
data['is_campus_closed'] = data['is_campus_closed'].astype(int)

print("Records with fixed time series: ", len(data))

Total records:  52288
Records with fixed time series:  51072


In [5]:
from sklearn import model_selection

y = data["lag1"]
X = data.drop(columns=["lag1"])
# Split it 75/25 training, testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=10) 
print("training length: ", len(X_train), "testing length: ", len(y_train))

training length:  38304 testing length:  38304


In [6]:
from sklearn.preprocessing import OneHotEncoder
# One Hot Encode the parking garage names
def OHE(dataset):
    garage_name = dataset[['name']]
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    enc.fit(garage_name)
    encoded_data = enc.transform(garage_name)
    feature_names = enc.get_feature_names_out(['name'])
    encoded_df = pd.DataFrame(encoded_data, columns=feature_names, index=dataset.index)
    return pd.concat([dataset.drop(['name', 'utc_timestamp', "time_lag1_diff", "second"], axis=1), encoded_df], axis=1)
X_train = OHE(X_train)


In [7]:
from sklearn.ensemble import HistGradientBoostingRegressor

boost = HistGradientBoostingRegressor(random_state=10)
model =  boost.fit(X_train, y_train)
model.score(X_train, y_train)

0.9989711576253533

In [8]:
X_test = OHE(X_test)
model.score(X_test, y_test)

0.9993559236273146

In [9]:
sample = X_test.loc[[30648]]
print(sample)
print("random predicted value", round(model.predict(sample)[0]), ", actual value", y_test.loc[30648])

       fullness  hour  minute  year  month  day  weekday  is_weekend  \
30648        45    20      20  2025     10    1        3           0   

       is_campus_closed  name_NorthGarage  name_SouthCampusGarage  \
30648                 0               0.0                     0.0   

       name_SouthGarage  name_WestGarage  
30648               1.0              0.0  
random predicted value 47 , actual value 47.0


In [10]:
import joblib
filename = "gradient_boost_model.pkl"
joblib.dump(model, filename)

['gradient_boost_model.pkl']

In [11]:
type(model)

sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor

In [12]:
a = joblib.load(filename)

sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor