In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
def convert_timestamp(timestamp):
    ts = timestamp.split(" ")
    return datetime.fromisoformat("".join([ts[0], "T", ts[1].split(".")[0], "+00:00"]))
#Testing function
print(convert_timestamp("2025-08-26 04:10:01 +0000 UTC"))

2025-08-26 04:10:01+00:00


In [21]:
#Load in the CSV 
data = pd.read_csv("master.csv")
data["utc_timestamp"] = (data["utc_timestamp"].apply(lambda x: convert_timestamp(x)))
#Need to extract the fullness of the next timestamp
data.sort_values(["utc_timestamp", "name"])
print("Total records: ", len(data))

# Remove any entries such that the time difference is not consistently around 10 mins
garages = ["NorthGarage", "SouthCampusGarage", "SouthGarage", "WestGarage"]
result = []

for garage in garages:
    g = data.loc[data['name'] == garage].copy()
    g['lag1'] = g['fullness'].shift(1)
    g['time_lag1_diff'] = g['utc_timestamp'] - g['utc_timestamp'].shift(1)
    diff = g.loc[(g['time_lag1_diff'] < pd.Timedelta(11, unit="m")) & (g['time_lag1_diff'] > pd.Timedelta(9, unit="m"))]
    result.append(diff)
data = pd.concat(result, ignore_index=True)
data.drop('time_lag1_diff', axis=1)
data['is_weekend'] = data['is_weekend'].astype(int)
data['is_campus_closed'] = data['is_campus_closed'].astype(int)

print("Records with fixed time series: ", len(data))

Total records:  52288
Records with fixed time series:  51072


In [22]:
from sklearn import model_selection

y = data["lag1"]
X = data.drop(columns=["lag1"])
# Split it 75/25 training, testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=10) 
print("training length: ", len(X_train), "testing length: ", len(y_train))

training length:  38304 testing length:  38304


In [23]:
def drop_unnecessary_cols(df):
    cols_to_drop = ['utc_timestamp', "time_lag1_diff", "second"]
    # We only drop if they exist (to avoid errors during single-record prediction)
    return df.drop(columns=[c for c in cols_to_drop if c in df.columns])
X_train = drop_unnecessary_cols(X_train)

In [24]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('name_ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['name'])
    ],
    remainder='passthrough' # Keeps all other numeric columns
)

pipeline = Pipeline([
    ('encoder', preprocessor),
    ('model', boost) # Add your classifier/regressor here
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('encoder', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('name_ohe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,100
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [26]:
pipeline.score(X_test, y_test)


0.9993558881207871

In [28]:
X_test = drop_unnecessary_cols(X_test)
X_test

Unnamed: 0,name,fullness,hour,minute,year,month,day,weekday,is_weekend,is_campus_closed
3770,NorthGarage,4,18,50,2025,9,21,0,1,0
42278,WestGarage,12,5,30,2025,9,23,2,0,0
30648,SouthGarage,45,20,20,2025,10,1,3,0,0
2916,NorthGarage,32,20,10,2025,9,15,1,0,0
32178,SouthGarage,23,18,0,2025,10,12,0,1,0
...,...,...,...,...,...,...,...,...,...,...
34565,SouthGarage,66,8,30,2025,10,30,4,0,0
40921,WestGarage,17,17,40,2025,9,13,6,1,0
19746,SouthCampusGarage,0,3,20,2025,10,15,3,0,0
37512,SouthGarage,22,3,20,2025,11,20,4,0,0


In [19]:
import joblib
filename = "gradient_boost_pipeline.joblib"
joblib.dump(pipeline, filename)

['gradient_boost_pipeline.joblib']