### model training and testing

In [1]:
import os
import warnings
from math import sqrt

import numpy as np
import pandas as pd
import pins
import pyodbc
import vetiver
from dotenv import load_dotenv
from pprint import pprint
from rsconnect.api import RSConnectServer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [2]:
# import custom functions
from bikeshare.data import clean_data

In [3]:
warnings.filterwarnings('ignore')
load_dotenv(override=True)

rsc_server = os.getenv("CONNECT_SERVER")
rsc_key = os.getenv("CONNECT_API_KEY")
connect_server = RSConnectServer(url=rsc_server,api_key=rsc_key)

## Raw data

Read in the raw data from the database.

In [4]:
# read table built by R ETL process in the database
connection = pyodbc.connect('DSN=Content DB')

sql = "select * from bike_model_data where date in (select distinct date from bike_model_data order by date desc limit 12);"

all_days = pd.read_sql_query(sql, connection)
all_days.sort_values(by='date', inplace=True, ascending=False)
all_days = all_days.reset_index(drop=True)

all_days

Unnamed: 0,id,hour,date,month,dow,n_bikes,lat,lon
0,453,4.0,2023-06-09,6.0,Friday,16.0,38.919086,-77.034502
1,299,0.0,2023-06-09,6.0,Friday,1.0,39.110314,-77.182669
2,298,8.0,2023-06-09,6.0,Friday,13.0,39.114688,-77.171487
3,298,10.0,2023-06-09,6.0,Friday,13.0,39.114688,-77.171487
4,298,12.0,2023-06-09,6.0,Friday,13.0,39.114688,-77.171487
...,...,...,...,...,...,...,...,...
39093,125,8.0,2023-05-29,5.0,Monday,6.0,38.897857,-77.026975
39094,326,16.0,2023-05-29,5.0,Monday,8.0,38.964992,-77.103381
39095,326,18.0,2023-05-29,5.0,Monday,8.0,38.964992,-77.103381
39096,326,20.0,2023-05-29,5.0,Monday,8.0,38.964992,-77.103381


## Train test split

In [5]:
X = all_days.drop(columns=["n_bikes", "id", "date"])
y = all_days[["n_bikes"]]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
X_train

Unnamed: 0,hour,month,dow,lat,lon
31122,10.0,5.0,Wednesday,38.934600,-76.995500
37816,8.0,5.0,Monday,38.909394,-77.048728
30580,10.0,5.0,Wednesday,38.813485,-77.049468
2004,18.0,6.0,Thursday,38.903584,-77.044789
30222,14.0,5.0,Wednesday,38.897612,-77.080851
...,...,...,...,...,...
34667,2.0,5.0,Tuesday,38.887312,-77.025762
12461,4.0,6.0,Monday,38.990249,-77.029350
24822,18.0,6.0,Friday,38.908142,-77.038359
3540,0.0,6.0,Thursday,38.988562,-77.096539


In [7]:
y_train

Unnamed: 0,n_bikes
31122,2.0
37816,15.0
30580,6.0
2004,14.0
30222,3.0
...,...
34667,14.0
12461,5.0
24822,7.0
3540,1.0


## Train model

In [8]:
# build a random forest model
model = Pipeline(
    steps=[
        ("clean-data", FunctionTransformer(clean_data)),
        ("regressor", RandomForestRegressor(
            n_estimators=100, 
            random_state=0, 
            n_jobs=-1
        ))
    ]
)

model.fit(X_train, y_train)

In [9]:
# test the random forest model
y_pred = model.predict(X_test)

# compare predictions
test_mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(test_mse)
print('RMSE: %f' % rmse)

RMSE: 2.826245


## Deploy model

Deploy the model with vetiver.

In [10]:
user_name="sam.edwardes"
pin_name = f"{user_name}/bikeshare-rf-python"

In [11]:
# convert the random forest model into a vetiver model
v = vetiver.VetiverModel(
    model=model,
    model_name=pin_name,
    prototype_data=X_train.head(1),
    description="A model to predict the number of bikes that will be available."
)

In [12]:
# create a board on Posit Connect
board = pins.board_connect(
    server_url="https://colorado.posit.co/rsc",
    allow_pickle_read=True
)

In [13]:
# write the vetiver model as pin to Posit Connect
vetiver.pin_read_write.vetiver_pin_write(
    board=board, 
    model=v
)

Model Cards provide a framework for transparent, responsible reporting. 
 Use the vetiver `.qmd` Quarto template as a place to start, 
 with vetiver.model_card()
Writing pin:
Name: 'sam.edwardes/bikeshare-rf-python'
Version: 20230609T223110Z-a9c79


In [14]:
board.pin_versions(pin_name)

Unnamed: 0,version
0,75565
1,75569
2,75571
3,75685
4,75693
5,75696
6,75701
7,75704
8,75706


In [15]:
# use Vetiver provided Posit Connect deployment function 
# to deploy the model as a FASTApi
vetiver.deploy_rsconnect(
    connect_server=connect_server,
    board=board,
    pin_name=pin_name,
    version=board.pin_versions(pin_name).tail(1)["version"].values[0],
    title="Random Forest model for Bikeshare Python",
    app_id="28923e33-dcb6-4774-b753-bf1d4c367579",
    extra_files=["requirements.txt"]
)

[0mValidating server...[0m[32;20m 	[OK]
[0m[0mValidating app mode...[0m[32;20m 	[OK]
[0m[0mMaking bundle ...[0m[32;20m 	[OK]
[0m[0mDeploying bundle ...[0m[32;20m 	[OK]
[0m[0mSaving deployed information...[0m[32;20m 	[OK]
[0m[0mBuilding FastAPI application...[0m
[0mBundle created with Python version 3.10.11 is compatible with environment Kubernetes::ghcr.io/rstudio/content-pro:r4.1.3-py3.10.11-ubuntu2204 with Python version 3.10.11 from /opt/python/3.10.11/bin/python3 [0m
[0mBundle requested Python version 3.10.11; using /opt/python/3.10.11/bin/python3 from Kubernetes::ghcr.io/rstudio/content-pro:r4.1.3-py3.10.11-ubuntu2204 which has version 3.10.11[0m
[0mDetermining session server location ...[0m
[0m2023/06/09 22:32:19.381217506 [rsc-session] Content GUID: 28923e33-dcb6-4774-b753-bf1d4c367579[0m
[0m2023/06/09 22:32:19.381937397 [rsc-session] Content ID: 16788[0m
[0m2023/06/09 22:32:19.381954293 [rsc-session] Bundle ID: 75707[0m
[0m2023/06/09 22:32:19.