### model training and testing

In [1]:
import numpy as np
import psycopg2
import pandas as pd
import os
from rsconnect.api import RSConnectServer
from rsconnect.actions import deploy_python_fastapi
from dotenv import load_dotenv
import warnings
import math
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from vetiver import VetiverModel, VetiverAPI, vetiver_endpoint,pin_read_write
from vetiver import deploy_rsconnect
import pins

In [2]:
warnings.filterwarnings('ignore')
load_dotenv(override=True)

rsc_server=os.getenv("CONNECT_SERVER")
rsc_key=os.getenv("CONNECT_API_KEY")
connect_server = RSConnectServer(url=rsc_server,api_key=rsc_key)

db_password=os.getenv("DB_PASSWORD")
db_url=os.getenv("DB_URL")

In [3]:
def add_dow_as_int(df):
    '''One hot encoding the day of the week'''
    df['date']=pd.to_datetime(df['date'])
    one_hot=pd.get_dummies(df['dow'])
    df = df.join(one_hot)
    df = df.drop('dow',axis=1)
    return df

In [4]:
def feature_split(df):
    '''split the df provided into X features and y output'''
    X = df.drop(['n_bikes','date','lat','lon'], axis=1)
    y = df[['n_bikes']]
    return X,y

In [5]:
def add_missing_dow(df):
    ''' add encoding for missing dow in testing dataset'''
    all_dow=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    dow = df.columns.drop(['n_bikes','id','date','hour','month','lat','lon']).to_list()
    a = np.setdiff1d(all_dow,dow,assume_unique=False)
    for i in a:
        df_testing[i]=0
    return df

In [7]:
# read table built by R ETL process in the database
connection = psycopg2.connect(
    user="content",
    password=db_password,
    host=db_url,
    port="5432",
    database="rds")

sql = "select * from bike_model_data where date in (select distinct date from bike_model_data order by date desc limit 12);"

all_days = pd.read_sql_query(sql, connection)
all_days.sort_values(by='date', inplace=True, ascending=False)
all_days=all_days.reset_index(drop=True)
all_days

Unnamed: 0,id,hour,date,month,dow,n_bikes,lat,lon
0,157,8.0,2022-07-21,7.0,Thursday,6.0,38.850337,-77.100989
1,132,0.0,2022-07-21,7.0,Thursday,5.0,38.865590,-76.952103
2,5,6.0,2022-07-21,7.0,Thursday,5.0,38.857866,-77.059490
3,5,8.0,2022-07-21,7.0,Thursday,5.0,38.857866,-77.059490
4,8,6.0,2022-07-21,7.0,Thursday,5.0,38.857405,-77.051132
...,...,...,...,...,...,...,...,...
79636,438,18.0,2022-07-10,7.0,Sunday,6.0,38.957037,-77.359718
79637,438,20.0,2022-07-10,7.0,Sunday,6.0,38.957037,-77.359718
79638,438,22.0,2022-07-10,7.0,Sunday,4.0,38.957037,-77.359718
79639,439,0.0,2022-07-10,7.0,Sunday,2.0,38.884916,-77.005965


In [6]:
# divide in training and testing datasets
all_dates = all_days.date.unique()
testing_dates = all_dates[0:3]
training_dates = all_dates[3:]
df_testing = all_days.loc[all_days['date'].isin(testing_dates)]
df_training = all_days.loc[all_days['date'].isin(training_dates)]

In [7]:
# build a random forest model
df_training = add_dow_as_int(df_training)

X_train,y_train = feature_split(df_training)

from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100,random_state=0,n_jobs=-1)
regressor.fit(X_train, y_train) 

In [8]:
# test the random forest model
df_testing = add_dow_as_int(df_testing)
df_testing = add_missing_dow(df_testing)
X_test,y_test = feature_split(df_testing)
y_pred = regressor.predict(X_test)

In [9]:
# compare predictions
test_mse = mean_squared_error(y_test,y_pred)
rmse = sqrt(test_mse)
print('RMSE: %f' % rmse)

RMSE: 4.927005


### vetiver deployment

In [11]:
# convert the random forest model into a vetiver model
v = VetiverModel(regressor,"gagan/bikeshare-rf-python",save_ptype=True, ptype_data=X_train)

In [12]:
v

<vetiver.vetiver_model.VetiverModel at 0x7fbb3de7ee00>

In [13]:
# create a board on RStudio Connect
board = pins.board_rsconnect(server_url="https://colorado.rstudio.com/rsc",
                            allow_pickle_read=True)

In [14]:
# write the vetiver model as pin to RStudio Connect
pin_read_write.vetiver_pin_write(
    board,
    v
)

Writing pin:
Name: 'gagan/bikeshare-rf-python'
Version: 20220720T220547Z-0a3c9


In [15]:
# use Vetiver provided RStudio Connect deployment function 
# to deploy the model as a FASTApi
deploy_rsconnect(
    connect_server=connect_server,
    board=board,
    pin_name="gagan/bikeshare-rf-python",
    #python=".bike_predict_python/bin/python",
    title = "Random Forest model for Bikeshare Python"
)