### model training and testing

In [None]:
import os
import warnings
from math import sqrt

import numpy as np
import pandas as pd
import pins
import pyodbc
from dotenv import load_dotenv
from rsconnect.api import RSConnectServer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from vetiver import VetiverModel, deploy_rsconnect, pin_read_write

In [None]:
warnings.filterwarnings('ignore')
load_dotenv(override=True)

rsc_server = os.getenv("CONNECT_SERVER")
rsc_key = os.getenv("CONNECT_API_KEY")
connect_server = RSConnectServer(url=rsc_server,api_key=rsc_key)

## Raw data

Read in the raw data from the database.

In [None]:
# read table built by R ETL process in the database
connection = pyodbc.connect('DSN=Content DB')

sql = "select * from bike_model_data where date in (select distinct date from bike_model_data order by date desc limit 12);"

all_days = pd.read_sql_query(sql, connection)
all_days.sort_values(by='date', inplace=True, ascending=False)
all_days = all_days.reset_index(drop=True)
all_days

## Train test split

In [None]:
X = all_days[["id", "hour", "date", "month", "dow"]]
y = all_days[["n_bikes"]]

X_train, X_test, y_train, y_test = train_test_split(X, y)


In [None]:
X_train

In [None]:
y_train

## Data processing

Clean and transform the data.

In [None]:
class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        df = self.add_dow_as_int(df)
        df = self.add_missing_dow(df)
        return df

    def add_dow_as_int(self, X):
        '''One hot encoding the day of the week'''
        df = X.copy()
        df['date'] = pd.to_datetime(df['date'])
        one_hot = pd.get_dummies(df['dow'])
        df = df.join(one_hot)
        df = df.drop('dow',axis=1)
        return df

    def add_missing_dow(self, X):
        ''' add encoding for missing dow in testing dataset'''
        df = X.copy()
        all_dow = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
        dow_in_data = df.columns.drop(['id','date','hour','month']).to_list()
        dow_not_in_data = np.setdiff1d(all_dow, dow_in_data, assume_unique=False)
        for i in dow_not_in_data:
            df[i] = False
        df = df.drop(columns=["date"])
        # Arrange columns
        df = df[['id','hour','month', 'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]
        return df

## Train model

In [None]:
# build a random forest model
model = Pipeline(
    steps=[
        ("pre-processor", DataCleaner()),
        ("regressor", RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1))
    ]
)

model.fit(X_train, y_train)


In [None]:
# test the random forest model
y_pred = model.predict(X_test)

# compare predictions
test_mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(test_mse)
print('RMSE: %f' % rmse)

## Deploy model

Deploy the model with vetiver.

In [None]:
user_name="gagan"

In [None]:
# convert the random forest model into a vetiver model
v = VetiverModel(
    model,
    f"{user_name}/bikeshare-rf-python",
    prototype_data=X_train
)

In [None]:
# create a board on Posit Connect
board = pins.board_rsconnect(
    server_url="https://colorado.posit.co/rsc",
    allow_pickle_read=True
)

In [None]:
# write the vetiver model as pin to Posit Connect
pin_read_write.vetiver_pin_write(board, v)

In [None]:
# use Vetiver provided Posit Connect deployment function 
# to deploy the model as a FASTApi
deploy_rsconnect(
    connect_server=connect_server,
    board=board,
    pin_name=f"{user_name}/bikeshare-rf-python",
    title = "Random Forest model for Bikeshare Python"
)