### CitiBike ML Demo
By: David Stearns  
  
This demo (while fully functional *could use a bit of clean up!* :D ) demonstrates Snowpark Python's capability to build, train, and deploy machine learning models. It uses the Scalar UDF function which is great for single predictions, however, if you would like to do full table predictions or use input batches, the best way to accomplish that is to use Vectorized UDF's. 

In [12]:
import snowflake.snowpark
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.functions import udf
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import *
import json
import pandas as pd
import numpy as np
import datetime
import sys
import math
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark import Window
pd.set_option('display.max_columns', None)
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
import credentials
import io
import joblib
import cachetools
conn = {
    "account": credentials.credentials["account"],
    "user": credentials.credentials["username"],
    "password": credentials.credentials["password"],
    "role": credentials.credentials["role"],
    "warehouse": credentials.credentials["warehouse"],
    "database": credentials.credentials["database"],
    "schema": credentials.credentials["schema"]
}
session = Session.builder.configs(conn).create()

In [13]:
def save_model(session, model, path):
  input_stream = io.BytesIO()
  joblib.dump(model, input_stream)
  session._conn._cursor.upload_stream(input_stream, path)
  return "successfully created file: " + path

In [14]:
session.sql(f"create or replace stage citibike_ml").collect()

[Row(status='Stage area CITIBIKE_ML successfully created.')]

In [4]:
df_trips_hourly = session.table("trips_stations_vw")
df_trips_daily = df_trips_hourly.select(F.to_date(F.col("STARTTIME")).as_("ds")).groupBy(F.col("ds")).count().select(F.col("ds"), F.col("COUNT").as_("y"))
df_weather = session.table("weather_vw")
df_trips_weather = df_trips_daily.join(df_weather, df_trips_daily["DS"] == df_weather["OBSERVATION_DATE"])\
                    .select(F.col("DS"),F.col("Y"), F.col("TEMP_AVG_C"), F.col("TOT_PRECIP_IN"))
df_trips_weather = df_trips_weather.withColumn("rain_indicator", F.when(F.col("TOT_PRECIP_IN") > 0, F.lit(1) ).otherwise(F.lit(0)))
df_trips_weather = df_trips_weather.sort(F.col("DS"))
df_trips_weather.select("DS", "Y", "RAIN_INDICATOR", "TEMP_AVG_C").write.mode("overwrite").save_as_table("citibike_ml_demo")

In [40]:
session.add_packages('snowflake-snowpark-python', 'scikit-learn', 'pandas', 'numpy', 'joblib', 'cachetools')
def citibike_ml_demo(session: snowflake.snowpark.Session) -> str:

    import snowflake.snowpark as snp
    from snowflake.snowpark import functions as F
    from snowflake.snowpark import types as T

    model_name = "predict_trips"
    stage_name = "citibike_ml"
    df = session.table("citibike_ml_demo")
    pandas_df = df.toPandas()
    pandas_df["trips_yesterday"] = pandas_df['Y'].shift(1)
    pandas_df["Y_log"] = np.log(pandas_df["Y"])
    pandas_df["trips_yesterday_log"] = np.log(pandas_df["trips_yesterday"])
    pandas_df = pandas_df[1:]

    X = pandas_df[["RAIN_INDICATOR", "TEMP_AVG_C", "trips_yesterday_log"]]
    y = pandas_df[["Y_log"]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33 )
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)

    save_model(session, rf, f"@{stage_name}/{model_name}.joblib")

    model_name_full = session.sql(f"ls @{stage_name}").collect()[0][0]

    performance = cross_validate(rf, X, y, cv=3)

    return f"Created model: {model_name_full}.... Performance Metrics: {performance}"

In [41]:
# Registering my SPROC within Snowflake
train_rfc_model = session.sproc.register(citibike_ml_demo, replace=True)

In [42]:
# Running the SPROC for model training
train_rfc_model(session=session)

"Created model: citibike_ml/predict_trips.joblib.............. {'fit_time': array([0.14057446, 0.13836741, 0.13778305]), 'score_time': array([0.01067162, 0.01074004, 0.01069951]), 'test_score': array([0.67073652, 0.61245731, 0.65127835])}"

In [8]:
# Importing my model into the session
session.add_import("@citibike_ml/predict_trips.joblib")  

# Caching my model so it is only read once
@cachetools.cached(cache={})

# Function to read the model file into another function
def read_file(filename):
       import os
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

In [9]:
# Registering the UDF
@udf(session=session, name="predict_trips", is_permanent=True, stage_location="@citibike_ml", replace=True)
def predict_trips(args: list) -> int:
    m = read_file('predict_trips.joblib')
    if isinstance(m, str):
        return m
    return m.predict([args])

In [50]:
# create a prediction
# inputs are: RAIN(1/0), TEMP(in C), TRIPS YESTERDAY(natural log)
# the trips_yesterday variable has to be in the form of a natural log
# you can change these variables if you want... but it doesn't matter

trips_yesterday_log = 10.1
rain = 0
temp = 22

trips_yesterday = math.floor(np.exp(10.1))
print(f"There were {trips_yesterday} trips yesterday")
trips_predicted_today = math.floor(session.sql(f"select exp(predict_trips([{rain}, {temp}, {trips_yesterday_log}]))").collect()[0][0])
print(f"Based on the inputs: \n RAIN = {rain}, TEMP(C) = {temp}, TRIPS YESTERDAY = {trips_yesterday}\n....\nThere will be {trips_predicted_today} trips today.")

There were 24343 trips yesterday
Based on the inputs: 
 RAIN = 0, TEMP(C) = 22, TRIPS YESTERDAY = 24343
....
There will be 8103 trips today.
