In [None]:
import snowflake.snowpark as snp
from snowflake.snowpark.functions import sproc
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from kmodes.kmodes import KModes
import numpy as np
import pandas as pd
import io

## Data Science

Now that the data engineers have loaded and transformed the data to the `squirrels_engineered_features` table, we can begin our model development. For this, we will leverage Snowpark to do **clustering analysis**.  
<br> 
The Snowpark Python client-side Dataframe API allows us to push-down most of the computation for preparation and feature engineering to Snowpark. For security and governance reasons we can read data into memory for model training and inference but no intermediate data products will be stored outside of Snowflake. 
Also, since training a model usually requires more intensive computation we are going to use a larger warehouse for this task. Snowflake supports resizing a warehouse at any time, you can scale up a warehouse in seconds or less. For this exercise, we are going to use the data science warehouse, so that every team can work completely independent of the others.

### 1. Load the Credentials

In [None]:
from steps.snowpark_connection import snowpark_connect
session, state_dict = snowpark_connect('./include/state.json')
session.use_warehouse(state_dict['compute_parameters']['ds_warehouse'])
print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

### 2. Create stages to save the ML model/pipeline and permanent UDFs

In [None]:
# create stage to store models
query = "create or replace stage models" +\
        " directory = (enable = true)" +\
        " copy_options = (on_error='skip_file')"
        
session.sql(query).collect()

In [None]:
# create stage to store udfs
query = "create or replace stage udf" +\
        " copy_options = (on_error='skip_file')"
        
session.sql(query).collect()

In [None]:
# clear out existing packages and imports
session.clear_packages()
session.clear_imports()

In [None]:
# let the snowflake server know what packages to manage on your behalf
session.add_packages('snowflake-snowpark-python', 'scikit-learn', 'pandas', 'numpy', 'joblib', 'cachetools')
session.add_import('./include/kmodes.zip')

In [None]:
# save artifacts to stages
def save_file(session, model, path):
  input_stream = io.BytesIO()
  joblib.dump(model, input_stream)
  session._conn._cursor.upload_stream(input_stream, path)
  return "successfully created file: " + path

### 3. Train model using Kmodes

The Snowpark server-side Anaconda runtime has a large [list of Python modules included](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html) for our UDF.  However, the data scientist built this code based on kmodes which is not currently in the Snowpark distribution.
  
  We can simply add [kmodes](https://github.com/nicodv/kmodes), as well as our own team's python code, as import dependencies.

In [None]:
# train model using engineered features from a snowflake table
def train_model(session: snp.Session) -> float:
    snowdf = session.table("squirrels_engineered_features")
    train = snowdf.select("HAS_ACTIVITY", "IS_STRESSED", "COMFY_WITH_HUMANS").to_pandas()
    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(), train.columns)])
    full_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', KModes(n_clusters=3, init = "Huang", random_state=0)),
        ])

    full_pipeline.fit(train)
    # save the full pipeline including the model
    save_file(session, full_pipeline, "@MODELS/kmodes.joblib")
    fitClusters = full_pipeline.predict(train)

    return len(np.unique(fitClusters))

# Create an instance of StoredProcedure using the sproc() function
train_model_sp = sproc(train_model, replace=True)

¯\\\_(ツ)_/¯ ```Clustering``` is an unsupervised learning method whose task is to divide the population or data points into a number of groups, such that data points in a group are more similar to other data points in the same group and dissimilar to the data points in other groups. It is basically a collection of objects based on similarity and dissimilarity between them.

```KModes``` clustering is one of the unsupervised Machine Learning algorithms that is used to cluster categorical variables (no intrinsic order to the categories).

You might be wondering, why ```KModes``` when we already have ```KMeans```.

KMeans uses mathematical measures (distance) to cluster continuous data. The lesser the distance, the more similar our data points are. Centroids are updated by Means.
But for categorical data points, we cannot calculate the distance. So we go for KModes algorithm. It uses the dissimilarities (total mismatches) between the data points. The lesser the dissimilarities the more similar our data points are. It uses Modes instead of means. The mode is the number that occurs most often in a data set.

### 4. Run the training within the SPROC

In [None]:
# train model direcly on the snowflake server: the code is moved to data!
train_model_sp()

### 5. Model/Pipeline Deployment using UDF
Here we can use Snowpark User Defined Functions for inference without having to pull data out of Snowflake.

In [None]:
import sys
import cachetools
import os
from snowflake.snowpark.functions import udf
session.add_import("@MODELS/kmodes.joblib")  

# get model from stage
@cachetools.cached(cache={})
def read_file(filename):
       import_dir = sys._xoptions.get("snowflake_import_directory")
       if import_dir:
              with open(os.path.join(import_dir, filename), 'rb') as file:
                     m = joblib.load(file)
                     return m

features = ["HAS_ACTIVITY", "IS_STRESSED", "COMFY_WITH_HUMANS"]

# create udf to apply predictions on new data
@udf(name="predict", is_permanent=True, stage_location="@udf", replace=True)
def predict(HAS_ACTIVITY: int, IS_STRESSED: int, COMFY_WITH_HUMANS: int) -> int:
       m = read_file('kmodes.joblib')       
       row = pd.DataFrame([locals()], columns=features)
       return m.predict(row)[0]

Lets check how the model performs.

In [None]:
# apply udf to make predictions on new data
snowdf_test = session.table("squirrels_engineered_features")
inputs = snowdf_test.select("HAS_ACTIVITY", "IS_STRESSED", "COMFY_WITH_HUMANS")
snowdf_results = snowdf_test.select(*snowdf_test.columns,
                    predict(*inputs).alias('CLUSTER'), 
                    )
                    
snowdf_results.to_pandas().head(2)

In [None]:
# save clusters back to snowflake
snowdf_results.write.mode("overwrite").save_as_table("squirrels_engineered_clusters")

In [None]:
session.close()