In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

from snowflake.ml.registry import Registry

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()
session


In [2]:
df = session.table("MORTGAGE_TRAINING_DATA").to_pandas()
df.head()

'1.0.2'

In [8]:
#Create lists of categorical and continuous features for later use
catFeats = list(df.columns[df.dtypes == 'O'])
contFeats = list(df.columns.drop(catFeats))
contFeats.remove('MORTGAGERESPONSE')

print(contFeats)
print(catFeats)

In [10]:
#split data into train and test sets and isolate target from input features

X = df.drop('MORTGAGERESPONSE', axis =1)
y = df['MORTGAGERESPONSE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, \
                                                    random_state=14)

In [11]:
#Build preprocessing pipeline steps for categorical and continuous features
contTransform = Pipeline(steps=[('scaler', StandardScaler())])
catTransform = Pipeline(steps = [('oneHots', OneHotEncoder(drop = 'first'))])


#Form the column transformer to apply transforms to the appropriate features

allFeatTransform = ColumnTransformer(
    transformers=[
        ('numerical', contTransform, contFeats),
        ('categorical', catTransform, catFeats)])

## Random forest

In [27]:
#Define the full pipeline with the preprocessing steps and the random forest classifier
rf_pipe =Pipeline(steps = [('preprocessing_steps', allFeatTransform), \
                               ('rf_classifier', RandomForestClassifier())])

In [28]:
#Train the model
rf_pipe.fit(X_train, y_train)

In [30]:
#compute predictions on test set
rf_yhat = rf_pipe.predict(X_test)


#Confusion matrix by percent of total
print(np.around(confusion_matrix(y_test,rf_yhat)/y_test.shape[0],4)*100)

#F1 score
print(f"F1 score - {f1_score(y_test,rf_yhat)}")

0.7592305053532418
              precision    recall  f1-score   support

           0       0.42      0.32      0.36     23724
           1       0.83      0.88      0.85     87050

    accuracy                           0.76    110774
   macro avg       0.62      0.60      0.61    110774
weighted avg       0.74      0.76      0.75    110774

[[ 7484 16240]
 [10431 76619]]


# MODEL REGISTRY

In [None]:
#Create a snowflake model registry object 
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier
from snowflake.ml.model import model_signature

db = identifier._get_unescaped_name(session.get_current_database())
schema = identifier._get_unescaped_name(session.get_current_schema())


# Define model name
model_name = "mortgage_lending_sklearn_pipeline"

# Create a registry to log the model to
model_registry = Registry(session=session, database_name=db, schema_name=schema)

In [None]:
#First we'll deploy the full pipeline to the model registry and use the raw X_test data as our sample input data

version_name = 'full_sklearn_pipeline'

try:
    model_ver_pipeline = model_registry.get_model(model_name).version(version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    model_ver_pipeline = model_registry.log_model(
        model_name=model_name,
        model=rf_pipe, #full model pipeline
        version_name=version_name,
        sample_input_data=X_test, #raw test data
        comment = "full sklearn pipeline with preprocessing transformers",
    )

In [None]:
#List built in functions for the model we have deployed to the model regsitry
model_ver_pipeline.show_functions()

In [None]:
#Now predict the test data using the model registry model
## note that you can pass the pandas df directly to this function rather using session.create_dataframe to convert it to a snowpark df
model_ver_pipeline.run(session.create_dataframe(X_test), function_name="predict")

In [None]:
#Here we can choose to extract the pipeline back from the model registry model if we choose
loaded_pipe = model_ver_pipeline.load()
loaded_pipe

In [None]:
#Here we'll do a quick validation that we can 
## 1. Extract the preprocessing transformers from the pipeline and transform the test data
## 2. Extract the classifier from the pipeline and use it to directly predict the transformed test data 
transformed_test_data = rf_pipe['preprocessing_steps'].transform(X_test)
rf_pipe['rf_classifier'].predict(transformed_test_data[0:10])

In [None]:
#Now we'll deploy just the classifier extracted from pipeline to the model registry and use the transformed X_test data as our sample input data
version_name = 'just_model'

try:
    model_ver_just_model = model_registry.get_model(model_name).version(version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    model_ver_just_model = model_registry.log_model(
        model_name=model_name,
        model = rf_pipe['rf_classifier'], # extract the classifier from the pipeline
        version_name=version_name,
        sample_input_data=rf_pipe['preprocessing_steps'].transform(X_test), # extract the preprocessing steps from the pipeline and transform the test data
        comment = "Extracted classifier from pipeline"
    )

In [None]:
#See that we have the same functions for the model registry whether it is deployed as the full pipeline or just the classifier model itself
model_ver_just_model.show_functions()

In [None]:
#Now predict the test data using the model registry model
model_ver_just_model.run(transformed_test_data, function_name="PREDICT")

In [None]:
#Here we can choose to extract the classifier back from the model registry model if we choose

loaded_just_model = model_ver_just_model.load()
loaded_just_model

## Conclusion

### 🚀 Snowflake model registry gives users a secure and flexible framework to deploy a raw model, or a full model pipeline with pre/post-processing transformers 🚀
#### 🔮 All model versions are logged in the model registry for inference, explainability, lineage tracking, visibility and more 🔮
#### 🌐 The actual model (or pipeline) object can be extracted from the model registry as needed 🌐