# Model Packaging Example

## Before Everything

### Install `snowflake-ml-python` locally

Please refer to our [readme file](https://docs.google.com/document/d/10DmBHYFGKINQwyvJupfuhARDk-cyG5_Fn3Uy2OQcQPk) to install `snowflake-ml-python`.

In [None]:
# Snowpark Connector, Snowpark Library, Session
import snowflake.connector
import snowflake.snowpark
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.ml.utils import connection_params

### Setup Notebook

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Scale cell width with the browser window to accommodate .show() commands for wider tables.
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

### Start Snowpark Session

To avoid exposing credentials in Github, we use a small utility `SnowflakeLoginOptions`. It allows you to score your default credentials in `~/.snowsql/config` in the following format:
```
[connections]
accountname = <string>   # Account identifier to connect to Snowflake.
username = <string>      # User name in the account. Optional.
password = <string>      # User password. Optional.
dbname = <string>        # Default database. Optional.
schemaname = <string>    # Default schema. Optional.
warehousename = <string> # Default warehouse. Optional.
#rolename = <string>      # Default role. Optional.
#authenticator = <string> # Authenticator: 'snowflake', 'externalbrowser', etc
```
Please follow [this](https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings) for more details.

In [None]:
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session

session = Session.builder.configs(SnowflakeLoginOptions()).create()

### Open/Create Model Registry

A model registry needs to be created before it can be used. The creation will create a new database in the current account so the active role needs to have permissions to create a database. After the first creation, the model registry can be opened without the need to create it again.

In [None]:
REGISTRY_DATABASE_NAME = "TEMP"
REGISTRY_SCHEMA_NAME = "WZHAO"

In [None]:
from snowflake.ml.registry import model_registry
model_registry.create_model_registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)
registry = model_registry.ModelRegistry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

## Use with snowml model

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd


iris = load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
df.columns = [s.replace(" (CM)", '').replace(' ', '') for s in df.columns.str.upper()]

INPUT_COLUMNS = ['SEPALLENGTH', 'SEPALWIDTH', 'PETALLENGTH', 'PETALWIDTH']
LABEL_COLUMNS = 'TARGET'
OUTPUT_COLUMNS = 'PREDICTED_TARGET'

In [None]:
df

In [None]:
test_features = df[:10]
model_version = "1_008"

### XGBoost model

In [None]:
clf_xgb = XGBClassifier(input_cols=INPUT_COLUMNS,
                          output_cols=OUTPUT_COLUMNS,
                          label_cols=LABEL_COLUMNS,
                          drop_input_cols=True)

clf_xgb.fit(df)

In [None]:
prediction = clf_xgb.predict(test_features)
prediction_proba = clf_xgb.predict_proba(test_features)

In [None]:
model_name = "SIMPLE_XGB_MODEL"
deploy_name = "xgb_model_predict"

In [None]:
# A name and model tags can be added to the model at registration time.
model_id = registry.log_model(
    model_name=model_name,
    model_version=model_version,
    model=clf_xgb,
    tags={"stage": "testing", "classifier_type": "XGBClassifier"},
    options={"embed_local_ml_library": True}
)

# The object API can be used to reference a model after creation.
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
print("Registered new model:", model_id)

### Test on the result using load_model 

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
restored_clf = model.load_model()

restored_prediction = restored_clf.predict(test_features)

print("Original prediction:", prediction[:10])
print("Restored prediction:", restored_prediction[:10])

print("Result comparison:", np.array_equal(prediction, restored_prediction[prediction.columns]))

### Testing on deploy

#### Predict function match/mismatch? - comparsion between deploy and local

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict",
    options={"relax_version": True},
)

In [None]:
remote_prediction = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction[:10])

print("Result comparison:", np.array_equal(prediction, remote_prediction.values))

#### Predict_proba function match/mismatch? - comparsion between deploy and local

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict_proba",
    options={"relax_version": True},
)

In [None]:
remote_prediction_proba = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction_proba[:10])

print("Result comparison:", np.allclose(prediction_proba, remote_prediction_proba.values))

### Random Forest model *from ensemble*


In [None]:
from snowflake.ml.modeling.ensemble import RandomForestClassifier

In [None]:
clf_rf = RandomForestClassifier(input_cols=INPUT_COLUMNS,
                          output_cols=OUTPUT_COLUMNS,
                          label_cols=LABEL_COLUMNS,
                          drop_input_cols=True)

clf_rf.fit(df)

In [None]:
prediction = clf_rf.predict(test_features)
prediction_proba = clf_rf.predict_proba(test_features)
prediction_log_proba = clf_rf.predict_log_proba(test_features)

In [None]:
model_name = "SIMPLE_RF_MODEL"
deploy_name = "rf_model_predict"
classifier_type = "RFClassifier"

In [None]:
# A name and model tags can be added to the model at registration time.
model_id = registry.log_model(
    model_name=model_name,
    model_version=model_version,
    model=clf_rf,
    tags={"stage": "testing", "classifier_type": classifier_type},
    options={"embed_local_ml_library": True}
)

# The object API can be used to reference a model after creation.
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
print("Registered new model:", model_id)

#### Comparsion between load_model

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
restored_clf = model.load_model()

restored_prediction = restored_clf.predict(test_features)

print("Original prediction:", prediction[:10])
print("Restored prediction:", restored_prediction[:10])

print("Result comparison:", np.array_equal(prediction["PREDICTED_TARGET"], restored_prediction[prediction.columns]))

#### Comparsion between deploy

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict",
    options={"relax_version": True},
)

In [None]:
remote_prediction = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction[:10])

print("Result comparison:", np.array_equal(prediction, remote_prediction.values))

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict_proba",
    options={"relax_version": True},
)

In [None]:
remote_prediction_proba = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction_proba[:10])

print("Result comparison:", np.array_equal(prediction_proba, remote_prediction_proba.values))

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict_log_proba",
    options={"relax_version": True},
)

In [None]:
remote_prediction_log_proba = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction_log_proba[:10])

print("Result comparison:", np.array_equal(prediction_log_proba, remote_prediction_log_proba.values))

### Logistic Regression model

The reason to test w/ LR model is because, it has all the functions such as `predict, predict_log_proba, predict_proba, decision_function`

In [None]:
from snowflake.ml.modeling.linear_model import LogisticRegression

In [None]:
clf_lr = LogisticRegression(input_cols=INPUT_COLUMNS,
                          output_cols=OUTPUT_COLUMNS,
                          label_cols=LABEL_COLUMNS,
                          drop_input_cols=True,
                           max_iter=1000)

clf_lr.fit(df)

In [None]:
prediction = clf_lr.predict(test_features)
prediction_proba = clf_lr.predict_proba(test_features)
prediction_log_proba = clf_lr.predict_log_proba(test_features)
prediction_decision = clf_lr.decision_function(test_features)

In [None]:
model_name = "SIMPLE_LR_MODEL"
deploy_name = "lr_model_predict"
classifier_type = "LogisticRegression"

In [None]:
# A name and model tags can be added to the model at registration time.
model_id = registry.log_model(
    model_name=model_name,
    model_version=model_version,
    model=clf_lr,
    tags={"stage": "testing", "classifier_type": classifier_type},
    options={"embed_local_ml_library": True}
)

# The object API can be used to reference a model after creation.
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
print("Registered new model:", model_id)

#### Comparison between load_model

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
restored_clf = model.load_model()

restored_prediction = restored_clf.predict(test_features)

print("Original prediction:", prediction[:10])
print("Restored prediction:", restored_prediction[:10])

print("Result comparison:", np.array_equal(prediction, restored_prediction[prediction.columns]))

#### Comparison between deploy

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict",
    options={"relax_version": True},
)

In [None]:
remote_prediction = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction[:10])

print("Result comparison:", np.array_equal(prediction, remote_prediction.values))

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict_proba",
    options={"relax_version": True},
)

In [None]:
remote_prediction_proba = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction_proba[:10])

print("Result comparison:", np.allclose(prediction_proba, remote_prediction_proba.values))

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict_log_proba",
    options={"relax_version": True},
)

In [None]:
remote_prediction_log_proba = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction_log_proba[:10])

print("Result comparison:", np.allclose(prediction_log_proba, remote_prediction_log_proba.values))

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="decision_function",
    options={"relax_version": True},
)

In [None]:
remote_prediction_decision_function = model.predict(deployment_name=deploy_name, data=test_features)

print("Remote prediction:", remote_prediction_decision_function[:10])

print("Result comparison:", np.allclose(prediction_decision, remote_prediction_decision_function.values))

### Pipeline model

It is important to see if the whole pipeline is stored

In [None]:
def add_simple_category(df):
    bins = (-1, 4, 5, 6, 10)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile']
    categories = pd.cut(df.SEPALLENGTH, bins, labels=group_names)
    df['SIMPLE'] = categories
    return df
df_cat = add_simple_category(df)

numeric_features=['SEPALLENGTH', 'SEPALWIDTH', 'PETALLENGTH', 'PETALWIDTH']
categorical_features = ['SIMPLE']
numeric_features_output = [x + '_O' for x in numeric_features]

In [None]:
# Define the Table and Cleanup Cols, have a work_schema for testing


############################################################################
# NOTE: 
#    Set work_schema variable to some schema that exists in your account.
#    set data_dir to point to the directory that contains the diamonds.csv file.
############################################################################
work_schema = 'TEST'
demo_table = 'IRIS_UPPER'

# write the DF to Snowflake and create a Snowflake DF
session.write_pandas(df_cat, demo_table, auto_create_table=True, table_type="temporary", schema=work_schema)

In [None]:
# Diamonds Snowflake Table
input_tbl = f"{session.get_current_database()}.{session.get_current_schema()}.{demo_table}"
iris_df = session.table(input_tbl)
print(iris_df.limit(10).to_pandas())

In [None]:
from snowflake.ml.modeling.linear_model import LogisticRegression
from snowflake.ml.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from snowflake.ml.framework.pipeline import Pipeline
pipeline = Pipeline(
    steps=[
        ('OHEHOT', OneHotEncoder(input_cols=categorical_features, output_cols='cat_output', drop_input_cols=True), ),
        ('SCALER', MinMaxScaler(clip=True, input_cols=numeric_features, output_cols=numeric_features_output, drop_input_cols=True), ),
        ('CLASSIFIER', LogisticRegression(label_cols=LABEL_COLUMNS))
    ])
pipeline.fit(iris_df)

In [None]:
iris_df_test = iris_df.limit(10)
prediction = pipeline.predict(iris_df_test)

In [None]:
pipeline.fit(iris_df.to_pandas())

In [None]:
prediction = pipeline.predict(iris_df_test.to_pandas())
prediction_log_proba = pipeline.predict_log_proba(iris_df_test.to_pandas())
prediction_proba = pipeline.predict_proba(iris_df_test.to_pandas())

In [None]:
model_name = "SIMPLE_PP_MODEL"
deploy_name = "pp_model_predict"
classifier_type = "Pipeline"
model_version = f"{model_name}_007"

In [None]:
# A name and model tags can be added to the model at registration time.
model_id = registry.log_model(
    model_name=model_name,
    model_version=model_version,
    model=pipeline,
    tags={"stage": "testing", "classifier_type": classifier_type},
    options={"embed_local_ml_library": True}
)

# The object API can be used to reference a model after creation.
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
print("Registered new model:", model_id)

#### Comparison between load_model

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
restored_clf = model.load_model()

restored_prediction = restored_clf.predict(iris_df_test.to_pandas())

print("Original prediction:", prediction[:10])
print("Restored prediction:", restored_prediction[:10])

print("Result comparison:", np.array_equal(prediction, restored_prediction[prediction.columns]))

#### Comparison between deploy predict

In [None]:
registry = model_registry.ModelRegistry(
    session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME
)
model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)
model.deploy(
    deployment_name=deploy_name,
    target_method="predict",
    options={"relax_version": True},
)

In [None]:
remote_prediction = model.predict(deployment_name=deploy_name, data=iris_df_test.to_pandas())

print("Remote prediction:", remote_prediction[:10])

print("Result comparison:", np.allclose(prediction, remote_prediction.values))