# ML Platform Demo - 2023-02-21

## Setup Notebook and Import Path

In [1]:
# Scale cell width with the browser window to accommodate .show() commands for wider tables.
from IPython.display import display, HTML

# Stretch cells with the window.
display(HTML("<style>.container { width:100% !important; }</style>"))

# Disable scrolling
display(HTML("<script>$('.output_scroll').removeClass('output_scroll')</script>"))

#display(HTML("<style>.container { font-size: 10px; }</style>"))
#display(HTML("<style>div.output_area pre { font-size: 16px; }</style>"))
BACKEND_SKLEARN = "SKLEARN"
BACKEND_SNOWML = "SNOWML"

PREPROCESSING_BACKEND = BACKEND_SNOWML
ESTIMATOR_BACKEND = BACKEND_SNOWML

In [2]:
import sys
import os

def add_repo_path(repo_path):
    return
    if repo_path not in sys.path:
        print(f"Adding {repo_path} to system path")
        sys.path.append(repo_path)

# Reading from the local repository
cwd=os.getcwd()
add_repo_path(cwd[:cwd.find("snowflake/ml")].rstrip('/'))

## Start Snowpark Session

To avoid exposing credentials in Github, we use a small utility `SnowflakeLoginOptions`. It allows you to score your default credentials in `~/.snowsql/config` in the following format:
```
[connections]
accountname = <string>   # Account identifier to connect to Snowflake.
username = <string>      # User name in the account. Optional.
password = <string>      # User password. Optional.
dbname = <string>        # Default database. Optional.
schemaname = <string>    # Default schema. Optional.
warehousename = <string> # Default warehouse. Optional.
#rolename = <string>      # Default role. Optional.
#authenticator = <string> # Authenticator: 'snowflake', 'externalbrowser', etc
```
Please follow [this](https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings) for more details.

In [3]:
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark import Session, Column, functions

session = Session.builder.configs(SnowflakeLoginOptions()).create()

In [4]:
#WHEEL_FILE = "snowflake_ml_python-0.0.1-py3-none-any-64c3e057103350427916c3acffcb619aa784ba85.whl"
#session.file.get(f"@SNOWML_WHEEL/{WHEEL_FILE}", "/tmp")
#add_repo_path(f"/tmp/{WHEEL_FILE}")

## Prepare Training Data

In [5]:
from pprint import pprint
from IPython.display import display, Markdown
all_data = session.sql("""SELECT *, IFF(Y = 'yes', 1.0, 0.0) as LABEL FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS""").drop(Column("Y"))
train_data, test_data = all_data.random_split(weights=[0.9, 0.1], seed=23)
display(Markdown(f"""
|total rows|training rows|test rows|
| :---: | :---: | :---: |
|{all_data.count()}|{train_data.count()}|{test_data.count()}|
"""))

print()
print("Example training row:")
pprint(train_data.limit(1).collect()[0].as_dict())

categorical_columns = ('AGE', 'CAMPAIGN', 'CONTACT', 'DAY_OF_WEEK', 'DEFAULT', 'EDUCATION', 'HOUSING', 'JOB', 'LOAN', 'MARITAL', 'MONTH', 'POUTCOME')
numerical_columns = ('CONS_CONF_IDX', 'CONS_PRICE_IDX', 'DURATION', 'EMP_VAR_RATE', 'EURIBOR3M', 'NR_EMPLOYED', 'PDAYS', 'PREVIOUS')
label_column = 'LABEL'

if PREPROCESSING_BACKEND == BACKEND_SKLEARN:
    train_label = train_data.select(label_column).to_pandas()
    train_features = train_data.drop(label_column).to_pandas()
    test_label = test_data.select(label_column).to_pandas()
    test_features = test_data.drop(label_column).to_pandas()
    



|total rows|training rows|test rows|
| :---: | :---: | :---: |
|41188|36973|4215|



Example training row:
{'AGE': 56,
 'CAMPAIGN': 1,
 'CONS_CONF_IDX': -36.4,
 'CONS_PRICE_IDX': 93.994,
 'CONTACT': 'telephone',
 'DAY_OF_WEEK': 'mon',
 'DEFAULT': 'no',
 'DURATION': 261,
 'EDUCATION': 'basic.4y',
 'EMP_VAR_RATE': 1.1,
 'EURIBOR3M': 4.857,
 'HOUSING': 'no',
 'JOB': 'housemaid',
 'LABEL': 0,
 'LOAN': 'no',
 'MARITAL': 'married',
 'MONTH': 'may',
 'NR_EMPLOYED': 5191.0,
 'PDAYS': 999,
 'POUTCOME': 'nonexistent',
 'PREVIOUS': 0}


## Preprocess Data

In [6]:
if PREPROCESSING_BACKEND == BACKEND_SNOWML:
    from snowflake.ml.framework.pipeline import Pipeline
    from snowflake.ml.preprocessing import OneHotEncoder
    from snowflake.ml.preprocessing import MinMaxScaler
    from snowflake.ml.preprocessing import LabelEncoder
elif PREPROCESSING_BACKEND == BACKEND_SKLEARN:
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import LabelEncoder


if PREPROCESSING_BACKEND == BACKEND_SNOWML:

    preprocessing = Pipeline(steps=[
        ("OHE", OneHotEncoder(handle_unknown='ignore', input_cols=categorical_columns, output_cols=categorical_columns)),
        ("MMS", MinMaxScaler(clip=True, input_cols=numerical_columns, output_cols=["MMS_" + x for x in numerical_columns])),
    ])
    preprocessing.fit(train_data)
    train_preprocessed = preprocessing.transform(train_data).drop(*categorical_columns, *numerical_columns)
    test_preprocessed = preprocessing.transform(test_data).drop(*categorical_columns, *numerical_columns)

elif PREPROCESSING_BACKEND == BACKEND_SKLEARN:
    
    preprocessing = ColumnTransformer(
        transformers=[
            ("OHE", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_columns),
            ("MMS", MinMaxScaler(copy=False), numerical_columns),
        ]
    )
    preprocessing.fit(train_features)
    train_preprocessed = preprocessing.transform(train_features)
    test_preprocessed = preprocessing.transform(test_features)

if PREPROCESSING_BACKEND == BACKEND_SNOWML:
    feature_columns = train_preprocessed.columns
    feature_columns.remove(label_column)
    assert len(feature_columns) == len(train_preprocessed.columns) - 1
    print("using features:", feature_columns)
    print("using label:", label_column)



using features: ['"AGE_\'17\'"', '"AGE_\'18\'"', '"AGE_\'19\'"', '"AGE_\'20\'"', '"AGE_\'21\'"', '"AGE_\'22\'"', '"AGE_\'23\'"', '"AGE_\'24\'"', '"AGE_\'25\'"', '"AGE_\'26\'"', '"AGE_\'27\'"', '"AGE_\'28\'"', '"AGE_\'29\'"', '"AGE_\'30\'"', '"AGE_\'31\'"', '"AGE_\'32\'"', '"AGE_\'33\'"', '"AGE_\'34\'"', '"AGE_\'35\'"', '"AGE_\'36\'"', '"AGE_\'37\'"', '"AGE_\'38\'"', '"AGE_\'39\'"', '"AGE_\'40\'"', '"AGE_\'41\'"', '"AGE_\'42\'"', '"AGE_\'43\'"', '"AGE_\'44\'"', '"AGE_\'45\'"', '"AGE_\'46\'"', '"AGE_\'47\'"', '"AGE_\'48\'"', '"AGE_\'49\'"', '"AGE_\'50\'"', '"AGE_\'51\'"', '"AGE_\'52\'"', '"AGE_\'53\'"', '"AGE_\'54\'"', '"AGE_\'55\'"', '"AGE_\'56\'"', '"AGE_\'57\'"', '"AGE_\'58\'"', '"AGE_\'59\'"', '"AGE_\'60\'"', '"AGE_\'61\'"', '"AGE_\'62\'"', '"AGE_\'63\'"', '"AGE_\'64\'"', '"AGE_\'65\'"', '"AGE_\'66\'"', '"AGE_\'67\'"', '"AGE_\'68\'"', '"AGE_\'69\'"', '"AGE_\'70\'"', '"AGE_\'71\'"', '"AGE_\'72\'"', '"AGE_\'73\'"', '"AGE_\'74\'"', '"AGE_\'75\'"', '"AGE_\'76\'"', '"AGE_\'77\'"', '"AGE_\

In [7]:
if ESTIMATOR_BACKEND == BACKEND_SNOWML:
    from snowflake.ml.xgboost.xgb_regressor import XGBRegressor
    from snowflake.ml.ensemble.random_forest_regressor import RandomForestRegressor
    from snowflake.ml.ensemble.gradient_boosting_regressor import GradientBoostingRegressor
    from snowflake.ml.linear_model.logistic_regression import LogisticRegression
    from snowflake.ml.linear_model.linear_regression import LinearRegression
elif ESTIMATOR_BACKEND == BACKEND_SKLEARN:
    from sklearn.utils.validation import column_or_1d
    from xgboost import XGBRegressor
    from sklearn.linear_model import LogisticRegression, LinearRegression
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor    

if ESTIMATOR_BACKEND == BACKEND_SNOWML:
    prediction_col = "PREDICTION"
    clf = LogisticRegression(input_cols=feature_columns, output_cols=[prediction_col], label_cols=label_column)
    clf.fit(train_preprocessed)

elif ESTIMATOR_BACKEND == BACKEND_SKLEARN:
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_preprocessed, column_or_1d(train_label))

  from pandas import MultiIndex, Int64Index
ERROR:snowflake.snowpark._internal.server_connection:Failed to execute query [queryID: 01aa8df7-0405-c4ee-000c-a901156fcce3] CALL SNOWML_FIT_F3467B65_3D85_4B43_A678_D6F66E1942EB('SELECT "LABEL", iff((("AGE_''17''" = ''NaN'') OR "AGE_''17''" IS NULL), 0, "AGE_''17''") AS "AGE_''17''", iff((("AGE_''18''" = ''NaN'') OR "AGE_''18''" IS NULL), 0, "AGE_''18''") AS "AGE_''18''", iff((("AGE_''19''" = ''NaN'') OR "AGE_''19''" IS NULL), 0, "AGE_''19''") AS "AGE_''19''", iff((("AGE_''20''" = ''NaN'') OR "AGE_''20''" IS NULL), 0, "AGE_''20''") AS "AGE_''20''", iff((("AGE_''21''" = ''NaN'') OR "AGE_''21''" IS NULL), 0, "AGE_''21''") AS "AGE_''21''", iff((("AGE_''22''" = ''NaN'') OR "AGE_''22''" IS NULL), 0, "AGE_''22''") AS "AGE_''22''", iff((("AGE_''23''" = ''NaN'') OR "AGE_''23''" IS NULL), 0, "AGE_''23''") AS "AGE_''23''", iff((("AGE_''24''" = ''NaN'') OR "AGE_''24''" IS NULL), 0, "AGE_''24''") AS "AGE_''24''", iff((("AGE_''25''" = ''NaN'') OR "AGE_''2

SnowparkSQLException: (1304): 01aa8df7-0405-c4ee-000c-a901156fcce3: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "_udf_code.py", line 7, in compute
  File "/opt/homebrew/anaconda3/envs/snowpark/lib/python3.8/site-packages/snowflake/ml/linear_model/logistic_regression.py", line 360, in fit_wrapper_sproc
  File "/usr/lib/python_udf/41c4eeac2aad7a9c9505d8d840e88d4a8bd7375f79c59cce9c55fabcbc38aef5/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1138, in fit
    X, y = self._validate_data(
  File "/usr/lib/python_udf/41c4eeac2aad7a9c9505d8d840e88d4a8bd7375f79c59cce9c55fabcbc38aef5/lib/python3.8/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/lib/python_udf/41c4eeac2aad7a9c9505d8d840e88d4a8bd7375f79c59cce9c55fabcbc38aef5/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/usr/lib/python_udf/41c4eeac2aad7a9c9505d8d840e88d4a8bd7375f79c59cce9c55fabcbc38aef5/lib/python3.8/site-packages/sklearn/utils/validation.py", line 768, in check_array
    dtype_orig = np.result_type(*dtypes_orig)
  File "<__array_function__ internals>", line 180, in result_type
ValueError: at least one array or dtype is required
 in function SNOWML_FIT_F3467B65_3D85_4B43_A678_D6F66E1942EB with handler compute

In [None]:
for id in (
    "0c0e951eb16411ed8846e289b4f89202",
    "ff52e938b16311ed8846e289b4f89202",
    "f268515eb16311ed8846e289b4f89202",
    "e15897cab16311ed8846e289b4f89202",
    "d1e675e6b16311ed8846e289b4f89202",
    "1cf19188b16411ed8846e289b4f89202",
    ):
    print(id)
    registry.set_tag(id=id, name="stage", value="experimental")
    


In [None]:
import numpy as np
from sklearn import metrics

def train_and_evaluate(classifier, X_train, y_train, X_test, y_test):
    if hasattr(classifier, "set_input_cols"):
        return train_and_evaluate_snowml(classifier, X_train, X_test)
    else:
        return train_and_evaluate_sklearn(classifier, X_train, y_train, X_test, y_test)

def train_and_evaluate_sklearn(classifier, X_train, y_train, X_test, y_test):
    name = classifier.__class__.__name__
    params = classifier.get_params()
    print(f"Training a {name} with params {params}.")
    
    classifier.fit(X_train, column_or_1d(y_train))
    if "predict_proba" in dir(classifier):
        prediction_labels = classifier.predict(X_test)
        prediction_probabilities = classifier.predict_proba(X_test)
    elif "score_samples" in dir(classifier):
        positive_probabilities = classifier.score_samples(X_test)
        prediction_labels = np.array([ 1 if x > 0.5 else 0 for x in positive_probabilities])
        prediction_probabilities = np.array(
            [np.array([1-x, x]) for x in positive_probabilities]
        )
    elif "predict" in dir(classifier):
        positive_probabilities = classifier.predict(X_test)
        prediction_labels = np.array([ 1 if x > 0.5 else 0 for x in positive_probabilities])
        prediction_probabilities = np.array(
            [np.array([1-x, x]) for x in positive_probabilities]  
        )
    
    return {
        "classifier": {
            "object": classifier,
            "name": classifier.__class__.__name__,
            "params": classifier.get_params(),
        },
        "predictions": {
            "probabilities": prediction_probabilities,
            "labels": prediction_labels,
        },
        "metrics": {
            "accuracy": metrics.accuracy_score(y_test, prediction_labels),
            "precision": metrics.precision_score(y_test, prediction_labels),
            "recall": metrics.recall_score(y_test, prediction_labels),
            "roc_auc": metrics.roc_auc_score(y_true=y_test, y_score=prediction_probabilities[:,1]),
            "pr_curve": metrics.precision_recall_curve(y_true=y_test, probas_pred=prediction_probabilities[:,1], pos_label=1),
            "roc_curve": metrics.roc_curve(y_true=y_test, y_score=prediction_probabilities[:,1], pos_label=1),
        }
    }
    

In [None]:
all_results = []

all_results.append(train_and_evaluate(
    LinearRegression(),
    train_preprocessed, train_label,
    test_preprocessed, test_label))

all_results.append(train_and_evaluate(
    LogisticRegression(verbose=0, max_iter=10000), 
    train_preprocessed, train_label,
    test_preprocessed, test_label))

all_results.append(train_and_evaluate(
    RandomForestRegressor(n_estimators=1),
    train_preprocessed, train_label,
    test_preprocessed, test_label))

all_results.append(train_and_evaluate(
    RandomForestRegressor(n_estimators=10),
    train_preprocessed, train_label,
    test_preprocessed, test_label))

all_results.append(train_and_evaluate(
    RandomForestRegressor(n_estimators=100),
    train_preprocessed, train_label,
    test_preprocessed, test_label))

all_results.append(train_and_evaluate(
    GradientBoostingRegressor(n_estimators=100),
    train_preprocessed, train_label,
    test_preprocessed, test_label))


In [None]:
#from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay

#for r in all_results:
#    print(r["classifier"]["name"], 
#          r["metrics"]["roc_auc"], 
#          sum(r["predictions"]["labels"]), 
#         )
#    RocCurveDisplay(fpr=r["metrics"]["roc_curve"][0], tpr=r["metrics"]["roc_curve"][1]).plot()


## Open/Create Model Registry

In [None]:
import importlib
from snowflake.ml.registry import model_registry
importlib.reload(model_registry)

In [None]:
# Create a new model registry. This will be a no-op if the registry already exists.
create_result = model_registry.create_model_registry(session)

In [None]:
registry = model_registry.ModelRegistry(session=session)

In [None]:
for result in all_results:
    model_id = registry.log_model(model=result["classifier"]["object"], name="uci-bank-marketing", tags={
        "classifier": result["classifier"]["name"],
        "params": result["classifier"]["params"]})
    for name, value in result["metrics"].items():
        registry.set_metric(model_id, name, value)
    
    

## Register a new Model

In [None]:
model_id = registry.log_model(model=clf, name="my_model", tags={
    "stage": "testing", "classifier_type": "svm.SVC", "svc_gamma": svc_gamma, "svc_C": svc_C})
model = model_registry.ModelReference(registry=registry, id=model_id)
print("Registered new model:", model_id)

## Add Metrics

In [None]:
from sklearn import metrics
import snowflake.ml.utils.formatting
importlib.reload(snowflake.ml.utils.formatting)
importlib.reload(snowflake.ml.registry.model_registry)


test_accuracy = metrics.accuracy_score(test_labels, prediction)
print("Model test accuracy:", test_accuracy)

# Simple scalar metrics.
registry.set_metric(id=model_id, name="test_accuracy", value=test_accuracy)
# OR in object model
model.set_metric(name="num_training_examples", value=num_training_examples)

# Hierarchical metric.
registry.set_metric(id=model_id, name="dataset_test", value={"accuracy": test_accuracy})

# Multivalent metric:
test_confusion_matrix = metrics.confusion_matrix(test_labels, prediction)
print("Confusion matrix:", test_confusion_matrix)

registry.set_metric(id=model_id, name="confusion_matrix", value=test_confusion_matrix)

## List Model in Registry

In [None]:
model_list = registry.list_models()
print(model_id)
model_list.filter(model_list["ID"] == model_id).select("NAME","TAGS","METRICS").show()

## Metadata: Tags and Name

### Relational API

In [None]:
print("Old tags:", registry.get_tags(model_id))

registry.set_tag(model_id, "minor_version", "23")
print("Added tag:", registry.get_tags(model_id))

registry.remove_tag(model_id, "minor_version")
print("Removed tag", registry.get_tags(model_id))
registry.set_tag(model_id, "stage", "production")
print("Updated tag:", registry.get_tags(model_id))

# Rename Model
print("Old name:", registry.get_model_name(model_id))

new_model_name = f"target_digit_{target_digit}"
registry.set_model_name(id=model_id, name=new_model_name)

print("New name:", registry.get_model_name(model_id))

### Object API

In [None]:
print("Old tags:", model.get_tags())

model.set_tag("minor_version", "23")
print("Added tag:", model.get_tags())

model.remove_tag("minor_version")
print("Removed tag", model.get_tags())
model.set_tag("stage", "production")
print("Updated tag:", model.get_tags())

# Rename Model
print("Old name:", model.get_model_name())

new_model_name = f"target_digit_{target_digit}"
model.set_model_name(name=new_model_name)

print("New name:", model.get_model_name())

## List recent Models in Registry

In [None]:
model_list.select("ID","NAME","CREATION_TIME","TAGS").order_by("CREATION_TIME", ascending=False).show()

## List all versions of a Model ordered by test set accuracy

In [None]:
model_list.select("ID","NAME","TAGS","METRICS").filter(
    Column("NAME") == "uci-bank-marketing").order_by(Column("METRICS")["test_accuracy"], ascending=False 
).show()                                                                                                             

## Examine Model History

### Relational API

In [None]:
registry.get_model_history(id=model_id).select("EVENT_TIMESTAMP", "ROLE", "ATTRIBUTE_NAME","VALUE[ATTRIBUTE_NAME]").show()

### Object API

In [None]:
model.get_model_history().select("EVENT_TIMESTAMP", "ROLE", "ATTRIBUTE_NAME","VALUE[ATTRIBUTE_NAME]").show()

## Load Model

### Relational Model

In [None]:
importlib.reload(snowflake.ml.model_registry.model_registry)
registry = model_registry.ModelRegistry(session=session)

restored_clf = registry.load_model(id=model_id)

restored_prediction = restored_clf.predict(test_features)

print("Original prediction:", prediction[:10])
print("Restored prediction:", restored_prediction[:10])

### Object Model

In [None]:
importlib.reload(snowflake.ml.model_registry.model_registry)
registry = model_registry.ModelRegistry(session=session)
model = model_registry.ModelReference(registry=registry, id=model_id)
restored_clf = model.load_model()

restored_prediction = restored_clf.predict(test_features)

print("Original prediction:", prediction[:10])
print("Restored prediction:", restored_prediction[:10])

## Close Session

In [None]:
# session.close()