snowflakedb · sfc-gh-kdama · Jun 22, 2023 · Jun 22, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,28 @@
 # Release History
 
+## 1.0.2 (2023-06-22)
+
+### Behavior Changes
+- Model Registry: Prohibit non-snowflake-native models from being logged.
+- Model Registry: `_use_local_snowml` parameter in options of `deploy()` has been removed.
+- Model Registry: A default `False` `embed_local_ml_library` parameter has been added to the options of `log_model()`. With this set to `False` (default), the version of the local snowflake-ml-python library will be recorded and used when deploying the model. With this set to `True`, local snowflake-ml-python library will be embedded into the logged model, and will be used when you load or deploy the model.
+
+### New Features
+- Model Registry: A new optional argument named `code_paths` has been added to the arguments of `log_model()` for users to specify additional code paths to be imported when loading and deploying the model.
+- Model Registry: A new optional argument named `options` has been added to the arguments of `log_model()` to specify any additional options when saving the model.
+- Model Development: Added metrics:
+  - d2_absolute_error_score
+  - d2_pinball_score
+  - explained_variance_score
+  - mean_absolute_error
+  - mean_absolute_percentage_error
+  - mean_squared_error
+
+### Bug Fixes
+
+- Model Development: `accuracy_score()` now works when given label column names are lists of a single value.
+
+
 ## 1.0.1 (2023-06-16)
 ### Behavior Changes
 

diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# Usage
+# copy_and_run_tests.sh <workspace> [--env pip|conda] [--with-snowpark]
+#
+# Args
+# workspace: path to the workspace, SnowML code should be in snowml directory.
+#
+# Optional Args
+# env: Set the environment, choose from pip and conda
+# with-snowpark: Build and test with snowpark in snowpark-python directory in the workspace.
+#
+# Action
+#   - Copy the integration tests from workspace folder and execute them in testing Python env using pytest.
+#   - This is to mimic the behavior of using snowml wheel package in user land.
+
+set -o pipefail
+set -eu
+
+PROG=$0
+
+help()
+{
+    exit_code=$1
+    echo "Invalid usage, must provide argument for workspace"
+    echo "Usage: ${PROG} <workspace> [--env pip|conda] [--with-snowpark]"
+    exit ${exit_code}
+}
+
+WORKSPACE=$1 && shift || help 1
+ENV="pip"
+WITH_SNOWPARK=false
+SNOWML_DIR="snowml"
+SNOWPARK_DIR="snowpark-python"
+
+while (($#)); do
+    case $1 in
+        -e|--env)
+            shift
+            if [[ $1 = "pip" || $1 = "conda" ]]; then
+                ENV=$1
+            else
+                help 1
+            fi
+            ;;
+        --with-snowpark)
+            WITH_SNOWPARK=true
+            ;;
+        -h|--help)
+            help 0
+            ;;
+        *)
+            help 1
+            ;;
+    esac
+    shift
+done
+
+# Check Python3.8 exist
+# TODO(SNOW-845592): ideally we should download py3.8 from conda if not exist. Currently we just fail.
+set +eu
+source /opt/rh/rh-python38/enable
+PYTHON38_EXIST=$?
+if [ $PYTHON38_EXIST -ne 0 ]; then
+    echo "Failed to execute tests: Python3.8 is not installed."
+    rm -rf "${TEMP_TEST_DIR}"
+    exit ${PYTHON38_EXIST}
+fi
+set -eu
+
+cd "${WORKSPACE}"
+
+# Create temp release folder
+TEMP_TEST_DIR=$(mktemp -d "${WORKSPACE}/tmp_XXXXX")
+
+pushd ${SNOWML_DIR}
+# Get the version from snowflake/ml/version.bzl
+VERSION=$(grep -oE "VERSION = \"[0-9]+\\.[0-9]+\\.[0-9]+.*\"" snowflake/ml/version.bzl | cut -d'"' -f2)
+echo "Extracted Package Version from code: ${VERSION}"
+
+# Get optional requirements from snowflake/ml/requirements.bzl
+OPTIONAL_REQUIREMENTS=$(cat snowflake/ml/requirements.bzl | python3 -c "import sys; exec(sys.stdin.read()); print(' '.join(map(lambda x: '\"'+x+'\"', EXTRA_REQUIREMENTS['all'])))")
+
+# Compare test required dependencies with wheel pkg dependencies and exclude tests if necessary
+EXCLUDE_TESTS=$(mktemp "${TEMP_TEST_DIR}/exclude_tests_XXXXX")
+./ci/get_excluded_tests.sh -f "${EXCLUDE_TESTS}"
+# Copy tests into temp directory
+pushd "${TEMP_TEST_DIR}"
+rsync -av --exclude-from "${EXCLUDE_TESTS}" "${WORKSPACE}/${SNOWML_DIR}/tests" .
+ls  tests/integ/snowflake/ml
+popd
+popd
+
+# Build snowml package
+if [ ${ENV} = "pip" ]; then
+    # Clean build workspace
+    rm -f ${WORKSPACE}/*.whl
+
+    # Build Snowpark
+    if [ "${WITH_SNOWPARK}" = true ]; then
+        pushd ${SNOWPARK_DIR}
+        rm -rf venv
+        python3.8 -m venv venv
+        source venv/bin/activate
+        python3.8 -m pip install -U pip setuptools wheel
+        echo "Building snowpark wheel from main:$(git rev-parse HEAD)."
+        pip wheel . --no-deps
+        cp snowflake_snowpark_python-*.whl ${WORKSPACE}
+        deactivate
+        popd
+    fi
+
+    # Build SnowML
+    pushd ${SNOWML_DIR}
+    bazel build //snowflake/ml:wheel
+    cp bazel-bin/snowflake/ml/snowflake_ml_python-*.whl ${WORKSPACE}
+    popd
+else
+    which conda
+
+    # Clean conda build workspace
+    rm -rf ${WORKSPACE}/conda-bld
+
+    # Build Snowpark
+    if [ "${WITH_SNOWPARK}" = true ]; then
+        pushd ${SNOWPARK_DIR}
+        conda build recipe/ --python=3.8 --numpy=1.16 --croot "${WORKSPACE}/conda-bld"
+        popd
+    fi
+
+    # Build SnowML
+    pushd ${SNOWML_DIR}
+    # Build conda package
+    conda build --channel=conda-forge --prefix-length 50 --croot "${WORKSPACE}/conda-bld" ci/conda_recipe
+    conda build purge
+    popd
+fi
+
+# Start testing
+pushd "${TEMP_TEST_DIR}"
+
+# Set up common pytest flag
+COMMON_PYTEST_FLAG=()
+COMMON_PYTEST_FLAG+=(--strict-markers) # Strict the pytest markers to avoid typo in markers
+COMMON_PYTEST_FLAG+=(--import-mode=append)
+COMMON_PYTEST_FLAG+=(-n 10)
+
+
+if [ ${ENV} = "pip" ]; then
+    # Copy wheel package
+    cp "${WORKSPACE}/snowflake_ml_python-${VERSION}-py3-none-any.whl" "${TEMP_TEST_DIR}"
+
+    # Create testing env
+    python3.8 -m venv testenv
+    source testenv/bin/activate
+    # Install all of the packages in single line,
+    # otherwise it will fail in dependency resolution.
+    python3.8 -m pip install --upgrade pip
+    python3.8 -m pip list
+    python3.8 -m pip install "snowflake_ml_python-${VERSION}-py3-none-any.whl[all]" pytest-xdist inflection --no-cache-dir --force-reinstall
+    if [ "${WITH_SNOWPARK}" = true ]; then
+        cp ${WORKSPACE}/snowflake_snowpark_python-*.whl "${TEMP_TEST_DIR}"
+        python3.8 -m pip install $(find . -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl') --force-reinstall
+    fi
+    python3.8 -m pip list
+
+    # Set up pip specific pytest flags
+    PIP_PYTEST_FLAG=()
+    PIP_PYTEST_FLAG+=(-m "not pip_incompatible")  # Filter out those pip incompatible tests.
+
+    # Run the tests
+    set +e
+    TEST_SRCDIR="${TEMP_TEST_DIR}" python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" "${PIP_PYTEST_FLAG[@]}" tests/
+    TEST_RETCODE=$?
+    set -e
+else
+    # Create local conda channel
+    conda index ${WORKSPACE}/conda-bld
+
+    # Clean conda cache
+    conda clean --all --force-pkgs-dirs -y
+
+    # Create testing env
+    conda create -y -p testenv -c "file://${WORKSPACE}/conda-bld" -c "https://repo.anaconda.com/pkgs/snowflake/" --override-channel "python=3.8" snowflake-ml-python pytest-xdist inflection ${OPTIONAL_REQUIREMENTS}
+    conda list -p testenv
+
+    # Run the tests
+    set +e
+    TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/
+    TEST_RETCODE=$?
+    set -e
+
+    # Clean the conda environment
+    conda env remove -p testenv
+fi
+
+popd
+
+# clean up temp dir
+rm -rf "${TEMP_TEST_DIR}"
+
+echo "Done running ${PROG}"
+exit ${TEST_RETCODE}
diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.0.1
+  version: 1.0.2
 requirements:
   build:
     - python

diff --git a/ci/copy_and_run_tests.sh b/ci/copy_and_run_tests.sh
diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template
@@ -548,26 +548,37 @@ class {transform.original_class_name}(BaseTransformer):
         # input cols need to match unquoted / quoted
         input_cols = self.input_cols
         unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
+        quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
 
         estimator = self._sklearn_object
 
-        input_df = dataset[input_cols] # Select input columns with quoted column names.
-        if hasattr(estimator, "feature_names_in_"):
-            missing_features = []
-            for i, f in enumerate(getattr(estimator, "feature_names_in_")):
-                if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
-                    missing_features.append(f)
-
-            if len(missing_features) > 0:
-                raise ValueError(
-                    "The feature names should match with those that were passed during fit.\n"
-                    f"Features seen during fit call but not present in the input: {{missing_features}}\n"
-                    f"Features in the input dataframe : {{input_cols}}\n"
-                )
-            input_df.columns = getattr(estimator, "feature_names_in_")
-        else:
-            # Just rename the column names to unquoted identifiers.
-            input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
+        features_required_by_estimator =  getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
+        missing_features = []
+        features_in_dataset = set(dataset.columns)
+        columns_to_select = []
+        for i, f in enumerate(features_required_by_estimator):
+            if (
+                    i >= len(input_cols)
+                or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
+                or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset 
+                    and quoted_input_cols[i] not in features_in_dataset)
+                ):
+                missing_features.append(f)
+            elif input_cols[i] in features_in_dataset:
+                columns_to_select.append(input_cols[i])
+            elif unquoted_input_cols[i] in features_in_dataset:
+                columns_to_select.append(unquoted_input_cols[i])
+            else:
+                columns_to_select.append(quoted_input_cols[i])
+
+        if len(missing_features) > 0:
+            raise ValueError(
+                "The feature names should match with those that were passed during fit.\n"
+                f"Features seen during fit call but not present in the input: {{missing_features}}\n"
+                f"Features in the input dataframe : {{input_cols}}\n"
+            )
+        input_df = dataset[columns_to_select]
+        input_df.columns = features_required_by_estimator
 
         transformed_numpy_array = getattr(estimator, inference_method)(
             input_df

diff --git a/codegen/transformer_autogen_test_template.py_template b/codegen/transformer_autogen_test_template.py_template
@@ -7,13 +7,16 @@ import numpy as np
 import pandas as pd
 import json
 import random
+import pytest
 
 from typing import Optional, Any
 from absl.testing.absltest import TestCase, main
 {transform.test_estimator_imports}
 from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
 from snowflake.snowpark import Session, DataFrame
 
+
+@pytest.mark.pip_incompatible
 class {transform.test_class_name}(TestCase):
     def setUp(self):
         """Creates Snowpark and Snowflake environments for testing."""