snowflakedb · sfc-gh-kdama · Oct 13, 2023 · Oct 13, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,32 @@
 # Release History
 
-## 1.0.9
+## 1.0.10
 
 ### Behavior Changes
 
-- Model Development: log_loss metric calculation is now distributed.
+- Model Development: precision_score, recall_score, f1_score, fbeta_score, precision_recall_fscore_support,
+mean_absolute_error, mean_squared_error, and mean_absolute_percentage_error metric calculations are now distributed.
+- Model Registry: `deploy` will now return `Deployment` for deployment information.
 
 ### New Features
 
+- Model Registry: When the model signature is auto-inferred, it will be printed to the log for reference.
+- Model Registry: For SPCS deployment, `Deployment` details will contains `image_name`, `service_spec` and `service_function_sql`.
+
+### Bug Fixes
+
+- Model Development: Fix an issue that leading to UTF-8 decoding errors when using modeling modules on Windows.
+- Model Development: Fix an issue that alias definitions cause `SnowparkSQLUnexpectedAliasException` in inference.
+- Model Registry: Fix an issue that signature inference could be incorrect when using Snowpark DataFrame as sample input.
+- Model Registry: Fix too strict data type validation when predicting. Now, for example, if you have a INT8
+ type feature in the signature, if providing a INT64 dataframe but all values are within the range, it would not fail.
+
+## 1.0.9 (2023-09-28)
+
+### Behavior Changes
+
+- Model Development: log_loss metric calculation is now distributed.
+
 ### Bug Fixes
 
 - Model Registry: Fix an issue that building images fails with specific docker setup.

diff --git a/bazel/environments/conda-env-build.yml b/bazel/environments/conda-env-build.yml
@@ -13,7 +13,6 @@ dependencies:
   - lightgbm==3.3.5
   - numpy==1.24.3
   - packaging==23.0
-  - pytimeparse==1.1.8
   - ruamel.yaml==0.17.21
   - scikit-learn==1.3.0
   - sphinx==5.0.2

diff --git a/bazel/environments/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml
@@ -30,7 +30,7 @@ dependencies:
   - packaging==23.0
   - pandas==1.5.3
   - protobuf==3.20.3
-  - pytest==7.1.2
+  - pytest==7.4.0
   - pytimeparse==1.1.8
   - pytorch==2.0.1
   - pyyaml==6.0
@@ -46,8 +46,9 @@ dependencies:
   - sphinx==5.0.2
   - sqlparse==0.4.4
   - tensorflow==2.10.0
+  - tokenizers==0.13.2
   - torchdata==0.6.1
-  - transformers==4.29.2
+  - transformers==4.32.1
   - types-protobuf==4.23.0.1
   - types-requests==2.30.0.0
   - typing-extensions==4.5.0

diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml
@@ -13,7 +13,7 @@ dependencies:
   - cachetools==4.2.2
   - cloudpickle==2.0.0
   - conda-forge::accelerate==0.22.0
-  - conda-forge::mypy==1.4.1
+  - conda-forge::mypy==1.5.1
   - conda-forge::starlette==0.27.0
   - conda-forge::types-PyYAML==6.0.12
   - conda-forge::types-cachetools==4.2.2
@@ -35,7 +35,7 @@ dependencies:
   - packaging==23.0
   - pandas==1.5.3
   - protobuf==3.20.3
-  - pytest==7.1.2
+  - pytest==7.4.0
   - pytimeparse==1.1.8
   - pytorch==2.0.1
   - pyyaml==6.0
@@ -51,8 +51,9 @@ dependencies:
   - sphinx==5.0.2
   - sqlparse==0.4.4
   - tensorflow==2.10.0
+  - tokenizers==0.13.2
   - torchdata==0.6.1
-  - transformers==4.29.2
+  - transformers==4.32.1
   - types-protobuf==4.23.0.1
   - types-requests==2.30.0.0
   - typing-extensions==4.5.0

diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py
@@ -361,6 +361,8 @@ def generate_requirements(
             )
         )
         sys.stdout.writelines(results)
+    elif (mode, format) == ("dev_version", "python"):
+        sys.stdout.writelines(f"REQUIREMENTS = {repr(snowflake_only_env)}\n")
     elif (mode, format) == ("version_requirements", "bzl"):
         extras_requirements = list(filter(lambda req_info: filter_by_extras(req_info, True, False), requirements))
         extras_results: MutableMapping[str, Sequence[str]] = {}
@@ -479,6 +481,7 @@ def main() -> None:
     VALID_SETTINGS = [
         ("validate", None, False),  # Validate the environment
         ("dev_version", "text", False),  # requirements.txt
+        ("dev_version", "python", True),  # sproc test dependencies list
         ("version_requirements", "bzl", False),  # wheel rule requirements
         ("version_requirements", "python", False),  # model deployment core dependencies list
         ("dev_version", "conda_env", False),  # dev conda-env.yml file

diff --git a/ci/build_and_run_tests.sh b/ci/build_and_run_tests.sh
@@ -38,6 +38,7 @@ WITH_SNOWPARK=false
 MODE="continuous_run"
 SNOWML_DIR="snowml"
 SNOWPARK_DIR="snowpark-python"
+IS_NT=false
 
 while (($#)); do
     case $1 in
@@ -74,26 +75,70 @@ while (($#)); do
     shift
 done
 
+EXT=""
+BAZEL_ADDITIONAL_BUILD_FLAGS=()
+BAZEL_ADDITIONAL_STARTUP_FLAGS=()
+
+# Computing artifact location
+# Detect the platform, also update some platform specific bazel settings
+case "$(uname)" in
+  Linux)
+    PLATFORM="linux" ;;
+  Darwin)
+    PLATFORM="darwin" ;;
+  *NT*)
+    PLATFORM="windows"
+    IS_NT=true ;;
+esac
+
+# Detect the architecture
+ARCH="$(uname -m)"
+case "$ARCH" in
+  aarch64|ppc64le|arm64)
+    ARCH="arm64" ;;
+  *)
+    ARCH="amd64" ;;
+esac
+
+# Compute the platform-arch string used to download yq.
+case "${PLATFORM}_${ARCH}" in
+  linux_arm64|linux_amd64|darwin_arm64|darwin_amd64|windows_amd64)
+      ;;  # pass
+  *)
+    echo "Platform / Architecture is not supported by yq." >&2
+    exit 1
+    ;;
+esac
+
 # Check Python3.8 exist
 # TODO(SNOW-845592): ideally we should download py3.8 from conda if not exist. Currently we just fail.
-set +eu
-source /opt/rh/rh-python38/enable
-PYTHON38_EXIST=$?
-if [ $PYTHON38_EXIST -ne 0 ]; then
-    echo "Failed to execute tests: Python3.8 is not installed."
-    rm -rf "${TEMP_TEST_DIR}"
-    exit ${PYTHON38_EXIST}
+if [ "${ENV}" = "pip" ]; then
+    set +eu
+    source /opt/rh/rh-python38/enable
+    PYTHON38_EXIST=$?
+    if [ $PYTHON38_EXIST -ne 0 ]; then
+        echo "Failed to execute tests: Python3.8 is not installed."
+        rm -rf "${TEMP_TEST_DIR}"
+        exit ${PYTHON38_EXIST}
+    fi
+    set -eu
+fi
+
+if [ ${IS_NT} = true ]; then
+    EXT=".exe"
+    BAZEL_ADDITIONAL_BUILD_FLAGS+=(--nobuild_python_zip)
+    BAZEL_ADDITIONAL_BUILD_FLAGS+=(--enable_runfiles)
+    BAZEL_ADDITIONAL_STARTUP_FLAGS+=(--output_user_root=D:/broot)
 fi
-set -eu
 
 cd "${WORKSPACE}"
 
 # Check and download yq if not presented.
-_YQ_BIN="yq"
+_YQ_BIN="yq${EXT}"
 if ! command -v "${_YQ_BIN}" &>/dev/null; then
     TEMP_BIN=$(mktemp -d "${WORKSPACE}/tmp_bin_XXXXX")
-    curl -Ls https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o "${TEMP_BIN}/yq" && chmod +x "${TEMP_BIN}/yq"
-    _YQ_BIN="${TEMP_BIN}/yq"
+    curl -Lsv https://github.com/mikefarah/yq/releases/latest/download/yq_${PLATFORM}_${ARCH}${EXT} -o "${TEMP_BIN}/yq${EXT}" && chmod +x "${TEMP_BIN}/yq${EXT}"
+    _YQ_BIN="${TEMP_BIN}/yq${EXT}"
 fi
 
 # Create temp release folder
@@ -109,23 +154,39 @@ echo "Extracted Package Version from code: ${VERSION}"
 OPTIONAL_REQUIREMENTS=()
 while IFS='' read -r line; do OPTIONAL_REQUIREMENTS+=("$line"); done < <("${_YQ_BIN}" '.requirements.run_constrained.[] | ... style=""' ci/conda_recipe/meta.yaml)
 
-# Generate and copy auto-gen tests.
-if [[ ${MODE} = "release" ]]; then
-    "${BAZEL}" build //tests/... --build_tag_filters=autogen_build
-    cp -r "$("${BAZEL}" info bazel-bin)/tests" "${TEMP_TEST_DIR}"
-fi
-
 # Compare test required dependencies with wheel pkg dependencies and exclude tests if necessary
 EXCLUDE_TESTS=$(mktemp "${TEMP_TEST_DIR}/exclude_tests_XXXXX")
 if [[ ${MODE} = "continuous_run" || ${MODE} = "release" ]]; then
     ./ci/get_excluded_tests.sh -f "${EXCLUDE_TESTS}" -m unused -b "${BAZEL}"
 elif [[ ${MODE} = "merge_gate" ]]; then
     ./ci/get_excluded_tests.sh -f "${EXCLUDE_TESTS}" -m all -b "${BAZEL}"
 fi
+
+# Generate and copy auto-gen tests.
+if [[ ${MODE} = "release" ]]; then
+# When release, we build all autogen tests
+    "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" build "${BAZEL_ADDITIONAL_BUILD_FLAGS[@]+"${BAZEL_ADDITIONAL_BUILD_FLAGS[@]}"}" //tests/integ/...
+else
+# In other cases, we build required utility only.
+    "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" build --build_tag_filters=-autogen_build,-autogen "${BAZEL_ADDITIONAL_BUILD_FLAGS[@]+"${BAZEL_ADDITIONAL_BUILD_FLAGS[@]}"}" //tests/integ/...
+fi
+
+# Rsync cannot work well with path that has drive letter in Windows,
+# Thus, these two rsync has to use relative path instead of absolute ones.
+
+rsync -av --exclude '*.runfiles_manifest' --exclude '*.runfiles/**' "bazel-bin/tests" .
+
 # Copy tests into temp directory
 pushd "${TEMP_TEST_DIR}"
-rsync -av --exclude-from "${EXCLUDE_TESTS}" "${WORKSPACE}/${SNOWML_DIR}/tests" .
+rsync -av --exclude-from "${EXCLUDE_TESTS}" "../${SNOWML_DIR}/tests" .
 popd
+
+# Bazel on windows is consuming a lot of memory, let's clean it before proceed to avoid OOM.
+if [ ${IS_NT} = true ]; then
+    "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" clean --expunge
+    "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" shutdown
+fi
+
 popd
 
 # Build snowml package
@@ -149,12 +210,10 @@ if [ "${ENV}" = "pip" ]; then
 
     # Build SnowML
     pushd ${SNOWML_DIR}
-    "${BAZEL}" build //snowflake/ml:wheel
+    "${BAZEL}" "${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]+"${BAZEL_ADDITIONAL_STARTUP_FLAGS[@]}"}" build "${BAZEL_ADDITIONAL_BUILD_FLAGS[@]+"${BAZEL_ADDITIONAL_BUILD_FLAGS[@]}"}" //snowflake/ml:wheel
     cp "$(${BAZEL} info bazel-bin)/snowflake/ml/snowflake_ml_python-${VERSION}-py3-none-any.whl" "${WORKSPACE}"
     popd
 else
-    which conda
-
     # Clean conda cache
     conda clean --all --force-pkgs-dirs -y
 
@@ -183,7 +242,7 @@ pushd "${TEMP_TEST_DIR}"
 COMMON_PYTEST_FLAG=()
 COMMON_PYTEST_FLAG+=(--strict-markers) # Strict the pytest markers to avoid typo in markers
 COMMON_PYTEST_FLAG+=(--import-mode=append)
-COMMON_PYTEST_FLAG+=(-n 10)
+COMMON_PYTEST_FLAG+=(-n logical)
 
 if [ "${ENV}" = "pip" ]; then
     # Copy wheel package
@@ -196,10 +255,10 @@ if [ "${ENV}" = "pip" ]; then
     # otherwise it will fail in dependency resolution.
     python3.8 -m pip install --upgrade pip
     python3.8 -m pip list
-    python3.8 -m pip install "snowflake_ml_python-${VERSION}-py3-none-any.whl[all]" pytest-xdist inflection --no-cache-dir --force-reinstall
+    python3.8 -m pip install "snowflake_ml_python-${VERSION}-py3-none-any.whl[all]" pytest-xdist[psutil] -r "${WORKSPACE}/${SNOWML_DIR}/requirements.txt" --no-cache-dir --force-reinstall
     if [ "${WITH_SNOWPARK}" = true ]; then
         cp "$(find "${WORKSPACE}" -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" "${TEMP_TEST_DIR}"
-        python3.8 -m pip install "$(find . -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" --force-reinstall
+        python3.8 -m pip install "$(find . -maxdepth 1 -iname 'snowflake_snowpark_python-*.whl')" --no-deps --force-reinstall
     fi
     python3.8 -m pip list
 
@@ -216,12 +275,12 @@ else
     conda clean --all --force-pkgs-dirs -y
 
     # Create testing env
-    conda create -y -p testenv -c "file://${WORKSPACE}/conda-bld" -c "https://repo.anaconda.com/pkgs/snowflake/" --override-channels "python=3.8" snowflake-ml-python pytest-xdist inflection "${OPTIONAL_REQUIREMENTS[@]}"
+    conda create -y -p testenv -c "${WORKSPACE}/conda-bld" -c "https://repo.anaconda.com/pkgs/snowflake/" --override-channels "python=3.8" snowflake-ml-python pytest-xdist psutil inflection "${OPTIONAL_REQUIREMENTS[@]}"
     conda list -p testenv
 
     # Run integration tests
     set +e
-    TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python3.8 -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/integ/
+    TEST_SRCDIR="${TEMP_TEST_DIR}" conda run -p testenv --no-capture-output python -m pytest "${COMMON_PYTEST_FLAG[@]}" tests/integ/
     TEST_RETCODE=$?
     set -e
 

diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.0.9
+  version: 1.0.10
 requirements:
   build:
     - python
@@ -27,11 +27,12 @@ requirements:
     - aiohttp!=4.0.0a0, !=4.0.0a1
     - anyio>=3.5.0,<4
     - cachetools>=3.1.1,<5
-    - cloudpickle
+    - cloudpickle>=2.0.0
     - fsspec>=2022.11,<2024
     - numpy>=1.23,<2
     - packaging>=20.9,<24
     - pandas>=1.0.0,<2
+    - pytimeparse>=1.1.8,<2
     - pyyaml>=6.0,<7
     - requests
     - s3fs>=2022.11,<2024
@@ -49,6 +50,7 @@ requirements:
     - sentencepiece>=0.1.95,<0.2
     - shap==0.42.1
     - tensorflow>=2.9,<3
+    - tokenizers>=0.10,<1
     - torchdata>=0.4,<1
     - transformers>=4.29.2,<5
 source:

diff --git a/ci/get_excluded_tests.sh b/ci/get_excluded_tests.sh
@@ -87,7 +87,7 @@ if [[ $mode = "unaffected" || $mode = "all" ]]; then
     # -- Begin of Query Rules Heredoc --
     cat >"${unaffected_test_rule_file}" <<EndOfMessage
     let unaffected_targets = //tests/... - rdeps(//tests/..., set($(<"${affected_targets_file}"))) in
-        kind('source file', labels(srcs, set($(<ci/skip_merge_gate_targets)) + kind('py_test rule', \$unaffected_targets)))
+        kind('source file', labels(srcs, set($(<ci/skip_merge_gate_targets)) + kind('py_test rule', \$unaffected_targets)) - labels(srcs, rdeps(//tests/..., set($(<"${affected_targets_file}")))))
 EndOfMessage
     # -- End of Query Rules Heredoc --
 

diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl
@@ -91,6 +91,7 @@ def autogen_estimators(module, estimator_info_list):
                 "//snowflake/ml/_internal/utils:identifier",
                 "//snowflake/ml/model:model_signature",
                 "//snowflake/ml/model/_signatures:utils",
+                "//snowflake/ml/modeling/_internal:estimator_utils",
             ],
         )
 

diff --git a/codegen/sklearn_wrapper_autogen.py b/codegen/sklearn_wrapper_autogen.py
@@ -113,7 +113,7 @@ def _generate_src_files(
             List of generated files.
         """
 
-        template = open(self.template_path).read()
+        template = open(self.template_path, encoding="utf-8").read()
 
         generated_files_list = []
         for generator in generators:
@@ -130,7 +130,7 @@ def _generate_src_files(
             # Create output src dir if it don't exist already.
             os.makedirs("/".join(output_file_name.split("/")[:-1]), exist_ok=True)
 
-            open(output_file_name, "w").write(wrapped_transform_string)
+            open(output_file_name, "w", encoding="utf-8").write(wrapped_transform_string)
             logging.info("Wrote file %s", output_file_name)
 
         return generated_files_list
@@ -149,7 +149,7 @@ def _generate_test_files(
         Returns:
             List of generated files.
         """
-        test_template = open(self.template_path).read()
+        test_template = open(self.template_path, encoding="utf-8").read()
 
         generated_files_list = []
         for generator in generators:
@@ -166,7 +166,7 @@ def _generate_test_files(
             # Create output test dir if it don't exist already.
             os.makedirs("/".join(test_output_file_name.split("/")[:-1]), exist_ok=True)
 
-            open(test_output_file_name, "w").write(wrapped_transform_string)
+            open(test_output_file_name, "w", encoding="utf-8").write(wrapped_transform_string)
             logging.info("Wrote file %s", test_output_file_name)
 
         return generated_files_list

diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py
@@ -722,9 +722,9 @@ def _populate_function_names_and_signatures(self) -> None:
         for arg_to_transform in args_to_transform:
             if arg_to_transform in self.original_init_signature.parameters.keys():
                 arg_transform_calls.append(
-                    f"{arg_to_transform} = _transform_snowml_obj_to_sklearn_obj({arg_to_transform})"
+                    f"{arg_to_transform} = transform_snowml_obj_to_sklearn_obj({arg_to_transform})"
                 )
-                deps_gathering_calls.append(f"deps = deps | _gather_dependencies({arg_to_transform})")
+                deps_gathering_calls.append(f"deps = deps | gather_dependencies({arg_to_transform})")
 
         self.estimator_init_signature = ",\n        ".join(signature_lines) + ","
         self.sklearn_init_arguments = ",\n            ".join(sklearn_init_lines) + ","