From c94742078ebc15bfc5bd631645dd9a1755cd241a Mon Sep 17 00:00:00 2001 From: Snowflake Authors Date: Thu, 1 Jun 2023 18:31:30 -0700 Subject: [PATCH] Project import generated by Copybara. GitOrigin-RevId: 288c0c4da10ce230b81b6eb80316011cbb76252b --- ci/conda_recipe/meta.yaml | 2 +- ci/type_ignored_targets | 32 +- codegen/codegen_rules.bzl | 3 +- codegen/sklearn_wrapper_generator.py | 4 +- codegen/sklearn_wrapper_template.py_template | 91 +- conda-env-extended.yml | 1 - conda-env-snowflake.yml | 4 +- conda-env.yml | 1 + snowflake/ml/BUILD.bazel | 58 +- snowflake/ml/_internal/BUILD.bazel | 1 - snowflake/ml/_internal/file_utils.py | 21 +- snowflake/ml/_internal/file_utils_test.py | 59 +- snowflake/ml/_internal/init_utils.py | 2 +- snowflake/ml/_internal/telemetry.py | 19 +- snowflake/ml/_internal/telemetry_test.py | 20 +- snowflake/ml/_internal/utils/identifier.py | 68 +- .../ml/_internal/utils/identifier_test.py | 58 +- .../ml/{modeling => }/lightgbm/BUILD.bazel | 0 .../lightgbm/estimators_info.bzl | 0 snowflake/ml/metrics/_utils.py | 19 +- snowflake/ml/metrics/correlation.py | 42 +- snowflake/ml/metrics/covariance.py | 44 +- snowflake/ml/model/BUILD.bazel | 8 +- .../model/_deploy_client/docker/BUILD.bazel | 17 - .../docker/client_image_builder.py | 29 - .../_deploy_client/image_builds/BUILD.bazel | 43 + .../base_image_builder.py | 0 .../image_builds/client_image_builder.py | 88 + .../image_builds/client_image_builder_test.py | 64 + .../image_builds/docker_context.py | 56 + .../image_builds/docker_context_test.py | 33 + .../image_builds/templates/app_template | 1 + .../templates/dockerfile_template | 1 + .../_deploy_client/snowservice/BUILD.bazel | 12 +- .../_deploy_client/snowservice/deploy.py | 50 +- .../snowservice/deploy_options.py | 80 + .../_deploy_client/snowservice/deploy_test.py | 58 +- .../ml/model/_deploy_client/utils/BUILD.bazel | 25 + .../model/_deploy_client/utils/constants.py | 40 + .../utils/snowservice_client.py | 145 + .../utils/snowservice_client_test.py | 164 ++ snowflake/ml/model/_deployer.py | 124 +- snowflake/ml/model/_handlers/BUILD.bazel | 2 +- snowflake/ml/model/_handlers/snowmlmodel.py | 14 +- snowflake/ml/model/_model.py | 369 ++- snowflake/ml/model/_model_meta.py | 1 + snowflake/ml/model/_model_test.py | 242 +- snowflake/ml/model/_udf_util.py | 77 +- snowflake/ml/model/model_signature.py | 163 +- snowflake/ml/model/model_signature_test.py | 508 +++- snowflake/ml/model/type_hints.py | 27 +- snowflake/ml/registry/BUILD.bazel | 3 +- snowflake/ml/registry/model_registry.py | 248 +- snowflake/ml/registry/model_registry_test.py | 18 +- .../notebooks/Model Packaging Example.ipynb | 90 +- .../Model Packaging SnowML Examples.ipynb | 2325 +++++++++++++++++ .../calibration/BUILD.bazel | 0 .../calibration/estimators_info.bzl | 0 .../{modeling => sklearn}/cluster/BUILD.bazel | 0 .../cluster/estimators_info.bzl | 0 .../{modeling => sklearn}/compose/BUILD.bazel | 0 .../compose/estimators_info.bzl | 0 .../covariance/BUILD.bazel | 0 .../covariance/estimators_info.bzl | 0 .../decomposition/BUILD.bazel | 0 .../decomposition/estimators_info.bzl | 0 .../discriminant_analysis/BUILD.bazel | 0 .../discriminant_analysis/estimators_info.bzl | 0 .../ensemble/BUILD.bazel | 0 .../ensemble/estimators_info.bzl | 0 .../feature_selection/BUILD.bazel | 0 .../feature_selection/estimators_info.bzl | 0 .../ml/{ => sklearn}/framework/BUILD.bazel | 0 .../ml/{ => sklearn}/framework/_utils.py | 0 snowflake/ml/{ => sklearn}/framework/base.py | 2 +- .../ml/{ => sklearn}/framework/pipeline.py | 2 +- .../gaussian_process/BUILD.bazel | 0 .../gaussian_process/estimators_info.bzl | 0 .../{modeling => sklearn}/impute/BUILD.bazel | 0 .../impute/estimators_info.bzl | 0 .../isotonic/BUILD.bazel | 0 .../isotonic/estimators_info.bzl | 0 .../kernel_approximation/BUILD.bazel | 0 .../kernel_approximation/estimators_info.bzl | 0 .../kernel_ridge/BUILD.bazel | 0 .../kernel_ridge/estimators_info.bzl | 0 .../linear_model/BUILD.bazel | 0 .../linear_model/estimators_info.bzl | 0 .../manifold/BUILD.bazel | 0 .../manifold/estimators_info.bzl | 0 .../{modeling => sklearn}/mixture/BUILD.bazel | 0 .../mixture/estimators_info.bzl | 0 .../model_selection/BUILD.bazel | 0 .../model_selection/estimators_info.bzl | 0 .../multiclass/BUILD.bazel | 0 .../multiclass/estimators_info.bzl | 0 .../multioutput/BUILD.bazel | 0 .../multioutput/estimators_info.bzl | 0 .../naive_bayes/BUILD.bazel | 0 .../naive_bayes/estimators_info.bzl | 0 .../neighbors/BUILD.bazel | 0 .../neighbors/estimators_info.bzl | 0 .../neural_network/BUILD.bazel | 0 .../neural_network/estimators_info.bzl | 0 .../{ => sklearn}/preprocessing/BUILD.bazel | 23 +- .../{ => sklearn}/preprocessing/__init__.py | 0 .../{ => sklearn}/preprocessing/binarizer.py | 2 +- .../preprocessing/k_bins_discretizer.py | 2 +- .../preprocessing/label_encoder.py | 4 +- .../preprocessing/max_abs_scaler.py | 2 +- .../preprocessing/min_max_scaler.py | 2 +- .../{ => sklearn}/preprocessing/normalizer.py | 2 +- .../preprocessing/one_hot_encoder.py | 13 +- .../preprocessing/ordinal_encoder.py | 4 +- .../preprocessing/robust_scaler.py | 2 +- .../preprocessing/simple_imputer.py | 2 +- .../preprocessing/standard_scaler.py | 2 +- .../semi_supervised/BUILD.bazel | 0 .../semi_supervised/estimators_info.bzl | 0 .../ml/{modeling => sklearn}/svm/BUILD.bazel | 0 .../svm/estimators_info.bzl | 0 .../ml/{modeling => sklearn}/tree/BUILD.bazel | 0 .../tree/estimators_info.bzl | 0 snowflake/ml/version.bzl | 2 +- .../ml/{modeling => }/xgboost/BUILD.bazel | 0 .../xgboost/estimators_info.bzl | 0 .../snowflake/ml/_internal/utils/BUILD.bazel | 2 +- .../snowflake/ml/extra_tests/BUILD.bazel | 42 +- .../extra_tests/test_column_name_inference.py | 2 +- .../ml/extra_tests/test_grid_search.py | 4 +- .../test_grid_search_on_pipeline.py | 10 +- .../ml/extra_tests/test_iterative_imputer.py | 4 +- .../test_pipeline_with_ohe_and_xgbr.py | 6 +- .../ml/extra_tests/test_randomized_search.py | 4 +- .../ml/extra_tests/test_voting_regressor.py | 4 +- .../ml/{modeling => }/lightgbm/BUILD.bazel | 4 +- tests/integ/snowflake/ml/metrics/BUILD.bazel | 2 +- .../ml/metrics/test_accuracy_score.py | 2 +- tests/integ/snowflake/ml/model/BUILD.bazel | 4 +- .../snowflake/ml/model/model_integ_test.py | 347 ++- .../calibration/BUILD.bazel | 4 +- .../{modeling => sklearn}/cluster/BUILD.bazel | 4 +- .../{modeling => sklearn}/compose/BUILD.bazel | 4 +- .../covariance/BUILD.bazel | 4 +- .../decomposition/BUILD.bazel | 4 +- .../discriminant_analysis/BUILD.bazel | 4 +- .../ensemble/BUILD.bazel | 4 +- .../feature_selection/BUILD.bazel | 4 +- .../ml/{ => sklearn}/framework/BUILD.bazel | 8 +- .../ml/{ => sklearn}/framework/test_base.py | 8 +- .../{ => sklearn}/framework/test_pipeline.py | 10 +- .../ml/{ => sklearn}/framework/utils.py | 0 .../gaussian_process/BUILD.bazel | 4 +- .../{modeling => sklearn}/impute/BUILD.bazel | 4 +- .../isotonic/BUILD.bazel | 4 +- .../kernel_approximation/BUILD.bazel | 4 +- .../kernel_ridge/BUILD.bazel | 4 +- .../linear_model/BUILD.bazel | 4 +- .../manifold/BUILD.bazel | 4 +- .../{modeling => sklearn}/mixture/BUILD.bazel | 4 +- .../model_selection/BUILD.bazel | 4 +- .../multiclass/BUILD.bazel | 4 +- .../multioutput/BUILD.bazel | 4 +- .../naive_bayes/BUILD.bazel | 4 +- .../neighbors/BUILD.bazel | 4 +- .../neural_network/BUILD.bazel | 4 +- .../{ => sklearn}/preprocessing/BUILD.bazel | 67 +- .../preprocessing/test_binarizer.py | 8 +- .../preprocessing/test_drop_input_cols.py | 8 +- .../preprocessing/test_k_bins_discretizer.py | 6 +- .../preprocessing/test_label_encoder.py | 10 +- .../preprocessing/test_max_abs_scaler.py | 10 +- .../preprocessing/test_min_max_scaler.py | 10 +- .../preprocessing/test_normalizer.py | 8 +- .../preprocessing/test_one_hot_encoder.py | 18 +- .../preprocessing/test_ordinal_encoder.py | 10 +- .../preprocessing/test_robust_scaler.py | 10 +- .../preprocessing/test_simple_imputer.py | 10 +- .../preprocessing/test_standard_scaler.py | 10 +- .../semi_supervised/BUILD.bazel | 4 +- .../ml/{modeling => sklearn}/svm/BUILD.bazel | 4 +- .../ml/{modeling => sklearn}/tree/BUILD.bazel | 4 +- .../ml/{modeling => }/xgboost/BUILD.bazel | 4 +- 183 files changed, 5601 insertions(+), 891 deletions(-) rename snowflake/ml/{modeling => }/lightgbm/BUILD.bazel (100%) rename snowflake/ml/{modeling => }/lightgbm/estimators_info.bzl (100%) delete mode 100644 snowflake/ml/model/_deploy_client/docker/BUILD.bazel delete mode 100644 snowflake/ml/model/_deploy_client/docker/client_image_builder.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel rename snowflake/ml/model/_deploy_client/{docker => image_builds}/base_image_builder.py (100%) create mode 100644 snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/docker_context.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py create mode 100644 snowflake/ml/model/_deploy_client/image_builds/templates/app_template create mode 100644 snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template create mode 100644 snowflake/ml/model/_deploy_client/snowservice/deploy_options.py create mode 100644 snowflake/ml/model/_deploy_client/utils/BUILD.bazel create mode 100644 snowflake/ml/model/_deploy_client/utils/constants.py create mode 100644 snowflake/ml/model/_deploy_client/utils/snowservice_client.py create mode 100644 snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py create mode 100644 snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb rename snowflake/ml/{modeling => sklearn}/calibration/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/calibration/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/cluster/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/cluster/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/compose/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/compose/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/covariance/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/covariance/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/decomposition/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/decomposition/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/discriminant_analysis/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/discriminant_analysis/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/ensemble/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/ensemble/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/feature_selection/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/feature_selection/estimators_info.bzl (100%) rename snowflake/ml/{ => sklearn}/framework/BUILD.bazel (100%) rename snowflake/ml/{ => sklearn}/framework/_utils.py (100%) rename snowflake/ml/{ => sklearn}/framework/base.py (99%) rename snowflake/ml/{ => sklearn}/framework/pipeline.py (99%) rename snowflake/ml/{modeling => sklearn}/gaussian_process/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/gaussian_process/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/impute/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/impute/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/isotonic/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/isotonic/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/kernel_approximation/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/kernel_approximation/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/kernel_ridge/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/kernel_ridge/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/linear_model/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/linear_model/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/manifold/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/manifold/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/mixture/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/mixture/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/model_selection/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/model_selection/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/multiclass/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/multiclass/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/multioutput/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/multioutput/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/naive_bayes/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/naive_bayes/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/neighbors/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/neighbors/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/neural_network/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/neural_network/estimators_info.bzl (100%) rename snowflake/ml/{ => sklearn}/preprocessing/BUILD.bazel (84%) rename snowflake/ml/{ => sklearn}/preprocessing/__init__.py (100%) rename snowflake/ml/{ => sklearn}/preprocessing/binarizer.py (98%) rename snowflake/ml/{ => sklearn}/preprocessing/k_bins_discretizer.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/label_encoder.py (97%) rename snowflake/ml/{ => sklearn}/preprocessing/max_abs_scaler.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/min_max_scaler.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/normalizer.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/one_hot_encoder.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/ordinal_encoder.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/robust_scaler.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/simple_imputer.py (99%) rename snowflake/ml/{ => sklearn}/preprocessing/standard_scaler.py (99%) rename snowflake/ml/{modeling => sklearn}/semi_supervised/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/semi_supervised/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/svm/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/svm/estimators_info.bzl (100%) rename snowflake/ml/{modeling => sklearn}/tree/BUILD.bazel (100%) rename snowflake/ml/{modeling => sklearn}/tree/estimators_info.bzl (100%) rename snowflake/ml/{modeling => }/xgboost/BUILD.bazel (100%) rename snowflake/ml/{modeling => }/xgboost/estimators_info.bzl (100%) rename tests/integ/snowflake/ml/{modeling => }/lightgbm/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/calibration/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/cluster/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/compose/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/covariance/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/decomposition/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/discriminant_analysis/BUILD.bazel (59%) rename tests/integ/snowflake/ml/{modeling => sklearn}/ensemble/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/feature_selection/BUILD.bazel (60%) rename tests/integ/snowflake/ml/{ => sklearn}/framework/BUILD.bazel (69%) rename tests/integ/snowflake/ml/{ => sklearn}/framework/test_base.py (95%) rename tests/integ/snowflake/ml/{ => sklearn}/framework/test_pipeline.py (97%) rename tests/integ/snowflake/ml/{ => sklearn}/framework/utils.py (100%) rename tests/integ/snowflake/ml/{modeling => sklearn}/gaussian_process/BUILD.bazel (60%) rename tests/integ/snowflake/ml/{modeling => sklearn}/impute/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/isotonic/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/kernel_approximation/BUILD.bazel (59%) rename tests/integ/snowflake/ml/{modeling => sklearn}/kernel_ridge/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/linear_model/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/manifold/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/mixture/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/model_selection/BUILD.bazel (60%) rename tests/integ/snowflake/ml/{modeling => sklearn}/multiclass/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/multioutput/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/naive_bayes/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{modeling => sklearn}/neighbors/BUILD.bazel (62%) rename tests/integ/snowflake/ml/{modeling => sklearn}/neural_network/BUILD.bazel (61%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/BUILD.bazel (53%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_binarizer.py (95%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_drop_input_cols.py (94%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_k_bins_discretizer.py (98%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_label_encoder.py (96%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_max_abs_scaler.py (95%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_min_max_scaler.py (97%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_normalizer.py (96%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_one_hot_encoder.py (99%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_ordinal_encoder.py (99%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_robust_scaler.py (97%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_simple_imputer.py (98%) rename tests/integ/snowflake/ml/{ => sklearn}/preprocessing/test_standard_scaler.py (98%) rename tests/integ/snowflake/ml/{modeling => sklearn}/semi_supervised/BUILD.bazel (60%) rename tests/integ/snowflake/ml/{modeling => sklearn}/svm/BUILD.bazel (63%) rename tests/integ/snowflake/ml/{modeling => sklearn}/tree/BUILD.bazel (63%) rename tests/integ/snowflake/ml/{modeling => }/xgboost/BUILD.bazel (61%) diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index a9fb458f..238929b7 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -25,7 +25,7 @@ requirements: - cloudpickle - fsspec>=2022.11,<=2023.1 - numpy>=1.23,<2 - - packaging>=23.0,<24 + - packaging>=20.9,<24 - pandas>=1.0.0,<2 # Limit since 2.x is not available in Snowflake Anaconda Channel yet. - pyyaml>=6.0,<7 - scikit-learn>=1.2.1,<2 diff --git a/ci/type_ignored_targets b/ci/type_ignored_targets index 7a1bfbd0..d576d2a2 100644 --- a/ci/type_ignored_targets +++ b/ci/type_ignored_targets @@ -1,4 +1,32 @@ //snowflake/ml/experimental/... -//snowflake/ml/modeling/... +//tests/integ/snowflake/ml/_internal/... //tests/integ/snowflake/ml/extra_tests/... -//tests/integ/snowflake/ml/preprocessing/... +//tests/integ/snowflake/ml/sklearn/preprocessing/... + +//snowflake/ml/sklearn/linear_model/... +//snowflake/ml/sklearn/ensemble/... +//snowflake/ml/sklearn/svm/... +//snowflake/ml/sklearn/neural_network/... +//snowflake/ml/sklearn/tree/... +//snowflake/ml/sklearn/calibration/... +//snowflake/ml/sklearn/cluster/... +//snowflake/ml/sklearn/compose/... +//snowflake/ml/sklearn/covariance/... +//snowflake/ml/sklearn/decomposition/... +//snowflake/ml/sklearn/discriminant_analysis/... +//snowflake/ml/sklearn/feature_selection/... +//snowflake/ml/sklearn/gaussian_process/... +//snowflake/ml/sklearn/impute/... +//snowflake/ml/sklearn/isotonic/... +//snowflake/ml/sklearn/kernel_approximation/... +//snowflake/ml/sklearn/kernel_ridge/... +//snowflake/ml/sklearn/manifold/... +//snowflake/ml/sklearn/mixture/... +//snowflake/ml/sklearn/model_selection/... +//snowflake/ml/sklearn/multiclass/... +//snowflake/ml/sklearn/multioutput/... +//snowflake/ml/sklearn/naive_bayes/... +//snowflake/ml/sklearn/neighbors/... +//snowflake/ml/sklearn/semi_supervised/... +//snowflake/ml/xgboost/... +//snowflake/ml/lightgbm/... diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl index ab9bad9f..5afd9d26 100644 --- a/codegen/codegen_rules.bzl +++ b/codegen/codegen_rules.bzl @@ -82,12 +82,13 @@ def autogen_estimators(module, estimator_info_list): srcs = [":generate_{}".format(e.normalized_class_name)], deps = [ ":init", - "//snowflake/ml/framework:framework", + "//snowflake/ml/sklearn/framework:framework", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal/utils:temp_file_utils", "//snowflake/ml/_internal/utils:query_result_checker", "//snowflake/ml/_internal/utils:pkg_version_utils", "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/model:model_signature", ], tags = ["skip_mypy_check"], ) diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py index ebb716d4..38816d10 100644 --- a/codegen/sklearn_wrapper_generator.py +++ b/codegen/sklearn_wrapper_generator.py @@ -311,9 +311,9 @@ def get_snow_ml_module_name(module_name: str) -> str: """ tokens = module_name.split(".") if tokens[0] == "sklearn": - return "snowflake.ml.modeling." + ".".join(module_name.split(".")[1:]) + return "snowflake.ml.sklearn." + ".".join(module_name.split(".")[1:]) else: - return "snowflake.ml.modeling." + module_name + return "snowflake.ml." + module_name @staticmethod def can_generate_wrapper(class_object: Tuple[str, type]) -> bool: diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index e45d58cd..28d664cd 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -12,7 +12,7 @@ import numpy as np {transform.estimator_imports} from sklearn.utils.metaestimators import available_if -from snowflake.ml.framework.base import BaseTransformer +from snowflake.ml.sklearn.framework.base import BaseTransformer from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator from snowflake.ml._internal.utils import pkg_version_utils, identifier @@ -21,6 +21,14 @@ from snowflake.snowpark import DataFrame, Session from snowflake.snowpark.functions import pandas_udf, sproc from snowflake.snowpark.types import PandasSeries +from snowflake.ml.model.model_signature import ( + DataType, + FeatureSpec, + ModelSignature, + _infer_signature, + _rename_features, +) + _PROJECT = "ModelDevelopment" # Derive subproject from module name by removing "sklearn" # and converting module name from underscore to CamelCase @@ -116,6 +124,7 @@ class {transform.original_class_name}(BaseTransformer): self._sklearn_object = {transform.root_module_name}.{transform.original_class_name}( {transform.sklearn_init_arguments} ) + self._model_signature_dict = None {transform.estimator_init_member_args} def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: @@ -161,6 +170,7 @@ class {transform.original_class_name}(BaseTransformer): "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." ) self._is_fitted = True + self._get_model_signatures(dataset) return self def _fit_snowpark(self, dataset: DataFrame) -> None: @@ -310,9 +320,9 @@ class {transform.original_class_name}(BaseTransformer): query, stage_transform_file_name, stage_result_file_name, - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols), - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols), - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col), + identifier.get_unescaped_names(self.input_cols), + identifier.get_unescaped_names(self.label_cols), + identifier.get_unescaped_names(self.sample_weight_col), statement_params=statement_params, ) @@ -378,7 +388,7 @@ class {transform.original_class_name}(BaseTransformer): # Input columns for UDF are sorted by column names. # We need actual order of input cols to reorder dataframe before calling inference methods. input_cols = self.input_cols - unquoted_input_cols = identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols) + unquoted_input_cols = identifier.get_unescaped_names(self.input_cols) statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, @@ -511,9 +521,37 @@ class {transform.original_class_name}(BaseTransformer): expected_output_cols_list: List[str] ) -> pd.DataFrame: output_cols = expected_output_cols_list.copy() - transformed_numpy_array = getattr(self._sklearn_object, inference_method)( - dataset[self.input_cols] + + # Model expects exact same columns names in the input df for predict call. + # Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call + # input cols need to match unquoted / quoted + input_cols = self.input_cols + unquoted_input_cols = identifier.get_unescaped_names(self.input_cols) + + estimator = self._sklearn_object + + input_df = dataset[input_cols] # Select input columns with quoted column names. + if hasattr(estimator, "feature_names_in_"): + missing_features = [] + for i, f in enumerate(getattr(estimator, "feature_names_in_")): + if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f): + missing_features.append(f) + + if len(missing_features) > 0: + raise ValueError( + "The feature names should match with those that were passed during fit.\n" + f"Features seen during fit call but not present in the input: {{missing_features}}\n" + f"Features in the input dataframe : {{input_cols}}\n" + ) + input_df.columns = getattr(estimator, "feature_names_in_") + else: + # Just rename the column names to unquoted identifiers. + input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids. + + transformed_numpy_array = getattr(estimator, inference_method)( + input_df ) + if ( isinstance(transformed_numpy_array, list) and len(transformed_numpy_array) > 0 @@ -974,12 +1012,45 @@ class {transform.original_class_name}(BaseTransformer): score_sproc_name, query, stage_score_file_name, - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols), - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols), - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col), + identifier.get_unescaped_names(self.input_cols), + identifier.get_unescaped_names(self.label_cols), + identifier.get_unescaped_names(self.sample_weight_col), statement_params=statement_params, ) cleanup_temp_files([local_score_file_name]) return score + + def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: + self._model_signature_dict: Dict[str, ModelSignature] = dict() + + PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"] + + inputs = _infer_signature(dataset[self.input_cols], "input") + if hasattr(self, "predict"): + # For classifier, the type of predict is the same as the type of label + if self._sklearn_object._estimator_type == 'classifier': + outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output + outputs = _rename_features(outputs, self.output_cols) # rename the output columns + self._model_signature_dict["predict"] = ModelSignature(inputs, outputs) + # For regressor, the type of predict is float64 + elif self._sklearn_object._estimator_type == 'regressor': + outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols] + self._model_signature_dict["predict"] = ModelSignature(inputs, outputs) + + for prob_func in PROB_FUNCTIONS: + if hasattr(self, prob_func): + output_cols_prefix: str = f"{{prob_func}}_" + output_column_names = self._get_output_column_names(output_cols_prefix) + outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names] + self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs) + + ##TODO: Add support for transform method + + + @property + def model_signatures(self) -> Dict[str, ModelSignature]: + if self._model_signature_dict is None: + raise RuntimeError("Estimator not fitted before accessing property model_signatures! ") + return self._model_signature_dict diff --git a/conda-env-extended.yml b/conda-env-extended.yml index c4eb0517..23af7cb9 100644 --- a/conda-env-extended.yml +++ b/conda-env-extended.yml @@ -8,7 +8,6 @@ channels: - conda-forge dependencies: - - moto==4.0.11 # SNOW-690705 - torchdata==0.4.1 # SNOW-702102 # SNOW-747683: Tensorflow is available on snowflake conda channel, # however, macos-arm64 is only available on conda-forge. diff --git a/conda-env-snowflake.yml b/conda-env-snowflake.yml index 95dd6a79..e18cb088 100644 --- a/conda-env-snowflake.yml +++ b/conda-env-snowflake.yml @@ -17,12 +17,14 @@ dependencies: - boto3==1.24.28 - conda-libmamba-solver==23.1.0 - coverage==6.3.2 # not a package dependency. + - docker-py==4.4.1 - flask-cors==3.0.10 - flask==2.1.3 - fsspec==2022.10.0 - inflection==0.5.1 - joblib==1.1.1 - lightgbm==3.3.5 + - moto==4.0.11 - networkx==2.8.4 - numpy==1.23.4 - packaging==23.0 @@ -38,4 +40,4 @@ dependencies: - sqlparse==0.4.3 - typing-extensions==4.5.0 - xgboost==1.7.3 - - mypy==0.981 # not a package dependency. + - mypy==0.981 # not a package dependency. diff --git a/conda-env.yml b/conda-env.yml index 51f264e2..e4f76a04 100644 --- a/conda-env.yml +++ b/conda-env.yml @@ -12,6 +12,7 @@ dependencies: - boto3==1.24.28 - conda-libmamba-solver==23.1.0 - coverage==6.3.2 + - docker-py==4.4.1 - flask-cors==3.0.10 - flask==2.1.3 - fsspec==2022.10.0 diff --git a/snowflake/ml/BUILD.bazel b/snowflake/ml/BUILD.bazel index f4736ab5..f0e35b67 100644 --- a/snowflake/ml/BUILD.bazel +++ b/snowflake/ml/BUILD.bazel @@ -41,7 +41,7 @@ snowml_wheel( "cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it. "fsspec[http]>=2022.11,<=2023.1", "numpy>=1.23,<2", - "packaging>=23.0,<24", + "packaging>=20.9,<24", "pandas>=1.0.0,<2", # Limit since 2.x is not available in Snowflake Anaconda Channel yet. "pyyaml>=6.0,<7", "scikit-learn>=1.2.1,<2", @@ -55,37 +55,37 @@ snowml_wheel( version = VERSION, deps = [ "//snowflake/ml/metrics:metrics_pkg", - "//snowflake/ml/preprocessing:preprocessing_pkg", + "//snowflake/ml/sklearn/preprocessing:preprocessing_pkg", "//snowflake/ml/utils:utils_pkg", "//snowflake/ml/fileset:fileset_pkg", "//snowflake/ml/registry:model_registry_pkg", # Auotgen packages - "//snowflake/ml/modeling/linear_model:sklearn_linear_model_pkg", - "//snowflake/ml/modeling/ensemble:sklearn_ensemble_pkg", - "//snowflake/ml/modeling/svm:sklearn_svm_pkg", - "//snowflake/ml/modeling/neural_network:sklearn_neural_network_pkg", - "//snowflake/ml/modeling/tree:sklearn_tree_pkg", - "//snowflake/ml/modeling/xgboost:xgboost_pkg", - "//snowflake/ml/modeling/calibration:sklearn_calibration_pkg", - "//snowflake/ml/modeling/cluster:sklearn_cluster_pkg", - "//snowflake/ml/modeling/compose:sklearn_compose_pkg", - "//snowflake/ml/modeling/covariance:sklearn_covariance_pkg", - "//snowflake/ml/modeling/decomposition:sklearn_decomposition_pkg", - "//snowflake/ml/modeling/discriminant_analysis:sklearn_discriminant_analysis_pkg", - "//snowflake/ml/modeling/feature_selection:sklearn_feature_selection_pkg", - "//snowflake/ml/modeling/gaussian_process:sklearn_gaussian_process_pkg", - "//snowflake/ml/modeling/impute:sklearn_impute_pkg", - "//snowflake/ml/modeling/isotonic:sklearn_isotonic_pkg", - "//snowflake/ml/modeling/kernel_approximation:sklearn_kernel_approximation_pkg", - "//snowflake/ml/modeling/kernel_ridge:sklearn_kernel_ridge_pkg", - "//snowflake/ml/modeling/manifold:sklearn_manifold_pkg", - "//snowflake/ml/modeling/mixture:sklearn_mixture_pkg", - "//snowflake/ml/modeling/model_selection:sklearn_model_selection_pkg", - "//snowflake/ml/modeling/multiclass:sklearn_multiclass_pkg", - "//snowflake/ml/modeling/multioutput:sklearn_multioutput_pkg", - "//snowflake/ml/modeling/naive_bayes:sklearn_naive_bayes_pkg", - "//snowflake/ml/modeling/neighbors:sklearn_neighbors_pkg", - "//snowflake/ml/modeling/semi_supervised:sklearn_semi_supervised_pkg", - "//snowflake/ml/modeling/lightgbm:lightgbm_pkg", + "//snowflake/ml/sklearn/linear_model:sklearn_linear_model_pkg", + "//snowflake/ml/sklearn/ensemble:sklearn_ensemble_pkg", + "//snowflake/ml/sklearn/svm:sklearn_svm_pkg", + "//snowflake/ml/sklearn/neural_network:sklearn_neural_network_pkg", + "//snowflake/ml/sklearn/tree:sklearn_tree_pkg", + "//snowflake/ml/sklearn/calibration:sklearn_calibration_pkg", + "//snowflake/ml/sklearn/cluster:sklearn_cluster_pkg", + "//snowflake/ml/sklearn/compose:sklearn_compose_pkg", + "//snowflake/ml/sklearn/covariance:sklearn_covariance_pkg", + "//snowflake/ml/sklearn/decomposition:sklearn_decomposition_pkg", + "//snowflake/ml/sklearn/discriminant_analysis:sklearn_discriminant_analysis_pkg", + "//snowflake/ml/sklearn/feature_selection:sklearn_feature_selection_pkg", + "//snowflake/ml/sklearn/gaussian_process:sklearn_gaussian_process_pkg", + "//snowflake/ml/sklearn/impute:sklearn_impute_pkg", + "//snowflake/ml/sklearn/isotonic:sklearn_isotonic_pkg", + "//snowflake/ml/sklearn/kernel_approximation:sklearn_kernel_approximation_pkg", + "//snowflake/ml/sklearn/kernel_ridge:sklearn_kernel_ridge_pkg", + "//snowflake/ml/sklearn/manifold:sklearn_manifold_pkg", + "//snowflake/ml/sklearn/mixture:sklearn_mixture_pkg", + "//snowflake/ml/sklearn/model_selection:sklearn_model_selection_pkg", + "//snowflake/ml/sklearn/multiclass:sklearn_multiclass_pkg", + "//snowflake/ml/sklearn/multioutput:sklearn_multioutput_pkg", + "//snowflake/ml/sklearn/naive_bayes:sklearn_naive_bayes_pkg", + "//snowflake/ml/sklearn/neighbors:sklearn_neighbors_pkg", + "//snowflake/ml/sklearn/semi_supervised:sklearn_semi_supervised_pkg", + "//snowflake/ml/xgboost:xgboost_pkg", + "//snowflake/ml/lightgbm:lightgbm_pkg", ], ) diff --git a/snowflake/ml/_internal/BUILD.bazel b/snowflake/ml/_internal/BUILD.bazel index d770f973..6a98bd1f 100644 --- a/snowflake/ml/_internal/BUILD.bazel +++ b/snowflake/ml/_internal/BUILD.bazel @@ -27,7 +27,6 @@ py_library( py_test( name = "file_utils_test", srcs = ["file_utils_test.py"], - timeout = "short", deps = [ ":file_utils", ], diff --git a/snowflake/ml/_internal/file_utils.py b/snowflake/ml/_internal/file_utils.py index 5a0e2b10..11960346 100644 --- a/snowflake/ml/_internal/file_utils.py +++ b/snowflake/ml/_internal/file_utils.py @@ -2,8 +2,9 @@ import io import os import shutil +import tempfile import zipfile -from typing import Generator, Optional +from typing import IO, Generator, Optional GENERATED_PY_FILE_EXT = (".pyc", ".pyo", ".pyd", ".pyi") @@ -69,7 +70,6 @@ def zip_file_or_directory_to_stream( with io.BytesIO() as input_stream: with zipfile.ZipFile(input_stream, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: - if os.path.realpath(path) != os.path.realpath(start_path): cur_path = os.path.dirname(path) while os.path.realpath(cur_path) != os.path.realpath(start_path): @@ -92,3 +92,20 @@ def zip_file_or_directory_to_stream( zf.write(path, os.path.relpath(path, start_path)) yield input_stream + + +@contextlib.contextmanager +def unzip_stream_in_temp_dir(stream: IO[bytes], temp_root: Optional[str] = None) -> Generator[str, None, None]: + """Unzip an IO stream into a temporary directory. + + Args: + stream: The input stream. + temp_root: The root directory where the temporary directory should created in. Defaults to None. + + Yields: + The path to the created temporary directory. + """ + with tempfile.TemporaryDirectory(dir=temp_root) as tempdir: + with zipfile.ZipFile(stream, mode="r", compression=zipfile.ZIP_DEFLATED) as zf: + zf.extractall(path=tempdir) + yield tempdir diff --git a/snowflake/ml/_internal/file_utils_test.py b/snowflake/ml/_internal/file_utils_test.py index 3d3a5eb2..d46aa956 100644 --- a/snowflake/ml/_internal/file_utils_test.py +++ b/snowflake/ml/_internal/file_utils_test.py @@ -1,6 +1,7 @@ -import importlib +# import importlib import os -import sys + +# import sys import tempfile from absl.testing import absltest @@ -15,38 +16,58 @@ def get_file(): """ -class UtilsTest(absltest.TestCase): +class FileUtilsTest(absltest.TestCase): def test_zip_file_or_directory_to_stream(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: leading_path = os.path.join(tmpdir, "test") - fake_mod_dirpath = os.path.join(leading_path, "snowflake", "snowpark", "fake_module") + fake_mod_dirpath = os.path.join(leading_path, "snowflake", "fake", "fake_module") os.makedirs(fake_mod_dirpath) - py_file_path = os.path.join(fake_mod_dirpath, "p.py") - with open(py_file_path, "w") as f: - f.write(PY_SRC) + # TODO(SNOW-831507): Test disabled because it breaks the coverage + # py_file_path = os.path.join(fake_mod_dirpath, "p.py") + # with open(py_file_path, "w") as f: + # f.write(PY_SRC) zip_module_filename = os.path.join(tmpdir, "fake_module.zip") - with file_utils.zip_file_or_directory_to_stream(py_file_path, leading_path) as input_stream: - with open(zip_module_filename, "wb") as f: - f.write(input_stream.getbuffer()) + # with file_utils.zip_file_or_directory_to_stream(py_file_path, leading_path) as input_stream: + # with open(zip_module_filename, "wb") as f: + # f.write(input_stream.getbuffer()) - sys.path.insert(0, os.path.abspath(zip_module_filename)) + # sys.path.insert(0, os.path.abspath(zip_module_filename)) - importlib.import_module("snowflake.snowpark.fake_module.p") + # importlib.import_module("snowflake.fake.fake_module.p") - sys.path.remove(os.path.abspath(zip_module_filename)) + # sys.path.remove(os.path.abspath(zip_module_filename)) - with file_utils.zip_file_or_directory_to_stream(fake_mod_dirpath, leading_path) as input_stream: - with open(zip_module_filename, "wb") as f: - f.write(input_stream.getbuffer()) + # with file_utils.zip_file_or_directory_to_stream(fake_mod_dirpath, leading_path) as input_stream: + # with open(zip_module_filename, "wb") as f: + # f.write(input_stream.getbuffer()) - sys.path.insert(0, os.path.abspath(zip_module_filename)) + # sys.path.insert(0, os.path.abspath(zip_module_filename)) - importlib.import_module("snowflake.snowpark.fake_module.p") + # importlib.import_module("snowflake.fake.fake_module.p") - sys.path.remove(os.path.abspath(zip_module_filename)) + # sys.path.remove(os.path.abspath(zip_module_filename)) with file_utils.zip_file_or_directory_to_stream(fake_mod_dirpath, fake_mod_dirpath) as input_stream: with open(zip_module_filename, "wb") as f: f.write(input_stream.getbuffer()) + + def test_unzip_stream_in_temp_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + leading_path = os.path.join(tmpdir, "test") + fake_mod_dirpath = os.path.join(leading_path, "snowflake", "fake", "fake_module") + os.makedirs(fake_mod_dirpath) + + py_file_path = os.path.join(fake_mod_dirpath, "p.py") + with open(py_file_path, "w") as f: + f.write(PY_SRC) + + with file_utils.zip_file_or_directory_to_stream(py_file_path, leading_path) as input_stream: + with file_utils.unzip_stream_in_temp_dir(input_stream, temp_root=tmpdir) as sub_tempdir: + with open(os.path.join(sub_tempdir, "snowflake", "fake", "fake_module", "p.py")) as f: + self.assertEqual(f.read(), PY_SRC) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/_internal/init_utils.py b/snowflake/ml/_internal/init_utils.py index 1a457dd7..ec6d836c 100644 --- a/snowflake/ml/_internal/init_utils.py +++ b/snowflake/ml/_internal/init_utils.py @@ -9,7 +9,7 @@ def fetch_classes_from_modules_in_pkg_dir(pkg_dir: str, pkg_name: str) -> Dict[s Args: pkg_dir: Path of the package directory. - pkg_name: Package name. Example, "snowflake.ml.preprocessing". + pkg_name: Package name. Example, "snowflake.ml.sklearn.preprocessing". Returns: A dict with class_name as key and class object as value. diff --git a/snowflake/ml/_internal/telemetry.py b/snowflake/ml/_internal/telemetry.py index cbdd0a77..5e33265e 100644 --- a/snowflake/ml/_internal/telemetry.py +++ b/snowflake/ml/_internal/telemetry.py @@ -6,6 +6,7 @@ import functools import inspect import operator +import threading import types from typing import ( Any, @@ -28,6 +29,10 @@ from snowflake.snowpark import dataframe, exceptions, session from snowflake.snowpark._internal import utils +_rlock = threading.RLock() +_log_counter = 0 +_FLUSH_SIZE = 10 + _Args = ParamSpec("_Args") _ReturnValue = TypeVar("_ReturnValue") @@ -303,8 +308,12 @@ def wrap(*args: Any, **kwargs: Any) -> _ReturnValue: return res finally: telemetry.send_function_usage_telemetry(**telemetry_args) - if "error" in telemetry_args: - telemetry.send_batch() + with _rlock: + global _log_counter + _log_counter += 1 + if _log_counter >= _FLUSH_SIZE or "error" in telemetry_args: + telemetry.send_batch() + _log_counter = 0 return cast(Callable[_Args, _ReturnValue], wrap) @@ -451,9 +460,6 @@ def _extract_arg_value(field: str, func_spec: inspect.FullArgSpec, args: Any, kw class _SourceTelemetryClient: - - DEFAULT_FORCE_FLUSH_SIZE = 10 - def __init__( self, conn: connector.SnowflakeConnection, @@ -486,9 +492,6 @@ def __init__( self.python_version: str = env.PYTHON_VERSION self.os: str = env.OS - if self._telemetry: - self._telemetry._flush_size = _SourceTelemetryClient.DEFAULT_FORCE_FLUSH_SIZE - def _send(self, msg: Dict[str, Any], timestamp: Optional[int] = None) -> None: """ Add telemetry data to a batch in connector client. diff --git a/snowflake/ml/_internal/telemetry_test.py b/snowflake/ml/_internal/telemetry_test.py index 1e831d92..925c5172 100644 --- a/snowflake/ml/_internal/telemetry_test.py +++ b/snowflake/ml/_internal/telemetry_test.py @@ -51,9 +51,6 @@ def foo(self, param: Any) -> None: test_obj = DummyObject() test_obj.foo(param="val") self.mock_telemetry.try_add_log_to_batch.assert_called() - self.assertEqual( - utils_telemetry._SourceTelemetryClient.DEFAULT_FORCE_FLUSH_SIZE, self.mock_telemetry._flush_size - ) message = self.mock_telemetry.try_add_log_to_batch.call_args.args[0].to_dict()["message"] data = message["data"] @@ -338,6 +335,23 @@ def foo2(self) -> "DummyObject": self.assertIn("DummyObject.foo", actual_statement_params[utils_telemetry.TelemetryField.KEY_FUNC_NAME.value]) self.assertFalse(hasattr(test_obj.foo2(), "_statement_params")) + @mock.patch("snowflake.snowpark.session._get_active_sessions") + def test_client_telemetry_flush_size(self, mock_get_active_sessions: mock.MagicMock) -> None: + mock_get_active_sessions.return_value = {self.mock_session} + + class DummyObject: + @utils_telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + ) + def foo(self) -> None: + pass + + test_obj = DummyObject() + for _ in range(utils_telemetry._FLUSH_SIZE): + test_obj.foo() + self.mock_telemetry.send_batch.assert_called() + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/utils/identifier.py b/snowflake/ml/_internal/utils/identifier.py index a95b32fc..bee92e44 100644 --- a/snowflake/ml/_internal/utils/identifier.py +++ b/snowflake/ml/_internal/utils/identifier.py @@ -1,6 +1,8 @@ import re from typing import Any, List, Optional, Tuple, Union, overload +from snowflake.snowpark._internal.analyzer import analyzer_utils + # Snowflake Identifier Regex. See https://docs.snowflake.com/en/sql-reference/identifiers-syntax.html. _SF_UNQUOTED_IDENTIFIER = "[A-Za-z_][A-Za-z0-9_$]*" SF_QUOTED_IDENTIFIER = '"(?:[^"]|"")*"' @@ -18,7 +20,7 @@ def _is_quoted(id: str) -> bool: NOTE: Snowflake treats all identifiers as UPPERCASE by default. That is 'Hello' would become 'HELLO'. To preserve case, one needs to use quoted identifiers, e.g. "Hello" (note the double quote). Callers must take care of that quoting themselves. This library assumes that if there is double-quote both sides, it is escaped, otherwise does not - require. Anything in the middle is undefined. + require. Args: id: The string to be checked @@ -36,51 +38,35 @@ def _is_quoted(id: str) -> bool: if id[0] == '"' and id[-1] == '"': if len(id) == 2: raise ValueError("Invalid id passed.") + if not QUOTED_IDENTIFIER_RE.match(id): + raise ValueError("Invalid id passed.") return True + if not UNQUOTED_CASE_INSENSITIVE_RE.match(id): + raise ValueError("Invalid id passed.") return False # To keep mypy happy -def remove_quote_if_quoted(id: str) -> str: - """Remove double quotes from id if quoted. +def _get_unescaped_name(id: str) -> str: + """Remove double quotes and unescape quotes between them from id if quoted. + Uppercase if not quoted. NOTE: See note in :meth:`_is_quoted`. Args: id: The string to be checked & treated. - Returns: - String with quotes removed if quoted; original string otherwise. - """ - if _is_quoted(id): - return id[1:-1] - return id - - -def remove_and_unescape_quote_if_quoted(id: str) -> str: - """Remove double quotes and escape quotes between them from id if quoted. - - NOTE: See note in :meth:`_is_quoted`. - - Args: - id: The string to be checked & treated. - - Raises: - ValueError: If the identifier is unquoted, it does not match the syntax. - ValueError: There is a continuous odd number of quotes, thus cannot unescape. Example '""a""' is invalid. - Returns: String with quotes removed if quoted; original string otherwise. """ if not _is_quoted(id): - if not UNQUOTED_CASE_INSENSITIVE_RE.match(id): - raise ValueError("Invalid id passed.") - return id - if not QUOTED_IDENTIFIER_RE.match(id): - raise ValueError("Invalid id passed.") + return id.upper() unquoted_id = id[1:-1] return unquoted_id.replace('""', '"') +quote_name_without_upper_casing = analyzer_utils.quote_name_without_upper_casing + + def concat_names(ids: List[str]) -> str: """Concatenates `ids` to form one valid id. @@ -99,11 +85,11 @@ def concat_names(ids: List[str]) -> str: # If any part is quoted, the user cares about case. quotes_needed = True # Remove quotes before using it. - id = id[1:-1] + id = _get_unescaped_name(id) parts.append(id) final_id = "".join(parts) if quotes_needed: - return f'"{final_id}"' + return quote_name_without_upper_casing(final_id) return final_id @@ -133,23 +119,21 @@ def parse_schema_level_object_identifier( @overload -def get_equivalent_identifier_in_the_response_pandas_dataframe(ids: None) -> None: +def get_unescaped_names(ids: None) -> None: ... @overload -def get_equivalent_identifier_in_the_response_pandas_dataframe(ids: str) -> str: +def get_unescaped_names(ids: str) -> str: ... @overload -def get_equivalent_identifier_in_the_response_pandas_dataframe(ids: List[str]) -> List[str]: +def get_unescaped_names(ids: List[str]) -> List[str]: ... -def get_equivalent_identifier_in_the_response_pandas_dataframe( - ids: Optional[Union[str, List[str]]] -) -> Optional[Union[str, List[str]]]: +def get_unescaped_names(ids: Optional[Union[str, List[str]]]) -> Optional[Union[str, List[str]]]: """Given a user provided identifier(s), this method will compute the equivalent column name identifier(s) in the response pandas dataframe(i.e., in the respones of snowpark_df.to_pandas()) using the rules defined here https://docs.snowflake.com/en/sql-reference/identifiers-syntax. @@ -164,19 +148,11 @@ def get_equivalent_identifier_in_the_response_pandas_dataframe( ValueError: if input types is unsupported or column name identifiers are invalid. """ - def _resolve(id: str) -> str: - if UNQUOTED_CASE_INSENSITIVE_RE.fullmatch(id): - # Unquoted case insensitive identifier. Snowflake would convert it to uppercase. - return id.upper() - else: - # Quoted or unquoted identifer with special charcters. Just remove quotes and return. - return remove_quote_if_quoted(id) - if ids is None: return None elif type(ids) is list: - return [_resolve(id) for id in ids] + return [_get_unescaped_name(id) for id in ids] elif type(ids) is str: - return _resolve(ids) + return _get_unescaped_name(ids) else: raise ValueError("Unsupported type. Only string or list of string are supported for selecting columns.") diff --git a/snowflake/ml/_internal/utils/identifier_test.py b/snowflake/ml/_internal/utils/identifier_test.py index b27b08cd..78815f1f 100644 --- a/snowflake/ml/_internal/utils/identifier_test.py +++ b/snowflake/ml/_internal/utils/identifier_test.py @@ -4,31 +4,38 @@ class SnowflakeIdentifierTest(absltest.TestCase): - def test_remove_quote(self) -> None: - """Tests if quote is removed correctly.""" - self.assertEqual("foo", identifier.remove_quote_if_quoted('"foo"')) - self.assertEqual('foo"bar', identifier.remove_quote_if_quoted('"foo"bar"')) + def test_is_quote_valid(self) -> None: + self.assertTrue(identifier._is_quoted('"foo"')) + self.assertTrue(identifier._is_quoted('"""foo"""')) + self.assertFalse(identifier._is_quoted("foo")) - def test_quote_not_removed(self) -> None: - """Tests if quote is removed correctly.""" - self.assertEqual('"foo', identifier.remove_quote_if_quoted('"foo')) - self.assertEqual('foo"', identifier.remove_quote_if_quoted('foo"')) - self.assertEqual('foo"bar', identifier.remove_quote_if_quoted('foo"bar')) - - def test_remove_and_unescape_quote_if_quoted(self) -> None: - self.assertEqual("foo", identifier.remove_and_unescape_quote_if_quoted('"foo"')) - self.assertEqual('"foo"', identifier.remove_and_unescape_quote_if_quoted('"""foo"""')) - self.assertEqual('foo"bar', identifier.remove_and_unescape_quote_if_quoted('"foo""bar"')) + def test_is_quote_invalid(self) -> None: with self.assertRaises(ValueError): - identifier.remove_and_unescape_quote_if_quoted('foo"') + identifier._is_quoted('foo"') with self.assertRaises(ValueError): - identifier.remove_and_unescape_quote_if_quoted('"bar') + identifier._is_quoted('"bar') with self.assertRaises(ValueError): - identifier.remove_and_unescape_quote_if_quoted('foo"bar') + identifier._is_quoted('foo"bar') with self.assertRaises(ValueError): - identifier.remove_and_unescape_quote_if_quoted('""foo""') + identifier._is_quoted('""foo""') with self.assertRaises(ValueError): - identifier.remove_and_unescape_quote_if_quoted('"foo"""bar"') + identifier._is_quoted('"foo"""bar"') + + def test_get_unescaped_names(self) -> None: + self.assertEqual("FOO", identifier.get_unescaped_names("foo")) + self.assertEqual("foo", identifier.get_unescaped_names('"foo"')) + self.assertEqual('"foo"', identifier.get_unescaped_names('"""foo"""')) + self.assertEqual('foo"bar', identifier.get_unescaped_names('"foo""bar"')) + + input_and_expected_output_tuples = [ + (None, None), + ("Abc", "ABC"), + ('"Abc"', "Abc"), + (["Abc", '"Abc"'], ["ABC", "Abc"]), + ] + + for input, expected_output in input_and_expected_output_tuples: + self.assertEqual(identifier.get_unescaped_names(input), expected_output) def test_plan_concat(self) -> None: """Test vanilla concat with no quotes.""" @@ -63,19 +70,6 @@ def test_parse_schema_level_object_identifier(self) -> None: tuple(test_case[1:]), identifier.parse_schema_level_object_identifier(test_case[0]) ) - def test_get_equivalent_identifier_in_the_response_pandas_dataframe(self) -> None: - input_and_expected_output_tuples = [ - (None, None), - ("Abc", "ABC"), - ('"Abc"', "Abc"), - (["Abc", '"Abc"'], ["ABC", "Abc"]), - ] - - for (input, expected_output) in input_and_expected_output_tuples: - self.assertEqual( - identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(input), expected_output - ) - if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/modeling/lightgbm/BUILD.bazel b/snowflake/ml/lightgbm/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/lightgbm/BUILD.bazel rename to snowflake/ml/lightgbm/BUILD.bazel diff --git a/snowflake/ml/modeling/lightgbm/estimators_info.bzl b/snowflake/ml/lightgbm/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/lightgbm/estimators_info.bzl rename to snowflake/ml/lightgbm/estimators_info.bzl diff --git a/snowflake/ml/metrics/_utils.py b/snowflake/ml/metrics/_utils.py index 12a76cc9..e99d8942 100644 --- a/snowflake/ml/metrics/_utils.py +++ b/snowflake/ml/metrics/_utils.py @@ -11,9 +11,6 @@ from snowflake import snowpark from snowflake.snowpark import Session, functions as F, types as T -_PROJECT = "ModelDevelopment" -_SUBPROJECT = "Metrics" - def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, str]) -> str: """Registers accumulator UDTF in Snowflake and returns the name of the UDTF. @@ -36,7 +33,7 @@ def process(self, input_row: bytes) -> None: """Accumulates rows. Args: - input_row (bytes): numpy array serialized using cloudpickle. + input_row: numpy array serialized using cloudpickle. """ row = cloudpickle.loads(input_row) if self._accumulated_row is None: @@ -107,19 +104,19 @@ def __init__(self) -> None: # Square root of count - ddof self._sqrt_count_d = -1.0 - def process(self, input_row: List[float], count: str, ddof: str) -> None: + def process(self, input_row: List[float], count: int, ddof: int) -> None: """Computes sum and dot product. Args: - input_row (List[float]): List of floats. - count (str): Number of rows in the table. - ddof (str): delta degree of freedom + input_row: List of floats. + count: Number of rows in the table. + ddof: delta degree of freedom """ # 1. initialization of variables if not self._variables_initialized: self._n_cols = len(input_row) - self._count = int(count) - self._ddof = int(ddof) + self._count = count + self._ddof = ddof self._sqrt_count_d = math.sqrt(self._count - self._ddof) self._sum_by_count = np.zeros(self._n_cols) self._sum_by_countd = np.zeros(self._n_cols) @@ -168,7 +165,7 @@ def accumulate_batch_sum_and_dot_prod(self) -> None: T.StructField("part", T.StringType()), ] ), - input_types=[T.ArrayType(), T.StringType(), T.StringType()], + input_types=[T.ArrayType(), T.IntegerType(), T.IntegerType()], packages=["numpy", "cloudpickle"], name=sharded_dot_and_sum_computer, is_permanent=False, diff --git a/snowflake/ml/metrics/correlation.py b/snowflake/ml/metrics/correlation.py index 4d2287bc..413616d3 100644 --- a/snowflake/ml/metrics/correlation.py +++ b/snowflake/ml/metrics/correlation.py @@ -1,7 +1,6 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -import inspect from typing import Collection, Optional import cloudpickle @@ -11,6 +10,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry from snowflake.ml.metrics import _utils +from snowflake.snowpark import functions as F _PROJECT = "ModelDevelopment" _SUBPROJECT = "Metrics" @@ -46,34 +46,28 @@ def correlation(*, df: snowpark.DataFrame, columns: Optional[Collection[str]] = Returns: Correlation matrix in pandas.DataFrame format. """ - statement_params = telemetry.get_function_usage_statement_params( - project=_PROJECT, - subproject=_SUBPROJECT, - function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), None), - ) + assert df._session is not None + session = df._session + statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) input_df, columns = _utils.validate_and_return_dataframe_and_columns(df=df, columns=columns) - assert input_df._session is not None, "input_df._session cannot be None" + count = input_df.count(statement_params=statement_params) + + # Register UDTFs. sharded_dot_and_sum_computer = _utils.register_sharded_dot_sum_computer( - session=input_df._session, statement_params=statement_params + session=session, statement_params=statement_params ) - dot_and_sum_accumulator = _utils.register_accumulator_udtf( - session=input_df._session, statement_params=statement_params - ) - count = input_df.count(statement_params=statement_params) + sharded_dot_and_sum_computer_udtf = F.table_function(sharded_dot_and_sum_computer) + accumulator = _utils.register_accumulator_udtf(session=session, statement_params=statement_params) + accumulator_udtf = F.table_function(accumulator) - # TODO: Move the below to snowpark dataframe operations - input_query = input_df.queries["queries"][-1] - query = f""" - with temp_table1 as - (select array_construct(*) as col from ({input_query})), - temp_table2 as - (select result as res, part from temp_table1, - table({sharded_dot_and_sum_computer}(temp_table1.col, '{str(count)}', '0'))) - select result, temp_table2.part from temp_table2, - table({dot_and_sum_accumulator}(temp_table2.res) over (partition by part)) - """ - results = input_df._session.sql(query).collect(statement_params=statement_params) + # Compute the confusion matrix. + temp_df1 = input_df.select(F.array_construct(*input_df.columns).alias("ARR_COL")) # type: ignore[arg-type] + temp_df2 = temp_df1.select( + sharded_dot_and_sum_computer_udtf(F.col("ARR_COL"), F.lit(count), F.lit(0)) # type: ignore[arg-type] + ).with_column_renamed("RESULT", "RES") + res_df = temp_df2.select(accumulator_udtf(F.col("RES")).over(partition_by="PART"), F.col("PART")) + results = res_df.collect(statement_params=statement_params) # The below computation can be moved to a third udtf. But there is not much benefit in terms of client side # resource consumption as the below computation is very fast (< 1 sec for 1000 cols). Memory is in the same order diff --git a/snowflake/ml/metrics/covariance.py b/snowflake/ml/metrics/covariance.py index dd995e19..8d5e11bb 100644 --- a/snowflake/ml/metrics/covariance.py +++ b/snowflake/ml/metrics/covariance.py @@ -1,7 +1,6 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -import inspect from typing import Collection, Optional import cloudpickle @@ -10,7 +9,7 @@ from snowflake.ml._internal import telemetry from snowflake.ml.metrics import _utils -from snowflake.snowpark import DataFrame +from snowflake.snowpark import DataFrame, functions as F _PROJECT = "ModelDevelopment" _SUBPROJECT = "Metrics" @@ -49,35 +48,28 @@ def covariance(*, df: DataFrame, columns: Optional[Collection[str]] = None, ddof Returns: Covariance matrix in pandas.DataFrame format. """ - - statement_params = telemetry.get_function_usage_statement_params( - project=_PROJECT, - subproject=_SUBPROJECT, - function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), None), - ) + assert df._session is not None + session = df._session + statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) input_df, columns = _utils.validate_and_return_dataframe_and_columns(df=df, columns=columns) - assert input_df._session is not None, "input_df._session cannot be None" + count = input_df.count(statement_params=statement_params) + + # Register UDTFs. sharded_dot_and_sum_computer = _utils.register_sharded_dot_sum_computer( - session=input_df._session, statement_params=statement_params + session=session, statement_params=statement_params ) - dot_and_sum_accumulator = _utils.register_accumulator_udtf( - session=input_df._session, statement_params=statement_params - ) - count = input_df.count(statement_params=statement_params) + sharded_dot_and_sum_computer_udtf = F.table_function(sharded_dot_and_sum_computer) + accumulator = _utils.register_accumulator_udtf(session=session, statement_params=statement_params) + accumulator_udtf = F.table_function(accumulator) - # TODO: Move the below to snowpark dataframe operations - input_query = input_df.queries["queries"][-1] - query = f""" - with temp_table1 as - (select array_construct(*) as col from ({input_query})), - temp_table2 as - (select result as res, part from temp_table1, - table({sharded_dot_and_sum_computer}(temp_table1.col, '{str(count)}', '{str(ddof)}'))) - select result, temp_table2.part from temp_table2, - table({dot_and_sum_accumulator}(temp_table2.res) over (partition by part)) - """ - results = input_df._session.sql(query).collect(statement_params=statement_params) + # Compute the confusion matrix. + temp_df1 = input_df.select(F.array_construct(*input_df.columns).alias("ARR_COL")) # type: ignore[arg-type] + temp_df2 = temp_df1.select( + sharded_dot_and_sum_computer_udtf(F.col("ARR_COL"), F.lit(count), F.lit(ddof)) # type: ignore[arg-type] + ).with_column_renamed("RESULT", "RES") + res_df = temp_df2.select(accumulator_udtf(F.col("RES")).over(partition_by="PART"), F.col("PART")) + results = res_df.collect(statement_params=statement_params) # The below computation can be moved to a third udtf. But there is not much benefit in terms of client side # resource consumption as the below computation is very fast (< 1 sec for 1000 cols). Memory is in the same order diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index 647b9672..8ce82f4d 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -6,7 +6,7 @@ py_library( name = "type_hints", srcs = ["type_hints.py"], deps = [ - "//snowflake/ml/framework:framework" + "//snowflake/ml/sklearn/framework:framework" ] ) @@ -93,11 +93,12 @@ py_library( ":custom_model", ":model_signature", ":type_hints", + "//snowflake/ml/_internal:file_utils", "//snowflake/ml/model/_handlers:custom", "//snowflake/ml/model/_handlers:sklearn", "//snowflake/ml/model/_handlers:snowmlmodel", "//snowflake/ml/model/_handlers:xgboost", - "//snowflake/ml/framework:framework" + "//snowflake/ml/sklearn/framework:framework" ], ) @@ -161,6 +162,7 @@ py_test( ":custom_model", ":model_signature", ":type_hints", - "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/test_utils:mock_session", + "//snowflake/ml/sklearn/linear_model:linear_regression", ], ) diff --git a/snowflake/ml/model/_deploy_client/docker/BUILD.bazel b/snowflake/ml/model/_deploy_client/docker/BUILD.bazel deleted file mode 100644 index 1e5468f8..00000000 --- a/snowflake/ml/model/_deploy_client/docker/BUILD.bazel +++ /dev/null @@ -1,17 +0,0 @@ -load("//bazel:py_rules.bzl", "py_library") - -package(default_visibility = ["//visibility:public"]) - - -py_library( - name = "base_image_builder", - srcs = ["base_image_builder.py"], -) - -py_library( - name = "client_image_builder", - srcs = ["client_image_builder.py"], - deps = [ - ":base_image_builder", - ] -) diff --git a/snowflake/ml/model/_deploy_client/docker/client_image_builder.py b/snowflake/ml/model/_deploy_client/docker/client_image_builder.py deleted file mode 100644 index cdf5c016..00000000 --- a/snowflake/ml/model/_deploy_client/docker/client_image_builder.py +++ /dev/null @@ -1,29 +0,0 @@ -from snowflake.ml.model._deploy_client.docker import base_image_builder - - -class ClientImageBuilder(base_image_builder.ImageBuilder): - """ - Class for client-side image building and upload to model registry. - """ - - def build_and_upload_image(self) -> None: - """ - Builds and uploads an image to the model registry. - TODO: Actual implementation coming. - """ - self._build() - self._upload() - - def _build(self) -> None: - """ - Builds image in client side. - TODO: Actual implementation coming. - """ - pass - - def _upload(self) -> None: - """ - Uploads image to image registry. - TODO: Actual implementation coming. - """ - pass diff --git a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel new file mode 100644 index 00000000..e5f6f6b4 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel @@ -0,0 +1,43 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + + +py_library( + name = "base_image_builder", + srcs = ["base_image_builder.py"], +) + +py_library( + name = "client_image_builder", + srcs = ["client_image_builder.py"], + deps = [ + ":base_image_builder", + ":docker_context" + ] +) + +py_library( + name = "docker_context", + srcs = ["docker_context.py"], +) + +py_test( + name = "client_image_builder_test", + srcs = ["client_image_builder_test.py"], + deps = [ + ":client_image_builder", + ] +) + +py_test( + name = "docker_context_test", + srcs = ["docker_context_test.py"], + deps = [ + ":docker_context" + ], + data = [ + "templates/dockerfile_template", + "templates/app_template", + ] +) diff --git a/snowflake/ml/model/_deploy_client/docker/base_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py similarity index 100% rename from snowflake/ml/model/_deploy_client/docker/base_image_builder.py rename to snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py new file mode 100644 index 00000000..0626eb14 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py @@ -0,0 +1,88 @@ +import tempfile +from enum import Enum + +import docker + +from snowflake.ml.model._deploy_client.image_builds import ( + base_image_builder, + docker_context, +) + + +class Platform(Enum): + LINUX_AMD64 = "linux/amd64" + + +class ClientImageBuilder(base_image_builder.ImageBuilder): + """ + Client-side image building and upload to model registry. + + Requires prior installation and running of Docker. + + See https://docs.docker.com/engine/install/ for official installation instructions. + """ + + def __init__(self, *, id: str, image_repo: str, model_dir: str, use_gpu: bool = False) -> None: + """Initialization + + Args: + id: A hexadecimal string used for naming the image tag. + image_repo: Path to image repository. + model_dir: Path to model directory. + use_gpu: Boolean flag for generating the CPU or GPU base image. + """ + self.image_tag = "/".join([image_repo.rstrip("/"), id]) + self.image_repo = image_repo + self.model_dir = model_dir + self.use_gpu = use_gpu + self._docker_client = None + + @property + def docker_client(self) -> docker.DockerClient: + """Creates a Docker client object for interacting with the Docker daemon running on the local machine. + + Raises: + ConnectionError: Occurs when Docker is not installed or is not running. + + Returns: + A docker.DockerClient object representing the Docker client. + """ + if self._docker_client is None: + try: + self._docker_client = docker.from_env() + except docker.errors.DockerException: + raise ConnectionError( + "Failed to initialize Docker client. Please ensure Docker is installed and running." + ) + return self._docker_client + + def build_and_upload_image(self) -> None: + """ + Builds and uploads an image to the model registry. + """ + self._build() + self._upload() + + def _build(self) -> None: + """ + Constructs the Docker context directory and then builds a Docker image based on that context. + """ + with tempfile.TemporaryDirectory() as context_dir: + dc = docker_context.DockerContext(context_dir=context_dir, model_dir=self.model_dir, use_gpu=self.use_gpu) + dc.build() + self._build_image_from_context(context_dir) + + def _build_image_from_context(self, context_dir: str, *, platform: Platform = Platform.LINUX_AMD64) -> None: + """Builds a Docker image based on provided context. + + Args: + context_dir: Path to context directory. + platform: Target platform for the build output, in the format "os[/arch[/variant]]". + """ + self.docker_client.images.build(path=context_dir, tag=self.image_tag, platform=platform) + + def _upload(self) -> None: + """ + Uploads image to image registry. + """ + pass diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py new file mode 100644 index 00000000..f1706891 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py @@ -0,0 +1,64 @@ +import docker +from absl.testing import absltest +from absl.testing.absltest import mock + +from snowflake.ml.model._deploy_client.image_builds import client_image_builder + + +class ClientImageBuilderTestCase(absltest.TestCase): + def setUp(self) -> None: + super().setUp() + self.idempotent_key = "mock_idempotent_key" + self.image_repo = "mock_image_repo" + self.model_dir = "mock_model_dir" + self.use_gpu = False + + self.client_image_builder = client_image_builder.ClientImageBuilder( + id=self.idempotent_key, + image_repo=self.image_repo, + model_dir=self.model_dir, + use_gpu=self.use_gpu, + ) + + @mock.patch("docker.from_env") # type: ignore + @mock.patch( + "snowflake.ml.model._deploy_client.image_builds.client_image_builder" ".docker_context.DockerContext" + ) # type: ignore + @mock.patch("tempfile.TemporaryDirectory") # type: ignore + def test_build(self, m_tempdir: mock.MagicMock, m_docker_context_class: mock.MagicMock, _) -> None: + m_docker_context = m_docker_context_class.return_value + m_context_dir = "mock_context_dir" + # Modify the m_tempdir mock to return the desired TemporaryDirectory object + m_tempdir.return_value.__enter__.return_value = m_context_dir + + with mock.patch.object(m_docker_context, "build") as m_build, mock.patch.object( + self.client_image_builder, "_build_image_from_context" + ) as m_build_image_from_context: + self.client_image_builder._build() + + m_docker_context_class.assert_called_once_with( + context_dir=m_context_dir, model_dir=self.model_dir, use_gpu=self.use_gpu + ) + m_build.assert_called_once() + m_build_image_from_context.assert_called_once() + + @mock.patch("docker.from_env") # type: ignore + def test_build_image_from_context_with_docker_daemon_running(self, m_docker_from_env: mock.MagicMock) -> None: + m_docker_client = m_docker_from_env.return_value + m_context_dir = "mock_context_dir" + m_docker_client.images.build.return_value = None + self.client_image_builder._build_image_from_context(context_dir=m_context_dir) + m_docker_client.images.build.assert_called_once_with( + path=m_context_dir, + tag=self.client_image_builder.image_tag, + platform=client_image_builder.Platform.LINUX_AMD64, + ) + + def test_build_image_from_context_without_docker_daemon_running(self) -> None: + with mock.patch("docker.from_env", side_effect=docker.errors.DockerException): + with self.assertRaises(ConnectionError): + self.client_image_builder._build_image_from_context(context_dir="dummy") + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py new file mode 100644 index 00000000..8c564936 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -0,0 +1,56 @@ +import os +import shutil +import string +from abc import ABC + + +class DockerContext(ABC): + """ + Constructs the Docker context directory required for image building. + """ + + def __init__(self, context_dir: str, model_dir: str, *, use_gpu: bool = False) -> None: + """Initialization + + Args: + context_dir: Path to context directory. + model_dir: Path to model directory. + use_gpu: Boolean flag for generating the CPU or GPU base image. + """ + self.context_dir = context_dir + self.model_dir = model_dir + # TODO(shchen): SNOW-825995, Define dockerfile template used for model deployment. use_gpu will be used. + self.use_gpu = use_gpu + + def build(self) -> None: + """ + Generates and/or moves resources into the Docker context directory. + """ + # TODO(shchen): SNOW-826705, Install SnowML wheel on the inference container + shutil.copytree(self.model_dir, "/".join([self.context_dir.rstrip("/"), os.path.basename(self.model_dir)])) + self._generate_docker_file() + self._generate_inference_code() + + def _generate_docker_file(self) -> None: + """ + Generates dockerfile based on dockerfile template. + """ + docker_file_path = os.path.join(self.context_dir, "Dockerfile") + docker_file_template = os.path.join(os.path.dirname(__file__), "templates/dockerfile_template") + + with open(docker_file_path, "w") as dockerfile, open(docker_file_template) as template: + dockerfile_content = string.Template(template.read()).safe_substitute() + dockerfile.write(dockerfile_content) + + def _generate_inference_code(self) -> None: + """ + Generates inference code based on the app template and creates a folder named 'server' to house the inference + server code. + """ + server_dir = os.path.join(self.context_dir, "server") + os.makedirs(server_dir, exist_ok=True) + + app_file_path = os.path.join(server_dir, "app.py") + app_file_template = os.path.join(os.path.dirname(__file__), "templates/app_template") + with open(app_file_path, "w") as app_file, open(app_file_template) as template: + app_file.write(template.read()) diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py new file mode 100644 index 00000000..bbbdabcb --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py @@ -0,0 +1,33 @@ +import os +import shutil +import tempfile +import unittest + +from absl.testing import absltest + +from snowflake.ml.model._deploy_client.image_builds.docker_context import DockerContext + + +class DockerContextTest(absltest.TestCase): + def setUp(self) -> None: + self.context_dir = tempfile.mkdtemp() + self.model_dir = tempfile.mkdtemp() + self.use_gpu = False + self.docker_context = DockerContext(self.context_dir, model_dir=self.model_dir, use_gpu=False) + + def tearDown(self) -> None: + shutil.rmtree(self.model_dir) + shutil.rmtree(self.context_dir) + + def test_build(self) -> None: + expected_files = [os.path.basename(self.model_dir), "Dockerfile", "server"] + self.docker_context.build() + generated_files = os.listdir(self.context_dir) + self.assertCountEqual(expected_files, generated_files) + + actual_inference_files = os.listdir(os.path.join(self.context_dir, "server")) + self.assertCountEqual(["app.py"], actual_inference_files) + + +if __name__ == "__main__": + unittest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/app_template b/snowflake/ml/model/_deploy_client/image_builds/templates/app_template new file mode 100644 index 00000000..7872234c --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/app_template @@ -0,0 +1 @@ +# TODO(shchen), SNOW-825996, Define inference server code template used for model deployment diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template new file mode 100644 index 00000000..7fdf68ba --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template @@ -0,0 +1 @@ +# TODO(shchen): SNOW-825995, Define dockerfile template used for model deployment diff --git a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel index 7ee33ba4..22285028 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel @@ -2,13 +2,21 @@ load("//bazel:py_rules.bzl", "py_library", "py_test") package(default_visibility = ["//visibility:public"]) +py_library( + name = "deploy_options", + srcs = ["deploy_options.py"], + deps = [ + "//snowflake/ml/model/_deploy_client/utils:constants" + ] +) py_library( name = "deploy", srcs = ["deploy.py"], deps = [ - "//snowflake/ml/model/_deploy_client/docker:base_image_builder", - "//snowflake/ml/model/_deploy_client/docker:client_image_builder" + "//snowflake/ml/model/_deploy_client/image_builds:base_image_builder", + "//snowflake/ml/model/_deploy_client/image_builds:client_image_builder", + ":deploy_options" ] ) diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy.py b/snowflake/ml/model/_deploy_client/snowservice/deploy.py index 7e8c5fe6..330f9b2d 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy.py @@ -1,15 +1,23 @@ from abc import ABC -from typing import Dict +from typing import Any, Dict, cast -from snowflake.ml.model._deploy_client.docker import ( +from typing_extensions import Unpack + +from snowflake.ml.model._deploy_client.image_builds import ( base_image_builder, client_image_builder, ) +from snowflake.ml.model._deploy_client.snowservice import deploy_options from snowflake.snowpark import Session def _deploy( - session: Session, *, model_id: str, service_func_name: str, model_dir: str, options: Dict[str, str] + session: Session, + *, + model_id: str, + service_func_name: str, + model_dir: str, + **kwargs: Unpack[deploy_options.SnowServiceDeployOptionsTypedHint], ) -> None: """Entrypoint for model deployment to SnowService. This function will trigger a docker image build followed by workflow deployment to SnowService. @@ -19,15 +27,23 @@ def _deploy( model_id: Unique hex string of length 32, provided by model registry. service_func_name: The service function name in SnowService associated with the created service. model_dir: Path to model directory. - options: various SnowService deployment options. + **kwargs: various SnowService deployment options. Raises: - ValueError: Raised when target method does not exist in model. + ValueError: Raised when model_id is empty. + ValueError: Raised when service_func_name is empty. + ValueError: Raised when model_dir is empty. """ if not model_id: - raise ValueError('Must provide a non-empty string for the "model_id" when deploying to SnowService') - - image_builder = client_image_builder.ClientImageBuilder() + raise ValueError('Must provide a non-empty string for "model_id" when deploying to SnowService') + if not service_func_name: + raise ValueError('Must provide a non-empty string for "service_func_name" when deploying to SnowService') + if not model_dir: + raise ValueError('Must provide a non-empty string for "model_dir" when deploying to SnowService') + options = deploy_options.SnowServiceDeployOptions.from_dict(cast(Dict[str, Any], kwargs)) + image_builder = client_image_builder.ClientImageBuilder( + id=model_id, image_repo=options.image_repo, model_dir=model_dir + ) ss_deployment = SnowServiceDeployment( session=session, model_id=model_id, @@ -42,36 +58,42 @@ def _deploy( class SnowServiceDeployment(ABC): """ Class implementation that encapsulates image build and workflow deployment to SnowService + + #TODO[shchen], SNOW-830093 GPU support on model deployment to SnowService """ def __init__( self, session: Session, - *, model_id: str, service_func_name: str, model_dir: str, image_builder: base_image_builder.ImageBuilder, - options: Dict[str, str] + options: deploy_options.SnowServiceDeployOptions, ) -> None: - """Initialization. + """Initialization Args: session: Snowpark session model_id: Unique hex string of length 32, provided by model registry; if not provided, auto-generate one for - resource naming.The model_id serves as an idempotent key throughout the deployment workflow. + resource naming.The model_id serves as an idempotent key throughout the deployment workflow. service_func_name: The service function name in SnowService associated with the created service. model_dir: Path to model directory. image_builder: InferenceImageBuilder instance that handles image build and upload to image registry. - options: various SnowService deployment options. + options: A SnowServiceDeployOptions object containing deployment options. """ + self.session = session - self.idempotent_key = model_id + self.id = model_id self.service_func_name = service_func_name self.model_dir = model_dir self.image_builder = image_builder self.options = options + self._stage_location = "/".join([options.stage.rstrip("/"), "models", self.id]) + self._service_name = f"service_{model_id}" + self._spec_file_location = "/".join([self._stage_location.rstrip("/"), f"{self.id}.yaml"]) + def deploy(self) -> None: """ This function triggers image build followed by workflow deployment to SnowService. diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py new file mode 100644 index 00000000..98032126 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py @@ -0,0 +1,80 @@ +from typing import Any, Dict, Optional, TypedDict + +from typing_extensions import NotRequired + +from snowflake.ml.model._deploy_client.utils import constants + + +class SnowServiceDeployOptionsTypedHint(TypedDict): + """Deployment options for deploying to SnowService. + + stage: the name of the stage for uploading artifacts. + compute_pool: SnowService compute pool name. + image_repo: SnowService image repo path. e.g. "///" + min_instances: Minimum number of service replicas. + max_instances: Maximum number of service replicas. + endpoint: The specific name of the endpoint that the service function will communicate with. Default to + "predict". This option is useful when service has multiple endpoints. + overridden_base_image: When provided, it will override the base image. + """ + + stage: str + compute_pool: str + image_repo: str + min_instances: NotRequired[int] + max_instances: NotRequired[int] + endpoint: NotRequired[str] + overridden_base_image: NotRequired[str] + + +class SnowServiceDeployOptions: + def __init__( + self, + stage: str, + compute_pool: str, + image_repo: str, + *, + min_instances: int = 1, + max_instances: int = 1, + endpoint: str = constants.PREDICT_ENDPOINT, + overridden_base_image: Optional[str] = None, + ) -> None: + """Initialization + + Args: + stage: the name of the stage for uploading artifacts. + compute_pool: SnowService compute pool name. + image_repo: SnowService image repo path. e.g. "///" + min_instances: Minimum number of service replicas. + max_instances: Maximum number of service replicas. + endpoint: The specific name of the endpoint that the service function will communicate with. Default to + "predict". This option is useful when service has multiple endpoints. + overridden_base_image: When provided, it will override the base image. + """ + + self.stage = stage + self.compute_pool = compute_pool + self.image_repo = image_repo + self.min_instances = min_instances + self.max_instances = max_instances + self.endpoint = endpoint + self.overridden_base_image = overridden_base_image + + @classmethod + def from_dict(cls, options_dict: Dict[str, Any]) -> "SnowServiceDeployOptions": + """Construct SnowServiceDeployOptions instance based from an option dictionary. + + Args: + options_dict: The dict containing various deployment options. + + Raises: + ValueError: When required option is missing. + + Returns: + A SnowServiceDeployOptions object + """ + required_options = [constants.STAGE, constants.COMPUTE_POOL, constants.IMAGE_REPO] + missing_keys = [key for key in required_options if options_dict.get(key) is None] + if missing_keys: + raise ValueError(f"Must provide options when deploying to SnowService: {', '.join(missing_keys)}") + return cls(**options_dict) diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py index 9a353ece..8d35582d 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py @@ -1,9 +1,10 @@ -from typing import Dict, cast +from typing import Any, Dict, cast from absl.testing import absltest from absl.testing.absltest import mock -from snowflake.ml.model._deploy_client.docker import client_image_builder +from snowflake.ml.model._deploy_client.image_builds import client_image_builder +from snowflake.ml.model._deploy_client.snowservice import deploy_options from snowflake.ml.model._deploy_client.snowservice.deploy import ( SnowServiceDeployment, _deploy, @@ -16,6 +17,11 @@ class DeployTestCase(absltest.TestCase): def setUp(self) -> None: super().setUp() self.m_session = cast(session.Session, mock_session.MockSession(conn=None, test_case=self)) + self.options: Dict[str, Any] = { + "stage": "mock_stage", + "compute_pool": "mock_compute_pool", + "image_repo": "mock_image_repo", + } @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock) -> None: @@ -26,7 +32,7 @@ def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock) -> None: model_id="provided_model_id", service_func_name="mock_service_func", model_dir="mock_model_dir", - options={}, + **self.options, ) m_deployment_class.assert_called_once_with( @@ -35,7 +41,7 @@ def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock) -> None: service_func_name="mock_service_func", model_dir="mock_model_dir", image_builder=mock.ANY, - options={}, + options=mock.ANY, ) m_deployment.deploy.assert_called_once() @@ -47,7 +53,41 @@ def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> service_func_name="mock_service_func", model_id="", model_dir="mock_model_dir", - options={}, + **self.options, + ) + + m_deployment_class.assert_not_called() + + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore + def test_deploy_with_missing_required_options(self, m_deployment_class: mock.MagicMock) -> None: + with self.assertRaisesRegex(ValueError, "stage, image_repo"): + options: Dict[str, Any] = {"compute_pool": "mock_compute_pool"} + _deploy( + session=self.m_session, + service_func_name="mock_service_func", + model_id="mock_model_id", + model_dir="mock_model_dir", + **options, + ) + + with self.assertRaisesRegex(ValueError, "stage"): + options = {"compute_pool": "mock_compute_pool", "image_repo": "mock_image_repo"} + _deploy( + session=self.m_session, + service_func_name="mock_service_func", + model_id="mock_model_id", + model_dir="mock_model_dir", + **options, + ) + + with self.assertRaisesRegex(ValueError, "image_repo"): + options = {"stage": "mock_stage", "compute_pool": "mock_compute_pool"} + _deploy( + session=self.m_session, + service_func_name="mock_service_func", + model_id="mock_model_id", + model_dir="mock_model_dir", + **options, ) m_deployment_class.assert_not_called() @@ -61,7 +101,11 @@ def setUp(self) -> None: self.m_model_id = "provided_model_id" self.m_service_func_name = "provided_service_func_name" self.m_model_dir = "provided_model_dir" - self.m_options: Dict[str, str] = {} + self.m_options = { + "stage": "mock_stage", + "compute_pool": "mock_compute_pool", + "image_repo": "mock_image_repo", + } self.deployment = SnowServiceDeployment( self.m_session, @@ -69,7 +113,7 @@ def setUp(self) -> None: service_func_name=self.m_service_func_name, model_dir=self.m_model_dir, image_builder=self.m_image_builder, - options=self.m_options, + options=deploy_options.SnowServiceDeployOptions.from_dict(self.m_options), ) def test_deploy(self) -> None: diff --git a/snowflake/ml/model/_deploy_client/utils/BUILD.bazel b/snowflake/ml/model/_deploy_client/utils/BUILD.bazel new file mode 100644 index 00000000..47207297 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/utils/BUILD.bazel @@ -0,0 +1,25 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "constants", + srcs = ["constants.py"] +) + +py_library( + name = "snowservice_client", + srcs = ["snowservice_client.py"], + deps = [ + ":constants" + ] +) + +py_test( + name = "snowservice_client_test", + srcs = ["snowservice_client_test.py"], + deps = [ + ":snowservice_client", + "//snowflake/ml/test_utils:mock_session", + ] +) diff --git a/snowflake/ml/model/_deploy_client/utils/constants.py b/snowflake/ml/model/_deploy_client/utils/constants.py new file mode 100644 index 00000000..55b51929 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/utils/constants.py @@ -0,0 +1,40 @@ +from enum import Enum + + +class ResourceType(Enum): + SERVICE = "service" + JOB = "job" + + +""" +Potential SnowService status based on existing ResourceSetStatus proto: + +github.com/snowflakedb/snowflake/blob/main/GlobalServices/src/main/protobuf/snowservices_resourceset_reconciler.proto +""" + + +class ResourceStatus(Enum): + UNKNOWN = "UNKNOWN" # status is unknown because we have not received enough data from K8s yet. + PENDING = "PENDING" # resource set is being created, can't be used yet + READY = "READY" # resource set has been deployed. + DELETING = "DELETING" # resource set is being deleted + FAILED = "FAILED" # resource set has failed and cannot be used anymore + DONE = "DONE" # resource set has finished running + NOT_FOUND = "NOT_FOUND" # not found or deleted + INTERNAL_ERROR = "INTERNAL_ERROR" # there was an internal service error. + + +RESOURCE_TO_STATUS_FUNCTION_MAPPING = { + ResourceType.SERVICE: "SYSTEM$GET_SNOWSERVICE_STATUS", + ResourceType.JOB: "SYSTEM$GET_JOB_STATUS", +} + +PREDICT_ENDPOINT = "predict" +STAGE = "stage" +COMPUTE_POOL = "compute_pool" +IMAGE_REPO = "image_repo" +MIN_INSTANCES = "min_instances" +MAX_INSTANCES = "max_instances" +GPU_COUNT = "gpu" +OVERRIDDEN_BASE_IMAGE = "image" +ENDPOINT = "endpoint" diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py new file mode 100644 index 00000000..e86066a7 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py @@ -0,0 +1,145 @@ +import json +import time +from typing import Optional + +from snowflake.ml.model._deploy_client.utils import constants +from snowflake.snowpark import Session + + +class SnowServiceClient: + """ + SnowService client implementation: a Python wrapper for SnowService SQL queries. + """ + + def __init__(self, session: Session) -> None: + """Initialization + + Args: + session: Snowpark session + """ + self.session = session + + def create_or_replace_service( + self, + service_name: str, + compute_pool: str, + spec_stage_location: str, + *, + min_instances: int = 1, + max_instances: int = 1, + ) -> None: + """Create or replace service. Since SnowService doesn't support the CREATE OR REPLACE service syntax, we will + first attempt to drop the service if it exists, and then create the service. Please note that this approach may + have side effects due to the lack of transaction support. + + Args: + service_name: Name of the service. + min_instances: Minimum number of service replicas. + max_instances: Maximum number of service replicas. + compute_pool: Name of the compute pool. + spec_stage_location: Stage path for the service spec. + """ + self._drop_service_if_exists(service_name) + sql = f""" + CREATE SERVICE {service_name} + MIN_INSTANCES={min_instances} + MAX_INSTANCES={max_instances} + COMPUTE_POOL={compute_pool} + SPEC=@{spec_stage_location} + """ + self.session.sql(sql).collect() + + def _drop_service_if_exists(self, service_name: str) -> None: + """Drop service if it already exists. + + Args: + service_name: Name of the service. + """ + self.session.sql(f"DROP SERVICE IF EXISTS {service_name}").collect() + + def create_or_replace_service_function( + self, service_func_name: str, service_name: str, *, endpoint_name: str = constants.PREDICT_ENDPOINT + ) -> None: + """Create or replace service function. + + Args: + service_func_name: Name of the service function. + service_name: Name of the service. + endpoint_name: Name of the service endpoint. + """ + sql = f""" + CREATE OR REPLACE FUNCTION {service_func_name}(input OBJECT) + RETURNS OBJECT + SERVICE={service_name} + ENDPOINT={endpoint_name} + AS '{endpoint_name}' + """ + self.session.sql(sql).collect() + + def block_until_resource_is_ready( + self, + resource_name: str, + resource_type: constants.ResourceType, + *, + max_retries: int = 60, + retry_interval_secs: int = 5, + ) -> None: + """Blocks execution until the specified resource is ready. + Note that this is a best-effort approach because when launching a service, it's possible for it to initially + fail due to a system error. However, SnowService may automatically retry and recover the service, leading to + potential false-negative information. + + Args: + resource_name: Name of the resource. + resource_type: Type of the resource. + max_retries: The maximum number of retries to check the resource readiness (default: 60). + retry_interval_secs: The number of seconds to wait between each retry (default: 5). + + Raises: + RuntimeError: If the resource received the following status [failed, not_found, internal_error, deleting] + RuntimeError: If the resource does not reach the ready/done state within the specified number of retries. + """ + for _ in range(max_retries): + status = self.get_resource_status(resource_name=resource_name, resource_type=resource_type) + if status in [constants.ResourceStatus.READY, constants.ResourceStatus.DONE]: + return + elif status in [ + constants.ResourceStatus.FAILED, + constants.ResourceStatus.NOT_FOUND, + constants.ResourceStatus.INTERNAL_ERROR, + constants.ResourceStatus.DELETING, + ]: + # TODO(shchen): SNOW-830453, support GET_SNOWSERVICE_LOGS to show errors message when deployment failed + raise RuntimeError(f"{resource_type} {resource_name} failed.") + time.sleep(retry_interval_secs) + + raise RuntimeError("Resource never reached the ready/done state.") + + def get_resource_status( + self, resource_name: str, resource_type: constants.ResourceType + ) -> Optional[constants.ResourceStatus]: + """Get resource status. + + Args: + resource_name: Name of the resource. + resource_type: Type of the resource. + + Raises: + ValueError: If resource type does not have a corresponding system function for querying status. + RuntimeError: If corresponding status call failed. + + Returns: + Optional[constants.ResourceStatus]: The status of the resource, or None if the resource status is empty. + """ + if resource_type not in constants.RESOURCE_TO_STATUS_FUNCTION_MAPPING: + raise ValueError(f"Status querying is not supported for resources of type '{resource_type}'.") + status_func = constants.RESOURCE_TO_STATUS_FUNCTION_MAPPING[resource_type] + try: + row = self.session.sql(f"CALL {status_func}('{resource_name}');").collect() + except Exception as e: + raise RuntimeError(f"Error while querying the {resource_type} {resource_name} status: {str(e)}") + resource_metadata = json.loads(row[0][status_func])[0] + if resource_metadata and resource_metadata["status"]: + res: constants.ResourceStatus = resource_metadata["status"] + return res + return None diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py new file mode 100644 index 00000000..83d7c33b --- /dev/null +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py @@ -0,0 +1,164 @@ +import json +from typing import cast + +from absl.testing import absltest +from absl.testing.absltest import mock + +from snowflake import snowpark +from snowflake.ml.model._deploy_client.utils import constants +from snowflake.ml.model._deploy_client.utils.snowservice_client import SnowServiceClient +from snowflake.ml.test_utils import mock_data_frame, mock_session +from snowflake.snowpark import session + + +class SnowServiceClientTest(absltest.TestCase): + def setUp(self) -> None: + super().setUp() + self.m_session = mock_session.MockSession(conn=None, test_case=self) + self.client = SnowServiceClient(cast(session.Session, self.m_session)) + self.m_service_name = "mock_service_name" + + def test_create_or_replace_service(self) -> None: + m_min_instances = 1 + m_max_instances = 2 + m_compute_pool = "mock_compute_pool" + m_spec_storgae_location = "mock_spec_storage_location" + + self.m_session.add_mock_sql( + query="drop service if exists mock_service_name", result=mock_data_frame.MockDataFrame(collect_result=[]) + ) + + self.m_session.add_mock_sql( + query="create service mock_service_name" + " min_instances=1" + " max_instances=2" + " compute_pool=mock_compute_pool" + " spec=@mock_spec_storage_location", + result=mock_data_frame.MockDataFrame(collect_result=[]), + ) + + self.client.create_or_replace_service( + service_name=self.m_service_name, + min_instances=m_min_instances, + max_instances=m_max_instances, + compute_pool=m_compute_pool, + spec_stage_location=m_spec_storgae_location, + ) + + def test_create_service_function(self) -> None: + m_service_func_name = "mock_service_func_name" + m_service_name = "mock_service_name" + m_endpoint_name = "mock_endpoint_name" + + self.m_session.add_mock_sql( + query="create or replace function mock_service_func_name(input OBJECT)" + " returns OBJECT" + " service=mock_service_name" + " endpoint=mock_endpoint_name" + " as '/predict'", + result=mock_data_frame.MockDataFrame(collect_result=[]), + ) + + self.client.create_or_replace_service_function( + service_func_name=m_service_func_name, service_name=m_service_name, endpoint_name=m_endpoint_name + ) + + def test_get_service_status(self) -> None: + row = snowpark.Row( + **{ + "SYSTEM$GET_SNOWSERVICE_STATUS": json.dumps( + [ + { + "status": "READY", + "message": "Running", + "containerName": "inference-server", + "instanceId": "0", + "serviceName": "SERVICE_DFC46DE9CEC441B2A3185266C11E79BA", + "image": "image", + "restartCount": 0, + } + ] + ) + } + ) + self.m_session.add_mock_sql( + query="call system$get_snowservice_status('mock_service_name');", + result=mock_data_frame.MockDataFrame(collect_result=[row]), + ) + + self.assertEqual(self.client.get_resource_status(self.m_service_name, constants.ResourceType.SERVICE), "READY") + + row = snowpark.Row( + **{ + "SYSTEM$GET_SNOWSERVICE_STATUS": json.dumps( + [ + { + "status": "FAILED", + "message": "Running", + "containerName": "inference-server", + "instanceId": "0", + "serviceName": "SERVICE_DFC46DE9CEC441B2A3185266C11E79BA", + "image": "image", + "restartCount": 0, + } + ] + ) + } + ) + self.m_session.add_mock_sql( + query="call system$get_snowservice_status('mock_service_name');", + result=mock_data_frame.MockDataFrame(collect_result=[row]), + ) + + self.assertEqual(self.client.get_resource_status(self.m_service_name, constants.ResourceType.SERVICE), "FAILED") + + row = snowpark.Row( + **{ + "SYSTEM$GET_SNOWSERVICE_STATUS": json.dumps( + [ + { + "status": "", + "message": "Running", + "containerName": "inference-server", + "instanceId": "0", + "serviceName": "SERVICE_DFC46DE9CEC441B2A3185266C11E79BA", + "image": "image", + "restartCount": 0, + } + ] + ) + } + ) + self.m_session.add_mock_sql( + query="call system$get_snowservice_status('mock_service_name');", + result=mock_data_frame.MockDataFrame(collect_result=[row]), + ) + self.assertEqual(self.client.get_resource_status(self.m_service_name, constants.ResourceType.SERVICE), None) + + def test_block_until_service_is_ready_happy_path(self) -> None: + with mock.patch.object(self.client, "get_resource_status", return_value="READY"): + self.client.block_until_resource_is_ready( + self.m_service_name, constants.ResourceType.SERVICE, max_retries=1, retry_interval_secs=1 + ) + + def test_block_until_service_is_ready_timeout(self) -> None: + with self.assertRaises(RuntimeError): + with mock.patch.object(self.client, "get_resource_status", side_effect=[None, None, None, "READY"]): + self.client.block_until_resource_is_ready( + self.m_service_name, constants.ResourceType.SERVICE, max_retries=1, retry_interval_secs=1 + ) + + def test_block_until_service_is_ready_retries_and_ready(self) -> None: + # Service becomes ready on 2nd retry. + with mock.patch.object(self.client, "get_resource_status", side_effect=[None, "READY"]): + self.client.block_until_resource_is_ready( + self.m_service_name, constants.ResourceType.SERVICE, max_retries=2, retry_interval_secs=1 + ) + + def test_block_until_service_is_ready_retries_and_fail(self) -> None: + # Service show failure status on 2nd retry. + with self.assertRaises(RuntimeError): + with mock.patch.object(self.client, "get_resource_status", side_effect=[None, "FAILED"]): + self.client.block_until_resource_is_ready( + self.m_service_name, constants.ResourceType.SERVICE, max_retries=2, retry_interval_secs=1 + ) diff --git a/snowflake/ml/model/_deployer.py b/snowflake/ml/model/_deployer.py index 93ba6e41..dd402268 100644 --- a/snowflake/ml/model/_deployer.py +++ b/snowflake/ml/model/_deployer.py @@ -1,8 +1,9 @@ -import os +import json from abc import ABC, abstractmethod from enum import Enum from typing import Dict, List, Optional, TypedDict, Union, overload +import numpy as np import pandas as pd from typing_extensions import Required @@ -10,7 +11,6 @@ from snowflake.ml.model import _udf_util, model_signature, type_hints as model_types from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session, functions as F from snowflake.snowpark._internal import type_utils -from snowflake.snowpark._internal.analyzer import analyzer_utils class TargetPlatform(Enum): @@ -163,33 +163,84 @@ def __init__(self, session: Session, manager: DeploymentManager) -> None: self._manager = manager self._session = session + @overload def create_deployment( self, + *, name: str, model_dir_path: str, platform: TargetPlatform, target_method: str, options: Optional[model_types.DeployOptions], ) -> Optional[Deployment]: - """Create a deployment and deploy it to remote platform. + """Create a deployment from a model in a local directory and deploy it to remote platform. Args: name: Name of the deployment for the model. + platform: Target platform to deploy the model. + target_method: The name of the target method to be deployed. model_dir_path: Directory of the model. + options: Additional options when deploying the model. + Each target platform will have their own specifications of options. + """ + ... + + @overload + def create_deployment( + self, + *, + name: str, + platform: TargetPlatform, + target_method: str, + model_stage_file_path: str, + options: Optional[model_types.DeployOptions], + ) -> Optional[Deployment]: + """Create a deployment from a model in a zip file in a stage and deploy it to remote platform. + + Args: + name: Name of the deployment for the model. + platform: Target platform to deploy the model. + target_method: The name of the target method to be deployed. + model_stage_file_path: Model file in the stage to be deployed. Must be a file with .zip extension. + options: Additional options when deploying the model. + Each target platform will have their own specifications of options. + """ + ... + + def create_deployment( + self, + *, + name: str, + platform: TargetPlatform, + target_method: str, + model_dir_path: Optional[str] = None, + model_stage_file_path: Optional[str] = None, + options: Optional[model_types.DeployOptions], + ) -> Optional[Deployment]: + """Create a deployment from a model and deploy it to remote platform. + + Args: + name: Name of the deployment for the model. platform: Target platform to deploy the model. target_method: The name of the target method to be deployed. + model_dir_path: Directory of the model. Exclusive with `model_stage_dir_path`. + model_stage_file_path: Model file in the stage to be deployed. Exclusive with `model_dir_path`. + Must be a file with .zip extension. options: Additional options when deploying the model. Each target platform will have their own specifications of options. Raises: - ValueError: Raised when the target platform is unavailable. RuntimeError: Raised when running into issues when deploying. ValueError: Raised when target method does not exist in model. Returns: The deployment information. """ - model_dir_path = os.path.normpath(model_dir_path) + if not ((model_stage_file_path is None) ^ (model_dir_path is None)): + raise ValueError( + "model_dir_path and model_stage_file_path both cannot be " + + f"{'None' if model_stage_file_path is None else 'specified'} at the same time." + ) is_success = False error_msg = "" @@ -203,6 +254,7 @@ def create_deployment( meta = _udf_util._deploy_to_warehouse( self._session, model_dir_path=model_dir_path, + model_stage_file_path=model_stage_file_path, udf_name=name, target_method=target_method, **options, @@ -284,15 +336,23 @@ def predict( Raises: ValueError: Raised when the deployment does not exist. ValueError: Raised when the input is too large to use keep_order option. + NotImplementedError: FeatureGroupSpec is not supported. Returns: The output dataframe. """ + # Initialize inference d = self.get_deployment(name) if not d: raise ValueError(f"Deployment {name} does not exist.") + + # Get options + INTERMEDIATE_OBJ_NAME = "tmp_result" sig = d["signature"] keep_order = d["options"].get("keep_order", True) + output_with_input_features = d["options"].get("output_with_input_features", False) + + # Validate and prepare input if not isinstance(X, SnowparkDataFrame): df = model_signature._convert_and_validate_local_data(X, sig.inputs) s_df = self._session.create_dataframe(df) @@ -304,45 +364,55 @@ def predict( # ID is UINT64 type, this we should limit. if s_df.count() > 2**64: raise ValueError("Unable to keep order of a DataFrame with more than 2 ** 64 rows.") - s_df = s_df.with_column("_ID", F.monotonically_increasing_id()) + s_df = s_df.with_column(_udf_util._KEEP_ORDER_COL_NAME, F.monotonically_increasing_id()) - cols = [] + # Infer and get intermediate result + input_cols = [] for col_name in s_df.columns: - literal_col_name = identifier.remove_and_unescape_quote_if_quoted(col_name) - cols.extend( + literal_col_name = identifier.get_unescaped_names(col_name) + input_cols.extend( [ type_utils.ColumnOrName(F.lit(type_utils.LiteralType(literal_col_name))), type_utils.ColumnOrName(F.col(col_name)), ] ) + output_obj = F.call_udf(name, type_utils.ColumnOrLiteral(F.object_construct(*input_cols))) + if output_with_input_features: + df_res = s_df.with_column(INTERMEDIATE_OBJ_NAME, output_obj) + else: + df_res = s_df.select(output_obj.alias(INTERMEDIATE_OBJ_NAME)) + if keep_order: + df_res = df_res.order_by(F.col(INTERMEDIATE_OBJ_NAME)[_udf_util._KEEP_ORDER_COL_NAME], ascending=True) + if output_with_input_features: + df_res = df_res.drop(_udf_util._KEEP_ORDER_COL_NAME) + + # Prepare the output output_cols = [] for output_feature in sig.outputs: - # To avoid automatic upper-case convert, we quoted the result name. output_cols.append( - F.parse_json(type_utils.ColumnOrName(F.col("tmp_result")[output_feature.name])) - .astype(output_feature.as_snowpark_type()) - .alias(analyzer_utils.quote_name_without_upper_casing(output_feature.name)) + F.col(INTERMEDIATE_OBJ_NAME)[output_feature.name].astype(output_feature.as_snowpark_type()) ) - df_res = s_df.select( - F.call_udf(name, type_utils.ColumnOrLiteral(F.object_construct(*cols))).alias("tmp_result") - ) - - if keep_order: - df_res = df_res.order_by(F.col("_ID"), ascending=True) - - df_res = df_res.select(*output_cols) + df_res = df_res.with_columns( + [identifier.quote_name_without_upper_casing(output_feature.name) for output_feature in sig.outputs], + output_cols, + ).drop(INTERMEDIATE_OBJ_NAME) + # Get final result if not isinstance(X, SnowparkDataFrame): dtype_map = {} for feature in sig.outputs: - if isinstance(feature, model_signature.FeatureSpec): - dtype_map[feature.name] = feature._dtype._value - elif isinstance(feature, model_signature.FeatureGroupSpec): - for ft in feature._specs: - dtype_map[ft.name] = ft._dtype._value - df_local = df_res.to_pandas().astype(dtype=dtype_map) + if isinstance(feature, model_signature.FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + assert isinstance(feature, model_signature.FeatureSpec), "Invalid feature kind." + dtype_map[feature.name] = feature.as_dtype() + df_local = df_res.to_pandas() + # This is because Array and object will generate variant type and requires an additional loads to + # get correct data otherwise it would be string. + for col_name in [col_name for col_name, col_dtype in dtype_map.items() if col_dtype == np.object0]: + df_local[col_name] = df_local[col_name].map(json.loads) + df_local = df_local.astype(dtype=dtype_map) return pd.DataFrame(df_local) else: return df_res diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_handlers/BUILD.bazel index 5e098f7d..c4a67ddf 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_handlers/BUILD.bazel @@ -46,7 +46,7 @@ py_library( "//snowflake/ml/model:custom_model", "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) diff --git a/snowflake/ml/model/_handlers/snowmlmodel.py b/snowflake/ml/model/_handlers/snowmlmodel.py index a61454b3..df17354a 100644 --- a/snowflake/ml/model/_handlers/snowmlmodel.py +++ b/snowflake/ml/model/_handlers/snowmlmodel.py @@ -16,14 +16,14 @@ from snowflake.ml.model._handlers import _base if TYPE_CHECKING: - from snowflake.ml.framework.base import BaseEstimator + from snowflake.ml.sklearn.framework.base import BaseEstimator class _SnowMLModelHandler(_base._ModelHandler["BaseEstimator"]): """Handler for SnowML based model. - Currently snowflake.ml.framework.base.BaseEstimator - and snowflake.ml.framework.pipeline.Pipeline based classes are supported. + Currently snowflake.ml.sklearn.framework.base.BaseEstimator + and snowflake.ml.sklearn.framework.pipeline.Pipeline based classes are supported. """ handler_type = "snowml" @@ -34,7 +34,7 @@ def can_handle( model: model_types.SupportedModelType, ) -> TypeGuard["BaseEstimator"]: return ( - type_utils.LazyType("snowflake.ml.framework.base.BaseEstimator").isinstance(model) + type_utils.LazyType("snowflake.ml.sklearn.framework.base.BaseEstimator").isinstance(model) # Pipeline is inherited from BaseEstimator, so no need to add one more check ) and any( (hasattr(model, method) and callable(getattr(model, method, None))) @@ -45,7 +45,7 @@ def can_handle( def cast_model( model: model_types.SupportedModelType, ) -> "BaseEstimator": - from snowflake.ml.framework.base import BaseEstimator + from snowflake.ml.sklearn.framework.base import BaseEstimator assert isinstance(model, BaseEstimator) # Pipeline is inherited from BaseEstimator, so no need to add one more check @@ -62,7 +62,7 @@ def _save_model( is_sub_model: Optional[bool] = False, **kwargs: Unpack[model_types.SNOWModelSaveOptions], ) -> None: - from snowflake.ml.framework.base import BaseEstimator + from snowflake.ml.sklearn.framework.base import BaseEstimator assert isinstance(model, BaseEstimator) # Pipeline is inherited from BaseEstimator, so no need to add one more check @@ -123,7 +123,7 @@ def _load_model(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: m = cloudpickle.load(f) - from snowflake.ml.framework.base import BaseEstimator + from snowflake.ml.sklearn.framework.base import BaseEstimator assert isinstance(m, BaseEstimator) return m diff --git a/snowflake/ml/model/_model.py b/snowflake/ml/model/_model.py index 06bea59b..d11b58cf 100644 --- a/snowflake/ml/model/_model.py +++ b/snowflake/ml/model/_model.py @@ -1,8 +1,10 @@ import os +import tempfile +import warnings from types import ModuleType from typing import Dict, List, Literal, Optional, Tuple, Union, overload -from snowflake.ml.framework import base +from snowflake.ml._internal import file_utils from snowflake.ml.model import ( _env, _model_handler, @@ -11,6 +13,8 @@ model_signature, type_hints as model_types, ) +from snowflake.ml.sklearn.framework import base +from snowflake.snowpark import FileOperation, Session MODEL_BLOBS_DIR = "models" @@ -19,8 +23,8 @@ def save_model( *, name: str, - model_dir_path: str, model: base.BaseEstimator, + model_dir_path: str, metadata: Optional[Dict[str, str]] = None, conda_dependencies: Optional[List[str]] = None, pip_requirements: Optional[List[str]] = None, @@ -33,8 +37,8 @@ def save_model( Args: name: Name of the model. - model_dir_path: Directory to save the model. model: SnowML modeling model object. + model_dir_path: Directory to save the model. metadata: Model metadata. conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify a dependency. It is a recommended way to specify your dependencies using conda. When channel is not @@ -55,8 +59,8 @@ def save_model( def save_model( *, name: str, - model_dir_path: str, model: model_types.SupportedLocalModelType, + model_dir_path: str, signatures: Dict[str, model_signature.ModelSignature], metadata: Optional[Dict[str, str]] = None, conda_dependencies: Optional[List[str]] = None, @@ -66,12 +70,12 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a local model under `dir_path`. + """Save a local model with user provided signatures under `dir_path`. Args: name: Name of the model. - model_dir_path: Directory to save the model. model: Model object. + model_dir_path: Directory to save the model. signatures: Model data signatures for inputs and output for every target methods. metadata: Model metadata. conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify @@ -93,8 +97,8 @@ def save_model( def save_model( *, name: str, - model_dir_path: str, model: model_types.SupportedLocalModelType, + model_dir_path: str, sample_input: model_types.SupportedDataType, metadata: Optional[Dict[str, str]] = None, conda_dependencies: Optional[List[str]] = None, @@ -104,12 +108,134 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save a local model under `dir_path` with signature inferred from a local sample_input_data. + """Save a local model under `dir_path` with signature inferred from a sample_input_data. Args: name: Name of the model. + model: Model object. model_dir_path: Directory to save the model. + sample_input: Sample input data to infer the model signatures from. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + """ + ... + + +@overload +def save_model( + *, + name: str, + model: base.BaseEstimator, + session: Session, + model_stage_file_path: str, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> _model_meta.ModelMetadata: + """Save a SnowML modeling model to a zip file whose path is the provided stage file path. + + Args: + name: Name of the model. + model: SnowML modeling model object. + session: Snowpark connection session. + model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. + Must be a file with .zip extension. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + """ + ... + + +@overload +def save_model( + *, + name: str, + model: model_types.SupportedLocalModelType, + session: Session, + model_stage_file_path: str, + signatures: Dict[str, model_signature.ModelSignature], + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> _model_meta.ModelMetadata: + """Save a local model with user provided signatures to a zip file whose path is the provided stage file path. + + Args: + name: Name of the model. + model: Model object. + session: Snowpark connection session. + model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. + Must be a file with .zip extension. + signatures: Model data signatures for inputs and output for every target methods. + metadata: Model metadata. + conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify + a dependency. It is a recommended way to specify your dependencies using conda. When channel is not + specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be + replaced with the Snowflake Anaconda channel. + pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip + requirements. + python_version: A string of python version where model is run. Used for user override. If specified as None, + current version would be captured. Defaults to None. + code_paths: Directory of code to import. + ext_modules: External modules that user might want to get pickled with model object. Defaults to None. + options: Model specific kwargs. + """ + ... + + +@overload +def save_model( + *, + name: str, + model: model_types.SupportedLocalModelType, + session: Session, + model_stage_file_path: str, + sample_input: model_types.SupportedDataType, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> _model_meta.ModelMetadata: + """Save a local model to a zip file whose path is the provided stage file path with signature inferred from a + sample_input_data. + + Args: + name: Name of the model. model: Model object. + session: Snowpark connection session. + model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. + Must be a file with .zip extension. sample_input: Sample input data to infer the model signatures from. metadata: Model metadata. conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify @@ -130,8 +256,10 @@ def save_model( def save_model( *, name: str, - model_dir_path: str, model: model_types.SupportedModelType, + session: Optional[Session] = None, + model_stage_file_path: Optional[str] = None, + model_dir_path: Optional[str] = None, signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, sample_input: Optional[model_types.SupportedDataType] = None, metadata: Optional[Dict[str, str]] = None, @@ -142,16 +270,22 @@ def save_model( code_paths: Optional[List[str]] = None, options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: - """Save the model under `dir_path`. + """Save the model. Args: name: Name of the model. - model_dir_path: Directory to save the model. model: Model object. + model_dir_path: Directory to save the model. Exclusive with `session` and `model_stage_file_path`. + session: Snowpark connection session. Needs to present with `model_stage_file_path`. + Exclusive with `model_dir_path`. + model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. + Needs to present with `session`. Exclusive with `model_dir_path`. Must be a file with .zip extension. signatures: Model data signatures for inputs and output for every target methods. If it is None, sample_input - would be used to infer the signatures. If not None, sample_input should not be specified. Defaults to None. - sample_input: Sample input data to infer the model signatures from. If it is None, signatures must be specified. - If not None, signatures should not be specified. Defaults to None. + would be used to infer the signatures if it is a local (non-SnowML modeling model). + If not None, sample_input should not be specified. Defaults to None. + sample_input: Sample input data to infer the model signatures from. If it is None, signatures must be specified + if it is a local (non-SnowML modeling model). If not None, signatures should not be specified. + Defaults to None. metadata: Model metadata. conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify a dependency. It is a recommended way to specify your dependencies using conda. When channel is not @@ -169,30 +303,117 @@ def save_model( Model metadata. Raises: - ValueError: Raised when the signatures and sample_input specified at the same time. - TypeError: Raised if model type is not supported. + ValueError: Raised when the session and model_stage_file_path not specified or not be None at the same time. + ValueError: Raised when the model_stage_file_path and model_dir_path specified at the same time. + ValueError: Raised when the signatures and sample_input specified at the same time, or not presented when + specifying local model. + ValueError: Raised when provided model directory is not a directory. + ValueError: Raised when provided model stage path is not a zip file. """ + if (session is None) ^ (model_stage_file_path is None): + raise ValueError( + "Session and model_stage_file_path must be " + + f"{'None' if session is None else 'specified'} at the same time." + ) - model_dir_path = os.path.normpath(model_dir_path) + if not ((model_stage_file_path is None) ^ (model_dir_path is None)): + raise ValueError( + "model_dir_path and model_stage_file_path both cannot be " + + f"{'None' if model_stage_file_path is None else 'specified'} at the same time." + ) if ((signatures is None) and (sample_input is None) and not isinstance(model, base.BaseEstimator)) or ( (signatures is not None) and (sample_input is not None) ): raise ValueError( "Signatures and sample_input both cannot be " - + f"{'None' if signatures is None else 'specified'} at the same time." + + f"{'None for local model' if signatures is None else 'specified'} at the same time." ) if not options: options = {} + if model_dir_path: + if os.path.exists(model_dir_path): + if not os.path.isdir(model_dir_path): + raise ValueError(f"Provided model directory {model_dir_path} is not a directory.") + if os.listdir(model_dir_path): + warnings.warn( + f"Provided model directory {model_dir_path} is not an empty directory. Files might be overwritten.", + category=UserWarning, + ) + else: + os.makedirs(model_dir_path) + return _save( + name=name, + model=model, + local_dir_path=model_dir_path, + signatures=signatures, + sample_input=sample_input, + metadata=metadata, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + python_version=python_version, + ext_modules=ext_modules, + code_paths=code_paths, + options=options, + ) + + assert session and model_stage_file_path + if os.path.splitext(model_stage_file_path)[1] != ".zip": + raise ValueError("Provided model path in the stage {model_stage_file_path} must be a path to a zip file.") + + with tempfile.TemporaryDirectory() as temp_local_model_dir_path: + meta = _save( + name=name, + model=model, + local_dir_path=temp_local_model_dir_path, + signatures=signatures, + sample_input=sample_input, + metadata=metadata, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + python_version=python_version, + ext_modules=ext_modules, + code_paths=code_paths, + options=options, + ) + with file_utils.zip_file_or_directory_to_stream( + temp_local_model_dir_path, leading_path=temp_local_model_dir_path + ) as zf: + assert session and model_stage_file_path + fo = FileOperation(session=session) + fo.put_stream( + zf, + model_stage_file_path, + auto_compress=False, + overwrite=options.get("allow_overwritten_stage_file", False), + ) + return meta + + +def _save( + *, + name: str, + model: model_types.SupportedModelType, + local_dir_path: str, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + sample_input: Optional[model_types.SupportedDataType] = None, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, +) -> _model_meta.ModelMetadata: + local_dir_path = os.path.normpath(local_dir_path) + handler = _model_handler._find_handler(model) if handler is None: raise TypeError(f"{type(model)} is not supported.") - if not os.path.exists(model_dir_path): - os.makedirs(model_dir_path) with _model_meta._create_model_metadata( - model_dir_path=model_dir_path, + model_dir_path=local_dir_path, name=name, model_type=handler.handler_type, metadata=metadata, @@ -203,7 +424,7 @@ def save_model( pip_requirements=pip_requirements, python_version=python_version, ) as meta: - model_blobs_path = os.path.join(model_dir_path, MODEL_BLOBS_DIR) + model_blobs_path = os.path.join(local_dir_path, MODEL_BLOBS_DIR) os.makedirs(model_blobs_path, exist_ok=True) model = handler.cast_model(model) handler._save_model( @@ -215,40 +436,124 @@ def save_model( is_sub_model=False, **options, ) + return meta -# TODO(SNOW-786570): Allows path to be stage path. @overload def load_model( - model_dir_path: str, meta_only: Optional[Literal[False]] = None + *, model_dir_path: str, meta_only: Optional[Literal[False]] = None ) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: + """Load the model into memory from directory. + + Args: + model_dir_path: Directory containing the model. + meta_only: Flag to indicate that if only load metadata. + """ ... @overload -def load_model(model_dir_path: str, meta_only: Literal[True]) -> _model_meta.ModelMetadata: +def load_model(*, model_dir_path: str, meta_only: Literal[True]) -> _model_meta.ModelMetadata: + """Load the model into memory from directory with metadata only. + + Args: + model_dir_path: Directory containing the model. + meta_only: Flag to indicate that if only load metadata. + """ ... +@overload def load_model( - model_dir_path: str, meta_only: Optional[bool] = None + *, session: Session, model_stage_file_path: str, meta_only: Optional[Literal[False]] = None +) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: + """Load the model into memory from a zip file in the stage. + + Args: + session: Snowflake connection session. + model_stage_file_path: The path to zipped model file in the stage. Must be a file with .zip extension. + meta_only: Flag to indicate that if only load metadata. + """ + ... + + +@overload +def load_model(*, session: Session, model_stage_file_path: str, meta_only: Literal[True]) -> _model_meta.ModelMetadata: + """Load the model into memory from a zip file in the stage with metadata only. + + Args: + session: Snowflake connection session. + model_stage_file_path: The path to zipped model file in the stage. Must be a file with .zip extension. + meta_only: Flag to indicate that if only load metadata. + """ + ... + + +def load_model( + *, + session: Optional[Session] = None, + model_stage_file_path: Optional[str] = None, + model_dir_path: Optional[str] = None, + meta_only: Optional[bool] = None, ) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: - """Load the model into memory from directory. + """Load the model into memory from directory or a zip file in the stage. Args: - model_dir_path: Directory containing the model. + session: Snowflake connection session. Must be specified when specifying model_stage_file_path. + Exclusive with model_dir_path. + model_stage_file_path: The path to zipped model file in the stage. Must be specified when specifying session. + Exclusive with model_dir_path. Must be a file with .zip extension. + model_dir_path: Directory containing the model. Exclusive with session and model_stage_file_path. meta_only: Flag to indicate that if only load metadata. Raises: - TypeError: Raised if model is not native format. + ValueError: Raised when the session and model_stage_file_path not specified or not be None at the same time. + ValueError: Raised when the model_stage_file_path and model_dir_path specified at the same time. + ValueError: Raised if model directory does not exist. + ValueError: Raised if model directory is not a directory. + ValueError: Raised if model provided in the stage is not a zip file. Returns: A tuple containing the model object and the model metadata. """ - model_dir_path = os.path.normpath(model_dir_path) + if (session is None) ^ (model_stage_file_path is None): + raise ValueError( + "Session and model_stage_file_path must be " + + f"{'None' if session is None else 'specified'} at the same time." + ) - meta = _model_meta._load_model_metadata(model_dir_path) + if not ((model_stage_file_path is None) ^ (model_dir_path is None)): + raise ValueError( + "model_dir_path and model_stage_file_path both cannot be " + + f"{'None' if model_stage_file_path is None else 'specified'} at the same time." + ) + + if model_dir_path: + if not os.path.exists(model_dir_path): + raise ValueError(f"Provided model directory {model_dir_path} does not exist.") + if not os.path.isdir(model_dir_path): + raise ValueError(f"Provided model directory {model_dir_path} is not a directory.") + + return _load(local_dir_path=model_dir_path, meta_only=meta_only) + + assert session and model_stage_file_path + if os.path.splitext(model_stage_file_path)[1] != ".zip": + raise ValueError("Provided model path in the stage {model_stage_file_path} must be a path to a zip file.") + + fo = FileOperation(session=session) + zf = fo.get_stream(model_stage_file_path) + with file_utils.unzip_stream_in_temp_dir(stream=zf) as temp_local_model_dir_path: + return _load(local_dir_path=temp_local_model_dir_path, meta_only=meta_only) + + +def _load( + *, + local_dir_path: str, + meta_only: Optional[bool] = None, +) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: + local_dir_path = os.path.normpath(local_dir_path) + meta = _model_meta._load_model_metadata(local_dir_path) if meta_only: return meta @@ -257,7 +562,7 @@ def load_model( handler = _model_handler._load_handler(meta.model_type) if handler is None: raise TypeError(f"{meta.model_type} is not supported.") - model_blobs_path = os.path.join(model_dir_path, MODEL_BLOBS_DIR) + model_blobs_path = os.path.join(local_dir_path, MODEL_BLOBS_DIR) m = handler._load_model(meta.name, meta, model_blobs_path) return m, meta diff --git a/snowflake/ml/model/_model_meta.py b/snowflake/ml/model/_model_meta.py index d879c475..b2521187 100644 --- a/snowflake/ml/model/_model_meta.py +++ b/snowflake/ml/model/_model_meta.py @@ -23,6 +23,7 @@ "pyyaml", "typing-extensions", "cloudpickle", + "packaging", "anyio", "snowflake-snowpark-python", "scikit-learn", diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py index 29668f5b..fbc7fe1d 100644 --- a/snowflake/ml/model/_model_test.py +++ b/snowflake/ml/model/_model_test.py @@ -2,6 +2,8 @@ import os import tempfile import warnings +from typing import cast +from unittest import mock import numpy as np import pandas as pd @@ -15,7 +17,9 @@ model_signature, type_hints as model_types, ) -from snowflake.ml.modeling.linear_model import LinearRegression +from snowflake.ml.sklearn.linear_model import LinearRegression +from snowflake.ml.test_utils import mock_session +from snowflake.snowpark import FileOperation, Session class DemoModelWithManyArtifacts(custom_model.CustomModel): @@ -86,6 +90,193 @@ def predict(self, input: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({"output": input["c1"] + self.bias}) +class ModelInterfaceTest(absltest.TestCase): + def test_save_interface(self) -> None: + m_session = mock_session.MockSession(conn=None, test_case=self) + c_session = cast(Session, m_session) + + local_dir = "path/to/local/model/dir" + stage_path = '@"db"."schema"."stage"/model.zip' + + arr = np.array([[1, 2, 3], [4, 2, 5]]) + d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + + with self.assertRaisesRegex( + ValueError, "model_dir_path and model_stage_file_path both cannot be None at the same time." + ): + model_api.save_model(name="model", model=linear_model.LinearRegression()) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "Session and model_stage_file_path must be specified at the same time." + ): + model_api.save_model( + name="model", model=linear_model.LinearRegression(), session=c_session, sample_input=d + ) # type:ignore[call-overload] + + with self.assertRaisesRegex(ValueError, "Session and model_stage_file_path must be None at the same time."): + model_api.save_model( + name="model", model=linear_model.LinearRegression(), model_stage_file_path=stage_path, sample_input=d + ) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "Session and model_stage_file_path must be specified at the same time." + ): + model_api.save_model( + name="model", + model=linear_model.LinearRegression(), + session=c_session, + model_dir_path=local_dir, + sample_input=d, + ) # type:ignore[call-overload] + + with self.assertRaisesRegex(ValueError, "Session and model_stage_file_path must be None at the same time."): + model_api.save_model( + name="model", + model=linear_model.LinearRegression(), + model_stage_file_path=stage_path, + model_dir_path=local_dir, + sample_input=d, + ) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "model_dir_path and model_stage_file_path both cannot be specified at the same time." + ): + model_api.save_model( + name="model", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + model_dir_path=local_dir, + sample_input=d, + ) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "Signatures and sample_input both cannot be None for local model at the same time." + ): + model_api.save_model( + name="model1", + model_dir_path=local_dir, + model=linear_model.LinearRegression(), + ) + + with self.assertRaisesRegex( + ValueError, "Signatures and sample_input both cannot be specified at the same time." + ): + model_api.save_model( # type:ignore[call-overload] + name="model1", + model_dir_path=local_dir, + model=linear_model.LinearRegression(), + sample_input=d, + signatures={"predict": model_signature.ModelSignature(inputs=[], outputs=[])}, + ) + + with self.assertRaisesRegex( + ValueError, "Signatures and sample_input both cannot be specified at the same time." + ): + model_api.save_model( # type:ignore[call-overload] + name="model1", + model_dir_path=local_dir, + model=LinearRegression(), + sample_input=d, + signatures={"predict": model_signature.ModelSignature(inputs=[], outputs=[])}, + ) + + with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + model_api.save_model( + name="model1", + model_dir_path=local_dir, + model=LinearRegression(), + ) + + with tempfile.TemporaryDirectory() as tempdir: + with open(os.path.join(tempdir, "some_file"), "w") as f: + f.write("Hi Ciyana!") + + with self.assertRaisesRegex(ValueError, "Provided model directory [^\\s]* is not a directory."): + model_api.save_model( + name="model1", + model_dir_path=os.path.join(tempdir, "some_file"), + model=linear_model.LinearRegression(), + sample_input=d, + ) + + with self.assertWarnsRegex(UserWarning, "Provided model directory [^\\s]* is not an empty directory."): + with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + model_api.save_model( + name="model1", + model_dir_path=tempdir, + model=linear_model.LinearRegression(), + sample_input=d, + ) + mock_save.assert_called_once() + + with self.assertRaisesRegex( + ValueError, "Provided model path in the stage [^\\s]* must be a path to a zip file." + ): + model_api.save_model( + name="model1", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path='@"db"."schema"."stage"/model', + sample_input=d, + ) + + with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: + model_api.save_model( + name="model1", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + sample_input=d, + ) + mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=False) + + with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: + model_api.save_model( + name="model1", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + sample_input=d, + options={"allow_overwritten_stage_file": True}, + ) + mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=True) + + def test_load_interface(self) -> None: + m_session = mock_session.MockSession(conn=None, test_case=self) + c_session = cast(Session, m_session) + + local_dir = "path/to/local/model/dir" + stage_path = '@"db"."schema"."stage"/model.zip' + + with self.assertRaisesRegex( + ValueError, "Session and model_stage_file_path must be specified at the same time." + ): + model_api.load_model(session=c_session) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "model_dir_path and model_stage_file_path both cannot be None at the same time." + ): + model_api.load_model() # type:ignore[call-overload] + + with self.assertRaisesRegex(ValueError, "Session and model_stage_file_path must be None at the same time."): + model_api.load_model(model_stage_file_path=stage_path) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "model_dir_path and model_stage_file_path both cannot be specified at the same time." + ): + model_api.load_model( + session=c_session, model_stage_file_path=stage_path, model_dir_path=local_dir + ) # type:ignore[call-overload] + + with self.assertRaisesRegex( + ValueError, "Provided model path in the stage [^\\s]* must be a path to a zip file." + ): + model_api.load_model(session=c_session, model_stage_file_path='@"db"."schema"."stage"/model') + + class ModelTest(absltest.TestCase): def test_bad_save_model(self) -> None: tmpdir = self.create_tempdir() @@ -100,15 +291,6 @@ def test_bad_save_model(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) s = {"predict": model_signature.infer_signature(d, lm.predict(d))} - with self.assertRaises(ValueError): - model_api.save_model( # type:ignore[call-overload] - name="model1", - model_dir_path=os.path.join(tmpdir.full_path, "model1"), - model=lm, - signatures=s, - sample_input=d, - metadata={"author": "halu", "version": "1"}, - ) with self.assertRaises(ValueError): model_api.save_model( @@ -119,14 +301,6 @@ def test_bad_save_model(self) -> None: metadata={"author": "halu", "version": "1"}, ) - with self.assertRaises(ValueError): - model_api.save_model( # type:ignore[call-overload] - name="model1", - model_dir_path=os.path.join(tmpdir.full_path, "model1"), - model=lm, - metadata={"author": "halu", "version": "1"}, - ) - model_api.save_model( name="model1", model_dir_path=os.path.join(tmpdir.full_path, "model1"), @@ -136,10 +310,10 @@ def test_bad_save_model(self) -> None: python_version="3.5.2", ) - _ = model_api.load_model(os.path.join(tmpdir, "model1"), meta_only=True) + _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1"), meta_only=True) with self.assertRaises(RuntimeError): - m, meta = model_api.load_model(os.path.join(tmpdir, "model1")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) def test_custom_model_with_multiple_artifacts(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: @@ -166,7 +340,7 @@ def test_custom_model_with_multiple_artifacts(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, meta = model_api.load_model(os.path.join(tmpdir, "model1")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, DemoModelWithManyArtifacts) res = m.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) @@ -185,7 +359,7 @@ def test_custom_model_with_multiple_artifacts(self) -> None: metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) assert isinstance(m, DemoModelWithManyArtifacts) res = m.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) @@ -214,11 +388,11 @@ def test_model_composition(self) -> None: signatures=s, metadata={"author": "halu", "version": "1"}, ) - lm, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + lm, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(lm, ComposeModel) p3 = lm.predict(d) - m_UDF, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_UDF, _ = model_api._load_model_for_deploy(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m_UDF, ComposeModel) p4 = m_UDF.predict(d) np.testing.assert_allclose(p1, p2) @@ -248,7 +422,7 @@ async def _test(self: "ModelTest") -> None: signatures=s, metadata={"author": "halu", "version": "1"}, ) - lm, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + lm, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(lm, AsyncComposeModel) p3 = await lm.predict(d) # type: ignore[misc] @@ -279,7 +453,7 @@ def test_custom_model_with_artifacts(self) -> None: metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(os.path.join(tmpdir, "model1")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, DemoModelWithArtifacts) res = m.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([11, 14]))) @@ -314,7 +488,7 @@ def test_skl_multiple_output_proba(self) -> None: ) m: multioutput.MultiOutputClassifier - m, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose( np.hstack(model.predict_proba(iris_X_df[-10:])), np.hstack(m.predict_proba(iris_X_df[-10:])) ) @@ -342,7 +516,7 @@ def test_skl_multiple_output_proba(self) -> None: metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose( np.hstack(model.predict_proba(iris_X_df[-10:])), np.hstack(m.predict_proba(iris_X_df[-10:])) ) @@ -387,7 +561,7 @@ def test_skl(self) -> None: warnings.simplefilter("error") m: linear_model.LinearRegression - m, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose(np.array([-0.08254936]), m.predict(iris_X_df[:1])) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) predict_method = getattr(m_udf, "predict", None) @@ -402,7 +576,7 @@ def test_skl(self) -> None: metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose(np.array([-0.08254936]), m.predict(iris_X_df[:1])) self.assertEqual(s["predict"], meta.signatures["predict"]) @@ -442,7 +616,7 @@ def test_xgb(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, xgboost.XGBClassifier) np.testing.assert_allclose(m.predict(cal_X_test), y_pred) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) @@ -458,7 +632,7 @@ def test_xgb(self) -> None: metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) assert isinstance(m, xgboost.XGBClassifier) np.testing.assert_allclose(m.predict(cal_X_test), y_pred) np.testing.assert_allclose(m.predict_proba(cal_X_test), y_pred_proba) @@ -510,7 +684,7 @@ def test_snowml(self) -> None: warnings.simplefilter("error") m: LinearRegression - m, _ = model_api.load_model(os.path.join(tmpdir, "model1")) + m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) predict_method = getattr(m_udf, "predict", None) @@ -525,7 +699,7 @@ def test_snowml(self) -> None: metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose(np.array([[-0.08254936]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) # TODO: After model_signatures() function is updated in codegen, next line should be changed to # s = regr.model_signatures() diff --git a/snowflake/ml/model/_udf_util.py b/snowflake/ml/model/_udf_util.py index 5f0ec31c..4966ee56 100644 --- a/snowflake/ml/model/_udf_util.py +++ b/snowflake/ml/model/_udf_util.py @@ -15,7 +15,33 @@ ) from snowflake.snowpark import session as snowpark_session, types as st -_KEEP_ORDER_CODE_TEMPLATE = 'predictions_df["_ID"] = input_df["_ID"]' +_KEEP_ORDER_COL_NAME = "_ID" + +_KEEP_ORDER_CODE_TEMPLATE = f'predictions_df["{_KEEP_ORDER_COL_NAME}"] = input_df["{_KEEP_ORDER_COL_NAME}"]' + +_EXTRACT_LOCAL_MODEL_CODE = """ +model_dir_name = '{model_dir_name}' +zip_model_path = os.path.join(import_dir, '{model_dir_name}.zip') +extracted = '/tmp/models' +extracted_model_dir_path = os.path.join(extracted, model_dir_name) + +with FileLock(): + if not os.path.isdir(extracted_model_dir_path): + with zipfile.ZipFile(zip_model_path, 'r') as myzip: + myzip.extractall(extracted) +""" + +_EXTRACT_STAGE_MODEL_CODE = """ +model_dir_name = os.path.splitext('{model_stage_file_name}')[0] +zip_model_path = os.path.join(import_dir, '{model_stage_file_name}') +extracted = '/tmp/models' +extracted_model_dir_path = os.path.join(extracted, model_dir_name) + +with FileLock(): + if not os.path.isdir(extracted_model_dir_path): + with zipfile.ZipFile(zip_model_path, 'r') as myzip: + myzip.extractall(extracted_model_dir_path) +""" _SNOWML_IMPORT_CODE = """ @@ -59,15 +85,8 @@ def __exit__(self, type, value, traceback): from snowflake.ml.model._model import _load_model_for_deploy -model_dir_name = '{model_dir_name}' -zip_model_path = os.path.join(import_dir, '{model_dir_name}.zip') -extracted = '/tmp/models' -extracted_model_dir_path = os.path.join(extracted, model_dir_name) +{extract_model_code} -with FileLock(): - if not os.path.isdir(extracted_model_dir_path): - with zipfile.ZipFile(zip_model_path, 'r') as myzip: - myzip.extractall(extracted) model, meta = _load_model_for_deploy(extracted_model_dir_path) # TODO(halu): Wire `max_batch_size`. @@ -90,7 +109,8 @@ def infer(df): def _deploy_to_warehouse( session: snowpark_session.Session, *, - model_dir_path: str, + model_dir_path: Optional[str] = None, + model_stage_file_path: Optional[str] = None, udf_name: str, target_method: str, **kwargs: Unpack[model_types.WarehouseDeployOptions], @@ -99,7 +119,8 @@ def _deploy_to_warehouse( Args: session: Snowpark session. - model_dir_path: Path to model directory. + model_dir_path: Path to model directory. Exclusive with model_stage_file_path. + model_stage_file_path: Path to the stored model zip file in the stage. Exclusive with model_dir_path. udf_name: Name of the UDF. target_method: The name of the target method to be deployed. **kwargs: Options that control some features in generated udf code. @@ -111,10 +132,17 @@ def _deploy_to_warehouse( Returns: The metadata of the model deployed. """ - if not os.path.exists(model_dir_path): - raise ValueError("Model config did not exist.") - model_dir_name = os.path.basename(model_dir_path) - meta = _model.load_model(model_dir_path, meta_only=True) + if model_dir_path: + model_dir_path = os.path.normpath(model_dir_path) + model_dir_name = os.path.basename(model_dir_path) + extract_model_code = _EXTRACT_LOCAL_MODEL_CODE.format(model_dir_name=model_dir_name) + meta = _model.load_model(model_dir_path=model_dir_path, meta_only=True) + else: + assert model_stage_file_path is not None, "Unreachable assertion error." + model_stage_file_name = os.path.basename(model_stage_file_path) + extract_model_code = _EXTRACT_STAGE_MODEL_CODE.format(model_stage_file_name=model_stage_file_name) + meta = _model.load_model(session=session, model_stage_file_path=model_stage_file_path, meta_only=True) + relax_version = kwargs.get("relax_version", False) if target_method not in meta.signatures.keys(): @@ -125,9 +153,13 @@ def _deploy_to_warehouse( final_packages = _get_model_final_packages(meta, session, relax_version=relax_version) with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: - _write_UDF_py_file(f.file, model_dir_name, target_method, **kwargs) + _write_UDF_py_file(f.file, extract_model_code, target_method, **kwargs) print(f"Generated UDF file is persisted at: {f.name}") - imports = [model_dir_path] + [_snowml_wheel_path] if _snowml_wheel_path else [] + imports = ( + ([model_dir_path] if model_dir_path else []) + + ([model_stage_file_path] if model_stage_file_path else []) + + ([_snowml_wheel_path] if _snowml_wheel_path else []) + ) stage_location = kwargs.get("permanent_udf_stage_location", None) @@ -161,7 +193,7 @@ class _UDFParams(TypedDict): def _write_UDF_py_file( f: IO[str], - model_dir_name: str, + extract_model_code: str, target_method: str, **kwargs: Unpack[model_types.WarehouseDeployOptions], ) -> None: @@ -169,7 +201,7 @@ def _write_UDF_py_file( Args: f: File descriptor to write the python code. - model_dir_name: Path to model directory. + extract_model_code: Code to extract the model. target_method: The name of the target method to be deployed. **kwargs: Options that control some features in generated udf code. """ @@ -180,7 +212,7 @@ def _write_UDF_py_file( snowml_import_code = _SNOWML_IMPORT_CODE.format(snowml_filename=whl_filename) udf_code = _UDF_CODE_TEMPLATE.format( - model_dir_name=model_dir_name, + extract_model_code=extract_model_code, keep_order_code=_KEEP_ORDER_CODE_TEMPLATE if keep_order else "", target_method=target_method, snowml_import_code=snowml_import_code if snowml_wheel_path else "", @@ -244,6 +276,9 @@ def _get_model_final_packages( if final_packages is None: raise RuntimeError( "The model's dependency cannot fit into Snowflake Warehouse. " - + "Trying to set relax_version as True in the options." + + "Trying to set relax_version as True in the options. Required packages are:\n" + + '"' + + " ".join(map(str, meta._conda_dependencies[""])) + + '"' ) return final_packages diff --git a/snowflake/ml/model/model_signature.py b/snowflake/ml/model/model_signature.py index d2aa1d51..b1219c60 100644 --- a/snowflake/ml/model/model_signature.py +++ b/snowflake/ml/model/model_signature.py @@ -187,6 +187,12 @@ def as_snowpark_type(self) -> spt.DataType: result_type = spt.ArrayType(result_type) return result_type + def as_dtype(self) -> npt.DTypeLike: + """Convert to corresponding local Type.""" + if not self._shape: + return self._dtype._numpy_type + return np.object0 + def __eq__(self, other: object) -> bool: if isinstance(other, FeatureSpec): return self._name == other._name and self._dtype == other._dtype and self._shape == other._shape @@ -249,7 +255,7 @@ def _validate(self) -> None: if not all(s._name is not None for s in self._specs): raise ValueError("All children feature specs have to have name.") if not (all(s._shape is None for s in self._specs) or all(s._shape is not None for s in self._specs)): - raise ValueError("All children feature specs have to have same type.") + raise ValueError("All children feature specs have to have same shape.") first_type = self._specs[0]._dtype if not all(s._dtype == first_type for s in self._specs): raise ValueError("All children feature specs have to have same type.") @@ -410,7 +416,7 @@ def validate(data: model_types._DataType) -> None: @staticmethod @abstractmethod - def infer_signature(data: model_types._DataType, role: Literal["input", "output"]) -> Sequence[FeatureSpec]: + def infer_signature(data: model_types._DataType, role: Literal["input", "output"]) -> Sequence[BaseFeatureSpec]: ... @staticmethod @@ -435,16 +441,21 @@ def truncate(data: pd.DataFrame) -> pd.DataFrame: @staticmethod def validate(data: pd.DataFrame) -> None: df_cols = data.columns - if not all(hasattr(data[col], "dtype") for col in data.columns): - raise ValueError(f"Data Validation Error: Unknown column confronted in {data}.") - - if len(df_cols) == 0: - raise ValueError("Data Validation Error: Empty data is found.") if df_cols.has_duplicates: # Rule out categorical index with duplicates raise ValueError("Data Validation Error: Duplicate column index is found.") - if df_cols.dtype not in [np.int64, np.uint64, np.float64, np.object0]: + assert all(hasattr(data[col], "dtype") for col in data.columns), f"Unknown column confronted in {data}" + + if len(df_cols) == 0: + raise ValueError("Data Validation Error: Empty data is found.") + + if df_cols.dtype not in [ + np.int64, + np.uint64, + np.float64, + np.object0, + ]: # To keep compatibility with Pandas 2.x and 1.x raise ValueError("Data Validation Error: Unsupported column index type is found.") df_col_dtypes = [data[col].dtype for col in data.columns] @@ -468,7 +479,7 @@ def validate(data: pd.DataFrame) -> None: ): raise ValueError( "Data Validation Error: " - + f"Inconsistent type of object found in column data {data[df_col]}." + + f"Inconsistent type of element in object found in column data {data[df_col]}." ) elif isinstance(data[df_col][0], np.ndarray): @@ -477,17 +488,20 @@ def validate(data: pd.DataFrame) -> None: if not all(DataType.from_numpy_type(data_row.dtype) == arr_dtype for data_row in data[df_col]): raise ValueError( "Data Validation Error: " - + f"Inconsistent type of object found in column data {data[df_col]}." + + f"Inconsistent type of element in object found in column data {data[df_col]}." ) elif not isinstance(data[df_col][0], (str, bytes)): raise ValueError(f"Data Validation Error: Unsupported type confronted in {data[df_col]}") @staticmethod - def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[FeatureSpec]: + def infer_signature(data: pd.DataFrame, role: Literal["input", "output"]) -> Sequence[BaseFeatureSpec]: feature_prefix = f"{_PandasDataFrameHandler.FEATURE_PREFIX}_" df_cols = data.columns + role_prefix = ( + _PandasDataFrameHandler.INPUT_PREFIX if role == "input" else _PandasDataFrameHandler.OUTPUT_PREFIX + ) + "_" if df_cols.dtype in [np.int64, np.uint64, np.float64]: - ft_names = [f"{feature_prefix}{i}" for i in df_cols] + ft_names = [f"{role_prefix}{feature_prefix}{i}" for i in df_cols] else: ft_names = list(map(str, data.columns.to_list())) @@ -554,16 +568,17 @@ def validate(data: model_types._SupportedNumpyArray) -> None: @staticmethod def infer_signature( data: model_types._SupportedNumpyArray, role: Literal["input", "output"] - ) -> Sequence[FeatureSpec]: + ) -> Sequence[BaseFeatureSpec]: feature_prefix = f"{_PandasDataFrameHandler.FEATURE_PREFIX}_" dtype = DataType.from_numpy_type(data.dtype) + role_prefix = (_NumpyArrayHandler.INPUT_PREFIX if role == "input" else _NumpyArrayHandler.OUTPUT_PREFIX) + "_" if len(data.shape) == 1: - return [FeatureSpec(dtype=dtype, name=f"{feature_prefix}0")] + return [FeatureSpec(dtype=dtype, name=f"{role_prefix}{feature_prefix}0")] else: # For high-dimension array, 0-axis is for batch, 1-axis is for column, further more is details of columns. features = [] n_cols = data.shape[1] - ft_names = [f"{feature_prefix}{i}" for i in range(n_cols)] + ft_names = [f"{role_prefix}{feature_prefix}{i}" for i in range(n_cols)] for col_data, ft_name in zip(data[0], ft_names): if isinstance(col_data, np.ndarray): ft_shape = np.shape(col_data) @@ -576,7 +591,12 @@ def infer_signature( def convert_to_df(data: model_types._SupportedNumpyArray) -> pd.DataFrame: if len(data.shape) == 1: data = np.expand_dims(data, axis=1) - return pd.DataFrame(data) + n_cols = data.shape[1] + if len(data.shape) == 2: + return pd.DataFrame(data={i: data[:, i] for i in range(n_cols)}) + else: + n_rows = data.shape[0] + return pd.DataFrame(data={i: [np.array(data[k, i]) for k in range(n_rows)] for i in range(n_cols)}) class _ListOfNumpyArrayHandler(_BaseDataHandler[List[model_types._SupportedNumpyArray]]): @@ -607,25 +627,29 @@ def validate(data: List[model_types._SupportedNumpyArray]) -> None: @staticmethod def infer_signature( data: List[model_types._SupportedNumpyArray], role: Literal["input", "output"] - ) -> Sequence[FeatureSpec]: - features: List[FeatureSpec] = [] + ) -> Sequence[BaseFeatureSpec]: + features: List[BaseFeatureSpec] = [] + role_prefix = ( + _ListOfNumpyArrayHandler.INPUT_PREFIX if role == "input" else _ListOfNumpyArrayHandler.OUTPUT_PREFIX + ) + "_" for i, data_col in enumerate(data): inferred_res = _NumpyArrayHandler.infer_signature(data_col, role) for ft in inferred_res: - additional_prefix = ( - _ListOfNumpyArrayHandler.OUTPUT_PREFIX - if role == "output" - else _ListOfNumpyArrayHandler.INPUT_PREFIX - ) - ft._name = f"{additional_prefix}_{i}_{ft._name}" + ft._name = f"{role_prefix}{i}_{ft._name[len(role_prefix):]}" features.extend(inferred_res) return features @staticmethod def convert_to_df(data: List[model_types._SupportedNumpyArray]) -> pd.DataFrame: - arr = np.concatenate(data, axis=1) - return pd.DataFrame(arr) + l_data = [] + for data_col in data: + if len(data_col.shape) == 1: + l_data.append(np.expand_dims(data_col, axis=1)) + else: + l_data.append(data_col) + arr = np.concatenate(l_data, axis=1) + return _NumpyArrayHandler.convert_to_df(arr) class _ListOfBuiltinHandler(_BaseDataHandler[model_types._SupportedBuiltinsList]): @@ -656,7 +680,7 @@ def validate(data: model_types._SupportedBuiltinsList) -> None: @staticmethod def infer_signature( data: model_types._SupportedBuiltinsList, role: Literal["input", "output"] - ) -> Sequence[FeatureSpec]: + ) -> Sequence[BaseFeatureSpec]: return _PandasDataFrameHandler.infer_signature(pd.DataFrame(data), role) @staticmethod @@ -687,11 +711,13 @@ def validate(data: snowflake.snowpark.DataFrame) -> None: ) @staticmethod - def infer_signature(data: snowflake.snowpark.DataFrame, role: Literal["input", "output"]) -> Sequence[FeatureSpec]: - features: List[FeatureSpec] = [] + def infer_signature( + data: snowflake.snowpark.DataFrame, role: Literal["input", "output"] + ) -> Sequence[BaseFeatureSpec]: + features: List[BaseFeatureSpec] = [] schema = data.schema for field in schema.fields: - name = identifier.remove_and_unescape_quote_if_quoted(field.name) + name = identifier.get_unescaped_names(field.name) features.append(FeatureSpec(name=name, dtype=DataType.from_snowpark_type(field.datatype))) return features @@ -734,7 +760,7 @@ def _truncate_data(data: model_types.SupportedDataType) -> model_types.Supported def _infer_signature( data: model_types.SupportedLocalDataType, role: Literal["input", "output"] -) -> Sequence[FeatureSpec]: +) -> Sequence[BaseFeatureSpec]: """Infer the inputs/outputs signature given a data that could be dataframe, numpy array or list. Dispatching is used to separate logic for different types. (Not using Python's singledispatch for unsupported feature of union dispatching in 3.8) @@ -789,8 +815,8 @@ def _convert_list_to_ndarray(data: List[Any]) -> npt.NDArray[Any]: def _rename_features( - features: Sequence[FeatureSpec], feature_names: Optional[List[str]] = None -) -> Sequence[FeatureSpec]: + features: Sequence[BaseFeatureSpec], feature_names: Optional[List[str]] = None +) -> Sequence[BaseFeatureSpec]: """It renames the feature in features provided optional feature names. Args: @@ -846,6 +872,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) features: A sequence of feature specifications and feature group specifications, where the dataframe should fit. Raises: + NotImplementedError: FeatureGroupSpec is not supported. ValueError: Raised when a feature cannot be found. ValueError: Raised when feature is scalar but confront list element. ValueError: Raised when feature type is not aligned in list element. @@ -855,15 +882,8 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) ValueError: Raised when feature shape is not aligned in numpy array element. ValueError: Raised when feature type is not aligned in string element. ValueError: Raised when feature type is not aligned in bytes element. - ValueError: Raised when feature type is not met. """ - _features: List[FeatureSpec] = [] for feature in features: - if isinstance(feature, FeatureSpec): - _features.append(feature) - elif isinstance(feature, FeatureGroupSpec): - _features.extend(feature._specs) - for feature in _features: ft_name = feature.name try: data_col = data[ft_name] @@ -871,10 +891,25 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") df_col_dtype = data_col.dtype + if isinstance(feature, FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + + assert isinstance(feature, FeatureSpec), "Invalid feature kind." ft_type = feature._dtype - if df_col_dtype == np.dtype("O"): + ft_shape = feature._shape + if df_col_dtype != np.dtype("O"): + if ft_type != DataType.from_numpy_type(df_col_dtype): + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + f"Feature type {ft_type} is not met by all elements in {data_col}." + ) + elif ft_shape is not None: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a array type feature while scalar data is provided." + ) + else: if isinstance(data_col[0], list): - ft_shape = feature._shape if not ft_shape: raise ValueError( f"Data Validation Error in feature {ft_name}: " @@ -898,7 +933,6 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) + f"Feature shape {ft_shape} is not met by all elements in {data_col}." ) elif isinstance(data_col[0], np.ndarray): - ft_shape = feature._shape if not ft_shape: raise ValueError( f"Data Validation Error in feature {ft_name}: " @@ -920,38 +954,47 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[BaseFeatureSpec]) + f"Feature shape {ft_shape} is not met by all elements in {data_col}." ) elif isinstance(data_col[0], str): + if ft_shape is not None: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a array type feature while scalar data is provided." + ) if ft_type != DataType.STRING: raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by all elements in {data_col}." ) elif isinstance(data_col[0], bytes): + if ft_shape is not None: + raise ValueError( + f"Data Validation Error in feature {ft_name}: " + + "Feature is a array type feature while scalar data is provided." + ) if ft_type != DataType.BYTES: raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by all elements in {data_col}." ) - else: - if ft_type != DataType.from_numpy_type(df_col_dtype): - raise ValueError( - f"Data Validation Error in feature {ft_name}: " - + f"Feature type {ft_type} is not met by all elements in {data_col}." - ) def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequence[BaseFeatureSpec]) -> None: - _features: List[FeatureSpec] = [] - for feature in features: - if isinstance(feature, FeatureSpec): - _features.append(feature) - elif isinstance(feature, FeatureGroupSpec): - _features.extend(feature._specs) + """Validate Snowpark DataFrame as input + + Args: + data: A snowpark dataframe to be validated. + features: A sequence of feature specifications and feature group specifications, where the dataframe should fit. + + Raises: + NotImplementedError: FeatureGroupSpec is not supported. + ValueError: Raised when confronting invalid feature. + ValueError: Raised when a feature cannot be found. + """ schema = data.schema - for feature in _features: + for feature in features: ft_name = feature.name found = False for field in schema.fields: - name = identifier.remove_and_unescape_quote_if_quoted(field.name) + name = identifier.get_unescaped_names(field.name) if name == ft_name: found = True if field.nullable: @@ -960,14 +1003,15 @@ def _validate_snowpark_data(data: snowflake.snowpark.DataFrame, features: Sequen + " inference might fail if there is null value.", category=RuntimeWarning, ) - + if isinstance(feature, FeatureGroupSpec): + raise NotImplementedError("FeatureGroupSpec is not supported.") + assert isinstance(feature, FeatureSpec), "Invalid feature kind." ft_type = feature._dtype if not ft_type.is_same_snowpark_type(field.datatype): raise ValueError( f"Data Validation Error in feature {ft_name}: " + f"Feature type {ft_type} is not met by column {field.name}." ) - break if not found: raise ValueError(f"Data Validation Error: feature {ft_name} does not exist in data.") @@ -992,6 +1036,7 @@ def _convert_and_validate_local_data( if handler.can_handle(data): handler.validate(data) df = handler.convert_to_df(data) + break if df is None: raise ValueError(f"Data Validation Error: Un-supported type {type(data)} provided.") assert isinstance(df, pd.DataFrame) diff --git a/snowflake/ml/model/model_signature_test.py b/snowflake/ml/model/model_signature_test.py index 8fb01100..44d42d98 100644 --- a/snowflake/ml/model/model_signature_test.py +++ b/snowflake/ml/model/model_signature_test.py @@ -32,21 +32,21 @@ def test_feature_spec(self) -> None: class FeatureGroupSpecTest(absltest.TestCase): def test_feature_group_spec(self) -> None: - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "No children feature specs."): _ = model_signature.FeatureGroupSpec(name="features", specs=[]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "All children feature specs have to have name."): ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.INT64) ft2._name = None # type: ignore[assignment] _ = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "All children feature specs have to have same type."): ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.FLOAT) _ = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "All children feature specs have to have same shape."): ft1 = model_signature.FeatureSpec(name="feature1", dtype=model_signature.DataType.INT64) ft2 = model_signature.FeatureSpec(name="feature2", dtype=model_signature.DataType.INT64, shape=(2,)) fts = model_signature.FeatureGroupSpec(name="features", specs=[ft1, ft2]) @@ -133,34 +133,77 @@ def test_2(self) -> None: class PandasDataFrameHandlerTest(absltest.TestCase): def test_validate_pd_DataFrame(self) -> None: df = pd.DataFrame([]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Empty data is found."): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) + with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): model_signature._PandasDataFrameHandler.validate(df) sub_df = pd.DataFrame([2.5, 6.8]) df = pd.DataFrame([[1, sub_df], [2, sub_df]], columns=["a", "b"]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Unsupported type confronted in"): model_signature._PandasDataFrameHandler.validate(df) df = pd.DataFrame( [[1, 2.0, 1, 2.0, 1, 2.0], [2, 4.0, 2, 4.0, 2, 4.0]], columns=pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), ) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, 2], [2, 4]], columns=["a", "a"]) + with self.assertRaisesRegex(ValueError, "Duplicate column index is found"): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, "Hello"], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, 2], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of object"): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, [2, [6]]], [2, [2, 6]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, [2, 6]], [2, [2, [6]]]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Ragged nested or Unsupported list-like data"): model_signature._PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, [2.5, 6.8]], [2, [2, 6]]], columns=["a", "b"]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): model_signature._PandasDataFrameHandler.validate(df) df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2, 6])]], columns=["a", "b"]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Inconsistent type of element in object found in column data"): + model_signature._PandasDataFrameHandler.validate(df) + + df = pd.DataFrame([[1, np.array([2.5, 6.8])], [2, 6]], columns=["a", "b"]) + with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in column data"): model_signature._PandasDataFrameHandler.validate(df) + def test_trunc_pd_DataFrame(self) -> None: + df = pd.DataFrame([1] * (model_signature._PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1)) + + pd.testing.assert_frame_equal( + pd.DataFrame([1] * (model_signature._PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT)), + model_signature._PandasDataFrameHandler.truncate(df), + ) + + df = pd.DataFrame([1] * (model_signature._PandasDataFrameHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)) + + pd.testing.assert_frame_equal( + df, + model_signature._PandasDataFrameHandler.truncate(df), + ) + def test_infer_signature_pd_DataFrame(self) -> None: df = pd.DataFrame([1, 2, 3, 4]) self.assertListEqual( model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], ) df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) @@ -185,8 +228,8 @@ def test_infer_signature_pd_DataFrame(self) -> None: self.assertListEqual( model_signature._PandasDataFrameHandler.infer_signature(df, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.DOUBLE), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.DOUBLE), ], ) @@ -294,30 +337,78 @@ def test_infer_signature_pd_DataFrame(self) -> None: ], ) + df = pd.DataFrame([1, 2, 3, 4]) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), + [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64)], + ) + + df = pd.DataFrame([1, 2, 3, 4], columns=["a"]) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), + [model_signature.FeatureSpec("a", model_signature.DataType.INT64)], + ) + + df = pd.DataFrame(["a", "b", "c", "d"], columns=["a"]) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), + [model_signature.FeatureSpec("a", model_signature.DataType.STRING)], + ) + + df = pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), + [model_signature.FeatureSpec("a", model_signature.DataType.BYTES)], + ) + + df = pd.DataFrame([[1, 2.0], [2, 4.0]]) + self.assertListEqual( + model_signature._PandasDataFrameHandler.infer_signature(df, role="output"), + [ + model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("output_feature_1", model_signature.DataType.DOUBLE), + ], + ) + class NumpyArrayHandlerTest(absltest.TestCase): def test_validate_np_ndarray(self) -> None: arr = np.array([]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Empty data is found."): model_signature._NumpyArrayHandler.validate(arr) arr = np.array(1) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Scalar data is found."): model_signature._NumpyArrayHandler.validate(arr) + def test_trunc_np_ndarray(self) -> None: + arr = np.array([1] * (model_signature._NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1)) + + np.testing.assert_equal( + np.array([1] * (model_signature._NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)), + model_signature._NumpyArrayHandler.truncate(arr), + ) + + arr = np.array([1] * (model_signature._NumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)) + + np.testing.assert_equal( + arr, + model_signature._NumpyArrayHandler.truncate(arr), + ) + def test_infer_schema_np_ndarray(self) -> None: arr = np.array([1, 2, 3, 4]) self.assertListEqual( model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], ) arr = np.array([[1, 2], [3, 4]]) self.assertListEqual( model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), ], ) @@ -325,17 +416,76 @@ def test_infer_schema_np_ndarray(self) -> None: self.assertListEqual( model_signature._NumpyArrayHandler.infer_signature(arr, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64, shape=(2,)), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2,)), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64, shape=(2,)), ], ) + arr = np.array([1, 2, 3, 4]) + self.assertListEqual( + model_signature._NumpyArrayHandler.infer_signature(arr, role="output"), + [model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64)], + ) + + arr = np.array([[1, 2], [3, 4]]) + self.assertListEqual( + model_signature._NumpyArrayHandler.infer_signature(arr, role="output"), + [ + model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64), + ], + ) + + arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + self.assertListEqual( + model_signature._NumpyArrayHandler.infer_signature(arr, role="output"), + [ + model_signature.FeatureSpec("output_feature_0", model_signature.DataType.INT64, shape=(2,)), + model_signature.FeatureSpec("output_feature_1", model_signature.DataType.INT64, shape=(2,)), + ], + ) + + def test_convert_to_df_numpy_array(self) -> None: + arr1 = np.array([1, 2, 3, 4]) + pd.testing.assert_frame_equal( + model_signature._NumpyArrayHandler.convert_to_df(arr1), + pd.DataFrame([1, 2, 3, 4]), + ) + + arr2 = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) + pd.testing.assert_frame_equal( + model_signature._NumpyArrayHandler.convert_to_df(arr2), + pd.DataFrame([[1, 1], [2, 2], [3, 3], [4, 4]]), + ) + + arr3 = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + pd.testing.assert_frame_equal( + model_signature._NumpyArrayHandler.convert_to_df(arr3), + pd.DataFrame(data={0: [np.array([1, 1]), np.array([3, 3])], 1: [np.array([2, 2]), np.array([4, 4])]}), + ) + class ListOfNumpyArrayHandlerTest(absltest.TestCase): def test_validate_list_of_numpy_array(self) -> None: lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] self.assertFalse(model_signature._ListOfNumpyArrayHandler.can_handle(lt8)) + def test_trunc_np_ndarray(self) -> None: + arrs = [np.array([1] * (model_signature._ListOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 + + for arr in model_signature._ListOfNumpyArrayHandler.truncate(arrs): + np.testing.assert_equal( + np.array([1] * (model_signature._ListOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT)), arr + ) + + arrs = [ + np.array([1]), + np.array([1] * (model_signature._ListOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT - 1)), + ] + + for arr in model_signature._ListOfNumpyArrayHandler.truncate(arrs): + np.testing.assert_equal(np.array([1]), arr) + def test_infer_signature_list_of_numpy_array(self) -> None: arr = np.array([1, 2, 3, 4]) lt = [arr, arr] @@ -359,11 +509,45 @@ def test_infer_signature_list_of_numpy_array(self) -> None: ], ) + def test_convert_to_df_list_of_numpy_array(self) -> None: + arr1 = np.array([1, 2, 3, 4]) + lt = [arr1, arr1] + pd.testing.assert_frame_equal( + model_signature._ListOfNumpyArrayHandler.convert_to_df(lt), + pd.DataFrame([[1, 1], [2, 2], [3, 3], [4, 4]]), + check_names=False, + ) + + arr2 = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) + lt = [arr1, arr2] + pd.testing.assert_frame_equal( + model_signature._ListOfNumpyArrayHandler.convert_to_df(lt), + pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]), + ) + + arr = np.array([[[1, 1], [2, 2]], [[3, 3], [4, 4]]]) + lt = [arr, arr] + pd.testing.assert_frame_equal( + model_signature._ListOfNumpyArrayHandler.convert_to_df(lt), + pd.DataFrame( + data={ + 0: [np.array([1, 1]), np.array([3, 3])], + 1: [np.array([2, 2]), np.array([4, 4])], + 2: [np.array([1, 1]), np.array([3, 3])], + 3: [np.array([2, 2]), np.array([4, 4])], + } + ), + ) + class ListOfBuiltinsHandlerTest(absltest.TestCase): def test_validate_list_builtins(self) -> None: + lt6 = ["Hello", [2, 3]] + with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in data"): + model_signature._ListOfBuiltinHandler.validate(lt6) # type:ignore[arg-type] + lt7 = [[1], [2, 3]] - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Ill-shaped list data"): model_signature._ListOfBuiltinHandler.validate(lt7) lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] @@ -373,27 +557,27 @@ def test_infer_signature_list_builtins(self) -> None: lt1 = [1, 2, 3, 4] self.assertListEqual( model_signature._ListOfBuiltinHandler.infer_signature(lt1, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], ) lt2 = ["a", "b", "c", "d"] self.assertListEqual( model_signature._ListOfBuiltinHandler.infer_signature(lt2, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.STRING)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.STRING)], ) lt3 = [ele.encode() for ele in lt2] self.assertListEqual( model_signature._ListOfBuiltinHandler.infer_signature(lt3, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.BYTES)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.BYTES)], ) lt4 = [[1, 2], [3, 4]] self.assertListEqual( model_signature._ListOfBuiltinHandler.infer_signature(lt4, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), ], ) @@ -401,8 +585,8 @@ def test_infer_signature_list_builtins(self) -> None: self.assertListEqual( model_signature._ListOfBuiltinHandler.infer_signature(lt5, role="input"), # type:ignore[arg-type] [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.DOUBLE), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.DOUBLE), ], ) @@ -410,8 +594,8 @@ def test_infer_signature_list_builtins(self) -> None: self.assertListEqual( model_signature._ListOfBuiltinHandler.infer_signature(lt6, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64, shape=(2,)), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64, shape=(2,)), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64, shape=(2,)), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64, shape=(2,)), ], ) @@ -428,7 +612,7 @@ def tearDownClass(cls) -> None: def test_validate_snowpark_df(self) -> None: schema = spt.StructType([spt.StructField('"a"', spt.VariantType()), spt.StructField('"b"', spt.StringType())]) df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Unsupported data type"): model_signature._SnowparkDataFrameHandler.validate(df) def test_infer_schema_snowpark_df(self) -> None: @@ -442,13 +626,23 @@ def test_infer_schema_snowpark_df(self) -> None: ], ) + schema = spt.StructType([spt.StructField('"""a"""', spt.LongType()), spt.StructField('"b"', spt.StringType())]) + df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) + self.assertListEqual( + model_signature._SnowparkDataFrameHandler.infer_signature(df, role="input"), + [ + model_signature.FeatureSpec('"a"', model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.STRING), + ], + ) + def test_validate_data_with_features(self) -> None: fts = [ model_signature.FeatureSpec("a", model_signature.DataType.INT64), model_signature.FeatureSpec("b", model_signature.DataType.INT64), ] df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) - with self.assertWarns(RuntimeWarning): + with self.assertWarnsRegex(RuntimeWarning, "Nullable column [^\\s]* provided"): model_signature._validate_snowpark_data(df, fts) fts = [ @@ -461,16 +655,16 @@ def test_validate_data_with_features(self) -> None: schema = spt.StructType([spt.StructField('"a"', spt.LongType()), spt.StructField('"b"', spt.IntegerType())]) df = self._session.create_dataframe([[1, 3], [3, 9]], schema) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): model_signature._validate_snowpark_data(df, fts) schema = spt.StructType([spt.StructField('"a1"', spt.LongType()), spt.StructField('"b"', spt.StringType())]) df = self._session.create_dataframe([[1, "snow"], [3, "flake"]], schema) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): model_signature._validate_snowpark_data(df, fts) df = self._session.create_dataframe([{'"a"': 1}, {'"b"': 2}]) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by column"): model_signature._validate_snowpark_data(df, fts) @@ -501,27 +695,27 @@ def test_infer_signature(self) -> None: df = pd.DataFrame([1, 2, 3, 4]) self.assertListEqual( model_signature._infer_signature(df, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], ) arr = np.array([1, 2, 3, 4]) self.assertListEqual( model_signature._infer_signature(arr, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], ) lt1 = [1, 2, 3, 4] self.assertListEqual( model_signature._infer_signature(lt1, role="input"), - [model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64)], + [model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64)], ) lt2 = [[1, 2], [3, 4]] self.assertListEqual( model_signature._infer_signature(lt2, role="input"), [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), ], ) @@ -556,59 +750,257 @@ def test_infer_signature(self) -> None: with self.assertRaises(NotImplementedError): model_signature._infer_signature([], role="input") + def test_validate_pandas_df(self) -> None: + fts = [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.INT64), + ] + + model_signature._validate_pandas_df(pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), fts) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df(pd.DataFrame([[2.5, 5], [6.8, 8]], columns=["a", "b"]), fts) + + with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + model_signature._validate_pandas_df(pd.DataFrame([5, 6], columns=["a"]), fts) + + model_signature._validate_pandas_df(pd.DataFrame([5, 6], columns=["a"]), fts[:1]) + + with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): + model_signature._validate_pandas_df(pd.DataFrame([[2, 5], [6, 8]], columns=["c", "d"]), fts) + + with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while list data is provided."): + model_signature._validate_pandas_df( + pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts + ) + + fts = [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2,)), + ] + + model_signature._validate_pandas_df(pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts) + + with self.assertRaisesRegex(ValueError, "Feature is a array type feature while scalar data is provided."): + model_signature._validate_pandas_df(pd.DataFrame([[2, 2.5], [6, 6.8]], columns=["a", "b"]), fts) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, [2.5, 6.8, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df(pd.DataFrame([[1, [2, 5]], [2, [6, 8]]], columns=["a", "b"]), fts) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2, 5])], [2, np.array([6, 8])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature is a array type feature while scalar data is provided."): + model_signature._validate_pandas_df( + pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["b"]), fts[-1:] + ) + + with self.assertRaisesRegex(ValueError, "Feature is a array type feature while scalar data is provided."): + model_signature._validate_pandas_df(pd.DataFrame(["a", "b", "c", "d"], columns=["b"]), fts[-1:]) + + fts = [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(-1,)), + ] + + model_signature._validate_pandas_df(pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, [2.5, 6.8, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts + ) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8, 6.8]]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df(pd.DataFrame([[1, [2, 5]], [2, [6, 8]]], columns=["a", "b"]), fts) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8])]], columns=["a", "b"]), fts + ) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts + ) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8, 6.8])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2, 5])], [2, np.array([6, 8])]], columns=["a", "b"]), fts + ) + + fts = [ + model_signature.FeatureSpec("a", model_signature.DataType.INT64), + model_signature.FeatureSpec("b", model_signature.DataType.DOUBLE, shape=(2, 1)), + ] + + model_signature._validate_pandas_df( + pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8]]]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, [[2.5], [6.8]]], [2, [[2.5], [6.8], [6.8]]]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, [2.5, 6.8]], [2, [2.5, 6.8]]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, [[2], [5]]], [2, [[6], [8]]]], columns=["a", "b"]), fts + ) + + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([[2.5], [6.8]])], [2, np.array([[2.5], [6.8]])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([[2.5], [6.8]])], [2, np.array([[2.5], [6.8], [6.8]])]], columns=["a", "b"]), + fts, + ) + + with self.assertRaisesRegex(ValueError, "Feature shape [\\(\\)0-9,\\s-]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([2.5, 6.8])], [2, np.array([2.5, 6.8])]], columns=["a", "b"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([[1, np.array([[2], [5]])], [2, np.array([[6], [8]])]], columns=["a", "b"]), fts + ) + + fts = [model_signature.FeatureSpec("a", model_signature.DataType.STRING)] + model_signature._validate_pandas_df(pd.DataFrame(["a", "b", "c", "d"], columns=["a"]), fts) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df( + pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while list data is provided."): + model_signature._validate_pandas_df(pd.DataFrame(data={"a": [[1, 2]]}), fts) + + with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while array data is provided."): + model_signature._validate_pandas_df(pd.DataFrame(data={"a": [np.array([1, 2])]}), fts) + + fts = [model_signature.FeatureSpec("a", model_signature.DataType.BYTES)] + model_signature._validate_pandas_df( + pd.DataFrame([ele.encode() for ele in ["a", "b", "c", "d"]], columns=["a"]), fts + ) + + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): + model_signature._validate_pandas_df(pd.DataFrame(["a", "b", "c", "d"], columns=["a"]), fts) + + with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while list data is provided."): + model_signature._validate_pandas_df(pd.DataFrame(data={"a": [[1, 2]]}), fts) + + with self.assertRaisesRegex(ValueError, "Feature is a scalar feature while array data is provided."): + model_signature._validate_pandas_df(pd.DataFrame(data={"a": [np.array([1, 2])]}), fts) + + def test_rename_pandas_df(self) -> None: + fts = [ + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), + ] + + df = pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]) + + pd.testing.assert_frame_equal(df, model_signature._rename_pandas_df(df, fts)) + + df = pd.DataFrame([[2, 5], [6, 8]]) + + pd.testing.assert_frame_equal(df, model_signature._rename_pandas_df(df, fts), check_names=False) + pd.testing.assert_index_equal( + pd.Index(["input_feature_0", "input_feature_1"]), model_signature._rename_pandas_df(df, fts).columns + ) + def test_validate_data_with_features(self) -> None: fts = [ - model_signature.FeatureSpec("feature_0", model_signature.DataType.INT64), - model_signature.FeatureSpec("feature_1", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_0", model_signature.DataType.INT64), + model_signature.FeatureSpec("input_feature_1", model_signature.DataType.INT64), ] - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Empty data is found."): model_signature._convert_and_validate_local_data(np.array([]), fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Scalar data is found."): model_signature._convert_and_validate_local_data(np.array(5), fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): model_signature._convert_and_validate_local_data(np.array([[2.5, 5], [6.8, 8]]), fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Un-supported type provided."): model_signature._convert_and_validate_local_data([], fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Inconsistent type of object found in data"): model_signature._convert_and_validate_local_data([1, [1, 1]], fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Ill-shaped list data"): model_signature._convert_and_validate_local_data([[1], [1, 1]], fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): model_signature._convert_and_validate_local_data([[2.1, 5.0], [6.8, 8.0]], fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Feature type [^\\s]* is not met by all elements"): model_signature._convert_and_validate_local_data(pd.DataFrame([[2.5, 5], [6.8, 8]]), fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Data does not have the same number of features as signature"): model_signature._convert_and_validate_local_data(pd.DataFrame([5, 6]), fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "Data does not have the same number of features as signature."): model_signature._convert_and_validate_local_data(np.array([5, 6]), fts) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, "feature [^\\s]* does not exist in data."): model_signature._convert_and_validate_local_data(pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), fts) df = model_signature._convert_and_validate_local_data(np.array([5, 6]), fts[:1]) - self.assertListEqual(df.columns.to_list(), ["feature_0"]) + self.assertListEqual(df.columns.to_list(), ["input_feature_0"]) df = model_signature._convert_and_validate_local_data(pd.DataFrame([5, 6]), fts[:1]) - self.assertListEqual(df.columns.to_list(), ["feature_0"]) + self.assertListEqual(df.columns.to_list(), ["input_feature_0"]) df = model_signature._convert_and_validate_local_data([5, 6], fts[:1]) - self.assertListEqual(df.columns.to_list(), ["feature_0"]) + self.assertListEqual(df.columns.to_list(), ["input_feature_0"]) df = model_signature._convert_and_validate_local_data(np.array([[2, 5], [6, 8]]), fts) - self.assertListEqual(df.columns.to_list(), ["feature_0", "feature_1"]) + self.assertListEqual(df.columns.to_list(), ["input_feature_0", "input_feature_1"]) df = model_signature._convert_and_validate_local_data(pd.DataFrame([[2, 5], [6, 8]]), fts) - self.assertListEqual(df.columns.to_list(), ["feature_0", "feature_1"]) + self.assertListEqual(df.columns.to_list(), ["input_feature_0", "input_feature_1"]) df = model_signature._convert_and_validate_local_data( pd.DataFrame([[2, 5], [6, 8]], columns=["a", "b"]), @@ -620,7 +1012,7 @@ def test_validate_data_with_features(self) -> None: self.assertListEqual(df.columns.to_list(), ["a", "b"]) df = model_signature._convert_and_validate_local_data([[2, 5], [6, 8]], fts) - self.assertListEqual(df.columns.to_list(), ["feature_0", "feature_1"]) + self.assertListEqual(df.columns.to_list(), ["input_feature_0", "input_feature_1"]) if __name__ == "__main__": diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index def902f2..4acbd920 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -4,7 +4,7 @@ import numpy.typing as npt from typing_extensions import NotRequired, TypeAlias -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base if TYPE_CHECKING: import numpy as np @@ -78,7 +78,21 @@ class DeployOptions(TypedDict): + """Common Options for deploying to Snowflake. + + output_with_input_features: Whether or not preserve the input columns in the output when predicting. + Defaults to False. + keep_order: Whether or not preserve the row order when predicting. Only available for dataframe has fewer than 2**64 + rows. Defaults to True. + + Internal-only options + _snowml_wheel_path: Local or in-stage path to snowml wheel file. If deployed permanently, it needs to be a stage + path where the stage is non-temporary, internal stage. + """ + _snowml_wheel_path: NotRequired[str] + output_with_input_features: NotRequired[bool] + keep_order: NotRequired[bool] class WarehouseDeployOptions(DeployOptions): @@ -87,17 +101,20 @@ class WarehouseDeployOptions(DeployOptions): permanent_udf_stage_location: A Snowflake stage option where the UDF should be persisted. If specified, the model will be deployed as a permanent UDF, otherwise temporary. relax_version: Whether or not relax the version constraints of the dependencies if unresolvable. Defaults to False. - keep_order: Whether or not preserve the row order when predicting. Only available for dataframe has fewer than 2**64 - rows. Defaults to True. """ permanent_udf_stage_location: NotRequired[str] relax_version: NotRequired[bool] - keep_order: NotRequired[bool] class ModelSaveOption(TypedDict): - ... + """Options for saving the model. + + allow_overwritten_stage_file: Flag to indicate when saving the model as a stage file, whether overwriting existed + file is allowed. Default to False. + """ + + allow_overwritten_stage_file: NotRequired[bool] class CustomModelSaveOption(TypedDict): diff --git a/snowflake/ml/registry/BUILD.bazel b/snowflake/ml/registry/BUILD.bazel index 77790306..46120bd1 100644 --- a/snowflake/ml/registry/BUILD.bazel +++ b/snowflake/ml/registry/BUILD.bazel @@ -11,11 +11,12 @@ py_library( "//snowflake/ml/_internal/utils:formatting", "//snowflake/ml/_internal/utils:query_result_checker", "//snowflake/ml/_internal/utils:uri", + "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/_internal:file_utils", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/model:_model", "//snowflake/ml/model:_deployer", - "//snowflake/ml/framework:framework" + "//snowflake/ml/sklearn/framework:framework" ], ) diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index 255ac47e..97c533ab 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -13,8 +13,12 @@ from snowflake import connector, snowpark from snowflake.ml._internal import file_utils, telemetry -from snowflake.ml._internal.utils import formatting, query_result_checker, uri -from snowflake.ml.framework import base +from snowflake.ml._internal.utils import ( + formatting, + identifier, + query_result_checker, + uri, +) from snowflake.ml.model import ( _deployer, _model as model_api, @@ -22,6 +26,7 @@ type_hints as model_types, ) from snowflake.ml.registry import _schema +from snowflake.ml.sklearn.framework import base if TYPE_CHECKING: import pandas as pd @@ -51,6 +56,7 @@ _TELEMETRY_SUBPROJECT = "ModelRegistry" +@snowpark._internal.utils.private_preview(version="0.2.0") def create_model_registry( *, session: snowpark.Session, @@ -156,7 +162,11 @@ def _create_registry_tables( metadata_table_name: Name for the metadata table used by the model registry. statement_params: Function usage statement parameters used in sql query executions. """ - fully_qualified_schema_name = f'"{database_name}"."{schema_name}"' + fully_qualified_schema_name = ( + f"{identifier.quote_name_without_upper_casing(database_name)}" + + "." + + f"{identifier.quote_name_without_upper_casing(schema_name)}" + ) fully_qualified_registry_table_name = f'{fully_qualified_schema_name}."{registry_table_name}"' fully_qualified_metadata_table_name = f'{fully_qualified_schema_name}."{metadata_table_name}"' @@ -206,7 +216,11 @@ def _create_registry_views( metadata_table_name: Name for the metadata table used by the model registry. statement_params: Function usage statement parameters used in sql query executions. """ - fully_qualified_schema_name = f'"{database_name}"."{schema_name}"' + fully_qualified_schema_name = ( + f"{identifier.quote_name_without_upper_casing(database_name)}" + + "." + + f"{identifier.quote_name_without_upper_casing(schema_name)}" + ) # From the documentation: Each DDL statement executes as a separate transaction. Races should not be an issue. # https://docs.snowflake.com/en/sql-reference/transactions.html#ddl @@ -236,7 +250,9 @@ def _create_registry_views( ) session.sql(sql).collect(statement_params=statement_params) metadata_view_names.append(view_name) - metadata_select_fields.append(f'"{view_name}".{attribute_name} AS {attribute_name}') + metadata_select_fields.append( + f"{identifier.quote_name_without_upper_casing(view_name)}.{attribute_name} AS {attribute_name}" + ) # Create a special view for the registration timestamp. attribute_name = _METADATA_ATTRIBUTE_REGISTRATION @@ -253,7 +269,9 @@ def _create_registry_views( ) session.sql(create_registration_view_sql).collect(statement_params=statement_params) metadata_view_names.append(view_name) - metadata_select_fields.append(f'"{view_name}".{final_attribute_name} AS {final_attribute_name}') + metadata_select_fields.append( + f"{identifier.quote_name_without_upper_casing(view_name)}.{final_attribute_name} AS {final_attribute_name}" + ) metadata_views_join = " ".join( [ @@ -343,15 +361,31 @@ def _get_new_unique_identifier(self) -> str: def _fully_qualified_registry_table_name(self) -> str: """Get the fully qualified name to the current registry table.""" - return f'"{self._name}"."{self._schema}"."{self._registry_table}"' + return ( + f"{identifier.quote_name_without_upper_casing(self._name)}" + + "." + + f"{identifier.quote_name_without_upper_casing(self._schema)}" + + "." + + f"{identifier.quote_name_without_upper_casing(self._registry_table)}" + ) def _fully_qualified_metadata_table_name(self) -> str: """Get the fully qualified name to the current metadata table.""" - return f'"{self._name}"."{self._schema}"."{self._metadata_table}"' + return ( + f"{identifier.quote_name_without_upper_casing(self._name)}" + + "." + + f"{identifier.quote_name_without_upper_casing(self._schema)}" + + "." + + f"{identifier.quote_name_without_upper_casing(self._metadata_table)}" + ) def _fully_qualified_schema_name(self) -> str: """Get the fully qualified name to the current registry schema.""" - return f'"{self._name}"."{self._schema}"' + return ( + f"{identifier.quote_name_without_upper_casing(self._name)}" + + "." + + f"{identifier.quote_name_without_upper_casing(self._schema)}" + ) def _insert_table_entry(self, *, table: str, columns: Dict[str, Any]) -> List[snowpark.Row]: """Insert an entry into an internal Model Registry table. @@ -709,6 +743,68 @@ def _get_model_path( raise NotImplementedError("Restoring models consisting of multiple files is currently not supported.") return f"{self._fully_qualified_schema_name()}.{model_file_list[0].name}" + def _log_model_path( + self, + model_name: str, + model_version: str, + *, + path: str, + type: str, + description: Optional[str] = None, + tags: Optional[Dict[Any, Any]] = None, + ) -> str: + """Uploads and register a model to the Model Registry from a local file path. + + If `path` is a directory all files will be uploaded recursively, preserving the relative directory structure. + Symbolic links will be followed. + + NOTE: If any symlinks under `path` point to a parent directory, this can lead to infinite recursion. + + Args: + model_name: The given name for the model. + model_version: Version string to be set for the model. + path: Local file path to be uploaded. + type: Type of the model to be added. + description: A desription for the model. The description can be changed later. + tags: string-to-string dictonary of tag names and values to be set for the model. + + Returns: + String of the auto-generate unique model identifier. + """ + self._model_identifier_is_nonempty_or_raise(model_name, model_version) + id = self._get_new_unique_identifier() + + # Copy model from local disk to remote stage. + fully_qualified_model_stage_name = self._prepare_model_stage(model_id=id) + + # Check if directory or file and adapt accordingly. + # TODO: Unify and explicit about compression for both file and directory. + if os.path.isfile(path): + self._session.file.put(path, f"{fully_qualified_model_stage_name}/data") + elif os.path.isdir(path): + with file_utils.zip_file_or_directory_to_stream(path, path) as input_stream: + self._session._conn.upload_stream( + input_stream=input_stream, + stage_location=fully_qualified_model_stage_name, + dest_filename=f"{os.path.basename(path)}.zip", + dest_prefix="", + source_compression="DEFLATE", + compress_data=False, + overwrite=True, + is_in_udf=True, + ) + self._register_model_with_id( + model_name=model_name, + model_version=model_version, + model_id=id, + type=type, + uri=uri.get_uri_from_snowflake_stage_path(fully_qualified_model_stage_name), + description=description, + tags=tags, + ) + + return id + def _register_model_with_id( self, model_name: str, @@ -785,6 +881,7 @@ def _register_model_with_id( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def list_models(self) -> snowpark.DataFrame: """Lists models contained in the registry. @@ -804,6 +901,7 @@ def list_models(self) -> snowpark.DataFrame: project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def set_tag( self, model_name: str, @@ -833,6 +931,7 @@ def set_tag( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def remove_tag(self, model_name: str, model_version: str, tag_name: str) -> None: """Remove target model tag. @@ -860,6 +959,7 @@ def remove_tag(self, model_name: str, model_version: str, tag_name: str) -> None project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def has_tag( self, model_name: str, @@ -891,6 +991,7 @@ def has_tag( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def get_tag_value(self, model_name: str, model_version: str, tag_name: str) -> Any: """Return the value of the tag for the model. @@ -910,6 +1011,7 @@ def get_tag_value(self, model_name: str, model_version: str, tag_name: str) -> A project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def get_tags(self, model_name: str = None, model_version: str = None) -> Dict[str, Any]: """Get all tags and values stored for the target model. @@ -936,6 +1038,7 @@ def get_tags(self, model_name: str = None, model_version: str = None) -> Dict[st project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def get_model_description(self, model_name: str, model_version: str) -> Optional[str]: """Get the description of the model. @@ -955,6 +1058,7 @@ def get_model_description(self, model_name: str, model_version: str) -> Optional project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def set_model_description( self, model_name: str, @@ -976,6 +1080,7 @@ def set_model_description( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def get_history(self) -> snowpark.DataFrame: """Return a dataframe with the history of operations performed on the model registry. @@ -1003,6 +1108,7 @@ def get_history(self) -> snowpark.DataFrame: project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def get_model_history( self, model_name: str, @@ -1026,6 +1132,7 @@ def get_model_history( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def set_metric( self, model_name: str, @@ -1055,6 +1162,7 @@ def set_metric( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def remove_metric( self, model_name: str, @@ -1090,6 +1198,7 @@ def remove_metric( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def has_metric(self, model_name: str, model_version: str, metric_name: str) -> bool: """Check if a model has a metric with the given name. @@ -1108,7 +1217,8 @@ def has_metric(self, model_name: str, model_version: str, metric_name: str) -> b project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) - def get_metric_value(self, model_name: str, model_version: str, metric_name: str) -> Optional[object]: + @snowpark._internal.utils.private_preview(version="0.2.0") + def get_metric_value(self, model_name: str, model_version: str, metric_name: str) -> object: """Return the value of the given metric for the model. The returned value can be None. If the metric does not exist, KeyError will be raised. @@ -1127,6 +1237,7 @@ def get_metric_value(self, model_name: str, model_version: str, metric_name: str project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def get_metrics(self, model_name: str, model_version: str) -> Dict[str, object]: """Get all metrics and values stored for the given model. @@ -1154,6 +1265,7 @@ def get_metrics(self, model_name: str, model_version: str) -> Dict[str, object]: project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def log_model( self, model_name: str, @@ -1259,72 +1371,7 @@ def log_model( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) - def _log_model_path( - self, - model_name: str, - model_version: str, - *, - path: str, - type: str, - description: Optional[str] = None, - tags: Optional[Dict[Any, Any]] = None, - ) -> str: - """Uploads and register a model to the Model Registry from a local file path. - - If `path` is a directory all files will be uploaded recursively, preserving the relative directory structure. - Symbolic links will be followed. - - NOTE: If any symlinks under `path` point to a parent directory, this can lead to infinite recursion. - - Args: - model_name: The given name for the model. - model_version: Version string to be set for the model. - path: Local file path to be uploaded. - type: Type of the model to be added. - description: A desription for the model. The description can be changed later. - tags: string-to-string dictonary of tag names and values to be set for the model. - - Returns: - String of the auto-generate unique model identifier. - """ - self._model_identifier_is_nonempty_or_raise(model_name, model_version) - id = self._get_new_unique_identifier() - - # Copy model from local disk to remote stage. - fully_qualified_model_stage_name = self._prepare_model_stage(model_id=id) - - # Check if directory or file and adapt accordingly. - # TODO: Unify and explicit about compression for both file and directory. - if os.path.isfile(path): - self._session.file.put(path, f"{fully_qualified_model_stage_name}/data") - elif os.path.isdir(path): - with file_utils.zip_file_or_directory_to_stream(path, path) as input_stream: - self._session._conn.upload_stream( - input_stream=input_stream, - stage_location=fully_qualified_model_stage_name, - dest_filename=f"{os.path.basename(path)}.zip", - dest_prefix="", - source_compression="DEFLATE", - compress_data=False, - overwrite=True, - is_in_udf=True, - ) - self._register_model_with_id( - model_name=model_name, - model_version=model_version, - model_id=id, - type=type, - uri=uri.get_uri_from_snowflake_stage_path(fully_qualified_model_stage_name), - description=description, - tags=tags, - ) - - return id - - @telemetry.send_api_usage_telemetry( - project=_TELEMETRY_PROJECT, - subproject=_TELEMETRY_SUBPROJECT, - ) + @snowpark._internal.utils.private_preview(version="0.2.0") def load_model(self, model_name: str, model_version: str) -> Any: """Loads the model with the given (model_name + model_version) from the registry into memory. @@ -1347,7 +1394,7 @@ def load_model(self, model_name: str, model_version: str) -> Any: with zipfile.ZipFile(local_path, "r") as myzip: if len(myzip.namelist()) > 1: myzip.extractall(extracted_dir) - restored_model, _meta = model_api.load_model(extracted_dir) + restored_model, _meta = model_api.load_model(model_dir_path=extracted_dir) is_native_model_format = True except TypeError: pass @@ -1364,6 +1411,7 @@ def load_model(self, model_name: str, model_version: str) -> Any: project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def deploy( self, model_name: str, @@ -1416,26 +1464,7 @@ def deploy( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, ) - def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": - """Predict using the deployed model in Snowflake. - - Args: - deployment_name: name of the generated UDF. - data: Data to run predict. - - Raises: - ValueError: The deployment with given name haven't been deployed. - - Returns: - A dataframe containing the result of prediction. - """ - - di = self._deploy_api.get_deployment(name=deployment_name) - if di is None: - raise ValueError(f"The deployment with name {deployment_name} haven't been deployed") - - return self._deploy_api.predict(di["name"], data) - + @snowpark._internal.utils.private_preview(version="0.2.0") def delete_model( self, model_name: str, @@ -1628,3 +1657,28 @@ def __init__( setattr(self.__class__.__dict__[name], "__doc__", docstring) # NoQA setattr(self.__class__, "init_complete", True) # NoQA + + @telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, + ) + @snowpark._internal.utils.private_preview(version="0.2.0") + def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": + """Predict using the deployed model in Snowflake. + + Args: + deployment_name: name of the generated UDF. + data: Data to run predict. + + Raises: + ValueError: The deployment with given name haven't been deployed. + + Returns: + A dataframe containing the result of prediction. + """ + + di = self._registry._deploy_api.get_deployment(name=deployment_name) + if di is None: + raise ValueError(f"The deployment with name {deployment_name} haven't been deployed") + + return self._registry._deploy_api.predict(di["name"], data) diff --git a/snowflake/ml/registry/model_registry_test.py b/snowflake/ml/registry/model_registry_test.py index 1a470b23..dcaa9b70 100644 --- a/snowflake/ml/registry/model_registry_test.py +++ b/snowflake/ml/registry/model_registry_test.py @@ -8,7 +8,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml._internal.utils import formatting +from snowflake.ml._internal.utils import formatting, identifier from snowflake.ml.registry import model_registry from snowflake.ml.test_utils import mock_data_frame, mock_session @@ -16,7 +16,13 @@ _SCHEMA_NAME = "PUBLIC" _REGISTRY_TABLE_NAME = "MODELS" _METADATA_TABLE_NAME = "METADATA" -_FULLY_QUALIFIED_REGISTRY_TABLE_NAME = f'"{_DATABASE_NAME}"."{_SCHEMA_NAME}"."{_REGISTRY_TABLE_NAME}"' +_FULLY_QUALIFIED_REGISTRY_TABLE_NAME = ( + f"{identifier.quote_name_without_upper_casing(_DATABASE_NAME)}" + + "." + + f"{identifier.quote_name_without_upper_casing(_SCHEMA_NAME)}" + + "." + + f"{identifier.quote_name_without_upper_casing(_REGISTRY_TABLE_NAME)}" +) _REGISTRY_SCHEMA_STRING = ", ".join([f"{k} {v}" for k, v in _REGISTRY_TABLE_SCHEMA.items()]) _METADATA_INSERT_COLUMNS_STRING = ",".join(filter(lambda x: x != "SEQUENCE_ID", _METADATA_TABLE_SCHEMA.keys())) _METADATA_SCHEMA_STRING = ", ".join( @@ -665,7 +671,13 @@ def test_log_model_path_file(self) -> None: mock_sp_file_operation = absltest.mock.Mock() self._session.__setattr__("file", mock_sp_file_operation) - expected_stage_path = f'"{_DATABASE_NAME}"."{_SCHEMA_NAME}".SNOWML_MODEL_{expected_stage_postfix}/data' + expected_stage_path = ( + f"{identifier.quote_name_without_upper_casing(_DATABASE_NAME)}" + + "." + + f"{identifier.quote_name_without_upper_casing(_SCHEMA_NAME)}" + + "." + + f"SNOWML_MODEL_{expected_stage_postfix}/data" + ) with absltest.mock.patch("model_registry.os.path.isfile", return_value=True) as mock_isfile: with absltest.mock.patch.object( diff --git a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb index 69c31e59..236f4571 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb @@ -168,7 +168,7 @@ "metadata": {}, "outputs": [], "source": [ - "SNOW_ML_WHEEL_LOCAL_PATH = \"~/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-0.3.2-py3-none-any.whl\"" + "SNOW_ML_WHEEL_LOCAL_PATH = \"~/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-0.3.3-py3-none-any.whl\"" ] }, { @@ -364,6 +364,17 @@ "Aso, you have to provide a sample input data so that we could infer the model signature for you, or you can specify the model signature manually." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "68705420", + "metadata": {}, + "outputs": [], + "source": [ + "SVC_MODEL_NAME=\"SIMPLE_SVC_MODEL\"\n", + "SVC_MODEL_VERSION=\"2\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -373,15 +384,15 @@ "source": [ "# A name and model tags can be added to the model at registration time.\n", "model_id = registry.log_model(\n", - " model_name=\"SIMPLE_SVC_MODEL\",\n", - " model_version=\"1\",\n", + " model_name=SVC_MODEL_NAME,\n", + " model_version=SVC_MODEL_VERSION,\n", " model=clf,\n", " tags={\"stage\": \"testing\", \"classifier_type\": \"svm.SVC\", \"svc_gamma\": svc_gamma, \"svc_C\": svc_C},\n", " sample_input_data=test_features[:10],\n", ")\n", "\n", "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, model_name=\"SIMPLE_SVC_MODEL\", model_version=\"1\")\n", + "model = model_registry.ModelReference(registry=registry, model_name=SVC_MODEL_NAME, model_version=SVC_MODEL_VERSION)\n", "print(\"Registered new model:\", model_id)" ] }, @@ -415,7 +426,7 @@ "registry = model_registry.ModelRegistry(\n", " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=\"SIMPLE_SVC_MODEL\", model_version=\"1\")\n", + "model = model_registry.ModelReference(registry=registry, model_name=SVC_MODEL_NAME, model_version=SVC_MODEL_VERSION)\n", "restored_clf = model.load_model()\n", "\n", "restored_prediction = restored_clf.predict(test_features)\n", @@ -480,7 +491,7 @@ "registry = model_registry.ModelRegistry(\n", " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=\"SIMPLE_SVC_MODEL\", model_version=\"1\")\n", + "model = model_registry.ModelReference(registry=registry, model_name=SVC_MODEL_NAME, model_version=SVC_MODEL_VERSION)\n", "model.deploy(\n", " deployment_name=\"svc_model_predict\",\n", " target_method=\"predict\",\n", @@ -495,7 +506,7 @@ "metadata": {}, "outputs": [], "source": [ - "remote_prediction = registry.predict(deployment_name=\"svc_model_predict\", data=test_features)\n", + "remote_prediction = model.predict(deployment_name=\"svc_model_predict\", data=test_features)\n", "\n", "print(\"Remote prediction:\", remote_prediction[:10])\n", "\n", @@ -532,7 +543,7 @@ "metadata": {}, "outputs": [], "source": [ - "remote_prediction_proba = registry.predict(deployment_name=\"svc_model_predict_proba\", data=test_features)\n", + "remote_prediction_proba = model.predict(deployment_name=\"svc_model_predict_proba\", data=test_features)\n", "\n", "print(\"Remote prediction:\", remote_prediction_proba[:10])\n", "\n", @@ -689,6 +700,17 @@ "Here, how to specify dependencies and model signature manually is shown." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c22ec2f", + "metadata": {}, + "outputs": [], + "source": [ + "GPT2_MODEL_NAME = \"GPT2_MODEL\"\n", + "GPT2_MODEL_VERSION = \"2\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -699,8 +721,8 @@ "from snowflake.ml.model import model_signature\n", "\n", "model_id_gpt = registry.log_model(\n", - " model_name=\"GPT2_MODEL\",\n", - " model_version=\"1\",\n", + " model_name=GPT2_MODEL_NAME,\n", + " model_version=GPT2_MODEL_VERSION,\n", " model=gpt_model,\n", " conda_dependencies=[\"tensorflow\", \"transformers\"],\n", " signatures={\n", @@ -711,7 +733,7 @@ " },\n", ")\n", "\n", - "gpt_model = model_registry.ModelReference(registry=registry, model_name=\"GPT2_MODEL\", model_version=\"1\")\n", + "gpt_model = model_registry.ModelReference(registry=registry, model_name=GPT2_MODEL_NAME, model_version=GPT2_MODEL_VERSION)\n", "print(\"Registered new model:\", model_id_gpt)" ] }, @@ -746,8 +768,8 @@ ")\n", "gpt_model = model_registry.ModelReference(\n", " registry=registry,\n", - " model_name=\"GPT2_MODEL\",\n", - " model_version=\"1\",\n", + " model_name=GPT2_MODEL_NAME,\n", + " model_version=GPT2_MODEL_VERSION,\n", ")\n", "gpt_model.deploy(\n", " deployment_name=\"gpt_model_predict\",\n", @@ -763,7 +785,7 @@ "metadata": {}, "outputs": [], "source": [ - "res = registry.predict(deployment_name=\"gpt_model_predict\", data=pd.DataFrame({\"input\":[\"Hello, are you GPT?\"]}))" + "res = gpt_model.predict(deployment_name=\"gpt_model_predict\", data=pd.DataFrame({\"input\":[\"Hello, are you GPT?\"]}))" ] }, { @@ -961,6 +983,17 @@ "### Train an XGBoost model" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea1e3bee", + "metadata": {}, + "outputs": [], + "source": [ + "XGB_MODEL_NAME = \"XGB_MODEL_KDDCUP99\"\n", + "XGB_MODEL_VERSION = \"2\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -973,11 +1006,7 @@ "regressor = xgboost.XGBClassifier(objective=\"multi:softprob\", n_estimators=500, reg_lambda=1, gamma=0, max_depth=5)\n", "kddcup99_pd_df_train = kddcup99_sp_df_train.to_pandas()\n", "regressor.fit(\n", - " kddcup99_pd_df_train.drop(\n", - " columns=[\n", - " col_name for col_name in kddcup99_pd_df_train.columns if col_name.startswith(\"labels\")\n", - " ] # Since there is a bug in OrdinalEncoder's output\n", - " ),\n", + " kddcup99_pd_df_train.drop(columns=[\"labels\"]),\n", " kddcup99_pd_df_train[\"labels\"],\n", ")" ] @@ -1005,16 +1034,14 @@ ")\n", "# A name and model tags can be added to the model at registration time.\n", "model_id_xgb = registry.log_model(\n", - " model_name=\"XGB_MODEL_KDDCUP99\",\n", - " model_version=\"1\",\n", + " model_name=XGB_MODEL_NAME,\n", + " model_version=XGB_MODEL_VERSION,\n", " model=regressor,\n", - " sample_input_data=kddcup99_sp_df_train.limit(10).drop(\n", - " *[col_name for col_name in kddcup99_sp_df_train.columns if col_name.startswith('\"labels')]\n", - " ),\n", + " sample_input_data=kddcup99_sp_df_train.drop('\"labels\"'),\n", ")\n", "\n", "# The object API can be used to reference a model after creation.\n", - "xgb_model = model_registry.ModelReference(registry=registry, model_name=\"XGB_MODEL_KDDCUP99\", model_version=\"1\")\n", + "xgb_model = model_registry.ModelReference(registry=registry, model_name=XGB_MODEL_NAME, model_version=XGB_MODEL_VERSION)\n", "print(\"Registered new model:\", model_id_xgb)" ] }, @@ -1039,8 +1066,8 @@ ")\n", "xgb_model = model_registry.ModelReference(\n", " registry=registry,\n", - " model_name=\"XGB_MODEL_KDDCUP99\",\n", - " model_version=\"1\",\n", + " model_name=XGB_MODEL_NAME,\n", + " model_version=XGB_MODEL_VERSION,\n", ")\n", "xgb_model.deploy(\n", " deployment_name=\"xgb_model_predict\",\n", @@ -1069,7 +1096,7 @@ "metadata": {}, "outputs": [], "source": [ - "sp_res = registry.predict(deployment_name=\"xgb_model_predict\", data=kddcup99_sp_df_test)\n", + "sp_res = xgb_model.predict(deployment_name=\"xgb_model_predict\", data=kddcup99_sp_df_test)\n", "sp_res.show()" ] }, @@ -1112,7 +1139,12 @@ "outputs": [], "source": [ "registry._session = another_session # Since permanent deployment managing has not been finished in registry.\n", - "sp_res = registry.predict(\n", + "xgb_model = model_registry.ModelReference(\n", + " registry=registry,\n", + " model_name=XGB_MODEL_NAME,\n", + " model_version=XGB_MODEL_VERSION,\n", + ")\n", + "sp_res = xgb_model.predict(\n", " deployment_name=\"xgb_model_predict\", data=another_session.create_dataframe(kddcup99_sp_df_test.to_pandas())\n", ")\n", "sp_res.show()" diff --git a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb new file mode 100644 index 00000000..3f06233d --- /dev/null +++ b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb @@ -0,0 +1,2325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5de3eb26", + "metadata": {}, + "source": [ + "# Model Packaging Example" + ] + }, + { + "cell_type": "markdown", + "id": "197efd00", + "metadata": {}, + "source": [ + "## Before Everything" + ] + }, + { + "cell_type": "markdown", + "id": "6ce97b36", + "metadata": {}, + "source": [ + "### Install `snowflake-ml-python` locally" + ] + }, + { + "cell_type": "markdown", + "id": "1117c596", + "metadata": {}, + "source": [ + "Before `snowflake-ml-python` is publicly available, you have to install from wheel file. Once it is ready, you could install them like other packages in PIP or conda." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "da314158", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install snowflake_ml_python-0.3.2-py3-none-any.whl\n", + "\n", + "# Snowpark Connector, Snowpark Library, Session\n", + "import snowflake.connector\n", + "import snowflake.snowpark\n", + "import snowflake.ml.preprocessing as snowml\n", + "from snowflake.snowpark import Session\n", + "from snowflake.snowpark.version import VERSION\n", + "from snowflake.ml.utils import connection_params" + ] + }, + { + "cell_type": "markdown", + "id": "285c1b29", + "metadata": {}, + "source": [ + "Notice: It is suggested to use pure-pip environment or empty conda environment when you try this. If you insist to install snowML in a conda environment with packages, it is suggested that you should install all requirements and install `snowflake-ml-python` with `--no-deps` flag." + ] + }, + { + "cell_type": "markdown", + "id": "99e58d8c", + "metadata": {}, + "source": [ + "### Setup Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "afd16ff5", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d609ff44", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Scale cell width with the browser window to accommodate .show() commands for wider tables.\n", + "from IPython.display import display, HTML\n", + "\n", + "display(HTML(\"\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1ac32c6f", + "metadata": {}, + "source": [ + "### Start Snowpark Session\n", + "\n", + "To avoid exposing credentials in Github, we use a small utility `SnowflakeLoginOptions`. It allows you to score your default credentials in `~/.snowsql/config` in the following format:\n", + "```\n", + "[connections]\n", + "accountname = # Account identifier to connect to Snowflake.\n", + "username = # User name in the account. Optional.\n", + "password = # User password. Optional.\n", + "dbname = # Default database. Optional.\n", + "schemaname = # Default schema. Optional.\n", + "warehousename = # Default warehouse. Optional.\n", + "#rolename = # Default role. Optional.\n", + "#authenticator = # Authenticator: 'snowflake', 'externalbrowser', etc\n", + "```\n", + "Please follow [this](https://docs.snowflake.com/en/user-guide/snowsql-start.html#configuring-default-connection-settings) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b2efc0a8", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n", + "from snowflake.snowpark import Session\n", + "\n", + "session = Session.builder.configs(SnowflakeLoginOptions()).create()" + ] + }, + { + "cell_type": "markdown", + "id": "e2fcbe4a", + "metadata": {}, + "source": [ + "### Let `snowflake-ml-python` available for your models to be deployed" + ] + }, + { + "cell_type": "markdown", + "id": "671a7710", + "metadata": {}, + "source": [ + "Unfortunately, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to import them manually to use it when the model get deployed to Snowflake. To avoid upload them again and again, we could set up a temporary stage and upload the wheel file there." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5eae711f", + "metadata": {}, + "outputs": [], + "source": [ + "SNOW_ML_WHEEL_LOCAL_PATH = \"~/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-0.3.2-py3-none-any.whl\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6fcececa", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "\n", + "def upload_snowml_to_tmp_stage(session: Session, wheel_path: str) -> str:\n", + " \"\"\"Upload model module of snowml to tmp stage.\n", + "\n", + " Args:\n", + " session: Snowpark session.\n", + " wheel_path: Path to the local SnowML wheel file.\n", + "\n", + " Returns:\n", + " The stage path to uploaded snowml.zip file.\n", + " \"\"\"\n", + " tmp_stage = session.get_session_stage()\n", + " _ = session.file.put(wheel_path, tmp_stage, auto_compress=False, overwrite=True)\n", + " whl_filename = os.path.basename(wheel_path)\n", + " return f\"{tmp_stage}/{whl_filename}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "90ea99cc", + "metadata": {}, + "outputs": [], + "source": [ + "SNOW_ML_WHEEL_STAGE_PATH = upload_snowml_to_tmp_stage(session, SNOW_ML_WHEEL_LOCAL_PATH)" + ] + }, + { + "cell_type": "markdown", + "id": "dfa9ab88", + "metadata": {}, + "source": [ + "### Open/Create Model Registry" + ] + }, + { + "cell_type": "markdown", + "id": "b0a0c8a8", + "metadata": {}, + "source": [ + "A model registry needs to be created before it can be used. The creation will create a new database in the current account so the active role needs to have permissions to create a database. After the first creation, the model registry can be opened without the need to create it again." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a95e3431", + "metadata": {}, + "outputs": [], + "source": [ + "REGISTRY_DATABASE_NAME = \"TEMP\"\n", + "REGISTRY_SCHEMA_NAME = \"WZHAO\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7fff21bc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:absl:The database TEMP already exists. Skipping creation.\n", + "WARNING:absl:The schmea \"TEMP\".\"WZHAO\" already exists. Skipping creation.\n", + "WARNING:absl:The registry table \"TEMP\".\"WZHAO\".\"MODELS\" already exists. Skipping creation.\n", + "WARNING:absl:The metadata table \"TEMP\".\"WZHAO\".\"METADATA\" already exists. Skipping creation.\n" + ] + } + ], + "source": [ + "from snowflake.ml.registry import model_registry\n", + "model_registry.create_model_registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)\n", + "registry = model_registry.ModelRegistry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)" + ] + }, + { + "cell_type": "markdown", + "id": "ca0f443d", + "metadata": {}, + "source": [ + "## Use with snowml model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6271c9d1", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.modeling.xgboost import XGBClassifier\n", + "from sklearn.datasets import load_iris\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "\n", + "iris = load_iris()\n", + "df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],\n", + " columns= iris['feature_names'] + ['target'])\n", + "df.columns = [s.replace(\" (CM)\", '').replace(' ', '') for s in df.columns.str.upper()]\n", + "\n", + "INPUT_COLUMNS = ['SEPALLENGTH', 'SEPALWIDTH', 'PETALLENGTH', 'PETALWIDTH']\n", + "LABEL_COLUMNS = 'TARGET'\n", + "OUTPUT_COLUMNS = 'PREDICTED_TARGET'" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4c8de352", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SEPALLENGTHSEPALWIDTHPETALLENGTHPETALWIDTHTARGET
05.13.51.40.20.0
14.93.01.40.20.0
24.73.21.30.20.0
34.63.11.50.20.0
45.03.61.40.20.0
..................
1456.73.05.22.32.0
1466.32.55.01.92.0
1476.53.05.22.02.0
1486.23.45.42.32.0
1495.93.05.11.82.0
\n", + "

150 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0\n", + ".. ... ... ... ... ...\n", + "145 6.7 3.0 5.2 2.3 2.0\n", + "146 6.3 2.5 5.0 1.9 2.0\n", + "147 6.5 3.0 5.2 2.0 2.0\n", + "148 6.2 3.4 5.4 2.3 2.0\n", + "149 5.9 3.0 5.1 1.8 2.0\n", + "\n", + "[150 rows x 5 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7ca901eb", + "metadata": {}, + "outputs": [], + "source": [ + "test_features = df[:10]\n", + "model_version = \"1_007\"" + ] + }, + { + "cell_type": "markdown", + "id": "b9441f7a", + "metadata": {}, + "source": [ + "### XGBoost model" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4ac4c21e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_xgb = XGBClassifier(input_cols=INPUT_COLUMNS,\n", + " output_cols=OUTPUT_COLUMNS,\n", + " label_cols=LABEL_COLUMNS)\n", + "\n", + "clf_xgb.fit(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "dd0ca646", + "metadata": {}, + "outputs": [], + "source": [ + "prediction = clf_xgb.predict(test_features)\n", + "prediction_proba = clf_xgb.predict_proba(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3d872431", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"SIMPLE_XGB_MODEL\"\n", + "deploy_name = \"xgb_model_predict\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "523cc249", + "metadata": { + "code_folding": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered new model: a1af4d7afbf111ed8e10ce0e8c87ef9b\n" + ] + } + ], + "source": [ + "# A name and model tags can be added to the model at registration time.\n", + "model_id = registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=clf_xgb,\n", + " tags={\"stage\": \"testing\", \"classifier_type\": \"XGBClassifier\"},\n", + " sample_input_data=test_features[:10], # this line can be removed after modelSignature\n", + ")\n", + "\n", + "# The object API can be used to reference a model after creation.\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "print(\"Registered new model:\", model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "fccfb1af", + "metadata": {}, + "source": [ + "### Test on the result using load_model " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bf9a3596", + "metadata": { + "code_folding": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0\n", + "1 4.9 3.0 1.4 0.2 0.0 0\n", + "2 4.7 3.2 1.3 0.2 0.0 0\n", + "3 4.6 3.1 1.5 0.2 0.0 0\n", + "4 5.0 3.6 1.4 0.2 0.0 0\n", + "5 5.4 3.9 1.7 0.4 0.0 0\n", + "6 4.6 3.4 1.4 0.3 0.0 0\n", + "7 5.0 3.4 1.5 0.2 0.0 0\n", + "8 4.4 2.9 1.4 0.2 0.0 0\n", + "9 4.9 3.1 1.5 0.1 0.0 0\n", + "Restored prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0\n", + "1 4.9 3.0 1.4 0.2 0.0 0\n", + "2 4.7 3.2 1.3 0.2 0.0 0\n", + "3 4.6 3.1 1.5 0.2 0.0 0\n", + "4 5.0 3.6 1.4 0.2 0.0 0\n", + "5 5.4 3.9 1.7 0.4 0.0 0\n", + "6 4.6 3.4 1.4 0.3 0.0 0\n", + "7 5.0 3.4 1.5 0.2 0.0 0\n", + "8 4.4 2.9 1.4 0.2 0.0 0\n", + "9 4.9 3.1 1.5 0.1 0.0 0\n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "restored_clf = model.load_model()\n", + "\n", + "restored_prediction = restored_clf.predict(test_features)\n", + "\n", + "print(\"Original prediction:\", prediction[:10])\n", + "print(\"Restored prediction:\", restored_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" + ] + }, + { + "cell_type": "markdown", + "id": "fe9e2081", + "metadata": {}, + "source": [ + "### Testing on deploy" + ] + }, + { + "cell_type": "markdown", + "id": "56834c5c", + "metadata": {}, + "source": [ + "#### Predict function match/mismatch? - comparsion between deploy and local" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bf55701d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmpuowyt6bq.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xgb_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6f159a5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0\n", + "1 4.9 3.0 1.4 0.2 0.0 0\n", + "2 4.7 3.2 1.3 0.2 0.0 0\n", + "3 4.6 3.1 1.5 0.2 0.0 0\n", + "4 5.0 3.6 1.4 0.2 0.0 0\n", + "5 5.4 3.9 1.7 0.4 0.0 0\n", + "6 4.6 3.4 1.4 0.3 0.0 0\n", + "7 5.0 3.4 1.5 0.2 0.0 0\n", + "8 4.4 2.9 1.4 0.2 0.0 0\n", + "9 4.9 3.1 1.5 0.1 0.0 0\n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, remote_prediction.values))" + ] + }, + { + "cell_type": "markdown", + "id": "65af7944", + "metadata": {}, + "source": [ + "#### Predict_proba function match/mismatch? - comparsion between deploy and local" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0c77d583", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmpgltgg1aw.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xgb_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict_proba\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1216dbe8", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET \\\n", + "0 5.1 3.5 1.4 0.2 0.0 \n", + "1 4.9 3.0 1.4 0.2 0.0 \n", + "2 4.7 3.2 1.3 0.2 0.0 \n", + "3 4.6 3.1 1.5 0.2 0.0 \n", + "4 5.0 3.6 1.4 0.2 0.0 \n", + "5 5.4 3.9 1.7 0.4 0.0 \n", + "6 4.6 3.4 1.4 0.3 0.0 \n", + "7 5.0 3.4 1.5 0.2 0.0 \n", + "8 4.4 2.9 1.4 0.2 0.0 \n", + "9 4.9 3.1 1.5 0.1 0.0 \n", + "\n", + " predict_proba_0.0 predict_proba_1.0 predict_proba_2.0 \n", + "0 0.996803 0.002383 0.000814 \n", + "1 0.996362 0.002382 0.001256 \n", + "2 0.996803 0.002383 0.000814 \n", + "3 0.996795 0.002383 0.000822 \n", + "4 0.996803 0.002383 0.000814 \n", + "5 0.996803 0.002383 0.000814 \n", + "6 0.996803 0.002383 0.000814 \n", + "7 0.996803 0.002383 0.000814 \n", + "8 0.996362 0.002382 0.001256 \n", + "9 0.996795 0.002383 0.000822 \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction_proba = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.allclose(prediction_proba, remote_prediction_proba.values))" + ] + }, + { + "cell_type": "markdown", + "id": "e5f9c9b7", + "metadata": {}, + "source": [ + "### Random Forest model *from ensemble*\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "48780cb2", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.modeling.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "93d42010", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_rf = RandomForestClassifier(input_cols=INPUT_COLUMNS,\n", + " output_cols=OUTPUT_COLUMNS,\n", + " label_cols=LABEL_COLUMNS)\n", + "\n", + "clf_rf.fit(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cfe55d82", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:910: RuntimeWarning: divide by zero encountered in log\n", + " return np.log(proba)\n" + ] + } + ], + "source": [ + "prediction = clf_rf.predict(test_features)\n", + "prediction_proba = clf_rf.predict_proba(test_features)\n", + "prediction_log_proba = clf_rf.predict_log_proba(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "4ef91e18", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"SIMPLE_RF_MODEL\"\n", + "deploy_name = \"rf_model_predict\"\n", + "classifier_type = \"RFClassifier\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "f9401b46", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:910: RuntimeWarning: divide by zero encountered in log\n", + " return np.log(proba)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered new model: fe277924fbf111ed8e10ce0e8c87ef9b\n" + ] + } + ], + "source": [ + "# A name and model tags can be added to the model at registration time.\n", + "model_id = registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=clf_rf,\n", + " tags={\"stage\": \"testing\", \"classifier_type\": classifier_type},\n", + " sample_input_data=test_features, # this line can be removed after modelSignature\n", + ")\n", + "\n", + "# The object API can be used to reference a model after creation.\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "print(\"Registered new model:\", model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "1c8e87fc", + "metadata": {}, + "source": [ + "#### Comparsion between load_model" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "cf1db785", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0 0.0\n", + "Restored prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0 0.0\n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "restored_clf = model.load_model()\n", + "\n", + "restored_prediction = restored_clf.predict(test_features)\n", + "\n", + "print(\"Original prediction:\", prediction[:10])\n", + "print(\"Restored prediction:\", restored_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" + ] + }, + { + "cell_type": "markdown", + "id": "cefbad30", + "metadata": {}, + "source": [ + "#### Comparsion between deploy" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "f81f663e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmpvnzzuuxw.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rf_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "726838d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0 0.0\n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, remote_prediction.values))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "33833e23", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmp6rm6hkvn.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rf_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict_proba\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "4e5d8d89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET \\\n", + "0 5.1 3.5 1.4 0.2 0.0 \n", + "1 4.9 3.0 1.4 0.2 0.0 \n", + "2 4.7 3.2 1.3 0.2 0.0 \n", + "3 4.6 3.1 1.5 0.2 0.0 \n", + "4 5.0 3.6 1.4 0.2 0.0 \n", + "5 5.4 3.9 1.7 0.4 0.0 \n", + "6 4.6 3.4 1.4 0.3 0.0 \n", + "7 5.0 3.4 1.5 0.2 0.0 \n", + "8 4.4 2.9 1.4 0.2 0.0 \n", + "9 4.9 3.1 1.5 0.1 0.0 \n", + "\n", + " predict_proba_0.0 predict_proba_1.0 predict_proba_2.0 \n", + "0 1.0 0.0 0.0 \n", + "1 1.0 0.0 0.0 \n", + "2 1.0 0.0 0.0 \n", + "3 1.0 0.0 0.0 \n", + "4 1.0 0.0 0.0 \n", + "5 1.0 0.0 0.0 \n", + "6 1.0 0.0 0.0 \n", + "7 1.0 0.0 0.0 \n", + "8 1.0 0.0 0.0 \n", + "9 1.0 0.0 0.0 \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction_proba = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction_proba, remote_prediction_proba.values))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "8ddc04e8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmpfpyrwg0l.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rf_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict_log_proba\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cf688655", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET \\\n", + "0 5.1 3.5 1.4 0.2 0.0 \n", + "1 4.9 3.0 1.4 0.2 0.0 \n", + "2 4.7 3.2 1.3 0.2 0.0 \n", + "3 4.6 3.1 1.5 0.2 0.0 \n", + "4 5.0 3.6 1.4 0.2 0.0 \n", + "5 5.4 3.9 1.7 0.4 0.0 \n", + "6 4.6 3.4 1.4 0.3 0.0 \n", + "7 5.0 3.4 1.5 0.2 0.0 \n", + "8 4.4 2.9 1.4 0.2 0.0 \n", + "9 4.9 3.1 1.5 0.1 0.0 \n", + "\n", + " predict_log_proba_0.0 predict_log_proba_1.0 predict_log_proba_2.0 \n", + "0 0.0 -inf -inf \n", + "1 0.0 -inf -inf \n", + "2 0.0 -inf -inf \n", + "3 0.0 -inf -inf \n", + "4 0.0 -inf -inf \n", + "5 0.0 -inf -inf \n", + "6 0.0 -inf -inf \n", + "7 0.0 -inf -inf \n", + "8 0.0 -inf -inf \n", + "9 0.0 -inf -inf \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction_log_proba = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_log_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction_log_proba, remote_prediction_log_proba.values))" + ] + }, + { + "cell_type": "markdown", + "id": "eb7b90fe", + "metadata": {}, + "source": [ + "### Logistic Regression model\n", + "\n", + "The reason to test w/ LR model is because, it has all the functions such as `predict, predict_log_proba, predict_proba, decision_function`" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6b1d0b93", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.modeling.linear_model import LogisticRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3280b02f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf_lr = LogisticRegression(input_cols=INPUT_COLUMNS,\n", + " output_cols=OUTPUT_COLUMNS,\n", + " label_cols=LABEL_COLUMNS,\n", + " max_iter=1000)\n", + "\n", + "clf_lr.fit(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a74cef89", + "metadata": {}, + "outputs": [], + "source": [ + "prediction = clf_lr.predict(test_features)\n", + "prediction_proba = clf_lr.predict_proba(test_features)\n", + "prediction_log_proba = clf_lr.predict_log_proba(test_features)\n", + "prediction_decision = clf_lr.decision_function(test_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "de6fa3a0", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"SIMPLE_LR_MODEL\"\n", + "deploy_name = \"lr_model_predict\"\n", + "classifier_type = \"LogisticRegression\"" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "35ca8aa6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered new model: 1ae7d6b2fbf211ed8e10ce0e8c87ef9b\n" + ] + } + ], + "source": [ + "# A name and model tags can be added to the model at registration time.\n", + "model_id = registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=clf_lr,\n", + " tags={\"stage\": \"testing\", \"classifier_type\": classifier_type},\n", + " sample_input_data=test_features, # this line can be removed after modelSignature\n", + ")\n", + "\n", + "# The object API can be used to reference a model after creation.\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "print(\"Registered new model:\", model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "39bdfe5a", + "metadata": {}, + "source": [ + "#### Comparison between load_model" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "b5fcc75b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0 0.0\n", + "Restored prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0 0.0\n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "restored_clf = model.load_model()\n", + "\n", + "restored_prediction = restored_clf.predict(test_features)\n", + "\n", + "print(\"Original prediction:\", prediction[:10])\n", + "print(\"Restored prediction:\", restored_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" + ] + }, + { + "cell_type": "markdown", + "id": "83ff0a1b", + "metadata": {}, + "source": [ + "#### Comparison between deploy" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "25be7377", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmpp1upu42a.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lr_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "afd5f285", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET PREDICTED_TARGET\n", + "0 5.1 3.5 1.4 0.2 0.0 0.0\n", + "1 4.9 3.0 1.4 0.2 0.0 0.0\n", + "2 4.7 3.2 1.3 0.2 0.0 0.0\n", + "3 4.6 3.1 1.5 0.2 0.0 0.0\n", + "4 5.0 3.6 1.4 0.2 0.0 0.0\n", + "5 5.4 3.9 1.7 0.4 0.0 0.0\n", + "6 4.6 3.4 1.4 0.3 0.0 0.0\n", + "7 5.0 3.4 1.5 0.2 0.0 0.0\n", + "8 4.4 2.9 1.4 0.2 0.0 0.0\n", + "9 4.9 3.1 1.5 0.1 0.0 0.0\n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, remote_prediction.values))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "fa054c3c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmp9p9ocx8r.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lr_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict_proba\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "ec25c905", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET \\\n", + "0 5.1 3.5 1.4 0.2 0.0 \n", + "1 4.9 3.0 1.4 0.2 0.0 \n", + "2 4.7 3.2 1.3 0.2 0.0 \n", + "3 4.6 3.1 1.5 0.2 0.0 \n", + "4 5.0 3.6 1.4 0.2 0.0 \n", + "5 5.4 3.9 1.7 0.4 0.0 \n", + "6 4.6 3.4 1.4 0.3 0.0 \n", + "7 5.0 3.4 1.5 0.2 0.0 \n", + "8 4.4 2.9 1.4 0.2 0.0 \n", + "9 4.9 3.1 1.5 0.1 0.0 \n", + "\n", + " predict_proba_0.0 predict_proba_1.0 predict_proba_2.0 \n", + "0 0.981584 0.018416 1.449704e-08 \n", + "1 0.971334 0.028666 3.019028e-08 \n", + "2 0.985275 0.014725 1.233695e-08 \n", + "3 0.976064 0.023936 3.970137e-08 \n", + "4 0.985235 0.014765 1.200231e-08 \n", + "5 0.970227 0.029773 7.396811e-08 \n", + "6 0.986775 0.013225 1.997822e-08 \n", + "7 0.976148 0.023852 2.773000e-08 \n", + "8 0.979626 0.020373 3.060330e-08 \n", + "9 0.968761 0.031239 3.173039e-08 \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction_proba = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.allclose(prediction_proba, remote_prediction_proba.values))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "5a425e55", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmp4kciu22a.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lr_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict_log_proba\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ff4a4c54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET \\\n", + "0 5.1 3.5 1.4 0.2 0.0 \n", + "1 4.9 3.0 1.4 0.2 0.0 \n", + "2 4.7 3.2 1.3 0.2 0.0 \n", + "3 4.6 3.1 1.5 0.2 0.0 \n", + "4 5.0 3.6 1.4 0.2 0.0 \n", + "5 5.4 3.9 1.7 0.4 0.0 \n", + "6 4.6 3.4 1.4 0.3 0.0 \n", + "7 5.0 3.4 1.5 0.2 0.0 \n", + "8 4.4 2.9 1.4 0.2 0.0 \n", + "9 4.9 3.1 1.5 0.1 0.0 \n", + "\n", + " predict_log_proba_0.0 predict_log_proba_1.0 predict_log_proba_2.0 \n", + "0 -0.018588 -3.994513 -18.049321 \n", + "1 -0.029085 -3.552040 -17.315746 \n", + "2 -0.014834 -4.218213 -18.210667 \n", + "3 -0.024227 -3.732359 -17.041880 \n", + "4 -0.014875 -4.215482 -18.238166 \n", + "5 -0.030225 -3.514169 -16.419632 \n", + "6 -0.013313 -4.325674 -17.728623 \n", + "7 -0.024141 -3.735879 -17.400751 \n", + "8 -0.020584 -3.893520 -17.302158 \n", + "9 -0.031738 -3.466082 -17.265991 \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction_log_proba = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_log_proba[:10])\n", + "\n", + "print(\"Result comparison:\", np.allclose(prediction_log_proba, remote_prediction_log_proba.values))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "2904de8c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmpevdk6spg.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lr_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"decision_function\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "713806ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET \\\n", + "0 5.1 3.5 1.4 0.2 0.0 \n", + "1 4.9 3.0 1.4 0.2 0.0 \n", + "2 4.7 3.2 1.3 0.2 0.0 \n", + "3 4.6 3.1 1.5 0.2 0.0 \n", + "4 5.0 3.6 1.4 0.2 0.0 \n", + "5 5.4 3.9 1.7 0.4 0.0 \n", + "6 4.6 3.4 1.4 0.3 0.0 \n", + "7 5.0 3.4 1.5 0.2 0.0 \n", + "8 4.4 2.9 1.4 0.2 0.0 \n", + "9 4.9 3.1 1.5 0.1 0.0 \n", + "\n", + " decision_function_0.0 decision_function_1.0 decision_function_2.0 \n", + "0 7.335553 3.359628 -10.695181 \n", + "1 6.936539 3.413583 -10.350122 \n", + "2 7.466404 3.263025 -10.729429 \n", + "3 6.908595 3.200463 -10.109058 \n", + "4 7.474632 3.274026 -10.748659 \n", + "5 6.624451 3.140506 -9.764956 \n", + "6 7.342557 3.030196 -10.372753 \n", + "7 7.029449 3.317712 -10.347161 \n", + "8 7.051503 3.178567 -10.230071 \n", + "9 6.889533 3.455188 -10.344721 \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction_decision_function = model.predict(deployment_name=deploy_name, data=test_features)\n", + "\n", + "print(\"Remote prediction:\", remote_prediction_decision_function[:10])\n", + "\n", + "print(\"Result comparison:\", np.allclose(prediction_decision, remote_prediction_decision_function.values))" + ] + }, + { + "cell_type": "markdown", + "id": "d6930720", + "metadata": {}, + "source": [ + "### Pipeline model\n", + "\n", + "It is important to see if the whole pipeline is stored" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "846db56a", + "metadata": {}, + "outputs": [], + "source": [ + "def add_simple_category(df):\n", + " bins = (-1, 4, 5, 6, 10)\n", + " group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile']\n", + " categories = pd.cut(df.SEPALLENGTH, bins, labels=group_names)\n", + " df['SIMPLE'] = categories\n", + " return df\n", + "df_cat = add_simple_category(df)\n", + "\n", + "numeric_features=['SEPALLENGTH', 'SEPALWIDTH', 'PETALLENGTH', 'PETALWIDTH']\n", + "categorical_features = ['SIMPLE']\n", + "numeric_features_output = [x + '_O' for x in numeric_features]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "2033ef31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the Table and Cleanup Cols, have a work_schema for testing\n", + "\n", + "\n", + "############################################################################\n", + "# NOTE: \n", + "# Set work_schema variable to some schema that exists in your account.\n", + "# set data_dir to point to the directory that contains the diamonds.csv file.\n", + "############################################################################\n", + "work_schema = 'TEST'\n", + "demo_table = 'IRIS_UPPER'\n", + "\n", + "# write the DF to Snowflake and create a Snowflake DF\n", + "session.write_pandas(df_cat, demo_table, auto_create_table=True, table_type=\"temporary\", schema=work_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "6b150ff8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH TARGET SIMPLE\n", + "0 5.1 3.5 1.4 0.2 0.0 2_quartile\n", + "1 4.9 3.0 1.4 0.2 0.0 1_quartile\n", + "2 4.7 3.2 1.3 0.2 0.0 1_quartile\n", + "3 4.6 3.1 1.5 0.2 0.0 1_quartile\n", + "4 5.0 3.6 1.4 0.2 0.0 1_quartile\n", + "5 5.4 3.9 1.7 0.4 0.0 2_quartile\n", + "6 4.6 3.4 1.4 0.3 0.0 1_quartile\n", + "7 5.0 3.4 1.5 0.2 0.0 1_quartile\n", + "8 4.4 2.9 1.4 0.2 0.0 1_quartile\n", + "9 4.9 3.1 1.5 0.1 0.0 1_quartile\n" + ] + } + ], + "source": [ + "# Diamonds Snowflake Table\n", + "input_tbl = f\"{session.get_current_database()}.{session.get_current_schema()}.{demo_table}\"\n", + "iris_df = session.table(input_tbl)\n", + "print(iris_df.limit(10).to_pandas())" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "86f8b074", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/snowflake/snowpark/session.py:1374: UserWarning: Pandas Dataframe has non-standard index of type which will not be written. Consider changing the index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)\n", + " success, nchunks, nrows, ci_output = write_pandas(\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from snowflake.ml.modeling.linear_model import LogisticRegression\n", + "from snowflake.ml.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder\n", + "from snowflake.ml.framework.pipeline import Pipeline\n", + "pipeline = Pipeline(\n", + " steps=[\n", + " ('OHEHOT', OneHotEncoder(input_cols=categorical_features, output_cols='cat_output', drop_input_cols=True), ),\n", + " ('SCALER', MinMaxScaler(clip=True, input_cols=numeric_features, output_cols=numeric_features_output, drop_input_cols=True), ),\n", + " ('CLASSIFIER', LogisticRegression(label_cols=LABEL_COLUMNS))\n", + " ])\n", + "pipeline.fit(iris_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "94231eb1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/snowflake/snowpark/session.py:1374: UserWarning: Pandas Dataframe has non-standard index of type which will not be written. Consider changing the index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)\n", + " success, nchunks, nrows, ci_output = write_pandas(\n" + ] + } + ], + "source": [ + "iris_df_test = iris_df.limit(10)\n", + "prediction = pipeline.predict(iris_df_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "2720275f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.fit(iris_df.to_pandas())" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "a3b5159a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "prediction = pipeline.predict(iris_df_test.to_pandas())\n", + "prediction_log_proba = pipeline.predict_log_proba(iris_df_test.to_pandas())\n", + "prediction_proba = pipeline.predict_proba(iris_df_test.to_pandas())" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "85917118", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"SIMPLE_PP_MODEL\"\n", + "deploy_name = \"pp_model_predict\"\n", + "classifier_type = \"Pipeline\"\n", + "model_version = f\"{model_name}_007\"" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "735ff3ca", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered new model: 46bab6bafbf211ed8e10ce0e8c87ef9b\n" + ] + } + ], + "source": [ + "# A name and model tags can be added to the model at registration time.\n", + "model_id = registry.log_model(\n", + " model_name=model_name,\n", + " model_version=model_version,\n", + " model=pipeline,\n", + " tags={\"stage\": \"testing\", \"classifier_type\": classifier_type},\n", + " sample_input_data=iris_df_test.to_pandas(), # this line can be removed after modelSignature\n", + ")\n", + "\n", + "# The object API can be used to reference a model after creation.\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "print(\"Registered new model:\", model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "9cd6b554", + "metadata": {}, + "source": [ + "#### Comparison between load_model" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "ade4d099", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original prediction: TARGET \"cat_output_1_quartile\" \"cat_output_2_quartile\" \\\n", + "0 0.0 0.0 1.0 \n", + "1 0.0 1.0 0.0 \n", + "2 0.0 1.0 0.0 \n", + "3 0.0 1.0 0.0 \n", + "4 0.0 1.0 0.0 \n", + "5 0.0 0.0 1.0 \n", + "6 0.0 1.0 0.0 \n", + "7 0.0 1.0 0.0 \n", + "8 0.0 1.0 0.0 \n", + "9 0.0 1.0 0.0 \n", + "\n", + " \"cat_output_3_quartile\" SEPALLENGTH_O SEPALWIDTH_O PETALLENGTH_O \\\n", + "0 0.0 0.222222 0.625000 0.067797 \n", + "1 0.0 0.166667 0.416667 0.067797 \n", + "2 0.0 0.111111 0.500000 0.050847 \n", + "3 0.0 0.083333 0.458333 0.084746 \n", + "4 0.0 0.194444 0.666667 0.067797 \n", + "5 0.0 0.305556 0.791667 0.118644 \n", + "6 0.0 0.083333 0.583333 0.067797 \n", + "7 0.0 0.194444 0.583333 0.084746 \n", + "8 0.0 0.027778 0.375000 0.067797 \n", + "9 0.0 0.166667 0.458333 0.084746 \n", + "\n", + " PETALWIDTH_O OUTPUT_TARGET \n", + "0 0.041667 0.0 \n", + "1 0.041667 0.0 \n", + "2 0.041667 0.0 \n", + "3 0.041667 0.0 \n", + "4 0.041667 0.0 \n", + "5 0.125000 0.0 \n", + "6 0.083333 0.0 \n", + "7 0.041667 0.0 \n", + "8 0.041667 0.0 \n", + "9 0.000000 0.0 \n", + "Restored prediction: TARGET \"cat_output_1_quartile\" \"cat_output_2_quartile\" \\\n", + "0 0.0 0.0 1.0 \n", + "1 0.0 1.0 0.0 \n", + "2 0.0 1.0 0.0 \n", + "3 0.0 1.0 0.0 \n", + "4 0.0 1.0 0.0 \n", + "5 0.0 0.0 1.0 \n", + "6 0.0 1.0 0.0 \n", + "7 0.0 1.0 0.0 \n", + "8 0.0 1.0 0.0 \n", + "9 0.0 1.0 0.0 \n", + "\n", + " \"cat_output_3_quartile\" SEPALLENGTH_O SEPALWIDTH_O PETALLENGTH_O \\\n", + "0 0.0 0.222222 0.625000 0.067797 \n", + "1 0.0 0.166667 0.416667 0.067797 \n", + "2 0.0 0.111111 0.500000 0.050847 \n", + "3 0.0 0.083333 0.458333 0.084746 \n", + "4 0.0 0.194444 0.666667 0.067797 \n", + "5 0.0 0.305556 0.791667 0.118644 \n", + "6 0.0 0.083333 0.583333 0.067797 \n", + "7 0.0 0.194444 0.583333 0.084746 \n", + "8 0.0 0.027778 0.375000 0.067797 \n", + "9 0.0 0.166667 0.458333 0.084746 \n", + "\n", + " PETALWIDTH_O OUTPUT_TARGET \n", + "0 0.041667 0.0 \n", + "1 0.041667 0.0 \n", + "2 0.041667 0.0 \n", + "3 0.041667 0.0 \n", + "4 0.041667 0.0 \n", + "5 0.125000 0.0 \n", + "6 0.083333 0.0 \n", + "7 0.041667 0.0 \n", + "8 0.041667 0.0 \n", + "9 0.000000 0.0 \n", + "Result comparison: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but OneHotEncoder was fitted without feature names\n", + " warnings.warn(\n", + "/Users/xjiang/opt/anaconda3/envs/snowflake-ml-modeling/lib/python3.8/site-packages/sklearn/base.py:413: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "restored_clf = model.load_model()\n", + "\n", + "restored_prediction = restored_clf.predict(iris_df_test.to_pandas())\n", + "\n", + "print(\"Original prediction:\", prediction[:10])\n", + "print(\"Restored prediction:\", restored_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.array_equal(prediction, restored_prediction[prediction.columns]))" + ] + }, + { + "cell_type": "markdown", + "id": "2ca2e15e", + "metadata": {}, + "source": [ + "#### Comparison between deploy predict" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "7c1210c5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/xjiang/Documents/snowml/snowflake/ml/model/_udf_util.py:227: RuntimeWarning: Cannot find conda resolver, use Snowflake information schema for best-effort dependency pre-check.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated UDF file is persisted at: /var/folders/76/47j700wn3g905_97713xwpnm0000gn/T/tmp0_o73cne.py\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark.session:The version of package anyio in the local environment is 3.6.2, which does not fit the criteria for the requirement anyio. Your UDF might not work when the package version is different between the server and your local environment\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pp_model_predict is deployed to warehouse.\n" + ] + } + ], + "source": [ + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "model.deploy(\n", + " deployment_name=deploy_name,\n", + " target_method=\"predict\",\n", + " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "d2ff838f", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Remote prediction: TARGET \"cat_output_1_quartile\" \"cat_output_2_quartile\" \\\n", + "0 0.0 0.0 1.0 \n", + "1 0.0 1.0 0.0 \n", + "2 0.0 1.0 0.0 \n", + "3 0.0 1.0 0.0 \n", + "4 0.0 1.0 0.0 \n", + "5 0.0 0.0 1.0 \n", + "6 0.0 1.0 0.0 \n", + "7 0.0 1.0 0.0 \n", + "8 0.0 1.0 0.0 \n", + "9 0.0 1.0 0.0 \n", + "\n", + " \"cat_output_3_quartile\" SEPALLENGTH_O SEPALWIDTH_O PETALLENGTH_O \\\n", + "0 0.0 0.222222 0.625000 0.067797 \n", + "1 0.0 0.166667 0.416667 0.067797 \n", + "2 0.0 0.111111 0.500000 0.050847 \n", + "3 0.0 0.083333 0.458333 0.084746 \n", + "4 0.0 0.194444 0.666667 0.067797 \n", + "5 0.0 0.305556 0.791667 0.118644 \n", + "6 0.0 0.083333 0.583333 0.067797 \n", + "7 0.0 0.194444 0.583333 0.084746 \n", + "8 0.0 0.027778 0.375000 0.067797 \n", + "9 0.0 0.166667 0.458333 0.084746 \n", + "\n", + " PETALWIDTH_O OUTPUT_TARGET \n", + "0 0.041667 0.0 \n", + "1 0.041667 0.0 \n", + "2 0.041667 0.0 \n", + "3 0.041667 0.0 \n", + "4 0.041667 0.0 \n", + "5 0.125000 0.0 \n", + "6 0.083333 0.0 \n", + "7 0.041667 0.0 \n", + "8 0.041667 0.0 \n", + "9 0.000000 0.0 \n", + "Result comparison: True\n" + ] + } + ], + "source": [ + "remote_prediction = model.predict(deployment_name=deploy_name, data=iris_df_test.to_pandas())\n", + "\n", + "print(\"Remote prediction:\", remote_prediction[:10])\n", + "\n", + "print(\"Result comparison:\", np.allclose(prediction, remote_prediction.values))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dd77df3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "fb0a62cbfaa59af7646af5a6672c5c3e72ec75fbadf6ff0336b6769523f221a5" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snowflake/ml/modeling/calibration/BUILD.bazel b/snowflake/ml/sklearn/calibration/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/calibration/BUILD.bazel rename to snowflake/ml/sklearn/calibration/BUILD.bazel diff --git a/snowflake/ml/modeling/calibration/estimators_info.bzl b/snowflake/ml/sklearn/calibration/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/calibration/estimators_info.bzl rename to snowflake/ml/sklearn/calibration/estimators_info.bzl diff --git a/snowflake/ml/modeling/cluster/BUILD.bazel b/snowflake/ml/sklearn/cluster/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/cluster/BUILD.bazel rename to snowflake/ml/sklearn/cluster/BUILD.bazel diff --git a/snowflake/ml/modeling/cluster/estimators_info.bzl b/snowflake/ml/sklearn/cluster/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/cluster/estimators_info.bzl rename to snowflake/ml/sklearn/cluster/estimators_info.bzl diff --git a/snowflake/ml/modeling/compose/BUILD.bazel b/snowflake/ml/sklearn/compose/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/compose/BUILD.bazel rename to snowflake/ml/sklearn/compose/BUILD.bazel diff --git a/snowflake/ml/modeling/compose/estimators_info.bzl b/snowflake/ml/sklearn/compose/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/compose/estimators_info.bzl rename to snowflake/ml/sklearn/compose/estimators_info.bzl diff --git a/snowflake/ml/modeling/covariance/BUILD.bazel b/snowflake/ml/sklearn/covariance/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/covariance/BUILD.bazel rename to snowflake/ml/sklearn/covariance/BUILD.bazel diff --git a/snowflake/ml/modeling/covariance/estimators_info.bzl b/snowflake/ml/sklearn/covariance/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/covariance/estimators_info.bzl rename to snowflake/ml/sklearn/covariance/estimators_info.bzl diff --git a/snowflake/ml/modeling/decomposition/BUILD.bazel b/snowflake/ml/sklearn/decomposition/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/decomposition/BUILD.bazel rename to snowflake/ml/sklearn/decomposition/BUILD.bazel diff --git a/snowflake/ml/modeling/decomposition/estimators_info.bzl b/snowflake/ml/sklearn/decomposition/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/decomposition/estimators_info.bzl rename to snowflake/ml/sklearn/decomposition/estimators_info.bzl diff --git a/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel b/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/discriminant_analysis/BUILD.bazel rename to snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel diff --git a/snowflake/ml/modeling/discriminant_analysis/estimators_info.bzl b/snowflake/ml/sklearn/discriminant_analysis/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/discriminant_analysis/estimators_info.bzl rename to snowflake/ml/sklearn/discriminant_analysis/estimators_info.bzl diff --git a/snowflake/ml/modeling/ensemble/BUILD.bazel b/snowflake/ml/sklearn/ensemble/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/ensemble/BUILD.bazel rename to snowflake/ml/sklearn/ensemble/BUILD.bazel diff --git a/snowflake/ml/modeling/ensemble/estimators_info.bzl b/snowflake/ml/sklearn/ensemble/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/ensemble/estimators_info.bzl rename to snowflake/ml/sklearn/ensemble/estimators_info.bzl diff --git a/snowflake/ml/modeling/feature_selection/BUILD.bazel b/snowflake/ml/sklearn/feature_selection/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/feature_selection/BUILD.bazel rename to snowflake/ml/sklearn/feature_selection/BUILD.bazel diff --git a/snowflake/ml/modeling/feature_selection/estimators_info.bzl b/snowflake/ml/sklearn/feature_selection/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/feature_selection/estimators_info.bzl rename to snowflake/ml/sklearn/feature_selection/estimators_info.bzl diff --git a/snowflake/ml/framework/BUILD.bazel b/snowflake/ml/sklearn/framework/BUILD.bazel similarity index 100% rename from snowflake/ml/framework/BUILD.bazel rename to snowflake/ml/sklearn/framework/BUILD.bazel diff --git a/snowflake/ml/framework/_utils.py b/snowflake/ml/sklearn/framework/_utils.py similarity index 100% rename from snowflake/ml/framework/_utils.py rename to snowflake/ml/sklearn/framework/_utils.py diff --git a/snowflake/ml/framework/base.py b/snowflake/ml/sklearn/framework/base.py similarity index 99% rename from snowflake/ml/framework/base.py rename to snowflake/ml/sklearn/framework/base.py index 1ed16fc9..f1541a50 100644 --- a/snowflake/ml/framework/base.py +++ b/snowflake/ml/sklearn/framework/base.py @@ -15,7 +15,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils import parallelize -from snowflake.ml.framework import _utils +from snowflake.ml.sklearn.framework import _utils from snowflake.snowpark import functions as F from snowflake.snowpark._internal import type_utils diff --git a/snowflake/ml/framework/pipeline.py b/snowflake/ml/sklearn/framework/pipeline.py similarity index 99% rename from snowflake/ml/framework/pipeline.py rename to snowflake/ml/sklearn/framework/pipeline.py index e9c9c5f6..f245a156 100644 --- a/snowflake/ml/framework/pipeline.py +++ b/snowflake/ml/sklearn/framework/pipeline.py @@ -10,7 +10,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base _PROJECT = "ModelDevelopment" _SUBPROJECT = "Framework" diff --git a/snowflake/ml/modeling/gaussian_process/BUILD.bazel b/snowflake/ml/sklearn/gaussian_process/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/gaussian_process/BUILD.bazel rename to snowflake/ml/sklearn/gaussian_process/BUILD.bazel diff --git a/snowflake/ml/modeling/gaussian_process/estimators_info.bzl b/snowflake/ml/sklearn/gaussian_process/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/gaussian_process/estimators_info.bzl rename to snowflake/ml/sklearn/gaussian_process/estimators_info.bzl diff --git a/snowflake/ml/modeling/impute/BUILD.bazel b/snowflake/ml/sklearn/impute/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/impute/BUILD.bazel rename to snowflake/ml/sklearn/impute/BUILD.bazel diff --git a/snowflake/ml/modeling/impute/estimators_info.bzl b/snowflake/ml/sklearn/impute/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/impute/estimators_info.bzl rename to snowflake/ml/sklearn/impute/estimators_info.bzl diff --git a/snowflake/ml/modeling/isotonic/BUILD.bazel b/snowflake/ml/sklearn/isotonic/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/isotonic/BUILD.bazel rename to snowflake/ml/sklearn/isotonic/BUILD.bazel diff --git a/snowflake/ml/modeling/isotonic/estimators_info.bzl b/snowflake/ml/sklearn/isotonic/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/isotonic/estimators_info.bzl rename to snowflake/ml/sklearn/isotonic/estimators_info.bzl diff --git a/snowflake/ml/modeling/kernel_approximation/BUILD.bazel b/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/kernel_approximation/BUILD.bazel rename to snowflake/ml/sklearn/kernel_approximation/BUILD.bazel diff --git a/snowflake/ml/modeling/kernel_approximation/estimators_info.bzl b/snowflake/ml/sklearn/kernel_approximation/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/kernel_approximation/estimators_info.bzl rename to snowflake/ml/sklearn/kernel_approximation/estimators_info.bzl diff --git a/snowflake/ml/modeling/kernel_ridge/BUILD.bazel b/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/kernel_ridge/BUILD.bazel rename to snowflake/ml/sklearn/kernel_ridge/BUILD.bazel diff --git a/snowflake/ml/modeling/kernel_ridge/estimators_info.bzl b/snowflake/ml/sklearn/kernel_ridge/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/kernel_ridge/estimators_info.bzl rename to snowflake/ml/sklearn/kernel_ridge/estimators_info.bzl diff --git a/snowflake/ml/modeling/linear_model/BUILD.bazel b/snowflake/ml/sklearn/linear_model/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/linear_model/BUILD.bazel rename to snowflake/ml/sklearn/linear_model/BUILD.bazel diff --git a/snowflake/ml/modeling/linear_model/estimators_info.bzl b/snowflake/ml/sklearn/linear_model/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/linear_model/estimators_info.bzl rename to snowflake/ml/sklearn/linear_model/estimators_info.bzl diff --git a/snowflake/ml/modeling/manifold/BUILD.bazel b/snowflake/ml/sklearn/manifold/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/manifold/BUILD.bazel rename to snowflake/ml/sklearn/manifold/BUILD.bazel diff --git a/snowflake/ml/modeling/manifold/estimators_info.bzl b/snowflake/ml/sklearn/manifold/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/manifold/estimators_info.bzl rename to snowflake/ml/sklearn/manifold/estimators_info.bzl diff --git a/snowflake/ml/modeling/mixture/BUILD.bazel b/snowflake/ml/sklearn/mixture/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/mixture/BUILD.bazel rename to snowflake/ml/sklearn/mixture/BUILD.bazel diff --git a/snowflake/ml/modeling/mixture/estimators_info.bzl b/snowflake/ml/sklearn/mixture/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/mixture/estimators_info.bzl rename to snowflake/ml/sklearn/mixture/estimators_info.bzl diff --git a/snowflake/ml/modeling/model_selection/BUILD.bazel b/snowflake/ml/sklearn/model_selection/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/model_selection/BUILD.bazel rename to snowflake/ml/sklearn/model_selection/BUILD.bazel diff --git a/snowflake/ml/modeling/model_selection/estimators_info.bzl b/snowflake/ml/sklearn/model_selection/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/model_selection/estimators_info.bzl rename to snowflake/ml/sklearn/model_selection/estimators_info.bzl diff --git a/snowflake/ml/modeling/multiclass/BUILD.bazel b/snowflake/ml/sklearn/multiclass/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/multiclass/BUILD.bazel rename to snowflake/ml/sklearn/multiclass/BUILD.bazel diff --git a/snowflake/ml/modeling/multiclass/estimators_info.bzl b/snowflake/ml/sklearn/multiclass/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/multiclass/estimators_info.bzl rename to snowflake/ml/sklearn/multiclass/estimators_info.bzl diff --git a/snowflake/ml/modeling/multioutput/BUILD.bazel b/snowflake/ml/sklearn/multioutput/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/multioutput/BUILD.bazel rename to snowflake/ml/sklearn/multioutput/BUILD.bazel diff --git a/snowflake/ml/modeling/multioutput/estimators_info.bzl b/snowflake/ml/sklearn/multioutput/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/multioutput/estimators_info.bzl rename to snowflake/ml/sklearn/multioutput/estimators_info.bzl diff --git a/snowflake/ml/modeling/naive_bayes/BUILD.bazel b/snowflake/ml/sklearn/naive_bayes/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/naive_bayes/BUILD.bazel rename to snowflake/ml/sklearn/naive_bayes/BUILD.bazel diff --git a/snowflake/ml/modeling/naive_bayes/estimators_info.bzl b/snowflake/ml/sklearn/naive_bayes/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/naive_bayes/estimators_info.bzl rename to snowflake/ml/sklearn/naive_bayes/estimators_info.bzl diff --git a/snowflake/ml/modeling/neighbors/BUILD.bazel b/snowflake/ml/sklearn/neighbors/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/neighbors/BUILD.bazel rename to snowflake/ml/sklearn/neighbors/BUILD.bazel diff --git a/snowflake/ml/modeling/neighbors/estimators_info.bzl b/snowflake/ml/sklearn/neighbors/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/neighbors/estimators_info.bzl rename to snowflake/ml/sklearn/neighbors/estimators_info.bzl diff --git a/snowflake/ml/modeling/neural_network/BUILD.bazel b/snowflake/ml/sklearn/neural_network/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/neural_network/BUILD.bazel rename to snowflake/ml/sklearn/neural_network/BUILD.bazel diff --git a/snowflake/ml/modeling/neural_network/estimators_info.bzl b/snowflake/ml/sklearn/neural_network/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/neural_network/estimators_info.bzl rename to snowflake/ml/sklearn/neural_network/estimators_info.bzl diff --git a/snowflake/ml/preprocessing/BUILD.bazel b/snowflake/ml/sklearn/preprocessing/BUILD.bazel similarity index 84% rename from snowflake/ml/preprocessing/BUILD.bazel rename to snowflake/ml/sklearn/preprocessing/BUILD.bazel index 052b46b5..c407482b 100644 --- a/snowflake/ml/preprocessing/BUILD.bazel +++ b/snowflake/ml/sklearn/preprocessing/BUILD.bazel @@ -21,7 +21,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -33,7 +33,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -47,7 +47,7 @@ py_library( ":ordinal_encoder", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -59,7 +59,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -71,7 +71,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -83,7 +83,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -96,7 +96,8 @@ py_library( ":init", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/framework", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/sklearn/framework", ], ) @@ -110,7 +111,7 @@ py_library( "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal:type_utils", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -122,7 +123,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -134,7 +135,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) @@ -146,7 +147,7 @@ py_library( deps = [ ":init", "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/framework", + "//snowflake/ml/sklearn/framework", ], ) diff --git a/snowflake/ml/preprocessing/__init__.py b/snowflake/ml/sklearn/preprocessing/__init__.py similarity index 100% rename from snowflake/ml/preprocessing/__init__.py rename to snowflake/ml/sklearn/preprocessing/__init__.py diff --git a/snowflake/ml/preprocessing/binarizer.py b/snowflake/ml/sklearn/preprocessing/binarizer.py similarity index 98% rename from snowflake/ml/preprocessing/binarizer.py rename to snowflake/ml/sklearn/preprocessing/binarizer.py index e254eb85..4e272f92 100644 --- a/snowflake/ml/preprocessing/binarizer.py +++ b/snowflake/ml/sklearn/preprocessing/binarizer.py @@ -9,7 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base from snowflake.snowpark import functions as F, types as T diff --git a/snowflake/ml/preprocessing/k_bins_discretizer.py b/snowflake/ml/sklearn/preprocessing/k_bins_discretizer.py similarity index 99% rename from snowflake/ml/preprocessing/k_bins_discretizer.py rename to snowflake/ml/sklearn/preprocessing/k_bins_discretizer.py index df000e35..06a5518e 100644 --- a/snowflake/ml/preprocessing/k_bins_discretizer.py +++ b/snowflake/ml/sklearn/preprocessing/k_bins_discretizer.py @@ -15,7 +15,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_utils diff --git a/snowflake/ml/preprocessing/label_encoder.py b/snowflake/ml/sklearn/preprocessing/label_encoder.py similarity index 97% rename from snowflake/ml/preprocessing/label_encoder.py rename to snowflake/ml/sklearn/preprocessing/label_encoder.py index f11cd2c3..50dc4e02 100644 --- a/snowflake/ml/preprocessing/label_encoder.py +++ b/snowflake/ml/sklearn/preprocessing/label_encoder.py @@ -9,8 +9,8 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils -from snowflake.ml.framework import base -from snowflake.ml.preprocessing import ordinal_encoder +from snowflake.ml.sklearn.framework import base +from snowflake.ml.sklearn.preprocessing import ordinal_encoder class LabelEncoder(base.BaseTransformer): diff --git a/snowflake/ml/preprocessing/max_abs_scaler.py b/snowflake/ml/sklearn/preprocessing/max_abs_scaler.py similarity index 99% rename from snowflake/ml/preprocessing/max_abs_scaler.py rename to snowflake/ml/sklearn/preprocessing/max_abs_scaler.py index c1b42260..3bd0675d 100644 --- a/snowflake/ml/preprocessing/max_abs_scaler.py +++ b/snowflake/ml/sklearn/preprocessing/max_abs_scaler.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base class MaxAbsScaler(base.BaseTransformer): diff --git a/snowflake/ml/preprocessing/min_max_scaler.py b/snowflake/ml/sklearn/preprocessing/min_max_scaler.py similarity index 99% rename from snowflake/ml/preprocessing/min_max_scaler.py rename to snowflake/ml/sklearn/preprocessing/min_max_scaler.py index 55b8cfdc..e370ff64 100644 --- a/snowflake/ml/preprocessing/min_max_scaler.py +++ b/snowflake/ml/sklearn/preprocessing/min_max_scaler.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import _utils, base +from snowflake.ml.sklearn.framework import _utils, base from snowflake.snowpark import functions as F diff --git a/snowflake/ml/preprocessing/normalizer.py b/snowflake/ml/sklearn/preprocessing/normalizer.py similarity index 99% rename from snowflake/ml/preprocessing/normalizer.py rename to snowflake/ml/sklearn/preprocessing/normalizer.py index 04f2fd39..48feaf22 100644 --- a/snowflake/ml/preprocessing/normalizer.py +++ b/snowflake/ml/sklearn/preprocessing/normalizer.py @@ -9,7 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base from snowflake.snowpark import functions as F, types as T _VALID_NORMS = ["l1", "l2", "max"] diff --git a/snowflake/ml/preprocessing/one_hot_encoder.py b/snowflake/ml/sklearn/preprocessing/one_hot_encoder.py similarity index 99% rename from snowflake/ml/preprocessing/one_hot_encoder.py rename to snowflake/ml/sklearn/preprocessing/one_hot_encoder.py index 150c6fdf..378456d9 100644 --- a/snowflake/ml/preprocessing/one_hot_encoder.py +++ b/snowflake/ml/sklearn/preprocessing/one_hot_encoder.py @@ -15,7 +15,8 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils -from snowflake.ml.framework import _utils, base +from snowflake.ml._internal.utils import identifier +from snowflake.ml.sklearn.framework import _utils, base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_utils @@ -732,7 +733,9 @@ def map_encoded_value(row: pd.Series) -> List[int]: transformed_dataset = dataset for input_col in self.input_cols: - output_cols = [f'"{col}"' for col in self._dense_output_cols_mappings[input_col]] + output_cols = [ + identifier.quote_name_without_upper_casing(col) for col in self._dense_output_cols_mappings[input_col] + ] input_col_state_df = state_df.filter(F.col(_COLUMN_NAME) == input_col)[output_cols + [_CATEGORY]] # index values through a left join over the dataset and its states @@ -1258,7 +1261,11 @@ def get_output_cols(self) -> List[str]: output_cols = self.output_cols else: output_cols = ( - [f'"{col}"' for input_col in self.input_cols for col in self._dense_output_cols_mappings[input_col]] + [ + identifier.quote_name_without_upper_casing(col) + for input_col in self.input_cols + for col in self._dense_output_cols_mappings[input_col] + ] if self._dense_output_cols_mappings else [] ) diff --git a/snowflake/ml/preprocessing/ordinal_encoder.py b/snowflake/ml/sklearn/preprocessing/ordinal_encoder.py similarity index 99% rename from snowflake/ml/preprocessing/ordinal_encoder.py rename to snowflake/ml/sklearn/preprocessing/ordinal_encoder.py index 1a475b3f..b2f05665 100644 --- a/snowflake/ml/preprocessing/ordinal_encoder.py +++ b/snowflake/ml/sklearn/preprocessing/ordinal_encoder.py @@ -13,7 +13,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils from snowflake.ml._internal.utils import identifier -from snowflake.ml.framework import base +from snowflake.ml.sklearn.framework import base from snowflake.snowpark import functions as F, types as T _COLUMN_NAME = "_COLUMN_NAME" @@ -450,7 +450,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame ) state_df = state_df.filter(F.col(_CATEGORY).is_not_null()).union_by_name(null_category_state_df) - suffix = uuid.uuid4().hex.upper() + suffix = "_" + uuid.uuid4().hex.upper() transformed_dataset = dataset for idx, input_col in enumerate(self.input_cols): diff --git a/snowflake/ml/preprocessing/robust_scaler.py b/snowflake/ml/sklearn/preprocessing/robust_scaler.py similarity index 99% rename from snowflake/ml/preprocessing/robust_scaler.py rename to snowflake/ml/sklearn/preprocessing/robust_scaler.py index 2d6c24ad..006ad395 100644 --- a/snowflake/ml/preprocessing/robust_scaler.py +++ b/snowflake/ml/sklearn/preprocessing/robust_scaler.py @@ -12,7 +12,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import _utils, base +from snowflake.ml.sklearn.framework import _utils, base class RobustScaler(base.BaseTransformer): diff --git a/snowflake/ml/preprocessing/simple_imputer.py b/snowflake/ml/sklearn/preprocessing/simple_imputer.py similarity index 99% rename from snowflake/ml/preprocessing/simple_imputer.py rename to snowflake/ml/sklearn/preprocessing/simple_imputer.py index 2cec9b8c..1c3bd401 100644 --- a/snowflake/ml/preprocessing/simple_imputer.py +++ b/snowflake/ml/sklearn/preprocessing/simple_imputer.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import _utils, base +from snowflake.ml.sklearn.framework import _utils, base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_internal_utils diff --git a/snowflake/ml/preprocessing/standard_scaler.py b/snowflake/ml/sklearn/preprocessing/standard_scaler.py similarity index 99% rename from snowflake/ml/preprocessing/standard_scaler.py rename to snowflake/ml/sklearn/preprocessing/standard_scaler.py index c16ec99e..bab46f77 100644 --- a/snowflake/ml/preprocessing/standard_scaler.py +++ b/snowflake/ml/sklearn/preprocessing/standard_scaler.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.framework import _utils, base +from snowflake.ml.sklearn.framework import _utils, base class StandardScaler(base.BaseTransformer): diff --git a/snowflake/ml/modeling/semi_supervised/BUILD.bazel b/snowflake/ml/sklearn/semi_supervised/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/semi_supervised/BUILD.bazel rename to snowflake/ml/sklearn/semi_supervised/BUILD.bazel diff --git a/snowflake/ml/modeling/semi_supervised/estimators_info.bzl b/snowflake/ml/sklearn/semi_supervised/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/semi_supervised/estimators_info.bzl rename to snowflake/ml/sklearn/semi_supervised/estimators_info.bzl diff --git a/snowflake/ml/modeling/svm/BUILD.bazel b/snowflake/ml/sklearn/svm/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/svm/BUILD.bazel rename to snowflake/ml/sklearn/svm/BUILD.bazel diff --git a/snowflake/ml/modeling/svm/estimators_info.bzl b/snowflake/ml/sklearn/svm/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/svm/estimators_info.bzl rename to snowflake/ml/sklearn/svm/estimators_info.bzl diff --git a/snowflake/ml/modeling/tree/BUILD.bazel b/snowflake/ml/sklearn/tree/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/tree/BUILD.bazel rename to snowflake/ml/sklearn/tree/BUILD.bazel diff --git a/snowflake/ml/modeling/tree/estimators_info.bzl b/snowflake/ml/sklearn/tree/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/tree/estimators_info.bzl rename to snowflake/ml/sklearn/tree/estimators_info.bzl diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 5cc482e4..8c61027c 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "0.3.2" +VERSION = "0.3.3" diff --git a/snowflake/ml/modeling/xgboost/BUILD.bazel b/snowflake/ml/xgboost/BUILD.bazel similarity index 100% rename from snowflake/ml/modeling/xgboost/BUILD.bazel rename to snowflake/ml/xgboost/BUILD.bazel diff --git a/snowflake/ml/modeling/xgboost/estimators_info.bzl b/snowflake/ml/xgboost/estimators_info.bzl similarity index 100% rename from snowflake/ml/modeling/xgboost/estimators_info.bzl rename to snowflake/ml/xgboost/estimators_info.bzl diff --git a/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel b/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel index 29389387..a87b01ff 100644 --- a/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel @@ -8,6 +8,6 @@ py_test( deps = [ "//snowflake/ml/_internal/utils:parallelize", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) diff --git a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel index fa0446c8..ead9b4d9 100644 --- a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel +++ b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel @@ -6,7 +6,7 @@ py_test( name = "test_column_name_inference", srcs = ["test_column_name_inference.py"], deps = [ - "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/sklearn/linear_model:linear_regression", "//snowflake/ml/utils:connection_params", ], ) @@ -15,8 +15,8 @@ py_test( name = "test_grid_search", srcs = ["test_grid_search.py"], deps = [ - "//snowflake/ml/modeling/model_selection:grid_search_cv", - "//snowflake/ml/modeling/svm:svr", + "//snowflake/ml/sklearn/model_selection:grid_search_cv", + "//snowflake/ml/sklearn/svm:svr", "//snowflake/ml/utils:connection_params", ], ) @@ -25,9 +25,9 @@ py_test( name = "test_voting_regressor", srcs = ["test_voting_regressor.py"], deps = [ - "//snowflake/ml/modeling/ensemble:voting_regressor", - "//snowflake/ml/modeling/linear_model:linear_regression", - "//snowflake/ml/modeling/linear_model:sgd_regressor", + "//snowflake/ml/sklearn/ensemble:voting_regressor", + "//snowflake/ml/sklearn/linear_model:linear_regression", + "//snowflake/ml/sklearn/linear_model:sgd_regressor", "//snowflake/ml/utils:connection_params", ], ) @@ -36,13 +36,13 @@ py_test( name="test_grid_search_on_pipeline", srcs = ["test_grid_search_on_pipeline.py"], deps = [ - "//snowflake/ml/modeling/linear_model:logistic_regression", - "//snowflake/ml/modeling/model_selection:grid_search_cv", - "//snowflake/ml/modeling/compose:column_transformer", - "//snowflake/ml/framework:framework", - "//snowflake/ml/preprocessing:one_hot_encoder", - "//snowflake/ml/preprocessing:min_max_scaler", - "//snowflake/ml/preprocessing:label_encoder", + "//snowflake/ml/sklearn/linear_model:logistic_regression", + "//snowflake/ml/sklearn/model_selection:grid_search_cv", + "//snowflake/ml/sklearn/compose:column_transformer", + "//snowflake/ml/sklearn/framework:framework", + "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", + "//snowflake/ml/sklearn/preprocessing:min_max_scaler", + "//snowflake/ml/sklearn/preprocessing:label_encoder", "//snowflake/ml/utils:connection_params", ] ) @@ -51,8 +51,8 @@ py_test( name="test_iterative_imputer", srcs = ["test_iterative_imputer.py"], deps = [ - "//snowflake/ml/modeling/linear_model:linear_regression", - "//snowflake/ml/modeling/impute:iterative_imputer", + "//snowflake/ml/sklearn/linear_model:linear_regression", + "//snowflake/ml/sklearn/impute:iterative_imputer", "//snowflake/ml/utils:connection_params", ] ) @@ -61,10 +61,10 @@ py_test( name="test_pipeline_with_ohe_and_xgbr", srcs = ["test_pipeline_with_ohe_and_xgbr.py"], deps = [ - "//snowflake/ml/modeling/xgboost:xgb_regressor", - "//snowflake/ml/framework:framework", - "//snowflake/ml/preprocessing:one_hot_encoder", - "//snowflake/ml/preprocessing:min_max_scaler", + "//snowflake/ml/xgboost:xgb_regressor", + "//snowflake/ml/sklearn/framework:framework", + "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", + "//snowflake/ml/sklearn/preprocessing:min_max_scaler", "//snowflake/ml/utils:connection_params", ] ) @@ -73,8 +73,8 @@ py_test( name="test_randomized_search", srcs = ["test_randomized_search.py"], deps = [ - "//snowflake/ml/modeling/model_selection:randomized_search_cv", - "//snowflake/ml/modeling/ensemble:random_forest_classifier", + "//snowflake/ml/sklearn/model_selection:randomized_search_cv", + "//snowflake/ml/sklearn/ensemble:random_forest_classifier", "//snowflake/ml/utils:connection_params", ] ) diff --git a/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py b/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py index 65fefdee..88f5bac0 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py +++ b/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py @@ -8,7 +8,7 @@ from sklearn.datasets import load_diabetes from sklearn.linear_model import LinearRegression as SkLinearRegression -from snowflake.ml.modeling.linear_model import LinearRegression +from snowflake.ml.sklearn.linear_model import LinearRegression from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_grid_search.py b/tests/integ/snowflake/ml/extra_tests/test_grid_search.py index cbc820c3..c4efdd95 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_grid_search.py +++ b/tests/integ/snowflake/ml/extra_tests/test_grid_search.py @@ -9,8 +9,8 @@ from sklearn.model_selection import GridSearchCV as SkGridSearchCV from sklearn.svm import SVR as SkSVR -from snowflake.ml.modeling.model_selection import GridSearchCV -from snowflake.ml.modeling.svm import SVR +from snowflake.ml.sklearn.model_selection import GridSearchCV +from snowflake.ml.sklearn.svm import SVR from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py b/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py index 7c554791..98dc201d 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py +++ b/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py @@ -2,12 +2,12 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from absl.testing.absltest import TestCase, main +from snowflake.ml.sklearn.linear_model.logistic_regression import LogisticRegression -from snowflake.ml.framework.pipeline import Pipeline -from snowflake.ml.modeling.compose import ColumnTransformer -from snowflake.ml.modeling.linear_model.logistic_regression import LogisticRegression -from snowflake.ml.modeling.model_selection import GridSearchCV -from snowflake.ml.preprocessing import MinMaxScaler, OneHotEncoder +from snowflake.ml.sklearn.compose import ColumnTransformer +from snowflake.ml.sklearn.framework.pipeline import Pipeline +from snowflake.ml.sklearn.model_selection import GridSearchCV +from snowflake.ml.sklearn.preprocessing import MinMaxScaler, OneHotEncoder from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Column, Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py b/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py index 782697ca..fe8a2e8d 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py +++ b/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py @@ -13,8 +13,8 @@ from sklearn.impute import IterativeImputer as SkIterativeImputer from sklearn.linear_model import LinearRegression as SkLinearRegression -from snowflake.ml.modeling.impute import IterativeImputer -from snowflake.ml.modeling.linear_model import LinearRegression +from snowflake.ml.sklearn.impute import IterativeImputer +from snowflake.ml.sklearn.linear_model import LinearRegression from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py b/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py index 634c02bc..2641c9c9 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py +++ b/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py @@ -3,10 +3,10 @@ # from absl.testing import absltest -from snowflake.ml.framework.pipeline import Pipeline -from snowflake.ml.modeling.xgboost import XGBRegressor -from snowflake.ml.preprocessing import MinMaxScaler, OneHotEncoder +from snowflake.ml.sklearn.framework.pipeline import Pipeline +from snowflake.ml.sklearn.preprocessing import MinMaxScaler, OneHotEncoder from snowflake.ml.utils.connection_params import SnowflakeLoginOptions +from snowflake.ml.xgboost import XGBRegressor from snowflake.snowpark import Column, Session categorical_columns = [ diff --git a/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py b/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py index 4ae783a9..9daa4ed4 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py +++ b/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py @@ -10,8 +10,8 @@ from sklearn.ensemble import RandomForestClassifier as SkRandomForestClassifier from sklearn.model_selection import RandomizedSearchCV as SkRandomizedSearchCV -from snowflake.ml.modeling.ensemble import RandomForestClassifier -from snowflake.ml.modeling.model_selection import RandomizedSearchCV +from snowflake.ml.sklearn.ensemble import RandomForestClassifier +from snowflake.ml.sklearn.model_selection import RandomizedSearchCV from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py b/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py index 36f76fe3..273c9fb6 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py +++ b/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py @@ -12,8 +12,8 @@ SGDRegressor as SkSGDRegressor, ) -from snowflake.ml.modeling.ensemble import VotingRegressor -from snowflake.ml.modeling.linear_model import LinearRegression, SGDRegressor +from snowflake.ml.sklearn.ensemble import VotingRegressor +from snowflake.ml.sklearn.linear_model import LinearRegression, SGDRegressor from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/modeling/lightgbm/BUILD.bazel b/tests/integ/snowflake/ml/lightgbm/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/lightgbm/BUILD.bazel rename to tests/integ/snowflake/ml/lightgbm/BUILD.bazel index 18aa456e..2fb38b2c 100644 --- a/tests/integ/snowflake/ml/modeling/lightgbm/BUILD.bazel +++ b/tests/integ/snowflake/ml/lightgbm/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/lightgbm:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/lightgbm:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "lightgbm", - module_root_dir = "snowflake/ml/modeling/lightgbm", + module_root_dir = "snowflake/ml/lightgbm", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/metrics/BUILD.bazel b/tests/integ/snowflake/ml/metrics/BUILD.bazel index 86b52bbe..522d90e5 100644 --- a/tests/integ/snowflake/ml/metrics/BUILD.bazel +++ b/tests/integ/snowflake/ml/metrics/BUILD.bazel @@ -17,7 +17,7 @@ py_test( deps = [ "//snowflake/ml/metrics", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) diff --git a/tests/integ/snowflake/ml/metrics/test_accuracy_score.py b/tests/integ/snowflake/ml/metrics/test_accuracy_score.py index a6cb9f6d..dbe058bf 100644 --- a/tests/integ/snowflake/ml/metrics/test_accuracy_score.py +++ b/tests/integ/snowflake/ml/metrics/test_accuracy_score.py @@ -8,7 +8,7 @@ from snowflake import snowpark from snowflake.ml import metrics as snowml_metrics from snowflake.ml.utils import connection_params -from tests.integ.snowflake.ml.framework import utils +from tests.integ.snowflake.ml.sklearn.framework import utils _DATA, _SCHEMA = utils.gen_fuzz_data( rows=100, diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index 0ffa1ae2..6f833ed9 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -12,7 +12,9 @@ py_test( deps = [ "//snowflake/ml/model:_deployer", "//snowflake/ml/model:_model", - "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/sklearn/linear_model:linear_regression", "//snowflake/ml/utils:connection_params", ], ) diff --git a/tests/integ/snowflake/ml/model/model_integ_test.py b/tests/integ/snowflake/ml/model/model_integ_test.py index 28d0904c..dfeb652c 100644 --- a/tests/integ/snowflake/ml/model/model_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_integ_test.py @@ -3,6 +3,7 @@ # import asyncio +import json import os import sys import tempfile @@ -22,7 +23,7 @@ custom_model, type_hints as model_types, ) -from snowflake.ml.modeling.linear_model import LinearRegression +from snowflake.ml.sklearn.linear_model import LinearRegression from snowflake.ml.utils import connection_params from snowflake.snowpark import Session @@ -35,7 +36,7 @@ def __init__(self, context: custom_model.ModelContext) -> None: @custom_model.inference_api def predict(self, input: pd.DataFrame) -> pd.DataFrame: - return pd.DataFrame({"output": input["c1"]}, dtype=np.int64) + return pd.DataFrame({"output": input["c1"]}) class DemoModelSPQuote(custom_model.CustomModel): @@ -44,7 +45,16 @@ def __init__(self, context: custom_model.ModelContext) -> None: @custom_model.inference_api def predict(self, input: pd.DataFrame) -> pd.DataFrame: - return pd.DataFrame({'"output"': input['"c1"']}, dtype=np.int64) + return pd.DataFrame({'"output"': input['"c1"']}) + + +class DemoModelArray(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame({"output": input.values.tolist()}) class AsyncComposeModel(custom_model.CustomModel): @@ -275,7 +285,7 @@ def test_custom_demo_model_sp_quote(self) -> None: pd.testing.assert_frame_equal( res, - pd.DataFrame([1, 4], columns=['"output"']), + pd.DataFrame([1, 4], columns=['"output"'], dtype=np.int8), ) def test_custom_demo_model_sp_mix_1(self) -> None: @@ -339,7 +349,7 @@ def test_custom_demo_model_sp_mix_2(self) -> None: pd.testing.assert_frame_equal( res, - pd.DataFrame([1, 4], columns=["output"]), + pd.DataFrame([1, 4], columns=["output"], dtype=np.int8), ) def test_custom_demo_model(self) -> None: @@ -397,6 +407,237 @@ def test_custom_demo_model(self) -> None: _drop_function(self._session, f"custom_demo_model_{self.run_id}") + def test_custom_demo_model_array(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModelArray(custom_model.ModelContext()) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + model_api.save_model( + name="custom_demo_model_array", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_array"), + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_array_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_array"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df) + + pd.testing.assert_frame_equal( + res, + pd.DataFrame(data={"output": [[1, 2, 3], [4, 2, 5]]}), + ) + + def test_custom_demo_model_str(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModel(custom_model.ModelContext()) + pd_df = pd.DataFrame( + [["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"] + ) + model_api.save_model( + name="custom_demo_model_str", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_str"), + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_str_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_str"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df) + + pd.testing.assert_frame_equal( + res, + pd.DataFrame(data={"output": ["Yogiri", "Artia"]}), + ) + + def test_custom_demo_model_array_sp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModelArray(custom_model.ModelContext()) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + sp_df = self._session.create_dataframe(pd_df) + model_api.save_model( + name="custom_demo_model_array_sp", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_array_sp"), + model=lm, + sample_input=sp_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_array_sp_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_array_sp"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], sp_df) + + pd.testing.assert_frame_equal( + res.to_pandas().applymap(json.loads), + pd.DataFrame(data={"output": [[1, 2, 3], [4, 2, 5]]}), + ) + + def test_custom_demo_model_str_sp(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModel(custom_model.ModelContext()) + pd_df = pd.DataFrame( + [["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"] + ) + sp_df = self._session.create_dataframe(pd_df) + model_api.save_model( + name="custom_demo_model_str_sp", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_str_sp"), + model=lm, + sample_input=sp_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_str_sp_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_str_sp"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], sp_df) + + pd.testing.assert_frame_equal( + res.to_pandas(), + pd.DataFrame(data={"output": ["Yogiri", "Artia"]}), + ) + + def test_custom_demo_model_array_str(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModelArray(custom_model.ModelContext()) + pd_df = pd.DataFrame( + [["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"] + ) + model_api.save_model( + name="custom_demo_model_array_str", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_array_str"), + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_array_str_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_array_str"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df) + + pd.testing.assert_frame_equal( + res, + pd.DataFrame(data={"output": [["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]]}), + ) + + def test_custom_demo_model_with_input_no_keep_order(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModel(custom_model.ModelContext()) + arr = np.random.randint(100, size=(10000, 3)) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + model_api.save_model( + name="custom_demo_model_with_input_no_keep_order", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_with_input_no_keep_order"), + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_with_input_no_keep_order_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_with_input_no_keep_order"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + { + "relax_version": True, + "_snowml_wheel_path": self._snowml_wheel_path, + "output_with_input_features": True, + "keep_order": False, + } + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df) + pd.testing.assert_series_equal(res["output"], res["c1"], check_dtype=False, check_names=False) + + def test_custom_demo_model_with_input(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + lm = DemoModel(custom_model.ModelContext()) + arr = np.random.randint(100, size=(10000, 3)) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + model_api.save_model( + name="custom_demo_model_with_input", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_with_input"), + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_with_input_{self.run_id}", + model_dir_path=os.path.join(tmpdir, "custom_demo_model_with_input"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + { + "relax_version": True, + "_snowml_wheel_path": self._snowml_wheel_path, + "output_with_input_features": True, + } + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df) + pd.testing.assert_series_equal(res["output"], res["c1"], check_dtype=False, check_names=False) + pd.testing.assert_frame_equal( + res, + pd.DataFrame( + np.concatenate([arr, np.expand_dims(arr[:, 0], axis=1)], axis=1), + columns=["c1", "c2", "c3", "output"], + ), + check_dtype=False, + ) + def test_custom_model_with_artifacts(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "bias"), "w") as f: @@ -432,6 +673,98 @@ def test_custom_model_with_artifacts(self) -> None: pd.DataFrame([False, True], columns=["output"]), ) + def test_custom_demo_model_in_stage(self) -> None: + lm = DemoModel(custom_model.ModelContext()) + arr = np.random.randint(100, size=(10000, 3)) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + tmp_stage = self._session.get_session_stage() + model_path_in_stage = f"{tmp_stage}/custom_demo_model_in_stage_{self.run_id}.zip" + model_api.save_model( + name="custom_demo_model_in_stage", + session=self._session, + model_stage_file_path=model_path_in_stage, + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + loaded_model, _ = model_api.load_model(session=self._session, model_stage_file_path=model_path_in_stage) + assert isinstance(loaded_model, DemoModel) + + local_loaded_res = loaded_model.predict(pd_df) + pd.testing.assert_frame_equal( + local_loaded_res, + pd.DataFrame(arr[:, 0], columns=["output"]), + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_demo_model_{self.run_id}", + model_stage_file_path=model_path_in_stage, + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df) + + pd.testing.assert_frame_equal( + res, + pd.DataFrame(arr[:, 0], columns=["output"]), + ) + + self.assertTrue(deploy_info in deployer.list_deployments()) + self.assertEqual(deploy_info, deployer.get_deployment(f"custom_demo_model_{self.run_id}")) + + def test_custom_model_with_artifacts_in_stage(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "bias"), "w") as f: + f.write("10") + lm = DemoModelWithArtifacts( + custom_model.ModelContext(models={}, artifacts={"bias": os.path.join(tmpdir, "bias")}) + ) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + tmp_stage = self._session.get_session_stage() + model_path_in_stage = f"{tmp_stage}/custom_model_with_artifacts_in_stage_{self.run_id}.zip" + model_api.save_model( + name="custom_model_with_artifacts", + session=self._session, + model_stage_file_path=model_path_in_stage, + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + loaded_model, _ = model_api.load_model(session=self._session, model_stage_file_path=model_path_in_stage) + assert isinstance(loaded_model, DemoModelWithArtifacts) + + local_loaded_res = loaded_model.predict(pd_df) + pd.testing.assert_frame_equal( + local_loaded_res, + pd.DataFrame([False, True], columns=["output"]), + ) + + deployer = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + deploy_info = deployer.create_deployment( + name=f"custom_model_with_artifacts{self.run_id}", + model_stage_file_path=model_path_in_stage, + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} + ), + ) + assert deploy_info is not None + res = deployer.predict(deploy_info["name"], pd_df[["c3", "c1", "c2"]]) + + pd.testing.assert_frame_equal( + res, + pd.DataFrame([False, True], columns=["output"]), + ) + def test_skl_model_deploy(self) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) regr = linear_model.LinearRegression() @@ -457,7 +790,7 @@ def test_skl_model_deploy(self) -> None: assert di is not None res = dc.predict(di["name"], iris_X) - np.testing.assert_allclose(res["feature_0"].values, regr.predict(iris_X)) + np.testing.assert_allclose(res["output_feature_0"].values, regr.predict(iris_X)) def test_skl_model_proba_deploy(self) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) @@ -484,7 +817,7 @@ def test_skl_model_proba_deploy(self) -> None: ) assert di_predict is not None res = dc.predict(di_predict["name"], iris_X[:10]) - np.testing.assert_allclose(res["feature_0"].values, model.predict(iris_X[:10])) + np.testing.assert_allclose(res["output_feature_0"].values, model.predict(iris_X[:10])) di_predict_proba = dc.create_deployment( name=f"skl_model_predict_proba_{self.run_id}", diff --git a/tests/integ/snowflake/ml/modeling/calibration/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/calibration/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/calibration/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/calibration/BUILD.bazel index 39b05975..91271f7a 100644 --- a/tests/integ/snowflake/ml/modeling/calibration/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/calibration/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/calibration:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/calibration:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.calibration", - module_root_dir = "snowflake/ml/modeling/calibration", + module_root_dir = "snowflake/ml/sklearn/calibration", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/cluster/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/cluster/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/cluster/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/cluster/BUILD.bazel index adbfa784..247afc5a 100644 --- a/tests/integ/snowflake/ml/modeling/cluster/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/cluster/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/cluster:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/cluster:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.cluster", - module_root_dir = "snowflake/ml/modeling/cluster", + module_root_dir = "snowflake/ml/sklearn/cluster", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/compose/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/compose/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/compose/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/compose/BUILD.bazel index 303c831a..34f9c526 100644 --- a/tests/integ/snowflake/ml/modeling/compose/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/compose/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/compose:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/compose:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.compose", - module_root_dir = "snowflake/ml/modeling/compose", + module_root_dir = "snowflake/ml/sklearn/compose", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/covariance/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/covariance/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/covariance/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/covariance/BUILD.bazel index 5dba8854..e03c5cab 100644 --- a/tests/integ/snowflake/ml/modeling/covariance/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/covariance/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/covariance:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/covariance:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.covariance", - module_root_dir = "snowflake/ml/modeling/covariance", + module_root_dir = "snowflake/ml/sklearn/covariance", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/decomposition/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/decomposition/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/decomposition/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/decomposition/BUILD.bazel index 6a19961a..80771ce6 100644 --- a/tests/integ/snowflake/ml/modeling/decomposition/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/decomposition/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/decomposition:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/decomposition:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.decomposition", - module_root_dir = "snowflake/ml/modeling/decomposition", + module_root_dir = "snowflake/ml/sklearn/decomposition", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel similarity index 59% rename from tests/integ/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel index 2dbf8fc9..d5c0f6ac 100644 --- a/tests/integ/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/discriminant_analysis:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/discriminant_analysis:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.discriminant_analysis", - module_root_dir = "snowflake/ml/modeling/discriminant_analysis", + module_root_dir = "snowflake/ml/sklearn/discriminant_analysis", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/ensemble/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/ensemble/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/ensemble/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/ensemble/BUILD.bazel index dc15e953..bd46af45 100644 --- a/tests/integ/snowflake/ml/modeling/ensemble/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/ensemble/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/ensemble:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/ensemble:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.ensemble", - module_root_dir = "snowflake/ml/modeling/ensemble", + module_root_dir = "snowflake/ml/sklearn/ensemble", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/feature_selection/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/feature_selection/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/modeling/feature_selection/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/feature_selection/BUILD.bazel index d6533bc1..64017722 100644 --- a/tests/integ/snowflake/ml/modeling/feature_selection/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/feature_selection/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/feature_selection:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/feature_selection:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.feature_selection", - module_root_dir = "snowflake/ml/modeling/feature_selection", + module_root_dir = "snowflake/ml/sklearn/feature_selection", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/framework/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/framework/BUILD.bazel similarity index 69% rename from tests/integ/snowflake/ml/framework/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/framework/BUILD.bazel index 2e39591a..e087479e 100644 --- a/tests/integ/snowflake/ml/framework/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/framework/BUILD.bazel @@ -10,8 +10,8 @@ py_test( srcs = ["test_base.py"], deps = [ ":utils", - "//snowflake/ml/preprocessing:min_max_scaler", - "//snowflake/ml/preprocessing:standard_scaler", + "//snowflake/ml/sklearn/preprocessing:min_max_scaler", + "//snowflake/ml/sklearn/preprocessing:standard_scaler", "//snowflake/ml/utils:connection_params", ], ) @@ -23,8 +23,8 @@ py_test( timeout = TIMEOUT, deps = [ ":utils", - "//snowflake/ml/preprocessing:min_max_scaler", - "//snowflake/ml/preprocessing:standard_scaler", + "//snowflake/ml/sklearn/preprocessing:min_max_scaler", + "//snowflake/ml/sklearn/preprocessing:standard_scaler", "//snowflake/ml/utils:connection_params", ], ) diff --git a/tests/integ/snowflake/ml/framework/test_base.py b/tests/integ/snowflake/ml/sklearn/framework/test_base.py similarity index 95% rename from tests/integ/snowflake/ml/framework/test_base.py rename to tests/integ/snowflake/ml/sklearn/framework/test_base.py index 5d75c020..0ecaa649 100644 --- a/tests/integ/snowflake/ml/framework/test_base.py +++ b/tests/integ/snowflake/ml/sklearn/framework/test_base.py @@ -7,16 +7,16 @@ import pytest from absl.testing.absltest import TestCase, main -from snowflake.ml.framework.base import BaseTransformer, _process_cols -from snowflake.ml.preprocessing import ( # type: ignore[attr-defined] +from snowflake.ml.sklearn.framework.base import BaseTransformer, _process_cols +from snowflake.ml.sklearn.preprocessing import ( # type: ignore[attr-defined] MinMaxScaler, StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session from snowflake.snowpark.exceptions import SnowparkColumnException -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, DATA_NONE_NAN, NUMERIC_COLS, diff --git a/tests/integ/snowflake/ml/framework/test_pipeline.py b/tests/integ/snowflake/ml/sklearn/framework/test_pipeline.py similarity index 97% rename from tests/integ/snowflake/ml/framework/test_pipeline.py rename to tests/integ/snowflake/ml/sklearn/framework/test_pipeline.py index 8765d578..bc0f1971 100644 --- a/tests/integ/snowflake/ml/framework/test_pipeline.py +++ b/tests/integ/snowflake/ml/sklearn/framework/test_pipeline.py @@ -27,15 +27,15 @@ StandardScaler as SklearnStandardScaler, ) -from snowflake.ml.framework.pipeline import Pipeline -from snowflake.ml.preprocessing import ( # type: ignore[attr-defined] +from snowflake.ml.sklearn.framework.pipeline import Pipeline +from snowflake.ml.sklearn.preprocessing import ( # type: ignore[attr-defined] MinMaxScaler, StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import DataFrame, Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -208,7 +208,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.framework.pipeline"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.framework.pipeline"]) # cloudpickle pipeline_load_cloudpickle = cloudpickle.loads(pipeline_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/framework/utils.py b/tests/integ/snowflake/ml/sklearn/framework/utils.py similarity index 100% rename from tests/integ/snowflake/ml/framework/utils.py rename to tests/integ/snowflake/ml/sklearn/framework/utils.py diff --git a/tests/integ/snowflake/ml/modeling/gaussian_process/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/gaussian_process/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/modeling/gaussian_process/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/gaussian_process/BUILD.bazel index 2cc8a11f..23971ad4 100644 --- a/tests/integ/snowflake/ml/modeling/gaussian_process/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/gaussian_process/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/gaussian_process:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/gaussian_process:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.gaussian_process", - module_root_dir = "snowflake/ml/modeling/gaussian_process", + module_root_dir = "snowflake/ml/sklearn/gaussian_process", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/impute/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/impute/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/impute/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/impute/BUILD.bazel index be952877..1b79cbbc 100644 --- a/tests/integ/snowflake/ml/modeling/impute/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/impute/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/impute:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/impute:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.impute", - module_root_dir = "snowflake/ml/modeling/impute", + module_root_dir = "snowflake/ml/sklearn/impute", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/isotonic/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/isotonic/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/isotonic/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/isotonic/BUILD.bazel index 28597688..f1e34874 100644 --- a/tests/integ/snowflake/ml/modeling/isotonic/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/isotonic/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/isotonic:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/isotonic:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.isotonic", - module_root_dir = "snowflake/ml/modeling/isotonic", + module_root_dir = "snowflake/ml/sklearn/isotonic", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/kernel_approximation/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel similarity index 59% rename from tests/integ/snowflake/ml/modeling/kernel_approximation/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel index b29935b2..880c5dd0 100644 --- a/tests/integ/snowflake/ml/modeling/kernel_approximation/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/kernel_approximation:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/kernel_approximation:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.kernel_approximation", - module_root_dir = "snowflake/ml/modeling/kernel_approximation", + module_root_dir = "snowflake/ml/sklearn/kernel_approximation", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/kernel_ridge/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/kernel_ridge/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel index efe574e8..317df396 100644 --- a/tests/integ/snowflake/ml/modeling/kernel_ridge/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/kernel_ridge:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/kernel_ridge:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.kernel_ridge", - module_root_dir = "snowflake/ml/modeling/kernel_ridge", + module_root_dir = "snowflake/ml/sklearn/kernel_ridge", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/linear_model/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/linear_model/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/linear_model/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/linear_model/BUILD.bazel index 6ed8421e..a81b02ab 100644 --- a/tests/integ/snowflake/ml/modeling/linear_model/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/linear_model/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/linear_model:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/linear_model:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.linear_model", - module_root_dir = "snowflake/ml/modeling/linear_model", + module_root_dir = "snowflake/ml/sklearn/linear_model", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/manifold/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/manifold/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/manifold/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/manifold/BUILD.bazel index e53401d2..a4109257 100644 --- a/tests/integ/snowflake/ml/modeling/manifold/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/manifold/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/manifold:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/manifold:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.manifold", - module_root_dir = "snowflake/ml/modeling/manifold", + module_root_dir = "snowflake/ml/sklearn/manifold", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/mixture/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/mixture/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/mixture/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/mixture/BUILD.bazel index 08dd1fc3..45a416f7 100644 --- a/tests/integ/snowflake/ml/modeling/mixture/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/mixture/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/mixture:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/mixture:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.mixture", - module_root_dir = "snowflake/ml/modeling/mixture", + module_root_dir = "snowflake/ml/sklearn/mixture", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/model_selection/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/model_selection/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/modeling/model_selection/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/model_selection/BUILD.bazel index 6c90ca2b..fd05ef10 100644 --- a/tests/integ/snowflake/ml/modeling/model_selection/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/model_selection/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/model_selection:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/model_selection:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.model_selection", - module_root_dir = "snowflake/ml/modeling/model_selection", + module_root_dir = "snowflake/ml/sklearn/model_selection", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/multiclass/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/multiclass/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/multiclass/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/multiclass/BUILD.bazel index f9ed793e..3fe9b70a 100644 --- a/tests/integ/snowflake/ml/modeling/multiclass/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/multiclass/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/multiclass:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/multiclass:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.multiclass", - module_root_dir = "snowflake/ml/modeling/multiclass", + module_root_dir = "snowflake/ml/sklearn/multiclass", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/multioutput/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/multioutput/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/multioutput/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/multioutput/BUILD.bazel index b3cc2f6e..03800288 100644 --- a/tests/integ/snowflake/ml/modeling/multioutput/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/multioutput/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/multioutput:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/multioutput:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.multioutput", - module_root_dir = "snowflake/ml/modeling/multioutput", + module_root_dir = "snowflake/ml/sklearn/multioutput", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/naive_bayes/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/naive_bayes/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/naive_bayes/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/naive_bayes/BUILD.bazel index effe5868..700afc11 100644 --- a/tests/integ/snowflake/ml/modeling/naive_bayes/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/naive_bayes/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/naive_bayes:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/naive_bayes:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.naive_bayes", - module_root_dir = "snowflake/ml/modeling/naive_bayes", + module_root_dir = "snowflake/ml/sklearn/naive_bayes", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/neighbors/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/neighbors/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/modeling/neighbors/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/neighbors/BUILD.bazel index c11ab353..e4fc5d3b 100644 --- a/tests/integ/snowflake/ml/modeling/neighbors/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/neighbors/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/neighbors:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/neighbors:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.neighbors", - module_root_dir = "snowflake/ml/modeling/neighbors", + module_root_dir = "snowflake/ml/sklearn/neighbors", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/neural_network/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/neural_network/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/neural_network/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/neural_network/BUILD.bazel index ae89ad6f..7ef7d44a 100644 --- a/tests/integ/snowflake/ml/modeling/neural_network/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/neural_network/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/neural_network:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/neural_network:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.neural_network", - module_root_dir = "snowflake/ml/modeling/neural_network", + module_root_dir = "snowflake/ml/sklearn/neural_network", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/preprocessing/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/preprocessing/BUILD.bazel similarity index 53% rename from tests/integ/snowflake/ml/preprocessing/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/preprocessing/BUILD.bazel index b61f06f9..93823d35 100644 --- a/tests/integ/snowflake/ml/preprocessing/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/BUILD.bazel @@ -9,9 +9,9 @@ py_test( name = "test_binarizer", srcs = ["test_binarizer.py"], deps = [ - "//snowflake/ml/preprocessing:binarizer", + "//snowflake/ml/sklearn/preprocessing:binarizer", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -21,10 +21,10 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:k_bins_discretizer", + "//snowflake/ml/sklearn/preprocessing:k_bins_discretizer", "//snowflake/ml/utils:connection_params", "//snowflake/ml/utils:sparse", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -34,9 +34,9 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:label_encoder", + "//snowflake/ml/sklearn/preprocessing:label_encoder", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -44,9 +44,9 @@ py_test( name = "test_max_abs_scaler", srcs = ["test_max_abs_scaler.py"], deps = [ - "//snowflake/ml/preprocessing:max_abs_scaler", + "//snowflake/ml/sklearn/preprocessing:max_abs_scaler", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -54,9 +54,9 @@ py_test( name = "test_min_max_scaler", srcs = ["test_min_max_scaler.py"], deps = [ - "//snowflake/ml/preprocessing:min_max_scaler", + "//snowflake/ml/sklearn/preprocessing:min_max_scaler", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -66,9 +66,9 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:normalizer", + "//snowflake/ml/sklearn/preprocessing:normalizer", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -78,10 +78,11 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:one_hot_encoder", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", "//snowflake/ml/utils:connection_params", "//snowflake/ml/utils:sparse", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -91,9 +92,9 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:ordinal_encoder", + "//snowflake/ml/sklearn/preprocessing:ordinal_encoder", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -103,9 +104,9 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:robust_scaler", + "//snowflake/ml/sklearn/preprocessing:robust_scaler", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -113,9 +114,9 @@ py_test( name = "test_standard_scaler", srcs = ["test_standard_scaler.py"], deps = [ - "//snowflake/ml/preprocessing:standard_scaler", + "//snowflake/ml/sklearn/preprocessing:standard_scaler", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -125,9 +126,9 @@ py_test( shard_count = SHARD_COUNT, timeout = TIMEOUT, deps = [ - "//snowflake/ml/preprocessing:simple_imputer", + "//snowflake/ml/sklearn/preprocessing:simple_imputer", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) @@ -135,17 +136,17 @@ py_test( name = "test_drop_input_cols", srcs = ["test_drop_input_cols.py"], deps = [ - "//snowflake/ml/preprocessing:binarizer", - "//snowflake/ml/preprocessing:label_encoder", - "//snowflake/ml/preprocessing:max_abs_scaler", - "//snowflake/ml/preprocessing:min_max_scaler", - "//snowflake/ml/preprocessing:normalizer", - "//snowflake/ml/preprocessing:one_hot_encoder", - "//snowflake/ml/preprocessing:ordinal_encoder", - "//snowflake/ml/preprocessing:robust_scaler", - "//snowflake/ml/preprocessing:simple_imputer", - "//snowflake/ml/preprocessing:standard_scaler", + "//snowflake/ml/sklearn/preprocessing:binarizer", + "//snowflake/ml/sklearn/preprocessing:label_encoder", + "//snowflake/ml/sklearn/preprocessing:max_abs_scaler", + "//snowflake/ml/sklearn/preprocessing:min_max_scaler", + "//snowflake/ml/sklearn/preprocessing:normalizer", + "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", + "//snowflake/ml/sklearn/preprocessing:ordinal_encoder", + "//snowflake/ml/sklearn/preprocessing:robust_scaler", + "//snowflake/ml/sklearn/preprocessing:simple_imputer", + "//snowflake/ml/sklearn/preprocessing:standard_scaler", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/framework:utils", + "//tests/integ/snowflake/ml/sklearn/framework:utils", ], ) diff --git a/tests/integ/snowflake/ml/preprocessing/test_binarizer.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_binarizer.py similarity index 95% rename from tests/integ/snowflake/ml/preprocessing/test_binarizer.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_binarizer.py index 860ce43a..fb2754f4 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_binarizer.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_binarizer.py @@ -14,11 +14,11 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import Binarizer as SklearnBinarizer -from snowflake.ml.preprocessing import Binarizer # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import Binarizer # type: ignore[attr-defined] from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, DATA_NONE_NAN, ID_COL, @@ -139,7 +139,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.binarizer"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.binarizer"]) # cloudpickle binarizer_load_cloudpickle = cloudpickle.loads(binarizer_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_drop_input_cols.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_drop_input_cols.py similarity index 94% rename from tests/integ/snowflake/ml/preprocessing/test_drop_input_cols.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_drop_input_cols.py index 63b65ea3..dbd34022 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_drop_input_cols.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_drop_input_cols.py @@ -8,8 +8,8 @@ import numpy as np from absl.testing.absltest import TestCase -from snowflake.ml.framework.pipeline import Pipeline -from snowflake.ml.preprocessing import ( # type: ignore[attr-defined] +from snowflake.ml.sklearn.framework.pipeline import Pipeline +from snowflake.ml.sklearn.preprocessing import ( # type: ignore[attr-defined] Binarizer, LabelEncoder, MaxAbsScaler, @@ -23,8 +23,8 @@ ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( CATEGORICAL_COLS, DATA, ID_COL, diff --git a/tests/integ/snowflake/ml/preprocessing/test_k_bins_discretizer.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_k_bins_discretizer.py similarity index 98% rename from tests/integ/snowflake/ml/preprocessing/test_k_bins_discretizer.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_k_bins_discretizer.py index 50dc0b15..f946df2b 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_k_bins_discretizer.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_k_bins_discretizer.py @@ -8,11 +8,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import KBinsDiscretizer as SklearnKBinsDiscretizer -from snowflake.ml.preprocessing import KBinsDiscretizer # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + KBinsDiscretizer, # type: ignore[attr-defined] +) from snowflake.ml.utils import sparse as sparse_utils from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils +from tests.integ.snowflake.ml.sklearn.framework import utils np.set_printoptions(threshold=sys.maxsize) diff --git a/tests/integ/snowflake/ml/preprocessing/test_label_encoder.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_label_encoder.py similarity index 96% rename from tests/integ/snowflake/ml/preprocessing/test_label_encoder.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_label_encoder.py index abd873cb..fbc833b0 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_label_encoder.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_label_encoder.py @@ -14,11 +14,13 @@ from absl.testing.absltest import main from sklearn.preprocessing import LabelEncoder as SklearnLabelEncoder -from snowflake.ml.preprocessing import LabelEncoder # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + LabelEncoder, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, DATA_BOOLEAN, DATA_NONE_NAN, @@ -208,7 +210,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.label_encoder"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.label_encoder"]) # cloudpickle label_encoder_load_cloudpickle = cloudpickle.loads(label_encoder_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_max_abs_scaler.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_max_abs_scaler.py similarity index 95% rename from tests/integ/snowflake/ml/preprocessing/test_max_abs_scaler.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_max_abs_scaler.py index 134d3940..321b8dc6 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_max_abs_scaler.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_max_abs_scaler.py @@ -16,11 +16,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import MaxAbsScaler as SklearnMaxAbsScaler -from snowflake.ml.preprocessing import MaxAbsScaler # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + MaxAbsScaler, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -152,7 +154,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.max_abs_scaler"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.max_abs_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_min_max_scaler.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_min_max_scaler.py similarity index 97% rename from tests/integ/snowflake/ml/preprocessing/test_min_max_scaler.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_min_max_scaler.py index e3bbc3b6..273d052a 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_min_max_scaler.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_min_max_scaler.py @@ -15,11 +15,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler -from snowflake.ml.preprocessing import MinMaxScaler # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + MinMaxScaler, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, DATA_CLIP, ID_COL, @@ -353,7 +355,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.min_max_scaler"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.min_max_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_normalizer.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_normalizer.py similarity index 96% rename from tests/integ/snowflake/ml/preprocessing/test_normalizer.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_normalizer.py index b6a78748..b36a37b9 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_normalizer.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_normalizer.py @@ -16,12 +16,12 @@ from absl.testing.absltest import main from sklearn.preprocessing import Normalizer as SklearnNormalizer -from snowflake.ml.preprocessing import Normalizer # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import Normalizer # type: ignore[attr-defined] from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session from snowflake.snowpark.exceptions import SnowparkSQLException -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( CATEGORICAL_COLS, DATA_NONE_NAN, ID_COL, @@ -197,7 +197,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.normalizer"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.normalizer"]) # cloudpickle normalizer_load_cloudpickle = cloudpickle.loads(normalizer_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_one_hot_encoder.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_one_hot_encoder.py similarity index 99% rename from tests/integ/snowflake/ml/preprocessing/test_one_hot_encoder.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_one_hot_encoder.py index 87120cde..ab5e161e 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_one_hot_encoder.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_one_hot_encoder.py @@ -20,12 +20,15 @@ from scipy.sparse import csr_matrix from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder -from snowflake.ml.preprocessing import OneHotEncoder # type: ignore[attr-defined] +from snowflake.ml._internal.utils import identifier as utils_identifier +from snowflake.ml.sklearn.preprocessing import ( + OneHotEncoder, # type: ignore[attr-defined] +) from snowflake.ml.utils import sparse as utils_sparse from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import DataFrame, Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( BOOLEAN_COLS, CATEGORICAL_COLS, DATA, @@ -1500,7 +1503,12 @@ def test_get_output_cols_dense(self, params: Dict[str, Any]) -> None: encoder.fit(df) expected_output_cols = [] for input_col in input_cols: - expected_output_cols.extend([f'"{col}"' for col in encoder._dense_output_cols_mappings[input_col]]) + expected_output_cols.extend( + [ + utils_identifier.quote_name_without_upper_casing(col) + for col in encoder._dense_output_cols_mappings[input_col] + ] + ) # output columns are set before fitting # fit Snowpark dataframe @@ -1560,7 +1568,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.one_hot_encoder"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.one_hot_encoder"]) # cloudpickle encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_ordinal_encoder.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_ordinal_encoder.py similarity index 99% rename from tests/integ/snowflake/ml/preprocessing/test_ordinal_encoder.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_ordinal_encoder.py index 00ca6bce..8aac2dd1 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_ordinal_encoder.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_ordinal_encoder.py @@ -18,11 +18,13 @@ from absl.testing.absltest import main from sklearn.preprocessing import OrdinalEncoder as SklearnOrdinalEncoder -from snowflake.ml.preprocessing import OrdinalEncoder # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + OrdinalEncoder, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( BOOLEAN_COLS, CATEGORICAL_COLS, DATA, @@ -809,7 +811,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.ordinal_encoder"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.ordinal_encoder"]) # cloudpickle encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_robust_scaler.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_robust_scaler.py similarity index 97% rename from tests/integ/snowflake/ml/preprocessing/test_robust_scaler.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_robust_scaler.py index 0d58ce8f..5d826bd3 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_robust_scaler.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_robust_scaler.py @@ -17,11 +17,13 @@ from absl.testing.absltest import main from sklearn.preprocessing import RobustScaler as SklearnRobustScaler -from snowflake.ml.preprocessing import RobustScaler # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + RobustScaler, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -271,7 +273,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.robust_scaler"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.robust_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_simple_imputer.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_simple_imputer.py similarity index 98% rename from tests/integ/snowflake/ml/preprocessing/test_simple_imputer.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_simple_imputer.py index 50e37579..936e8876 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_simple_imputer.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_simple_imputer.py @@ -15,11 +15,13 @@ from absl.testing.absltest import main from sklearn.impute import SimpleImputer as SklearnSimpleImputer -from snowflake.ml.preprocessing import SimpleImputer # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + SimpleImputer, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( CATEGORICAL_COLS, DATA, DATA_ALL_NONE, @@ -501,7 +503,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.simple_imputer"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.simple_imputer"]) # cloudpickle simple_imputer_load_cloudpickle = cloudpickle.loads(simple_imputer_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/preprocessing/test_standard_scaler.py b/tests/integ/snowflake/ml/sklearn/preprocessing/test_standard_scaler.py similarity index 98% rename from tests/integ/snowflake/ml/preprocessing/test_standard_scaler.py rename to tests/integ/snowflake/ml/sklearn/preprocessing/test_standard_scaler.py index 70dcd786..a39ed348 100644 --- a/tests/integ/snowflake/ml/preprocessing/test_standard_scaler.py +++ b/tests/integ/snowflake/ml/sklearn/preprocessing/test_standard_scaler.py @@ -15,11 +15,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import StandardScaler as SklearnStandardScaler -from snowflake.ml.preprocessing import StandardScaler # type: ignore[attr-defined] +from snowflake.ml.sklearn.preprocessing import ( + StandardScaler, # type: ignore[attr-defined] +) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.framework import utils as framework_utils -from tests.integ.snowflake.ml.framework.utils import ( +from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils +from tests.integ.snowflake.ml.sklearn.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -385,7 +387,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.preprocessing.standard_scaler"]) + importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.standard_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/modeling/semi_supervised/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/semi_supervised/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/modeling/semi_supervised/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/semi_supervised/BUILD.bazel index 54bc3a86..a3af24a6 100644 --- a/tests/integ/snowflake/ml/modeling/semi_supervised/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/semi_supervised/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/semi_supervised:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/semi_supervised:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.semi_supervised", - module_root_dir = "snowflake/ml/modeling/semi_supervised", + module_root_dir = "snowflake/ml/sklearn/semi_supervised", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/svm/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/svm/BUILD.bazel similarity index 63% rename from tests/integ/snowflake/ml/modeling/svm/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/svm/BUILD.bazel index ce9a11d3..60eaa350 100644 --- a/tests/integ/snowflake/ml/modeling/svm/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/svm/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/svm:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/svm:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.svm", - module_root_dir = "snowflake/ml/modeling/svm", + module_root_dir = "snowflake/ml/sklearn/svm", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/tree/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/tree/BUILD.bazel similarity index 63% rename from tests/integ/snowflake/ml/modeling/tree/BUILD.bazel rename to tests/integ/snowflake/ml/sklearn/tree/BUILD.bazel index 26ec20cd..a97cc6bf 100644 --- a/tests/integ/snowflake/ml/modeling/tree/BUILD.bazel +++ b/tests/integ/snowflake/ml/sklearn/tree/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/tree:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/sklearn/tree:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.tree", - module_root_dir = "snowflake/ml/modeling/tree", + module_root_dir = "snowflake/ml/sklearn/tree", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/xgboost/BUILD.bazel b/tests/integ/snowflake/ml/xgboost/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/modeling/xgboost/BUILD.bazel rename to tests/integ/snowflake/ml/xgboost/BUILD.bazel index c1a9221a..b1d7a563 100644 --- a/tests/integ/snowflake/ml/modeling/xgboost/BUILD.bazel +++ b/tests/integ/snowflake/ml/xgboost/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/modeling/xgboost:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/xgboost:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "xgboost", - module_root_dir = "snowflake/ml/modeling/xgboost", + module_root_dir = "snowflake/ml/xgboost", estimator_info_list=estimator_info_list )