diff --git a/CHANGELOG.md b/CHANGELOG.md index 9819eae9..945d6e4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,41 @@ # Release History -## 1.0.5 +## 1.0.6 + +### New Features +- Model Registry: add `create_if_not_exists` parameter in constructor. +- Model Registry: Added get_or_create_model_registry API. +- Model Registry: Added support for using GPU inference when deploying XGBoost (`xgboost.XGBModel` and `xgboost.Booster`), PyTorch (`torch.nn.Module` and `torch.jit.ScriptModule`) and TensorFlow (`tensorflow.Module` and `tensorflow.keras.Model`) models to Snowpark Container Services. +- Model Registry: When inferring model signature, `Sequence` of built-in types, `Sequence` of `numpy.ndarray`, `Sequence` of `torch.Tensor`, `Sequence` of `tensorflow.Tensor` and `Sequence` of `tensorflow.Tensor` can be used instead of only `List` of them. +- Model Registry: Added `get_training_dataset` API. +- Model Development: Size of metrics result can exceed previous 8MB limit. +- Model Registry: Added support save/load/deploy HuggingFace pipeline object (`transformers.Pipeline`) and our wrapper (`snowflake.ml.model.models.huggingface_pipeline.HuggingFacePipelineModel`) to it. Using the wrapper to specify configurations and the model for the pipeline will be loaded dynamically when deploying. Currently, following tasks are supported to log without manually specifying model signatures: + - "conversational" + - "fill-mask" + - "question-answering" + - "summarization" + - "table-question-answering" + - "text2text-generation" + - "text-classification" (alias "sentiment-analysis" available) + - "text-generation" + - "token-classification" (alias "ner" available) + - "translation" + - "translation_xx_to_yy" + - "zero-shot-classification" + +### Bug Fixes +- Model Development: Fixed a bug when using simple imputer with numpy >= 1.25. +- Model Development: Fixed a bug when inferring the type of label columns. + +### Behavior Changes +- Model Registry: `log_model()` now return a `ModelReference` object instead of a model ID. +- Model Registry: When deploying a model with 1 `target method` only, the `target_method` argument can be omitted. +- Model Registry: When using the snowflake-ml-python with version newer than what is available in Snowflake Anaconda Channel, `embed_local_ml_library` option will be set as `True` automatically if not. +- Model Registry: When deploying a model to Snowpark Container Services and using GPU, the default value of num_workers will be 1. +- Model Registry: `keep_order` and `output_with_input_features` in the deploy options have been removed. Now the behavior is controlled by the type of the input when calling `model.predict()`. If the input is a `pandas.DataFrame`, the behavior will be the same as `keep_order=True` and `output_with_input_features=False` before. If the input is a `snowpark.DataFrame`, the behavior will be the same as `keep_order=False` and `output_with_input_features=True` before. +- Model Registry: When logging and deploying PyTorch (`torch.nn.Module` and `torch.jit.ScriptModule`) and TensorFlow (`tensorflow.Module` and `tensorflow.keras.Model`) models, we no longer accept models whose input is a list of tensor and output is a list of tensors. Instead, now we accept models whose input is 1 or more tensors as positional arguments, and output is a tensor or a tuple of tensors. The input and output dataframe when predicting keep the same as before, that is every column is an array feature and contains a tensor. + +## 1.0.5 (2023-08-17) ### New Features @@ -13,7 +48,7 @@ - Model Registry: Fixed an issue that the UDF name created when deploying a model is not identical to what is provided and cannot be correctly dropped when deployment getting dropped. - connection_params.SnowflakeLoginOptions(): Added support for `private_key_path`. -## 1.0.4 +## 1.0.4 (2023-07-28) ### New Features diff --git a/bazel/environments/conda-env-build.yml b/bazel/environments/conda-env-build.yml index be19154f..6bb229d9 100644 --- a/bazel/environments/conda-env-build.yml +++ b/bazel/environments/conda-env-build.yml @@ -14,5 +14,5 @@ dependencies: - numpy==1.24.3 - packaging==23.0 - pyyaml==6.0 -- scikit-learn==1.2.2 +- scikit-learn==1.3.0 - xgboost==1.7.3 diff --git a/bazel/environments/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml index 13a21ec4..277138dc 100644 --- a/bazel/environments/conda-env-snowflake.yml +++ b/bazel/environments/conda-env-snowflake.yml @@ -9,6 +9,7 @@ dependencies: - aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 +- cachetools==4.2.2 - cloudpickle==2.0.0 - conda-libmamba-solver==23.3.0 - coverage==6.3.2 @@ -23,6 +24,7 @@ dependencies: - lightgbm==3.3.5 - mlflow==2.3.1 - moto==4.0.11 +- multipledispatch==0.6.0 - mypy==0.981 - networkx==2.8.4 - numpy==1.24.3 @@ -36,7 +38,7 @@ dependencies: - requests==2.29.0 - ruamel.yaml==0.17.21 - s3fs==2022.11.0 -- scikit-learn==1.2.2 +- scikit-learn==1.3.0 - scipy==1.9.3 - snowflake-connector-python==3.0.3 - snowflake-snowpark-python==1.5.1 @@ -44,5 +46,6 @@ dependencies: - tensorflow==2.10.0 - transformers==4.29.2 - types-protobuf==4.23.0.1 +- types-requests==2.30.0.0 - typing-extensions==4.5.0 - xgboost==1.7.3 diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml index fa6d69d8..c281fd27 100644 --- a/bazel/environments/conda-env.yml +++ b/bazel/environments/conda-env.yml @@ -9,9 +9,11 @@ dependencies: - aiohttp==3.8.3 - anyio==3.5.0 - boto3==1.24.28 +- cachetools==4.2.2 - cloudpickle==2.0.0 - conda-forge::starlette==0.27.0 - conda-forge::types-PyYAML==6.0.12 +- conda-forge::types-cachetools==4.2.2 - conda-libmamba-solver==23.3.0 - coverage==6.3.2 - cryptography==39.0.1 @@ -25,6 +27,7 @@ dependencies: - lightgbm==3.3.5 - mlflow==2.3.1 - moto==4.0.11 +- multipledispatch==0.6.0 - mypy==0.981 - networkx==2.8.4 - numpy==1.24.3 @@ -39,7 +42,7 @@ dependencies: - requests==2.29.0 - ruamel.yaml==0.17.21 - s3fs==2022.11.0 -- scikit-learn==1.2.2 +- scikit-learn==1.3.0 - scipy==1.9.3 - snowflake-connector-python==3.0.3 - snowflake-snowpark-python==1.5.1 @@ -47,5 +50,6 @@ dependencies: - tensorflow==2.10.0 - transformers==4.29.2 - types-protobuf==4.23.0.1 +- types-requests==2.30.0.0 - typing-extensions==4.5.0 - xgboost==1.7.3 diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 6b1b3b43..6b8933c0 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -17,7 +17,7 @@ build: noarch: python package: name: snowflake-ml-python - version: 1.0.5 + version: 1.0.6 requirements: build: - python @@ -34,7 +34,7 @@ requirements: - python - pyyaml>=6.0,<7 - requests - - scikit-learn>=1.2.1,<1.3 + - scikit-learn>=1.2.1,<1.4 - scipy>=1.9,<2 - snowflake-connector-python>=3.0.3,<4 - snowflake-snowpark-python>=1.5.1,<2 @@ -43,8 +43,9 @@ requirements: - xgboost>=1.7.3,<2 run_constrained: - lightgbm==3.3.5 - - mlflow>=2.1.0,<3 + - mlflow>=2.1.0,<2.4 - tensorflow>=2.9,<3 - torchdata>=0.4,<1 + - transformers>=4.29.2,<5 source: path: ../../ diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index 4187a093..8771d318 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -25,6 +25,10 @@ from snowflake.snowpark import DataFrame, Session from snowflake.snowpark.functions import pandas_udf, sproc from snowflake.snowpark.types import PandasSeries from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type +from snowflake.snowpark._internal.utils import ( + TempObjectType, + random_name_for_temp_object, +) from snowflake.ml.model.model_signature import ( DataType, @@ -244,7 +248,7 @@ class {transform.original_class_name}(BaseTransformer): cp.dump(self._sklearn_object, local_transform_file) # Create temp stage to run fit. - transform_stage_name = "SNOWML_TRANSFORM_{{safe_id}}".format(safe_id=self._get_rand_id()) + transform_stage_name = random_name_for_temp_object(TempObjectType.STAGE) stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {{transform_stage_name}};" SqlResultValidator( session=session, @@ -258,7 +262,7 @@ class {transform.original_class_name}(BaseTransformer): stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name)) local_result_file_name = get_temp_file_path() - fit_sproc_name = "SNOWML_FIT_{{safe_id}}".format(safe_id=self._get_rand_id()) + fit_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, subproject=_SUBPROJECT, @@ -439,8 +443,7 @@ class {transform.original_class_name}(BaseTransformer): pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT) # Register vectorized UDF for batch inference - batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{{safe_id}}_{{method}}".format( - safe_id=self._get_rand_id(), method=inference_method) + batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark # will try to pickle all of self which fails. @@ -701,8 +704,17 @@ class {transform.original_class_name}(BaseTransformer): expected_type_inferred = "{transform.udf_datatype}" # when it is classifier, infer the datatype from label columns if expected_type_inferred == "" and 'predict' in self.model_signatures: + # Batch inference takes a single expected output column type. Use the first columns type for now. + # TODO: Handle varying output column types. + label_cols_signatures = [row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols] + if len(label_cols_signatures) == 0: + error_str = f"Output columns {{self.output_cols}} do not match model signatures {{self.model_signatures['predict'].outputs}}." + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=ValueError(error_str), + ) expected_type_inferred = convert_sp_to_sf_type( - self.model_signatures['predict'].outputs[0].as_snowpark_type() + label_cols_signatures[0].as_snowpark_type() ) output_df = self._batch_inference( @@ -955,7 +967,7 @@ class {transform.original_class_name}(BaseTransformer): cp.dump(self._sklearn_object, local_score_file) # Create temp stage to run score. - score_stage_name = "SNOWML_SCORE_{{safe_id}}".format(safe_id=self._get_rand_id()) + score_stage_name = random_name_for_temp_object(TempObjectType.STAGE) session = dataset._session assert session is not None # keep mypy happy stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {{score_stage_name}};" @@ -968,7 +980,7 @@ class {transform.original_class_name}(BaseTransformer): # Use posixpath to construct stage paths stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name)) - score_sproc_name = "SNOWML_SCORE_{{safe_id}}".format(safe_id=self._get_rand_id()) + score_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) statement_params = telemetry.get_function_usage_statement_params( project=_PROJECT, subproject=_SUBPROJECT, diff --git a/requirements.yml b/requirements.yml index 3aa82dd4..d3267a52 100644 --- a/requirements.yml +++ b/requirements.yml @@ -68,6 +68,7 @@ version_requirements: ">=0.15,<2" tags: - build_essential + - deployment_core # For fsspec[http] in conda - name_conda: aiohttp dev_version_conda: "3.8.3" @@ -123,7 +124,7 @@ - build_essential - name: mlflow dev_version: "2.3.1" - version_requirements: ">=2.1.0,<3" + version_requirements: ">=2.1.0,<2.4" requirements_extra_tags: - mlflow - name: moto @@ -176,8 +177,8 @@ - name: s3fs dev_version: "2022.11.0" - name: scikit-learn - dev_version: "1.2.2" - version_requirements: ">=1.2.1,<1.3" + dev_version: "1.3.0" + version_requirements: ">=1.2.1,<1.4" tags: - build_essential - name: scipy @@ -211,6 +212,11 @@ - torch - name: transformers dev_version: "4.29.2" + version_requirements: ">=4.29.2,<5" + requirements_extra_tags: + - transformers +- name: types-requests + dev_version: "2.30.0.0" - name: types-protobuf dev_version: "4.23.0.1" - name: types-PyYAML @@ -226,3 +232,12 @@ version_requirements: ">=1.7.3,<2" tags: - build_essential +- name: types-cachetools + dev_version: "4.2.2" + from_channel: conda-forge +- name: cachetools + dev_version: "4.2.2" +# TODO: this will be a user side dep requirement +# enable when we are releasing FS. +- name: multipledispatch + dev_version: "0.6.0" diff --git a/snowflake/ml/_internal/env_utils.py b/snowflake/ml/_internal/env_utils.py index 1e26d3d2..777da25c 100644 --- a/snowflake/ml/_internal/env_utils.py +++ b/snowflake/ml/_internal/env_utils.py @@ -339,3 +339,142 @@ def parse_python_version_string(dep: str) -> Optional[str]: # "python" only, no specifier return "" return None + + +def _find_conda_dep_spec( + conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], pkg_name: str +) -> Optional[Tuple[str, requirements.Requirement]]: + for channel in conda_chan_deps: + spec = next(filter(lambda req: req.name == pkg_name, conda_chan_deps[channel]), None) + if spec: + return channel, spec + return None + + +def _find_pip_req_spec(pip_reqs: List[requirements.Requirement], pkg_name: str) -> Optional[requirements.Requirement]: + spec = next(filter(lambda req: req.name == pkg_name, pip_reqs), None) + return spec + + +def _find_dep_spec( + conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], + pip_reqs: List[requirements.Requirement], + conda_pkg_name: str, + pip_pkg_name: Optional[str] = None, + remove_spec: bool = False, +) -> Tuple[ + DefaultDict[str, List[requirements.Requirement]], List[requirements.Requirement], Optional[requirements.Requirement] +]: + if pip_pkg_name is None: + pip_pkg_name = conda_pkg_name + spec_conda = _find_conda_dep_spec(conda_chan_deps, conda_pkg_name) + if spec_conda: + channel, spec = spec_conda + if remove_spec: + conda_chan_deps[channel].remove(spec) + return conda_chan_deps, pip_reqs, spec + else: + spec_pip = _find_pip_req_spec(pip_reqs, pip_pkg_name) + if spec_pip: + if remove_spec: + pip_reqs.remove(spec_pip) + return conda_chan_deps, pip_reqs, spec_pip + return conda_chan_deps, pip_reqs, None + + +def generate_env_for_cuda( + conda_chan_deps: DefaultDict[str, List[requirements.Requirement]], + pip_reqs: List[requirements.Requirement], + cuda_version: str, +) -> Tuple[DefaultDict[str, List[requirements.Requirement]], List[requirements.Requirement]]: + conda_chan_deps_cuda = copy.deepcopy(conda_chan_deps) + pip_reqs_cuda = copy.deepcopy(pip_reqs) + + cuda_version_obj = version.parse(cuda_version) + cuda_version_spec_str = f"{cuda_version_obj.major}.{cuda_version_obj.minor}.*" + + try: + append_conda_dependency( + conda_chan_deps_cuda, + ("nvidia", requirements.Requirement(f"cuda=={cuda_version_spec_str}")), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + conda_chan_deps_cuda, pip_reqs_cuda, xgboost_spec = _find_dep_spec( + conda_chan_deps_cuda, pip_reqs, conda_pkg_name="xgboost", remove_spec=True + ) + if xgboost_spec: + xgboost_spec.name = "py-xgboost-gpu" + try: + append_conda_dependency( + conda_chan_deps_cuda, + ("conda-forge", xgboost_spec), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + conda_chan_deps_cuda, pip_reqs_cuda, pytorch_spec = _find_dep_spec( + conda_chan_deps_cuda, pip_reqs, conda_pkg_name="pytorch", pip_pkg_name="torch", remove_spec=True + ) + if pytorch_spec: + pytorch_spec.name = "pytorch" + try: + append_conda_dependency( + conda_chan_deps_cuda, + ("pytorch", pytorch_spec), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + try: + append_conda_dependency( + conda_chan_deps_cuda, + p_chan_dep=("pytorch", requirements.Requirement(f"pytorch-cuda=={cuda_version_spec_str}")), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + conda_chan_deps_cuda, pip_reqs_cuda, tf_spec = _find_dep_spec( + conda_chan_deps_cuda, pip_reqs, conda_pkg_name="tensorflow", remove_spec=True + ) + if tf_spec: + tf_spec.name = "tensorflow-gpu" + try: + append_conda_dependency( + conda_chan_deps_cuda, + ("conda-forge", tf_spec), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + conda_chan_deps_cuda, pip_reqs_cuda, transformers_spec = _find_dep_spec( + conda_chan_deps_cuda, pip_reqs, conda_pkg_name="transformers", remove_spec=False + ) + if transformers_spec: + try: + append_conda_dependency( + conda_chan_deps_cuda, + ("conda-forge", requirements.Requirement("accelerate>=0.22.0")), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + # Required by bitsandbytes + try: + append_conda_dependency( + conda_chan_deps_cuda, + (DEFAULT_CHANNEL_NAME, get_local_installed_version_of_pip_package(requirements.Requirement("scipy"))), + ) + except (DuplicateDependencyError, DuplicateDependencyInMultipleChannelsError): + pass + + try: + append_requirement_list( + pip_reqs_cuda, + requirements.Requirement("bitsandbytes>=0.41.0"), + ) + except DuplicateDependencyError: + pass + + return conda_chan_deps_cuda, pip_reqs_cuda diff --git a/snowflake/ml/_internal/env_utils_test.py b/snowflake/ml/_internal/env_utils_test.py index 0a21e25f..56ba06fa 100644 --- a/snowflake/ml/_internal/env_utils_test.py +++ b/snowflake/ml/_internal/env_utils_test.py @@ -588,6 +588,701 @@ def test_parse_python_version_string(self) -> None: with self.assertRaises(ValueError): env_utils.parse_python_version_string("python>2.7.16") + def test_find_conda_dep_spec(self) -> None: + conda_reqs: DefaultDict[str, List[requirements.Requirement]] = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + self.assertTupleEqual( + (env_utils.DEFAULT_CHANNEL_NAME, requirements.Requirement("somepackage==1.0.0")), + env_utils._find_conda_dep_spec(conda_reqs, "somepackage"), + ) + + self.assertTupleEqual( + ("another_channel", requirements.Requirement("another_package==1.0.0")), + env_utils._find_conda_dep_spec(conda_reqs, "another_package"), + ) + + self.assertIsNone(env_utils._find_conda_dep_spec(conda_reqs, "random_package")) + + def test_find_pip_req_spec(self) -> None: + pip_reqs = [requirements.Requirement("somepackage==1.0.0")] + + self.assertEqual( + requirements.Requirement("somepackage==1.0.0"), + env_utils._find_pip_req_spec(pip_reqs, pkg_name="somepackage"), + ) + + self.assertIsNone(env_utils._find_pip_req_spec(pip_reqs, pkg_name="random_package")) + + def test_find_dep_spec(self) -> None: + conda_reqs: DefaultDict[str, List[requirements.Requirement]] = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + pip_reqs = [requirements.Requirement("pip_package==1.0.0")] + + conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + conda_reqs, pip_reqs, conda_pkg_name="somepackage" + ) + + self.assertDictEqual(conda_reqs_result, conda_reqs) + self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + pip_reqs = [requirements.Requirement("pip_package==1.0.0")] + + conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + conda_reqs, pip_reqs, conda_pkg_name="pip_package" + ) + + self.assertDictEqual(conda_reqs_result, conda_reqs) + self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertEqual(spec, requirements.Requirement("pip_package==1.0.0")) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + pip_reqs = [requirements.Requirement("pip_package==1.0.0")] + + conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + conda_reqs, pip_reqs, conda_pkg_name="somepackage", pip_pkg_name="pip_package" + ) + + self.assertDictEqual(conda_reqs_result, conda_reqs) + self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + pip_reqs = [requirements.Requirement("pip_package==1.0.0")] + + conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + conda_reqs, pip_reqs, conda_pkg_name="somepackage", remove_spec=True + ) + + self.assertDictEqual( + conda_reqs_result, + collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ), + ) + self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + pip_reqs = [requirements.Requirement("pip_package==1.0.0")] + + conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + conda_reqs, pip_reqs, conda_pkg_name="pip_package", remove_spec=True + ) + + self.assertDictEqual(conda_reqs_result, conda_reqs) + self.assertListEqual(pip_reqs_result, []) + self.assertEqual(spec, requirements.Requirement("pip_package==1.0.0")) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + pip_reqs = [requirements.Requirement("pip_package==1.0.0")] + + conda_reqs_result, pip_reqs_result, spec = env_utils._find_dep_spec( + conda_reqs, pip_reqs, conda_pkg_name="somepackage", pip_pkg_name="pip_package", remove_spec=True + ) + + self.assertDictEqual( + conda_reqs_result, + collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ), + ) + self.assertListEqual(pip_reqs_result, pip_reqs) + self.assertEqual(spec, requirements.Requirement("somepackage==1.0.0")) + + def test_generate_conda_env_for_cuda(self) -> None: + conda_reqs: DefaultDict[str, List[requirements.Requirement]] = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("somepackage==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("somepackage==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("somepackage==1.0.0")], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.8.*"), + ], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("somepackage==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.8.*"), + ], + "another_channel": [requirements.Requirement("another_package==1.0.0")], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "pytorch": [ + requirements.Requirement("pytorch==1.0.0"), + requirements.Requirement("pytorch-cuda==11.7.*"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch>=1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "pytorch": [ + requirements.Requirement("pytorch>=1.0.0"), + requirements.Requirement("pytorch-cuda==11.7.*"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch>=1.0.0")], + "pytorch": [ + requirements.Requirement("pytorch-cuda==11.8.*"), + ], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "pytorch": [ + requirements.Requirement("pytorch-cuda==11.8.*"), + requirements.Requirement("pytorch>=1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("pytorch>=1.0.0")], + "pytorch": [ + requirements.Requirement("pytorch>=1.1.0"), + requirements.Requirement("pytorch-cuda==11.8.*"), + ], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "pytorch": [ + requirements.Requirement("pytorch>=1.1.0"), + requirements.Requirement("pytorch-cuda==11.8.*"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + "conda-forge": [requirements.Requirement("pytorch==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + "conda-forge": [], + "pytorch": [ + requirements.Requirement("pytorch==1.0.0"), + requirements.Requirement("pytorch-cuda==11.7.*"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( + collections.defaultdict( + list, + ), + [requirements.Requirement("torch==1.0.0")], + cuda_version="11.7", + ) + + self.assertDictEqual( + conda_reqs_result, + { + "pytorch": [ + requirements.Requirement("pytorch==1.0.0"), + requirements.Requirement("pytorch-cuda==11.7.*"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + self.assertListEqual(pip_reqs_result, []) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("tensorflow==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "conda-forge": [ + requirements.Requirement("tensorflow-gpu==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("tensorflow>=1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "conda-forge": [ + requirements.Requirement("tensorflow-gpu>=1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("tensorflow>=1.0.0")], + "conda-forge": [ + requirements.Requirement("tensorflow-gpu>=1.1.0"), + ], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "conda-forge": [ + requirements.Requirement("tensorflow-gpu>=1.1.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + "conda-forge": [requirements.Requirement("tensorflow==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + "conda-forge": [ + requirements.Requirement("tensorflow-gpu==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( + collections.defaultdict( + list, + ), + [requirements.Requirement("tensorflow==1.0.0")], + cuda_version="11.7", + ) + + self.assertDictEqual( + conda_reqs_result, + { + "conda-forge": [ + requirements.Requirement("tensorflow-gpu==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + self.assertListEqual(pip_reqs_result, []) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("xgboost==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "conda-forge": [ + requirements.Requirement("py-xgboost-gpu==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("xgboost>=1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "conda-forge": [ + requirements.Requirement("py-xgboost-gpu>=1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [requirements.Requirement("xgboost>=1.0.0")], + "conda-forge": [ + requirements.Requirement("py-xgboost-gpu>=1.1.0"), + ], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [], + "conda-forge": [ + requirements.Requirement("py-xgboost-gpu>=1.1.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs = collections.defaultdict( + list, + { + "conda-forge": [requirements.Requirement("xgboost==1.0.0")], + }, + ) + + conda_reqs_result, _ = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + "conda-forge": [ + requirements.Requirement("py-xgboost-gpu==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( + collections.defaultdict( + list, + ), + [requirements.Requirement("xgboost==1.0.0")], + cuda_version="11.7", + ) + + self.assertDictEqual( + conda_reqs_result, + { + "conda-forge": [ + requirements.Requirement("py-xgboost-gpu==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + self.assertListEqual(pip_reqs_result, []) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("transformers==1.0.0"), + requirements.Requirement("pytorch==1.0.0"), + ], + }, + ) + + conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("transformers==1.0.0"), + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy")), + ], + "pytorch": [ + requirements.Requirement("pytorch==1.0.0"), + requirements.Requirement("pytorch-cuda==11.7.*"), + ], + "conda-forge": [ + requirements.Requirement("accelerate>=0.22.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + self.assertListEqual(pip_reqs_result, [requirements.Requirement("bitsandbytes>=0.41.0")]) + + conda_reqs = collections.defaultdict( + list, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("transformers==1.0.0"), + requirements.Requirement("scipy==1.0.0"), + ], + "conda-forge": [ + requirements.Requirement("accelerate==1.0.0"), + ], + }, + ) + conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda( + conda_reqs, [requirements.Requirement("bitsandbytes==1.0.0")], cuda_version="11.7" + ) + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + requirements.Requirement("transformers==1.0.0"), + requirements.Requirement("scipy==1.0.0"), + ], + "conda-forge": [ + requirements.Requirement("accelerate==1.0.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + self.assertListEqual(pip_reqs_result, [requirements.Requirement("bitsandbytes==1.0.0")]) + + conda_reqs = collections.defaultdict( + list, + { + "conda-forge": [requirements.Requirement("transformers==1.0.0")], + }, + ) + + conda_reqs_result, pip_reqs_result = env_utils.generate_env_for_cuda(conda_reqs, [], cuda_version="11.7") + + self.assertDictEqual( + conda_reqs_result, + { + env_utils.DEFAULT_CHANNEL_NAME: [ + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement("scipy")), + ], + "conda-forge": [ + requirements.Requirement("transformers==1.0.0"), + requirements.Requirement("accelerate>=0.22.0"), + ], + "nvidia": [ + requirements.Requirement(requirement_string="cuda==11.7.*"), + ], + }, + ) + + self.assertListEqual(pip_reqs_result, [requirements.Requirement("bitsandbytes>=0.41.0")]) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/exceptions/error_codes.py b/snowflake/ml/_internal/exceptions/error_codes.py index 93e2ca20..8745b4cf 100644 --- a/snowflake/ml/_internal/exceptions/error_codes.py +++ b/snowflake/ml/_internal/exceptions/error_codes.py @@ -30,6 +30,9 @@ # Indicates an internal failure raising a Snowpark ML error with an ambiguous cause, such as invoking an unexpected # private API, catching an error with an unknown cause, etc. INTERNAL_SNOWML_ERROR = "1200" +# Indicates an internal failure raising a Snowpark error with an ambiguous cause, such as invalid queries, invalid +# permission, catching an error with an unknown cause, etc. +INTERNAL_SNOWPARK_ERROR = "1300" # USER # Indicates the incompatibility of local dependency versions with the target requirements. For example, an API added in diff --git a/snowflake/ml/_internal/file_utils.py b/snowflake/ml/_internal/file_utils.py index 2781bed7..08308a67 100644 --- a/snowflake/ml/_internal/file_utils.py +++ b/snowflake/ml/_internal/file_utils.py @@ -5,6 +5,7 @@ import pathlib import pkgutil import shutil +import tarfile import tempfile import zipfile from typing import IO, Generator, List, Optional, Union @@ -128,29 +129,43 @@ def unzip_stream_in_temp_dir(stream: IO[bytes], temp_root: Optional[str] = None) yield tempdir -def hash_directory(directory: Union[str, pathlib.Path]) -> str: +def hash_directory( + directory: Union[str, pathlib.Path], *, ignore_hidden: bool = False, excluded_files: List[str] = None +) -> str: """Hash the **content** of a folder recursively using SHA-1. Args: directory: The path to the directory to be hashed. + ignore_hidden: Whether to ignore hidden file. Defaults to False. + excluded_files: List of file names to be excluded from the hashing. Returns: The hexdigest form of the hash result. """ + if not excluded_files: + excluded_files = [] - def _update_hash_from_dir(directory: Union[str, pathlib.Path], hash: "hashlib._Hash") -> "hashlib._Hash": + def _update_hash_from_dir( + directory: Union[str, pathlib.Path], hash: "hashlib._Hash", *, ignore_hidden: bool, excluded_files: List[str] + ) -> "hashlib._Hash": assert pathlib.Path(directory).is_dir(), "Provided path is not a directory." for path in sorted(pathlib.Path(directory).iterdir(), key=lambda p: str(p).lower()): + if ignore_hidden and path.name.startswith("."): + continue + if path.name in excluded_files: + continue hash.update(path.name.encode()) if path.is_file(): with open(path, "rb") as f: for chunk in iter(lambda: f.read(64 * 1024), b""): hash.update(chunk) elif path.is_dir(): - hash = _update_hash_from_dir(path, hash) + hash = _update_hash_from_dir(path, hash, ignore_hidden=ignore_hidden, excluded_files=excluded_files) return hash - return _update_hash_from_dir(directory, hashlib.sha1()).hexdigest() + return _update_hash_from_dir( + directory, hashlib.sha1(), ignore_hidden=ignore_hidden, excluded_files=excluded_files + ).hexdigest() def get_all_modules(dirname: str, prefix: str = "") -> List[str]: @@ -171,3 +186,23 @@ def _able_ascii_encode(s: str) -> bool: return True except UnicodeEncodeError: return False + + +@contextlib.contextmanager +def _create_tar_gz_stream(source_dir: str, arcname: str = None) -> Generator[io.BytesIO, None, None]: + """ + Create a compressed tarball (.tar.gz) of the source directory and return an input stream as a context + manager. + + Args: + source_dir (str): The path to the directory to compress. + arcname: Alternative name for a file in the archive + + Yields: + io.BytesIO: An input stream containing the compressed tarball. + """ + with io.BytesIO() as output_stream: + with tarfile.open(fileobj=output_stream, mode="w:gz") as tar: + tar.add(source_dir, arcname=arcname) + output_stream.seek(0) + yield output_stream diff --git a/snowflake/ml/_internal/file_utils_test.py b/snowflake/ml/_internal/file_utils_test.py index 8e476518..5a724002 100644 --- a/snowflake/ml/_internal/file_utils_test.py +++ b/snowflake/ml/_internal/file_utils_test.py @@ -4,6 +4,7 @@ import sys import tempfile import warnings +from datetime import datetime from absl.testing import absltest @@ -154,6 +155,63 @@ def test_hash_directory(self) -> None: self.assertEqual(hash_0, hash_5) self.assertNotEqual(hash_0, hash_6) + def test_hash_directory_with_excluded_files(self) -> None: + def _populate_tmpdir(tmpdir: str) -> None: + with open(os.path.join(tmpdir, "Dockerfile"), "w", encoding="utf-8") as f: + f.write("FROM focal-cuda-11.6.2") + f.flush() + + os.mkdir(os.path.join(tmpdir, "env")) + with open(os.path.join(tmpdir, "env", "conda.yaml"), "w", encoding="utf-8") as f: + f.write("python==3.8.13") + f.flush() + + os.mkdir(os.path.join(tmpdir, "server")) + with open(os.path.join(tmpdir, "server", "main.py"), "w", encoding="utf-8") as f: + f.write("import os") + f.flush() + + with open(os.path.join(tmpdir, "model.yaml"), "w", encoding="utf-8") as f: + f.write(f"creation_timestamp: {datetime.now().time().strftime('%H:%M:%S.%f')}") + f.flush() + + with tempfile.TemporaryDirectory() as tmpdir: + _populate_tmpdir(tmpdir) + hash_0 = file_utils.hash_directory(tmpdir) + hash_0_with_exclude = file_utils.hash_directory(tmpdir, excluded_files=["model.yaml"]) + + with tempfile.TemporaryDirectory() as tmpdir: + _populate_tmpdir(tmpdir) + hash_1 = file_utils.hash_directory(tmpdir) + hash_1_with_exclude = file_utils.hash_directory(tmpdir, excluded_files=["model.yaml"]) + + self.assertNotEqual(hash_0, hash_1) + self.assertNotEqual(hash_0, hash_0_with_exclude) + self.assertEqual(hash_0_with_exclude, hash_1_with_exclude) + + def test_hash_directory_with_ignore_hidden_file(self) -> None: + def _populate_tmpdir(tmpdir: str) -> None: + with open(os.path.join(tmpdir, "Dockerfile"), "w", encoding="utf-8") as f: + f.write("FROM focal-cuda-11.6.2") + f.flush() + with open(os.path.join(tmpdir, ".DS_Store"), "w", encoding="utf-8") as f: + f.write(f"creation_timestamp: {datetime.now().time().strftime('%H:%M:%S.%f')}") + f.flush() + + with tempfile.TemporaryDirectory() as tmpdir: + _populate_tmpdir(tmpdir) + hash_0 = file_utils.hash_directory(tmpdir) + hash_0_ignore_hidden = file_utils.hash_directory(tmpdir, ignore_hidden=True) + + with tempfile.TemporaryDirectory() as tmpdir: + _populate_tmpdir(tmpdir) + hash_1 = file_utils.hash_directory(tmpdir) + hash_1_ignore_hidden = file_utils.hash_directory(tmpdir, ignore_hidden=True) + + self.assertNotEqual(hash_0, hash_1) + self.assertNotEqual(hash_0, hash_0_ignore_hidden) + self.assertEqual(hash_0_ignore_hidden, hash_1_ignore_hidden) + def test_able_ascii_encode(self) -> None: self.assertTrue(file_utils._able_ascii_encode("abc")) self.assertFalse(file_utils._able_ascii_encode("❄️")) diff --git a/snowflake/ml/_internal/telemetry.py b/snowflake/ml/_internal/telemetry.py index 1e03ed5e..dff9873f 100644 --- a/snowflake/ml/_internal/telemetry.py +++ b/snowflake/ml/_internal/telemetry.py @@ -305,9 +305,20 @@ def wrap(*args: Any, **kwargs: Any) -> _ReturnValue: res = func(*args, **kwargs) except Exception as e: if not isinstance(e, snowml_exceptions.SnowflakeMLException): - e = snowml_exceptions.SnowflakeMLException(error_code=error_codes.UNDEFINED, original_exception=e) + # already handled via a nested decorated function + if hasattr(e, "_snowflake_ml_handled") and e._snowflake_ml_handled: # type: ignore[attr-defined] + raise e + if isinstance(e, snowpark_exceptions.SnowparkClientException): + e = snowml_exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_SNOWPARK_ERROR, original_exception=e + ) + else: + e = snowml_exceptions.SnowflakeMLException( + error_code=error_codes.UNDEFINED, original_exception=e + ) telemetry_args["error"] = repr(e) telemetry_args["error_code"] = e.error_code + e.original_exception._snowflake_ml_handled = True # type: ignore[attr-defined] raise e.original_exception else: return res diff --git a/snowflake/ml/_internal/telemetry_test.py b/snowflake/ml/_internal/telemetry_test.py index 43dc2624..723573b9 100644 --- a/snowflake/ml/_internal/telemetry_test.py +++ b/snowflake/ml/_internal/telemetry_test.py @@ -375,6 +375,32 @@ def foo(self) -> None: test_obj.foo() self.assertIn(error_codes.INTERNAL_TEST, str(ex.exception)) + @mock.patch("snowflake.snowpark.session._get_active_sessions") + def test_snowml_nested_error(self, mock_get_active_sessions: mock.MagicMock) -> None: + mock_get_active_sessions.return_value = {self.mock_session} + + class DummyObject: + @utils_telemetry.send_api_usage_telemetry( + project=_PROJECT, + ) + def foo(self) -> None: + self.nested_foo() + + @utils_telemetry.send_api_usage_telemetry( + project=_PROJECT, + ) + def nested_foo(self) -> None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INTERNAL_TEST, + original_exception=RuntimeError("foo error"), + ) + + test_obj = DummyObject() + with self.assertRaises(RuntimeError) as ex: + test_obj.foo() + self.assertIn(error_codes.INTERNAL_TEST, str(ex.exception)) + self.assertNotIn(error_codes.UNDEFINED, str(ex.exception)) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/utils/BUILD.bazel b/snowflake/ml/_internal/utils/BUILD.bazel index aac2947b..4e4fd4c1 100644 --- a/snowflake/ml/_internal/utils/BUILD.bazel +++ b/snowflake/ml/_internal/utils/BUILD.bazel @@ -78,6 +78,9 @@ py_test( py_library( name = "uri", srcs = ["uri.py"], + deps = [ + "//snowflake/ml/_internal/utils:identifier", + ] ) py_test( @@ -137,3 +140,33 @@ py_library( name = "parallelize", srcs = ["parallelize.py"], ) + +py_library( + name = "result", + srcs = ["result.py"], +) + +py_library( + name = "spcs_image_registry", + srcs = ["spcs_image_registry.py"], + deps = [":query_result_checker"] +) + +py_library( + name = "table_manager", + srcs = [ + "table_manager.py", + "//snowflake/ml/_internal/utils:formatting", + "//snowflake/ml/_internal/utils:query_result_checker", + ], +) + +py_test( + name = "table_manager_test", + srcs = ["table_manager_test.py"], + deps = [ + ":table_manager", + "//snowflake/ml/test_utils:mock_data_frame", + "//snowflake/ml/test_utils:mock_session", + ], +) diff --git a/snowflake/ml/_internal/utils/identifier.py b/snowflake/ml/_internal/utils/identifier.py index 3b54d7a3..f55a47a2 100644 --- a/snowflake/ml/_internal/utils/identifier.py +++ b/snowflake/ml/_internal/utils/identifier.py @@ -8,7 +8,10 @@ _SF_UNQUOTED_CASE_SENSITIVE_IDENTIFIER = "[A-Z_][A-Z0-9_$]*" SF_QUOTED_IDENTIFIER = '"(?:[^"]|"")*"' _SF_IDENTIFIER = f"({_SF_UNQUOTED_CASE_INSENSITIVE_IDENTIFIER}|{SF_QUOTED_IDENTIFIER})" -_SF_SCHEMA_LEVEL_OBJECT = rf"{_SF_IDENTIFIER}\.{_SF_IDENTIFIER}\.{_SF_IDENTIFIER}(.*)" +SF_IDENTIFIER_RE = re.compile(_SF_IDENTIFIER) +_SF_SCHEMA_LEVEL_OBJECT = ( + rf"(?:(?:(?P{_SF_IDENTIFIER})\.)?(?P{_SF_IDENTIFIER})\.)?(?P{_SF_IDENTIFIER})(?P.*)" +) _SF_SCHEMA_LEVEL_OBJECT_RE = re.compile(_SF_SCHEMA_LEVEL_OBJECT) UNQUOTED_CASE_INSENSITIVE_RE = re.compile(f"^({_SF_UNQUOTED_CASE_INSENSITIVE_IDENTIFIER})$") @@ -154,10 +157,35 @@ def parse_schema_level_object_identifier( res = _SF_SCHEMA_LEVEL_OBJECT_RE.fullmatch(path) if not res: raise ValueError(f"Invalid identifier. It should start with database.schema.stage. Getting {path}") - identifiers = res.groups() - if len(identifiers) != 4: - raise ValueError(f"Failed to parse the identifier. Identifiers parsed: {identifiers}") - return identifiers[0], identifiers[1], identifiers[2], identifiers[3] + return res.group("db"), res.group("schema"), res.group("object"), res.group("others") + + +def get_schema_level_object_identifier( + db: Optional[str], schema: Optional[str], object_name: str, others: Optional[str] = None +) -> str: + """The reverse operation of parse_schema_level_object_identifier + + Args: + db: Database level object name. + schema: Schema level object name. + object_name: stage/table level object name. Must be not None. + others: All other part attached. + + Returns: + A string in format '..' + + Raises: + ValueError: If the identifiers is invalid. + """ + + for identifier in (db, schema, object_name): + if identifier is not None and SF_IDENTIFIER_RE.match(identifier) is None: + raise ValueError(f"Invalid identifier {identifier}") + + if others is None: + others = "" + + return ".".join(filter(None, (db, schema, object_name))) + others @overload @@ -238,3 +266,18 @@ def get_escaped_names(ids: Optional[Union[str, List[str]]]) -> Optional[Union[st return _get_escaped_name(ids) else: raise ValueError("Unsupported type. Only string or list of string are supported for selecting columns.") + + +def remove_prefix(s: str, prefix: str) -> str: + """Remove prefix from a string. + + Args: + s: string to remove prefix from. + prefix: prefix to match. + + Returns: + string with the prefix removed. + """ + if s.startswith(prefix): + return s[len(prefix) :] + return s diff --git a/snowflake/ml/_internal/utils/identifier_test.py b/snowflake/ml/_internal/utils/identifier_test.py index 78815f1f..0ce35bab 100644 --- a/snowflake/ml/_internal/utils/identifier_test.py +++ b/snowflake/ml/_internal/utils/identifier_test.py @@ -2,6 +2,39 @@ import snowflake.ml._internal.utils.identifier as identifier +SCHEMA_LEVEL_OBJECT_TEST_CASES = [ + ("foo", None, None, "foo", ""), + ("foo/", None, None, "foo", "/"), + ('"foo"', None, None, '"foo"', ""), + ('"foo"/', None, None, '"foo"', "/"), + ("foo/bar", None, None, "foo", "/bar"), + ("foo/bar.gz", None, None, "foo", "/bar.gz"), + ('"foo"/bar.gz', None, None, '"foo"', "/bar.gz"), + ("testschema.foo", None, "testschema", "foo", ""), + ('testschema."foo"', None, "testschema", '"foo"', ""), + ("testschema.foo/bar", None, "testschema", "foo", "/bar"), + ("testschema.foo/bar.gz", None, "testschema", "foo", "/bar.gz"), + ('testschema."foo"/bar.gz', None, "testschema", '"foo"', "/bar.gz"), + ('"testschema".foo', None, '"testschema"', "foo", ""), + ('"testschema"."foo"', None, '"testschema"', '"foo"', ""), + ('"testschema".foo/bar', None, '"testschema"', "foo", "/bar"), + ('"testschema".foo/bar.gz', None, '"testschema"', "foo", "/bar.gz"), + ('"testschema"."foo"/bar.gz', None, '"testschema"', '"foo"', "/bar.gz"), + ("testdb.testschema.foo", "testdb", "testschema", "foo", ""), + ("_testdb.testschema._foo/", "_testdb", "testschema", "_foo", "/"), + ('testdb$."test""s""chema"._f1oo', "testdb$", '"test""s""chema"', "_f1oo", ""), + ("test1db.test$schema.foo1/nytrain/", "test1db", "test$schema", "foo1", "/nytrain/"), + ("test_db.test_schema.foo.nytrain.1.txt", "test_db", "test_schema", "foo", ".nytrain.1.txt"), + ('test_d$b."test.schema".fo$_o/nytrain/', "test_d$b", '"test.schema"', "fo$_o", "/nytrain/"), + ( + '"идентификатор"."test schema"."f.o_o1"', + '"идентификатор"', + '"test schema"', + '"f.o_o1"', + "", + ), +] + class SnowflakeIdentifierTest(absltest.TestCase): def test_is_quote_valid(self) -> None: @@ -47,29 +80,19 @@ def test_user_specificed_quotes(self) -> None: self.assertEqual('"demo__task1"', identifier.concat_names(["demo__", '"task1"'])) def test_parse_schema_level_object_identifier(self) -> None: - """Test if the schema level identifiers could be scuuessfully parsed""" - test_cases = [ - ("testdb.testschema.foo", "testdb", "testschema", "foo", ""), - ("_testdb.testschema._foo/", "_testdb", "testschema", "_foo", "/"), - ('testdb$."test""s""chema"._f1oo', "testdb$", '"test""s""chema"', "_f1oo", ""), - ("test1db.test$schema.foo1/nytrain/", "test1db", "test$schema", "foo1", "/nytrain/"), - ("test_db.test_schema.foo.nytrain.1.txt", "test_db", "test_schema", "foo", ".nytrain.1.txt"), - ('test_d$b."test.schema".fo$_o/nytrain/', "test_d$b", '"test.schema"', "fo$_o", "/nytrain/"), - ( - '"идентификатор"."test schema"."f.o_o1"', - '"идентификатор"', - '"test schema"', - '"f.o_o1"', - "", - ), - ] + """Test if the schema level identifiers could be successfully parsed""" - for test_case in test_cases: + for test_case in SCHEMA_LEVEL_OBJECT_TEST_CASES: with self.subTest(): self.assertTupleEqual( tuple(test_case[1:]), identifier.parse_schema_level_object_identifier(test_case[0]) ) + def test_get_schema_level_object_identifier(self) -> None: + for test_case in SCHEMA_LEVEL_OBJECT_TEST_CASES: + with self.subTest(): + self.assertEqual(test_case[0], identifier.get_schema_level_object_identifier(*test_case[1:])) + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/utils/result.py b/snowflake/ml/_internal/utils/result.py new file mode 100644 index 00000000..9542848d --- /dev/null +++ b/snowflake/ml/_internal/utils/result.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved. +# +import sys +from io import BytesIO +from typing import Any + +import cloudpickle + +import snowflake.snowpark._internal.utils as snowpark_utils +from snowflake import snowpark + +_RESULT_SIZE_THRESHOLD = 5 * (1024**2) # 5MB + + +class SnowflakeResult: + """ + Handles serialization, uploading, downloading, and deserialization of stored procedure results. If the results + are too large to be returned from a stored procedure, the result will be uploaded. The client can then retrieve + and deserialize the result if it was uploaded. + """ + + def __init__(self, session: snowpark.Session, result: Any) -> None: + self.result = result + self.session = session + self.result_object_filepath = None + result_bytes = cloudpickle.dumps(self.result) + if sys.getsizeof(result_bytes) > _RESULT_SIZE_THRESHOLD: + stage_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.STAGE) + session.sql(f"CREATE TEMPORARY STAGE {stage_name}").collect() + result_object_filepath = f"@{stage_name}/{snowpark_utils.generate_random_alphanumeric()}" + session.file.put_stream(BytesIO(result_bytes), result_object_filepath) + self.result_object_filepath = f"{result_object_filepath}.gz" + + def serialize(self) -> bytes: + """ + Serialize a tuple containing the result (or None) and the result object filepath + if the result was uploaded to a stage (or None). + + Returns: + Cloudpickled string of bytes of the result tuple. + """ + if self.result_object_filepath is not None: + return cloudpickle.dumps((None, self.result_object_filepath)) # type: ignore[no-any-return] + return cloudpickle.dumps((self.result, None)) # type: ignore[no-any-return] + + @staticmethod + def load_result_from_filepath(session: snowpark.Session, result_object_filepath: str) -> Any: + """ + Loads and deserializes the uploaded result. + + Args: + session: Snowpark session. + result_object_filepath: Stage filepath of the result object returned by serialize method. + + Returns: + The original serialized result (any type). + """ + result_object_bytes_io = session.file.get_stream(result_object_filepath, decompress=True) + result_bytes = result_object_bytes_io.read() + return cloudpickle.loads(result_bytes) diff --git a/snowflake/ml/_internal/utils/spcs_image_registry.py b/snowflake/ml/_internal/utils/spcs_image_registry.py new file mode 100644 index 00000000..cf898972 --- /dev/null +++ b/snowflake/ml/_internal/utils/spcs_image_registry.py @@ -0,0 +1,76 @@ +import base64 +import contextlib +import json +from typing import Generator + +from snowflake import snowpark +from snowflake.ml._internal.utils import query_result_checker + + +@contextlib.contextmanager +def generate_image_registry_credential(session: snowpark.Session) -> Generator[str, None, None]: + """Construct basic auth credential that is specific to SPCS image registry. For image registry authentication, we + will use a session token obtained from the Snowpark session object. The token authentication mechanism is + automatically used when the username is set to "0sessiontoken" according to the registry implementation. + + As a workaround for SNOW-841699: Fail to authenticate to image registry with session token generated from + Snowpark. We need to temporarily set the json query format in order to process GS token response. Note that we + should set the format back only after registry authentication is complete, otherwise authentication will fail. + + Args: + session: snowpark session + + Yields: + base64-encoded credentials. + """ + + query_result = ( + query_result_checker.SqlResultValidator( + session, + query="SHOW PARAMETERS LIKE 'PYTHON_CONNECTOR_QUERY_RESULT_FORMAT' IN SESSION", + ) + .has_dimensions(expected_rows=1) + .validate() + ) + prev_format = query_result[0].value + try: + session.sql("ALTER SESSION SET PYTHON_CONNECTOR_QUERY_RESULT_FORMAT = 'json'").collect() + session_token = _get_session_token(session) + yield _get_base64_encoded_credentials(username="0sessiontoken", password=json.dumps({"token": session_token})) + finally: + session.sql(f"ALTER SESSION SET PYTHON_CONNECTOR_QUERY_RESULT_FORMAT = '{prev_format}'").collect() + + +def _get_session_token(session: snowpark.Session) -> str: + """ + This function retrieves the session token from a given Snowpark session object. + + Args: + session: snowpark session. + + Returns: + The session token string value. + """ + ctx = session._conn._conn + assert ctx._rest, "SnowflakeRestful is not set in session" + token_data = ctx._rest._token_request("ISSUE") + session_token = token_data["data"]["sessionToken"] + assert session_token, "session_token is not obtained successfully from the session object" + return session_token + + +def _get_base64_encoded_credentials(username: str, password: str) -> str: + """This function returns the base64 encoded username:password, which is compatible with registry, such as + SnowService image registry, that uses Docker credential helper. + + Args: + username: username for authentication. + password: password for authentication. + + Returns: + base64 encoded credential string. + + """ + credentials = f"{username}:{password}" + encoded_credentials = base64.b64encode(credentials.encode("utf-8")).decode("utf-8") + return encoded_credentials diff --git a/snowflake/ml/_internal/utils/table_manager.py b/snowflake/ml/_internal/utils/table_manager.py new file mode 100644 index 00000000..415e9929 --- /dev/null +++ b/snowflake/ml/_internal/utils/table_manager.py @@ -0,0 +1,98 @@ +from typing import Any, Dict, List, Tuple + +from snowflake import snowpark +from snowflake.ml._internal.utils import formatting, query_result_checker + +"""Table_manager is a set of utils that helps create tables. + +TODO: We should make table manager a class and then put the following functions as public methods. + Class constructor should take the session. Potentially db, schema as well. +""" + + +def get_fully_qualified_schema_name(database_name: str, schema_name: str) -> str: + return f"{database_name}.{schema_name}" + + +def get_fully_qualified_table_name(database_name: str, schema_name: str, table_name: str) -> str: + return f"{get_fully_qualified_schema_name(database_name, schema_name)}.{table_name}" + + +def create_single_registry_table( + session: snowpark.Session, + database_name: str, + schema_name: str, + table_name: str, + table_schema: List[Tuple[str, str]], + statement_params: Dict[str, Any], +) -> str: + """Creates a single table for registry and returns the fully qualified name of the table. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + table_name: Name of the target table. + table_schema: A list of pair of strings, each pair denotes `(, )`. + statement_params: Function usage statement parameters used in sql query executions. + + Returns: + A string which is the name of the created table. + + Raises: + RuntimeError: If table creation failed. + """ + fully_qualified_table_name = get_fully_qualified_table_name(database_name, schema_name, table_name) + table_schema_string = ", ".join([f"{k} {v}" for k, v in table_schema]) + try: + session.sql(f"CREATE TABLE IF NOT EXISTS {fully_qualified_table_name} ({table_schema_string})").collect( + statement_params=statement_params + ) + except Exception as e: + raise RuntimeError(f"Registry table {fully_qualified_table_name} creation failed due to {e}") + + return fully_qualified_table_name + + +def insert_table_entry(session: snowpark.Session, table: str, columns: Dict[str, Any]) -> List[snowpark.Row]: + """Insert an entry into an internal Model Registry table. + + Args: + session: Snowpark session object to communicate with Snowflake. + table: Fully qualified name of the table to insert into. + columns: Key-value pairs of columns and values to be inserted into the table. + + Returns: + Result of the operation as returned by the Snowpark session (snowpark.DataFrame). + + Raises: + RuntimeError: If entry insertion failed. + """ + sorted_columns = sorted(columns.items()) + try: + sql = "INSERT INTO {table} ( {columns} ) SELECT {values}".format( + table=table, + columns=",".join([x[0] for x in sorted_columns]), + values=",".join([formatting.format_value_for_select(x[1]) for x in sorted_columns]), + ) + return query_result_checker.SqlResultValidator(session, sql).insertion_success(expected_num_rows=1).validate() + except Exception as e: + raise RuntimeError(f"Table {table} entry {columns} insertion failed due to {e}") + + +def validate_table_exist(session: snowpark.Session, table: str, qualified_schema_name: str) -> bool: + """Check if the given table exists in the target schema. + + Note: + In case the table doesn't exist, a DataError will be raised by SqlResultValidator. + + Args: + session: Snowpark session object to communicate with Snowflake. + table: Name of the target table as an identifier. + qualified_schema_name: Fully qualidied schema name where the target table is expected to exist. + + Returns: + A boolean stands for whether the target table already exists. + """ + tables = session.sql(f"SHOW TABLES LIKE '{table}' IN {qualified_schema_name}").collect() + return len(tables) == 1 diff --git a/snowflake/ml/_internal/utils/table_manager_test.py b/snowflake/ml/_internal/utils/table_manager_test.py new file mode 100644 index 00000000..80a3e594 --- /dev/null +++ b/snowflake/ml/_internal/utils/table_manager_test.py @@ -0,0 +1,86 @@ +from typing import List, cast + +from absl.testing import absltest + +from snowflake import snowpark +from snowflake.ml._internal.utils import table_manager +from snowflake.ml.test_utils import mock_data_frame, mock_session + + +class TableManagerTest(absltest.TestCase): + """Testing table manager util functions.""" + + def setUp(self) -> None: + """Creates Snowpark environemnts for testing.""" + self._session = mock_session.MockSession(conn=None, test_case=self) + + def tearDown(self) -> None: + """Complete test case. Ensure all expected operations have been observed.""" + self._session.finalize() + + def test_get_fully_qualified_schema_name(self) -> None: + test_cases = [ + ("testdb", "testschema", "testdb.testschema"), + ('"testdb"', '"testschema"', '"testdb"."testschema"'), + ] + for database_name, schema_name, expected_res in test_cases: + with self.subTest(): + self.assertEqual( + table_manager.get_fully_qualified_schema_name(database_name, schema_name), expected_res + ) + + def test_get_fully_qualified_table_name(self) -> None: + test_cases = [ + ("testdb", "testschema", "table", "testdb.testschema.table"), + ('"testdb"', '"testschema"', '"table"', '"testdb"."testschema"."table"'), + ("testdb", "testschema", '"table"', 'testdb.testschema."table"'), + ] + for database_name, schema_name, table_name, expected_res in test_cases: + with self.subTest(): + self.assertEqual( + table_manager.get_fully_qualified_table_name(database_name, schema_name, table_name), expected_res + ) + + def test_create_single_registry_table(self) -> None: + schema_list = [("ID", "VARCHAR"), ("TYPE", "VARCHAR")] + database_name = "testdb" + schema_name = "testschema" + table_name = "testtable" + self._session.add_mock_sql( + query=f"CREATE TABLE IF NOT EXISTS {database_name}.{schema_name}.{table_name} (ID VARCHAR, TYPE VARCHAR)", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"Table {table_name} successfully created.")], + ), + ) + table_manager.create_single_registry_table( + cast(snowpark.Session, self._session), database_name, schema_name, table_name, schema_list, {} + ) + + def test_insert_table_entry(self) -> None: + table_name = "testtable" + insert_query = f"INSERT INTO {table_name} ( ID,TYPE ) SELECT 1,'a' " + self._session.add_mock_sql( + query=insert_query, + result=mock_data_frame.MockDataFrame([snowpark.Row(**{"number of rows inserted": 1})]), + ) + table_manager.insert_table_entry(cast(snowpark.Session, self._session), table_name, {"ID": 1, "TYPE": "a"}) + + def test_validate_table_exist(self) -> None: + table_name = "testtable" + schema_name = "testschema" + empty_row_list: List[snowpark.Row] = [] + test_cases = [(empty_row_list, False), ([snowpark.Row(**{"number of rows inserted": 1})], True)] + for snowpark_res, expected_res in test_cases: + with self.subTest(): + self._session.add_mock_sql( + query=f"SHOW TABLES LIKE '{table_name}' IN {schema_name}", + result=mock_data_frame.MockDataFrame(snowpark_res), + ) + self.assertEqual( + table_manager.validate_table_exist(cast(snowpark.Session, self._session), table_name, schema_name), + expected_res, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/_internal/utils/uri.py b/snowflake/ml/_internal/utils/uri.py index ffad711c..70dac266 100644 --- a/snowflake/ml/_internal/utils/uri.py +++ b/snowflake/ml/_internal/utils/uri.py @@ -2,6 +2,8 @@ from typing import Optional from urllib.parse import ParseResult, urlparse, urlunparse +from snowflake.ml._internal.utils import identifier + _LOCAL_URI_SCHEMES = ["", "file"] _HTTP_URI_SCHEMES = ["http", "https"] _SNOWFLAKE_STAGE_URI_SCHEMES = ["sfc", "sfstage"] @@ -48,14 +50,17 @@ def get_uri_scheme(uri: str) -> str: return urlparse(uri).scheme -def get_uri_from_snowflake_stage_path(path: str) -> str: +def get_uri_from_snowflake_stage_path(stage_path: str) -> str: """Generates a URI from Snowflake stage path.""" - clean_path = path.replace('"', "").replace("'", "").lstrip("@") + assert stage_path.startswith("@") + (db, schema, stage, path) = identifier.parse_schema_level_object_identifier( + posixpath.normpath(identifier.remove_prefix(stage_path, "@")) + ) return urlunparse( ParseResult( scheme=_SNOWFLAKE_STAGE_URI_SCHEMES[0], - netloc="", - path=clean_path, + netloc=identifier.get_schema_level_object_identifier(db, schema, stage), + path=path, params="", query="", fragment="", diff --git a/snowflake/ml/_internal/utils/uri_test.py b/snowflake/ml/_internal/utils/uri_test.py index 0f65500d..c7b6daf6 100644 --- a/snowflake/ml/_internal/utils/uri_test.py +++ b/snowflake/ml/_internal/utils/uri_test.py @@ -26,6 +26,38 @@ def test_snowflake_stage_uris(self) -> None: uri.get_snowflake_stage_path_from_uri("sfc://SNOWFLAKE_STAGE/content"), "SNOWFLAKE_STAGE/content" ) + self.assertEqual( + uri.get_uri_from_snowflake_stage_path("@SNOWFLAKE_STAGE/content"), "sfc://SNOWFLAKE_STAGE/content" + ) + + self.assertEqual( + uri.get_snowflake_stage_path_from_uri("sfc://SNOWFLAKE_STAGE/content/"), "SNOWFLAKE_STAGE/content" + ) + + self.assertEqual( + uri.get_uri_from_snowflake_stage_path("@SNOWFLAKE_STAGE/content/"), "sfc://SNOWFLAKE_STAGE/content" + ) + + self.assertEqual(uri.get_snowflake_stage_path_from_uri("sfc://SNOWFLAKE_STAGE"), "SNOWFLAKE_STAGE") + + self.assertEqual(uri.get_uri_from_snowflake_stage_path("@SNOWFLAKE_STAGE"), "sfc://SNOWFLAKE_STAGE") + + self.assertEqual(uri.get_snowflake_stage_path_from_uri("sfc://SNOWFLAKE_STAGE/"), "SNOWFLAKE_STAGE") + + self.assertEqual(uri.get_uri_from_snowflake_stage_path("@SNOWFLAKE_STAGE/"), "sfc://SNOWFLAKE_STAGE") + + self.assertEqual( + uri.get_uri_from_snowflake_stage_path("@SNOWFLAKE_DB.SNOWFLAKE_SCHEMA.SNOWFLAKE_STAGE/content"), + "sfc://SNOWFLAKE_DB.SNOWFLAKE_SCHEMA.SNOWFLAKE_STAGE/content", + ) + + self.assertEqual( + uri.get_uri_from_snowflake_stage_path( + stage_path='@"SNOWFLAKE_DB"."SNOWFLAKE_SCHEMA".SNOWFLAKE_STAGE/content' + ), + 'sfc://"SNOWFLAKE_DB"."SNOWFLAKE_SCHEMA".SNOWFLAKE_STAGE/content', + ) + self.assertEqual(uri.get_snowflake_stage_path_from_uri("sfc://SNOWFLAKE_STAGE"), "SNOWFLAKE_STAGE") # No stage path from invalid scheme. @@ -33,7 +65,7 @@ def test_snowflake_stage_uris(self) -> None: # Assembling URIs self.assertEqual( - uri.get_uri_from_snowflake_stage_path("SNOWFLAKE_STAGE/content"), "sfc:SNOWFLAKE_STAGE/content" + uri.get_uri_from_snowflake_stage_path("@SNOWFLAKE_STAGE/content"), "sfc://SNOWFLAKE_STAGE/content" ) def test_non_snowflake_uris(self) -> None: diff --git a/snowflake/ml/fileset/fileset.py b/snowflake/ml/fileset/fileset.py index 4c5ca848..1a2cf0ff 100644 --- a/snowflake/ml/fileset/fileset.py +++ b/snowflake/ml/fileset/fileset.py @@ -530,6 +530,8 @@ def _validate_target_stage_loc(snowpark_session: snowpark.Session, target_stage_ ) try: db, schema, stage, _ = identifier.parse_schema_level_object_identifier(target_stage_loc[1:]) + if db is None or schema is None: + raise ValueError("The stage path should be in the form '@../*'") df_stages = snowpark_session.sql(f"Show stages like '{stage}' in SCHEMA {db}.{schema}") df_stages = df_stages.filter(functions.col('"type"').like(f"%{_FILESET_STAGE_TYPE}%")) valid_stage = df_stages.collect() diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index 1b025d63..14ea76c3 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -95,6 +95,7 @@ py_library( ":model_signature", ":type_hints", ":deploy_platforms", + ":_model", "//snowflake/ml/_internal/utils:identifier", "//snowflake/ml/model/_deploy_client/snowservice:deploy", "//snowflake/ml/model/_deploy_client/warehouse:deploy", @@ -122,8 +123,8 @@ py_library( ":custom_model", ":model_signature", ":type_hints", + "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/_internal:type_utils", "//snowflake/ml/model/_handlers:custom", "//snowflake/ml/model/_handlers:mlflow", "//snowflake/ml/model/_handlers:pytorch", @@ -132,6 +133,7 @@ py_library( "//snowflake/ml/model/_handlers:tensorflow", "//snowflake/ml/model/_handlers:torchscript", "//snowflake/ml/model/_handlers:xgboost", + "//snowflake/ml/model/_handlers:huggingface_pipeline", "//snowflake/ml/modeling/framework", ], ) @@ -184,6 +186,7 @@ py_test( ":custom_model", ":model_signature", ":type_hints", + "//snowflake/ml/_internal:env_utils", "//snowflake/ml/model/_signatures:pytorch_handler", "//snowflake/ml/model/_signatures:tensorflow_handler", "//snowflake/ml/model/_signatures:utils", diff --git a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel index e5da3b2d..1a3cbdaf 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel @@ -14,7 +14,27 @@ py_library( deps = [ ":base_image_builder", ":docker_context", + "//snowflake/ml/model:_model_meta", "//snowflake/ml/_internal/utils:query_result_checker", + "//snowflake/ml/_internal/utils:spcs_image_registry", + ] +) + +py_library( + name = "server_image_builder", + srcs = ["server_image_builder.py"], + deps = [ + ":base_image_builder", + ":docker_context", + ":client_image_builder", + "//snowflake/ml/model/_deploy_client/utils:constants", + "//snowflake/ml/model/_deploy_client/utils:snowservice_client", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/_internal:file_utils", + ], + data = [ + "templates/image_build_job_spec_template", + "templates/kaniko_shell_script_template", ] ) @@ -22,7 +42,8 @@ py_library( name = "docker_context", srcs = ["docker_context.py"], deps = [ - "//snowflake/ml/model/_deploy_client/utils:constants" + "//snowflake/ml/model/_deploy_client/utils:constants", + "//snowflake/ml/model:_model_meta", ], data = [ "gunicorn_run.sh", @@ -40,6 +61,18 @@ py_test( ] ) +py_test( + name = "server_image_builder_test", + srcs = ["server_image_builder_test.py"], + deps = [ + ":server_image_builder", + "//snowflake/ml/test_utils:mock_session", + ], + data = [ + "test_fixtures/kaniko_shell_script_fixture.sh", + ] +) + py_test( name = "docker_context_test", srcs = ["docker_context_test.py"], @@ -49,6 +82,7 @@ py_test( ], data = [ "test_fixtures/dockerfile_test_fixture", + "test_fixtures/dockerfile_test_fixture_with_CUDA", ] ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py index 8b491b0f..4ab460f6 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py @@ -1,14 +1,15 @@ -import base64 import json import logging import os import subprocess import tempfile +import time from enum import Enum -from typing import List +from typing import List, Optional from snowflake import snowpark -from snowflake.ml._internal.utils import query_result_checker +from snowflake.ml._internal.utils import spcs_image_registry +from snowflake.ml.model import _model_meta from snowflake.ml.model._deploy_client.image_builds import ( base_image_builder, docker_context, @@ -28,48 +29,67 @@ class ClientImageBuilder(base_image_builder.ImageBuilder): Usage requirements: Requires prior installation and running of Docker with BuildKit. See installation instructions in https://docs.docker.com/engine/install/ - - """ - def __init__(self, *, id: str, image_repo: str, model_dir: str, session: snowpark.Session) -> None: + def __init__( + self, + *, + id: str, + image_repo: str, + model_meta: _model_meta.ModelMetadata, + session: snowpark.Session, + image_tag: Optional[str] = None, + ) -> None: """Initialization Args: id: A hexadecimal string used for naming the image tag. image_repo: Path to image repository. - model_dir: Local model directory, downloaded form stage and extracted. + model_meta: Model Metadata session: Snowpark session + image_tag: Optional image tag name; when not provided, will use model id as the tag name. """ - self.image_tag = "/".join([image_repo.rstrip("/"), id]) + ":latest" + self.image_tag = image_tag or "/".join([image_repo.rstrip("/"), id]) + ":latest" self.image_repo = image_repo - self.model_dir = model_dir + self.model_meta = model_meta self.session = session - def build_and_upload_image(self) -> str: - """ - Builds and uploads an image to the model registry. + def build_and_upload_image(self, image_to_pull: str = None) -> str: + """Builds and uploads an image to the model registry. + + Args: + image_to_pull: When set, skips building image locally; instead, pull image directly from public + repo. This is more of a workaround to support non-spcs-registry images. + TODO[shchen] remove such logic when first-party-image is supported on snowservice registry. + + Returns: + Snowservice registry image tag. + + Raises: + RuntimeError: Occurs when failed to build image or push to image registry. """ - def _setup_docker_config(docker_config_dir: str) -> None: - """Set up a temporary docker config, which is used for running all docker commands. + def _setup_docker_config(docker_config_dir: str, registry_cred: str) -> None: + """Set up a temporary docker config, which is used for running all docker commands. The format of config + is based on the format that is compatible with docker credential helper: + { + "auths": { + "https://index.docker.io/v1/": { + "auth": "" + } + } + } Args: docker_config_dir: Path to docker configuration directory, which stores the temporary session token. + registry_cred: image registry basic auth credential. """ - ctx = self.session._conn._conn - assert ctx._rest, "SnowflakeRestful is not set in session" - token_data = ctx._rest._token_request("ISSUE") - snowpark_session_token = token_data["data"]["sessionToken"] - token_obj = {"token": snowpark_session_token} - credentials = f"0sessiontoken:{json.dumps(token_obj)}" - encoded_credentials = base64.b64encode(credentials.encode("utf-8")).decode("utf-8") - content = {"auths": {self.image_tag: {"auth": encoded_credentials}}} + content = {"auths": {self.image_tag: {"auth": registry_cred}}} config_path = os.path.join(docker_config_dir, "config.json") with open(config_path, "w", encoding="utf-8") as file: json.dump(content, file) - def _cleanup_local_image() -> None: + def _cleanup_local_image(docker_config_dir: str) -> None: try: image_exist_command = f"docker image inspect {self.image_tag}" subprocess.check_call( @@ -79,40 +99,35 @@ def _cleanup_local_image() -> None: # Image does not exist, probably due to failed build step pass else: - commands = ["docker", "--config", config_dir, "rmi", self.image_tag] + commands = ["docker", "--config", docker_config_dir, "rmi", self.image_tag] logger.debug(f"Removing local image: {self.image_tag}") self._run_docker_commands(commands) self.validate_docker_client_env() - - query_result = ( - query_result_checker.SqlResultValidator( - self.session, - query="SHOW PARAMETERS LIKE 'PYTHON_CONNECTOR_QUERY_RESULT_FORMAT' IN SESSION", - ) - .has_dimensions(expected_rows=1) - .validate() - ) - prev_format = query_result[0].value - - with tempfile.TemporaryDirectory() as config_dir: + with spcs_image_registry.generate_image_registry_credential( + self.session + ) as registry_cred, tempfile.TemporaryDirectory() as docker_config_dir: try: - # Workaround for SNOW-841699: Fail to authenticate to image registry with session token generated from - # Snowpark. Need to temporarily set the json query format in order to process GS token response. - self.session.sql("ALTER SESSION SET PYTHON_CONNECTOR_QUERY_RESULT_FORMAT = 'json'").collect() - _setup_docker_config(config_dir) - self._build(config_dir) + _setup_docker_config(docker_config_dir=docker_config_dir, registry_cred=registry_cred) + if not image_to_pull: + start = time.time() + self._build_and_tag(docker_config_dir) + end = time.time() + logger.info(f"Time taken to build the image on the client: {end - start:.2f} seconds") + else: + self._pull_and_tag(image_to_pull=image_to_pull) except Exception as e: raise RuntimeError(f"Failed to build docker image: {str(e)}") else: try: - self._upload(config_dir) + start = time.time() + self._upload(docker_config_dir) + end = time.time() + logger.info(f"Time taken to upload the image to image registry: {end - start:.2f} seconds") except Exception as e: raise RuntimeError(f"Failed to upload docker image to registry: {str(e)}") finally: - _cleanup_local_image() - finally: - self.session.sql(f"ALTER SESSION SET PYTHON_CONNECTOR_QUERY_RESULT_FORMAT = '{prev_format}'").collect() + _cleanup_local_image(docker_config_dir) return self.image_tag def validate_docker_client_env(self) -> None: @@ -141,7 +156,7 @@ def validate_docker_client_env(self) -> None: "https://docs.docker.com/build/buildkit/#getting-started" ) - def _build(self, docker_config_dir: str) -> None: + def _build_and_tag(self, docker_config_dir: str) -> None: """Constructs the Docker context directory and then builds a Docker image based on that context. Args: @@ -149,10 +164,26 @@ def _build(self, docker_config_dir: str) -> None: """ with tempfile.TemporaryDirectory() as context_dir: - dc = docker_context.DockerContext(context_dir=context_dir, model_dir=self.model_dir) + dc = docker_context.DockerContext(context_dir=context_dir, model_meta=self.model_meta) dc.build() self._build_image_from_context(context_dir=context_dir, docker_config_dir=docker_config_dir) + def _pull_and_tag(self, image_to_pull: str, platform: Platform = Platform.LINUX_AMD64) -> None: + """Pull image from public docker hub repo. Then tag it with the specified image tag + + Args: + image_to_pull: Name of image to download. + platform: Specifies the target platform that matches the image to be downloaded + """ + + commands = ["docker", "pull", "--platform", platform.value, image_to_pull] + logger.debug(f"Running {str(commands)}") + self._run_docker_commands(commands) + + commands = ["docker", "tag", image_to_pull, self.image_tag] + logger.debug(f"Running {str(commands)}") + self._run_docker_commands(commands) + def _run_docker_commands(self, commands: List[str]) -> None: """Run docker commands in a new child process. @@ -205,12 +236,6 @@ def _upload(self, docker_config_dir: str) -> None: local image at the end of the upload operation to save up local space. Image cache is kept for more performant built experience at the cost of small storage footprint. - For image registry authentication, we will use a session token obtained from the Snowpark session object. - The token authentication mechanism is automatically used when the username is set to "0sessiontoken" according - to the registry implementation detailed in the following link: - https://github.com/snowflakedb/snowflake-image-registry/blob/277435c6fd79db2df9f863aa9d04dc875e034d85 - /AuthAdapter/src/main/java/com/snowflake/registry/service/AuthHeader.java#L122 - By default, Docker overwrites the local Docker config file "/.docker/config.json" whenever a docker login occurs. However, to ensure better isolation between Snowflake-managed Docker credentials and the user's own Docker credentials, we will not use the default Docker config. Instead, we will write the username and session diff --git a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py index 5b1c6fc3..bd1e4663 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/client_image_builder_test.py @@ -10,17 +10,21 @@ class ClientImageBuilderTestCase(absltest.TestCase): - def setUp(self) -> None: + @mock.patch( + "snowflake.ml.model._deploy_client.image_builds.client_image_builder._model_meta.ModelMetadata" + ) # type: ignore + def setUp(self, m_model_meta_class: mock.MagicMock) -> None: + m_model_meta = m_model_meta_class.return_value super().setUp() self.m_session = cast(snowpark.session.Session, mock_session.MockSession(conn=None, test_case=self)) self.unique_id = "mock_id" self.image_repo = "mock_image_repo" - self.model_dir = "local/dir/model.zip" + self.model_meta = m_model_meta self.client_image_builder = client_image_builder.ClientImageBuilder( id=self.unique_id, image_repo=self.image_repo, - model_dir=self.model_dir, + model_meta=self.model_meta, session=self.m_session, ) @@ -46,8 +50,8 @@ def test_build(self, m_tempdir: mock.MagicMock, m_docker_context_class: mock.Mag with mock.patch.object(m_docker_context, "build") as m_build, mock.patch.object( self.client_image_builder, "_build_image_from_context" ) as m_build_image_from_context: - self.client_image_builder._build(m_docker_config_dir) - m_docker_context_class.assert_called_once_with(context_dir=m_context_dir, model_dir=self.model_dir) + self.client_image_builder._build_and_tag(m_docker_config_dir) + m_docker_context_class.assert_called_once_with(context_dir=m_context_dir, model_meta=self.model_meta) m_build.assert_called_once() m_build_image_from_context.assert_called_once_with( context_dir=m_context_dir, docker_config_dir=m_docker_config_dir diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py index 931d4c69..809e48e0 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -3,6 +3,9 @@ import string from abc import ABC +from packaging import version + +from snowflake.ml.model import _model_meta from snowflake.ml.model._deploy_client.utils import constants @@ -11,15 +14,15 @@ class DockerContext(ABC): Constructs the Docker context directory required for image building. """ - def __init__(self, context_dir: str, model_dir: str) -> None: + def __init__(self, context_dir: str, model_meta: _model_meta.ModelMetadata) -> None: """Initialization Args: context_dir: Path to context directory. - model_dir: Path to local model directory. + model_meta: Model Metadata """ self.context_dir = context_dir - self.model_dir = model_dir + self.model_meta = model_meta def build(self) -> None: """ @@ -38,9 +41,10 @@ def _copy_entrypoint_script_to_docker_context(self) -> None: shutil.copy(path, os.path.join(self.context_dir, constants.ENTRYPOINT_SCRIPT)) def _copy_model_env_dependency_to_docker_context(self) -> None: - path = os.path.join(self.model_dir, constants.MODEL_ENV_FOLDER) - assert os.path.exists(path), f"Model env folder missing at path: {path}" - shutil.copytree(path, os.path.join(self.context_dir, constants.MODEL_ENV_FOLDER)) + """ + Convert model dependencies to files from model metadata. + """ + self.model_meta.save_model_metadata(self.context_dir) def _generate_docker_file(self) -> None: """ @@ -49,6 +53,12 @@ def _generate_docker_file(self) -> None: docker_file_path = os.path.join(self.context_dir, "Dockerfile") docker_file_template = os.path.join(os.path.dirname(__file__), "templates/dockerfile_template") + if self.model_meta.cuda_version: + cuda_version_parsed = version.parse(self.model_meta.cuda_version) + cuda_version_str = f"{cuda_version_parsed.major}.{cuda_version_parsed.minor}" + else: + cuda_version_str = "" + with open(docker_file_path, "w", encoding="utf-8") as dockerfile, open( docker_file_template, encoding="utf-8" ) as template: @@ -59,6 +69,10 @@ def _generate_docker_file(self) -> None: "model_env_folder": constants.MODEL_ENV_FOLDER, "inference_server_dir": constants.INFERENCE_SERVER_DIR, "entrypoint_script": constants.ENTRYPOINT_SCRIPT, + # Instead of omitting this ENV var when no CUDA required, we explicitly set it to empty to override + # as no CUDA is detected thus it won't be affected by the existence of CUDA in base image. + # https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html + "cuda_override_env": cuda_version_str, } ) dockerfile.write(dockerfile_content) diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py index 7f9a6fc0..d3302f90 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py @@ -28,21 +28,27 @@ def setUp(self) -> None: self.context_dir = tempfile.mkdtemp() self.model_dir = tempfile.mkdtemp() - model_api.save_model( + self.model_meta = model_api._save( name="model", - model_dir_path=self.model_dir, + local_dir_path=self.model_dir, model=_get_sklearn_model(), sample_input=_IRIS_X, ) - self.docker_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir) + self.docker_context = docker_context.DockerContext(self.context_dir, model_meta=self.model_meta) def tearDown(self) -> None: shutil.rmtree(self.model_dir) shutil.rmtree(self.context_dir) def test_build_results_in_correct_docker_context_file_structure(self) -> None: - expected_files = ["Dockerfile", constants.INFERENCE_SERVER_DIR, constants.ENTRYPOINT_SCRIPT, "env"] + expected_files = [ + "Dockerfile", + constants.INFERENCE_SERVER_DIR, + constants.ENTRYPOINT_SCRIPT, + "env", + "model.yaml", + ] self.docker_context.build() generated_files = os.listdir(self.context_dir) self.assertCountEqual(expected_files, generated_files) @@ -69,12 +75,54 @@ def test_docker_file_content(self) -> None: actual = re.sub(comment_pattern, "", actual, flags=re.MULTILINE) self.assertEqual(actual, expected, "Generated dockerfile is not aligned with the docker template") - def test_docker_file_content_with_gpu(self) -> None: - gpu_context = docker_context.DockerContext(self.context_dir, model_dir=self.model_dir) - gpu_context.build() + +class DockerContextTestCuda(absltest.TestCase): + def setUp(self) -> None: + self.context_dir = tempfile.mkdtemp() + self.model_dir = tempfile.mkdtemp() + + self.model_meta = model_api._save( + name="model", + local_dir_path=self.model_dir, + model=_get_sklearn_model(), + sample_input=_IRIS_X, + ) + + self.model_meta.cuda_version = "11.7.1" + + self.docker_context = docker_context.DockerContext(self.context_dir, model_meta=self.model_meta) + + def tearDown(self) -> None: + shutil.rmtree(self.model_dir) + shutil.rmtree(self.context_dir) + + def test_build_results_in_correct_docker_context_file_structure(self) -> None: + expected_files = [ + "Dockerfile", + constants.INFERENCE_SERVER_DIR, + constants.ENTRYPOINT_SCRIPT, + "env", + "model.yaml", + ] + self.docker_context.build() + generated_files = os.listdir(self.context_dir) + self.assertCountEqual(expected_files, generated_files) + + actual_inference_files = os.listdir(os.path.join(self.context_dir, constants.INFERENCE_SERVER_DIR)) + self.assertCountEqual(["main.py"], actual_inference_files) + + model_env_dir = os.path.join(self.context_dir, "env") + self.assertTrue(os.path.exists(model_env_dir)) + + def test_docker_file_content(self) -> None: + self.docker_context.build() dockerfile_path = os.path.join(self.context_dir, "Dockerfile") - dockerfile_fixture_path = os.path.join(os.path.dirname(__file__), "test_fixtures", "dockerfile_test_fixture") - with open(dockerfile_path) as dockerfile, open(dockerfile_fixture_path) as expected_dockerfile: + dockerfile_fixture_path = os.path.join( + os.path.dirname(__file__), "test_fixtures", "dockerfile_test_fixture_with_CUDA" + ) + with open(dockerfile_path, encoding="utf-8") as dockerfile, open( + dockerfile_fixture_path, encoding="utf-8" + ) as expected_dockerfile: actual = dockerfile.read() expected = expected_dockerfile.read() diff --git a/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh b/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh index 81984c67..e380c51d 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +++ b/snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh @@ -33,4 +33,4 @@ echo "Setting number of workers to $FINAL_NUM_WORKERS" # Exclude preload option as it won't work with non-thread-safe model, and no easy way to detect whether model is # thread-safe or not. Defer the optimization later. -exec /opt/conda/bin/gunicorn -w "$FINAL_NUM_WORKERS" -k uvicorn.workers.UvicornWorker -b 0.0.0.0:5000 inference_server.main:app +exec /opt/conda/bin/gunicorn -w "$FINAL_NUM_WORKERS" -k uvicorn.workers.UvicornWorker -b 0.0.0.0:5000 --timeout 600 inference_server.main:app diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel index 93d0e7e5..aec85abe 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel @@ -7,7 +7,8 @@ py_library( srcs = ["main.py"], deps = [ "//snowflake/ml/model:_model", - "//snowflake/ml/model:custom_model" + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:type_hints", ] ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py index 42135dfa..b9b45c52 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py @@ -1,17 +1,60 @@ +import asyncio +import http import logging import os import sys import tempfile +import threading import zipfile -from typing import List, cast +from enum import Enum +from typing import Dict, List, cast import pandas as pd -from starlette import applications, requests, responses, routing +from gunicorn import arbiter +from starlette import applications, concurrency, requests, responses, routing + + +class _ModelLoadingState(Enum): + """ + Enum class to represent various model loading state. + """ + + LOADING = "loading" + SUCCEEDED = "succeeded" + FAILED = "failed" + + +class CustomThread(threading.Thread): + """ + Custom Thread implementation that overrides Thread.run. + + This is necessary because the default Thread implementation suppresses exceptions in child threads. The standard + behavior involves the Thread class catching exceptions and throwing a SystemExit exception, which requires + Thread.join to terminate the process. To address this, we overwrite Thread.run and use os._exit instead. + + We throw specific error code "Arbiter.APP_LOAD_ERROR" such that Gunicorn Arbiter master process will be killed, + which then trigger the container to be marked as failed. This ensures the container becomes ready when all workers + loaded the model successfully. + """ + + def run(self) -> None: + try: + super().run() + except Exception as e: + logger.error(str(e)) + os._exit(arbiter.Arbiter.APP_LOAD_ERROR) + logger = logging.getLogger(__name__) _LOADED_MODEL = None _LOADED_META = None -MODEL_CODE_DIR = "code" +_MODEL_CODE_DIR = "code" +_MODEL_LOADING_STATE = _ModelLoadingState.LOADING +_MODEL_LOADING_EVENT = threading.Event() +_CONCURRENT_REQUESTS_MAX = None +_CONCURRENT_COUNTER = 0 +_CONCURRENT_COUNTER_LOCK = asyncio.Lock() +TARGET_METHOD = None def _run_setup() -> None: @@ -21,86 +64,109 @@ def _run_setup() -> None: logger.handlers = gunicorn_logger.handlers logger.setLevel(gunicorn_logger.level) + logger.info(f"ENV: {os.environ}") + global _LOADED_MODEL global _LOADED_META + global _MODEL_LOADING_STATE + global _MODEL_LOADING_EVENT + global _CONCURRENT_REQUESTS_MAX + global TARGET_METHOD - MODEL_ZIP_STAGE_PATH = os.getenv("MODEL_ZIP_STAGE_PATH") - assert MODEL_ZIP_STAGE_PATH, "Missing environment variable MODEL_ZIP_STAGE_PATH" + try: + MODEL_ZIP_STAGE_PATH = os.getenv("MODEL_ZIP_STAGE_PATH") + assert MODEL_ZIP_STAGE_PATH, "Missing environment variable MODEL_ZIP_STAGE_PATH" - root_path = os.path.abspath(os.sep) - model_zip_stage_path = os.path.join(root_path, MODEL_ZIP_STAGE_PATH) + TARGET_METHOD = os.getenv("TARGET_METHOD") - with tempfile.TemporaryDirectory() as tmp_dir: - if zipfile.is_zipfile(model_zip_stage_path): - extracted_dir = os.path.join(tmp_dir, "extracted_model_dir") - logger.info(f"Extracting model zip from {model_zip_stage_path} to {extracted_dir}") - with zipfile.ZipFile(model_zip_stage_path, "r") as model_zip: - if len(model_zip.namelist()) > 1: - model_zip.extractall(extracted_dir) - else: - raise RuntimeError(f"No model zip found at stage path: {model_zip_stage_path}") - logger.info(f"Loading model from {extracted_dir} into memory") + _CONCURRENT_REQUESTS_MAX = os.getenv("_CONCURRENT_REQUESTS_MAX", None) - sys.path.insert(0, os.path.join(extracted_dir, MODEL_CODE_DIR)) - from snowflake.ml.model import _model as model_api + root_path = os.path.abspath(os.sep) + model_zip_stage_path = os.path.join(root_path, MODEL_ZIP_STAGE_PATH) - _LOADED_MODEL, _LOADED_META = model_api._load_model_for_deploy(model_dir_path=extracted_dir) - logger.info("Successfully loaded model into memory") + with tempfile.TemporaryDirectory() as tmp_dir: + if zipfile.is_zipfile(model_zip_stage_path): + extracted_dir = os.path.join(tmp_dir, "extracted_model_dir") + logger.info(f"Extracting model zip from {model_zip_stage_path} to {extracted_dir}") + with zipfile.ZipFile(model_zip_stage_path, "r") as model_zip: + if len(model_zip.namelist()) > 1: + model_zip.extractall(extracted_dir) + else: + raise RuntimeError(f"No model zip found at stage path: {model_zip_stage_path}") + logger.info(f"Loading model from {extracted_dir} into memory") + sys.path.insert(0, os.path.join(extracted_dir, _MODEL_CODE_DIR)) + from snowflake.ml.model import ( + _model as model_api, + type_hints as model_types, + ) -async def ready(request: requests.Request) -> responses.JSONResponse: - """Endpoint to check if the application is ready.""" - return responses.JSONResponse({"status": "ready"}) + # Backward for <= 1.0.5 + if hasattr(model_api, "_load_model_for_deploy"): + _LOADED_MODEL, _LOADED_META = model_api._load_model_for_deploy( # type:ignore[attr-defined] + extracted_dir + ) + else: + _LOADED_MODEL, _LOADED_META = model_api._load( + local_dir_path=extracted_dir, + as_custom_model=True, + meta_only=False, + options=model_types.ModelLoadOption( + {"use_gpu": cast(bool, os.environ.get("SNOWML_USE_GPU", False))} + ), + ) + _MODEL_LOADING_STATE = _ModelLoadingState.SUCCEEDED + logger.info("Successfully loaded model into memory") + _MODEL_LOADING_EVENT.set() + except Exception as e: + _MODEL_LOADING_STATE = _ModelLoadingState.FAILED + raise RuntimeError(e) -async def predict(request: requests.Request) -> responses.JSONResponse: - """Endpoint to make predictions based on input data. +async def ready(request: requests.Request) -> responses.JSONResponse: + """Check if the application is ready to serve requests. + + This endpoint is used to determine the readiness of the application to handle incoming requests. It returns an HTTP + 200 status code only when the model has been successfully loaded into memory. If the model has not yet been loaded, + it responds with an HTTP 503 status code, which signals to the readiness probe to continue probing until the + application becomes ready or until the client's timeout is reached. Args: - request: The input data is expected to be in the following JSON format: - { - "data": [ - [0, {'_ID': 0, 'input_feature_0': 0.0, 'input_feature_1': 1.0}], - [1, {'_ID': 1, 'input_feature_0': 2.0, 'input_feature_1': 3.0}], - } - Each row is represented as a list, where the first element denotes the index of the row. + request: + The HTTP request object. Returns: - Two possible responses: - For success, return a JSON response - { - "data": [ - [0, {'_ID': 0, 'output': 1}], - [1, {'_ID': 1, 'output': 2}] - ] - }, - The first element of each resulting list denotes the index of the row, and the rest of the elements - represent the prediction results for that row. - For an error, return {"error": error_message, "status_code": http_response_status_code}. + A JSON response with status information: + - HTTP 200 status code and {"status": "ready"} when the model is loaded and the application is ready. + - HTTP 503 status code and {"status": "not ready"} when the model is not yet loaded. + """ - assert _LOADED_MODEL, "model is not loaded" - assert _LOADED_META, "model metadata is not loaded" + if _MODEL_LOADING_STATE == _ModelLoadingState.SUCCEEDED: + return responses.JSONResponse({"status": "ready"}) + return responses.JSONResponse({"status": "not ready"}, status_code=http.HTTPStatus.SERVICE_UNAVAILABLE) - TARGET_METHOD = os.getenv("TARGET_METHOD") - assert TARGET_METHOD, "Missing environment variable TARGET_METHOD" +def _do_predict(input_json: Dict[str, List[List[object]]]) -> responses.JSONResponse: from snowflake.ml.model.model_signature import FeatureSpec + assert _LOADED_MODEL, "model is not loaded" + assert _LOADED_META, "model metadata is not loaded" + assert TARGET_METHOD, "Missing environment variable TARGET_METHOD" + try: - input = await request.json() features = cast(List[FeatureSpec], _LOADED_META.signatures[TARGET_METHOD].inputs) dtype_map = {feature.name: feature.as_dtype() for feature in features} input_cols = [spec.name for spec in features] output_cols = [spec.name for spec in _LOADED_META.signatures[TARGET_METHOD].outputs] - assert "data" in input, "missing data field in the request input" + assert "data" in input_json, "missing data field in the request input" # The expression x[1:] is used to exclude the index of the data row. - input_data = [x[1] for x in input.get("data")] + input_data = [x[1] for x in input_json["data"]] df = pd.json_normalize(input_data).astype(dtype=dtype_map) x = df[input_cols] assert len(input_data) != 0 and not all(not row for row in input_data), "empty data" except Exception as e: error_message = f"Input data malformed: {str(e)}" - return responses.JSONResponse({"error": error_message}, status_code=400) + return responses.JSONResponse({"error": error_message}, status_code=http.HTTPStatus.BAD_REQUEST) try: predictions_df = getattr(_LOADED_MODEL, TARGET_METHOD)(x) @@ -113,7 +179,57 @@ async def predict(request: requests.Request) -> responses.JSONResponse: return responses.JSONResponse(response) except Exception as e: error_message = f"Prediction failed: {str(e)}" - return responses.JSONResponse({"error": error_message}, status_code=400) + return responses.JSONResponse({"error": error_message}, status_code=http.HTTPStatus.BAD_REQUEST) + + +async def predict(request: requests.Request) -> responses.JSONResponse: + """Endpoint to make predictions based on input data. + + Args: + request: The input data is expected to be in the following JSON format: + { + "data": [ + [0, {'_ID': 0, 'input_feature_0': 0.0, 'input_feature_1': 1.0}], + [1, {'_ID': 1, 'input_feature_0': 2.0, 'input_feature_1': 3.0}], + } + Each row is represented as a list, where the first element denotes the index of the row. + + Returns: + Two possible responses: + For success, return a JSON response + { + "data": [ + [0, {'_ID': 0, 'output': 1}], + [1, {'_ID': 1, 'output': 2}] + ] + }, + The first element of each resulting list denotes the index of the row, and the rest of the elements + represent the prediction results for that row. + For an error, return {"error": error_message, "status_code": http_response_status_code}. + """ + _MODEL_LOADING_EVENT.wait() # Ensure model is indeed loaded into memory + + global _CONCURRENT_COUNTER + global _CONCURRENT_COUNTER_LOCK + + input_json = await request.json() + + if _CONCURRENT_REQUESTS_MAX: + async with _CONCURRENT_COUNTER_LOCK: + if _CONCURRENT_COUNTER >= int(_CONCURRENT_REQUESTS_MAX): + return responses.JSONResponse( + {"error": "Too many requests"}, status_code=http.HTTPStatus.TOO_MANY_REQUESTS + ) + + async with _CONCURRENT_COUNTER_LOCK: + _CONCURRENT_COUNTER += 1 + + resp = await concurrency.run_in_threadpool(_do_predict, input_json) + + async with _CONCURRENT_COUNTER_LOCK: + _CONCURRENT_COUNTER -= 1 + + return resp def _in_test_mode() -> bool: @@ -134,8 +250,15 @@ def _in_test_mode() -> bool: def run_app() -> applications.Starlette: - if not _in_test_mode(): - _run_setup() + if _in_test_mode(): + _MODEL_LOADING_EVENT.set() + else: + # TODO[shchen]: SNOW-893654. Before SnowService supports Startup probe, or extends support for Readiness probe + # with configurable failureThreshold, we will have to load the model in a separate thread in order to prevent + # gunicorn worker timeout. + model_loading_worker = CustomThread(target=_run_setup) + model_loading_worker.start() + routes = [ routing.Route("/health", endpoint=ready, methods=["GET"]), routing.Route("/predict", endpoint=predict, methods=["POST"]), diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py index 87d33668..9c3b6d15 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py @@ -1,6 +1,8 @@ +import http import os from typing import Tuple +import main import pandas as pd import sklearn.datasets as datasets import sklearn.neighbors as neighbors @@ -22,9 +24,7 @@ class MainTest(absltest.TestCase): def setUp(self) -> None: super().setUp() - from main import app - - self.client = testclient.TestClient(app) + self.client = testclient.TestClient(main.app) self.loaded_sklearn_model, self.loaded_sklearn_meta = self.get_custom_sklearn_model() def get_custom_sklearn_model(self) -> Tuple[custom_model.CustomModel, _model_meta.ModelMetadata]: @@ -45,19 +45,32 @@ def predict(self, input: pd.DataFrame) -> pd.DataFrame: model = TestCustomModel(custom_model.ModelContext()) tmpdir = self.create_tempdir() model_name = "model_name" - model_api.save_model( + model_api._save( name=model_name, - model_dir_path=os.path.join(tmpdir.full_path, model_name), + local_dir_path=os.path.join(tmpdir.full_path, model_name), model=model, sample_input=x, metadata={"author": "halu", "version": "1"}, ) - return model_api._load_model_for_deploy(model_dir_path=os.path.join(tmpdir, model_name)) - - def test_ready_endpoint(self) -> None: - response = self.client.get("/health") - self.assertEqual(response.status_code, 200) - self.assertEqual(response.json(), {"status": "ready"}) + return model_api._load(local_dir_path=os.path.join(tmpdir, model_name), as_custom_model=True) + + def test_ready_endpoint_after_model_successfully_loaded(self) -> None: + with mock.patch("main._MODEL_LOADING_STATE", main._ModelLoadingState.SUCCEEDED): + response = self.client.get("/health") + self.assertEqual(response.status_code, http.HTTPStatus.OK) + self.assertEqual(response.json(), {"status": "ready"}) + + def test_ready_endpoint_during_model_loading(self) -> None: + with mock.patch("main._MODEL_LOADING_STATE", main._ModelLoadingState.LOADING): + response = self.client.get("/health") + self.assertEqual(response.status_code, http.HTTPStatus.SERVICE_UNAVAILABLE) + self.assertEqual(response.json(), {"status": "not ready"}) + + def test_ready_endpoint_after_model_loading_failed(self) -> None: + with mock.patch("main._MODEL_LOADING_STATE", main._ModelLoadingState.FAILED): + response = self.client.get("/health") + self.assertEqual(response.status_code, http.HTTPStatus.SERVICE_UNAVAILABLE) + self.assertEqual(response.json(), {"status": "not ready"}) def test_predict_endpoint_happy_path(self) -> None: loaded_model, loaded_meta = self.get_custom_sklearn_model() @@ -88,11 +101,11 @@ def test_predict_endpoint_happy_path(self) -> None: ] } - with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( - "main._LOADED_MODEL", loaded_model - ), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch("main.TARGET_METHOD", "predict"), mock.patch("main._LOADED_MODEL", loaded_model), mock.patch( + "main._LOADED_META", loaded_meta + ): response = self.client.post("/predict", json=data) - self.assertEqual(response.status_code, 200) + self.assertEqual(response.status_code, http.HTTPStatus.OK) expected_response = { "data": [[0, {"output_feature_0": 1, "_ID": 0}], [1, {"output_feature_0": 2, "_ID": 1}]] } @@ -100,20 +113,20 @@ def test_predict_endpoint_happy_path(self) -> None: def test_predict_endpoint_with_invalid_input(self) -> None: loaded_model, loaded_meta = self.get_custom_sklearn_model() - with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( - "main._LOADED_MODEL", loaded_model - ), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch("main.TARGET_METHOD", "predict"), mock.patch("main._LOADED_MODEL", loaded_model), mock.patch( + "main._LOADED_META", loaded_meta + ): response = self.client.post("/predict", json={}) - self.assertEqual(response.status_code, 400) + self.assertEqual(response.status_code, http.HTTPStatus.BAD_REQUEST) self.assertRegex(response.text, "Input data malformed: missing data field in the request input") response = self.client.post("/predict", json={"data": []}) - self.assertEqual(response.status_code, 400) + self.assertEqual(response.status_code, http.HTTPStatus.BAD_REQUEST) self.assertRegex(response.text, "Input data malformed") # Input data with indexes only. response = self.client.post("/predict", json={"data": [[0], [1]]}) - self.assertEqual(response.status_code, 400) + self.assertEqual(response.status_code, http.HTTPStatus.BAD_REQUEST) self.assertRegex(response.text, "Input data malformed") response = self.client.post( @@ -125,7 +138,7 @@ def test_predict_endpoint_with_invalid_input(self) -> None: ] }, ) - self.assertEqual(response.status_code, 400) + self.assertEqual(response.status_code, http.HTTPStatus.BAD_REQUEST) self.assertRegex(response.text, "Input data malformed: missing data field in the request input") # @@ -155,11 +168,11 @@ def test_predict_with_misshaped_data(self) -> None: ] } - with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( - "main._LOADED_MODEL", loaded_model - ), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch("main.TARGET_METHOD", "predict"), mock.patch("main._LOADED_MODEL", loaded_model), mock.patch( + "main._LOADED_META", loaded_meta + ): response = self.client.post("/predict", json=data) - self.assertEqual(response.status_code, 400) + self.assertEqual(response.status_code, http.HTTPStatus.BAD_REQUEST) self.assertRegex(response.text, r"Input data malformed: .*dtype mappings argument.*") def test_predict_with_incorrect_data_type(self) -> None: @@ -179,11 +192,11 @@ def test_predict_with_incorrect_data_type(self) -> None: ] } - with mock.patch.dict(os.environ, {"TARGET_METHOD": "predict"}, clear=True), mock.patch( - "main._LOADED_MODEL", loaded_model - ), mock.patch("main._LOADED_META", loaded_meta): + with mock.patch("main.TARGET_METHOD", "predict"), mock.patch("main._LOADED_MODEL", loaded_model), mock.patch( + "main._LOADED_META", loaded_meta + ): response = self.client.post("/predict", json=data) - self.assertEqual(response.status_code, 400) + self.assertEqual(response.status_code, http.HTTPStatus.BAD_REQUEST) self.assertRegex(response.text, "Input data malformed: could not convert string to float") diff --git a/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py new file mode 100644 index 00000000..ac1843de --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py @@ -0,0 +1,220 @@ +import logging +import os +import posixpath +import tempfile +from string import Template + +import yaml + +from snowflake import snowpark +from snowflake.ml._internal import file_utils +from snowflake.ml._internal.utils import identifier +from snowflake.ml.model import _model_meta +from snowflake.ml.model._deploy_client.image_builds import ( + base_image_builder, + client_image_builder, + docker_context, +) +from snowflake.ml.model._deploy_client.utils import constants, snowservice_client + +logger = logging.getLogger(__name__) + + +class ServerImageBuilder(base_image_builder.ImageBuilder): + """ + Server-side image building and upload to model registry. + """ + + def __init__( + self, + *, + id: str, + image_repo: str, + model_meta: _model_meta.ModelMetadata, + session: snowpark.Session, + artifact_stage_location: str, + compute_pool: str, + ) -> None: + """Initialization + + Args: + id: A hexadecimal string used for naming the image tag. + image_repo: Path to image repository. + model_meta: Model Metadata. + session: Snowpark session + artifact_stage_location: Spec file and future deployment related artifacts will be stored under + {stage}/models/{model_id} + compute_pool: The compute pool used to run docker image build workload. + """ + self.model_id = id + self.image_repo = image_repo + self.image_tag = "/".join([image_repo.rstrip("/"), id]) + ":latest" + self.model_meta = model_meta + self.session = session + self.artifact_stage_location = artifact_stage_location + self.compute_pool = compute_pool + self.client = snowservice_client.SnowServiceClient(session) + + assert artifact_stage_location.startswith( + "@" + ), f"stage path should start with @, actual: {artifact_stage_location}" + + def build_and_upload_image(self) -> str: + """ + Builds and uploads an image to the model registry. + """ + logger.info("Starting server-side image build with Kaniko") + with tempfile.TemporaryDirectory() as context_dir: + dc = docker_context.DockerContext(context_dir=context_dir, model_meta=self.model_meta) + dc.build() + self._build_image_in_remote_job(context_dir) + return self.image_tag + + def _build_image_in_remote_job(self, context_dir: str) -> None: + """ + Args: + context_dir: Path to context directory. + + """ + context_tarball_stage_location = f"{self.artifact_stage_location}/{constants.CONTEXT}.tar.gz" + spec_stage_location = f"{self.artifact_stage_location}/{constants.IMAGE_BUILD_JOB_SPEC_TEMPLATE}.yaml" + kaniko_shell_script_stage_location = f"{self.artifact_stage_location}/{constants.KANIKO_SHELL_SCRIPT_NAME}" + + self._compress_and_upload_docker_context_tarball( + context_dir=context_dir, context_tarball_stage_location=context_tarball_stage_location + ) + + self._construct_and_upload_docker_entrypoint_script( + context_dir=context_dir, context_tarball_stage_location=context_tarball_stage_location + ) + + # This is more of a workaround to support non-spcs-registry images. + # TODO[shchen] remove such logic when first-party-image is supported on snowservice registry. + # The regular Kaniko image doesn't include a shell; only the debug image comes with a shell. We need a shell + # as we use an sh script to launch Kaniko + kaniko_image_tag = "/".join([self.image_repo.rstrip("/"), "kaniko-project/executor:debug"]) + image_builder_client = client_image_builder.ClientImageBuilder( + id=self.model_id, + image_repo=self.image_repo, + image_tag=kaniko_image_tag, + model_meta=self.model_meta, + session=self.session, + ) + base_image = image_builder_client.build_and_upload_image(image_to_pull="gcr.io/kaniko-project/executor:debug") + + self._construct_and_upload_job_spec( + base_image=base_image, + context_dir=context_dir, + kaniko_shell_script_stage_location=kaniko_shell_script_stage_location, + ) + self._launch_kaniko_job(spec_stage_location) + + def _construct_and_upload_docker_entrypoint_script( + self, context_dir: str, context_tarball_stage_location: str + ) -> None: + """Construct a shell script that invokes logic to uncompress the docker context tarball, then invoke Kaniko + executor to build images and push to image registry; the script will also ensure the docker credential(used to + authenticate to image registry) stays up-to-date when session token refreshes. + + Args: + context_dir: Path to context directory. + context_tarball_stage_location: Path context directory stage location. + """ + + kaniko_shell_script_template = os.path.join( + os.path.dirname(__file__), f"templates/{constants.KANIKO_SHELL_SCRIPT_TEMPLATE}" + ) + kaniko_shell_file = os.path.join(context_dir, constants.KANIKO_SHELL_SCRIPT_NAME) + + with open(kaniko_shell_script_template, encoding="utf-8") as template_file, open( + kaniko_shell_file, "w+", encoding="utf-8" + ) as script_file: + normed_artifact_stage_path = posixpath.normpath(identifier.remove_prefix(self.artifact_stage_location, "@")) + params = { + # Remove @ in the beginning, append "/" to denote root directory. + "tar_from": "/" + posixpath.normpath(identifier.remove_prefix(context_tarball_stage_location, "@")), + # Remove @ in the beginning, append "/" to denote root directory. + "tar_to": "/" + normed_artifact_stage_path, + "context_dir": f"dir:///{normed_artifact_stage_path}/{constants.CONTEXT}", + "image_repo": self.image_repo, + # All models will be sharing the same layer cache from the image_repo/cache directory. + "cache_repo": f"{self.image_repo.rstrip('/')}/cache", + "image_destination": self.image_tag, + } + template = Template(template_file.read()) + script = template.safe_substitute(params) + script_file.write(script) + logger.debug(f"script content: \n\n {script}") + self.session.file.put( + local_file_name=kaniko_shell_file, + stage_location=self.artifact_stage_location, + auto_compress=False, + overwrite=True, + ) + + def _compress_and_upload_docker_context_tarball( + self, context_dir: str, context_tarball_stage_location: str + ) -> None: + try: + with file_utils._create_tar_gz_stream(source_dir=context_dir, arcname=constants.CONTEXT) as input_stream: + self.session.file.put_stream( + input_stream=input_stream, + stage_location=context_tarball_stage_location, + auto_compress=False, + overwrite=True, + ) + except Exception as e: + raise RuntimeError( + "Exception occurred when compressing docker context dir as tarball and upload to stage", e + ) + + def _construct_and_upload_job_spec( + self, base_image: str, context_dir: str, kaniko_shell_script_stage_location: str + ) -> None: + assert kaniko_shell_script_stage_location.startswith( + "@" + ), f"stage path should start with @, actual: {kaniko_shell_script_stage_location}" + spec_template_path = os.path.join( + os.path.dirname(__file__), f"templates/{constants.IMAGE_BUILD_JOB_SPEC_TEMPLATE}" + ) + spec_file_path = os.path.join(os.path.dirname(context_dir), f"{constants.IMAGE_BUILD_JOB_SPEC_TEMPLATE}.yaml") + + with open(spec_template_path, encoding="utf-8") as template_file, open( + spec_file_path, "w+", encoding="utf-8" + ) as spec_file: + assert self.artifact_stage_location.startswith("@") + normed_artifact_stage_path = posixpath.normpath(identifier.remove_prefix(self.artifact_stage_location, "@")) + (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(normed_artifact_stage_path) + content = Template(template_file.read()).substitute( + { + "base_image": base_image, + "container_name": constants.KANIKO_CONTAINER_NAME, + "stage": identifier.get_schema_level_object_identifier(db, schema, stage), + # Remove @ in the beginning, append "/" to denote root directory. + "script_path": "/" + + posixpath.normpath(identifier.remove_prefix(kaniko_shell_script_stage_location, "@")), + } + ) + content_dict = yaml.safe_load(content) + yaml.dump(content_dict, spec_file) + spec_file.seek(0) + logger.debug(f"Kaniko job spec file: \n\n {spec_file.read()}") + + self.session.file.put( + local_file_name=spec_file_path, + stage_location=self.artifact_stage_location, + auto_compress=False, + overwrite=True, + ) + + def _launch_kaniko_job(self, spec_stage_location: str) -> None: + job_id = self.client.create_job(compute_pool=self.compute_pool, spec_stage_location=spec_stage_location) + logger.debug(f"Submit job for building docker image in kaniko with job id {job_id}") + # Given image build can take a while, we set a generous timeout to be 1 hour. + self.client.block_until_resource_is_ready( + resource_name=job_id, + resource_type=constants.ResourceType.JOB, + container_name=constants.KANIKO_CONTAINER_NAME, + max_retries=240, + retry_interval_secs=15, + ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/server_image_builder_test.py b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder_test.py new file mode 100644 index 00000000..cbb410ca --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/server_image_builder_test.py @@ -0,0 +1,60 @@ +import os +import tempfile + +from absl.testing import absltest +from absl.testing.absltest import mock + +from snowflake.ml.model._deploy_client.image_builds import server_image_builder +from snowflake.ml.model._deploy_client.utils import constants + + +class ServerImageBuilderTestCase(absltest.TestCase): + @mock.patch( + "snowflake.ml.model._deploy_client.image_builds.client_image_builder._model_meta.ModelMetadata" + ) # type: ignore + def setUp(self, m_model_meta_class: mock.MagicMock) -> None: + m_model_meta = m_model_meta_class.return_value + super().setUp() + self.unique_id = "mock_id" + self.image_repo = "mock_image_repo" + self.model_meta = m_model_meta + self.artifact_stage_location = "@stage/models/id" + self.compute_pool = "test_pool" + self.context_tarball_stage_location = f"{self.artifact_stage_location}/context.tar.gz" + + @mock.patch("snowflake.ml.model._deploy_client.image_builds.server_image_builder.snowpark.Session") # type: ignore + def test_construct_and_upload_docker_entrypoint_script(self, m_session_class: mock.MagicMock) -> None: + m_session = m_session_class.return_value + mock_file_put = mock.MagicMock() + m_session.file.put = mock_file_put + + builder = server_image_builder.ServerImageBuilder( + id=self.unique_id, + image_repo=self.image_repo, + model_meta=self.model_meta, + session=m_session, + artifact_stage_location=self.artifact_stage_location, + compute_pool=self.compute_pool, + ) + + with tempfile.TemporaryDirectory() as context_dir: + shell_file_path = os.path.join(context_dir, constants.KANIKO_SHELL_SCRIPT_NAME) + fixture_path = os.path.join(os.path.dirname(__file__), "test_fixtures", "kaniko_shell_script_fixture.sh") + builder._construct_and_upload_docker_entrypoint_script( + context_dir=context_dir, context_tarball_stage_location=self.context_tarball_stage_location + ) + m_session.file.put.assert_called_once_with( + local_file_name=shell_file_path, + stage_location=self.artifact_stage_location, + auto_compress=False, + overwrite=True, + ) + + with open(shell_file_path, encoding="utf-8") as shell_file, open(fixture_path, encoding="utf-8") as fixture: + actual = shell_file.read() + expected = fixture.read() + self.assertEqual(actual, expected, "Generated image build shell script is not the same") + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template index b9faa3a0..a1802438 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template @@ -1,37 +1,44 @@ -FROM $base_image as build +FROM ${base_image} as build -COPY $model_env_folder/conda.yaml conda.yaml -COPY $model_env_folder/requirements.txt requirements.txt +COPY ${model_env_folder}/conda.yaml conda.yaml +COPY ${model_env_folder}/requirements.txt requirements.txt # Set MAMBA_DOCKERFILE_ACTIVATE=1 to activate the conda environment during build time. ARG MAMBA_DOCKERFILE_ACTIVATE=1 # The micromamba image comes with an empty environment named base. -RUN --mount=type=cache,target=/opt/conda/pkgs micromamba install -y -n base -f conda.yaml && \ +# CONDA_OVERRIDE_CUDA ref https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html +RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="${cuda_override_env}" \ + micromamba install -y -n base -f conda.yaml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ - python -m pip install -r requirements.txt - -FROM debian:buster-slim AS runtime - -ENV USER nonrootuser -ENV UID 1000 -ENV HOME /home/$USER -RUN adduser --disabled-password \ - --gecos "A non-root user for running inference server" \ - --uid $UID \ - --home $HOME \ - $USER - -COPY $inference_server_dir ./$inference_server_dir -COPY $entrypoint_script ./$entrypoint_script -RUN chmod +x /$entrypoint_script - -# The mamba root prefix by default is set to /opt/conda, in which the base conda environment is built at. -COPY --from=build /opt/conda /opt/conda + python -m pip install -r requirements.txt && \ + micromamba clean -afy + +# Bitsandbytes uses this ENVVAR to determine CUDA library location +ENV CONDA_PREFIX=/opt/conda + +COPY ${inference_server_dir} ./${inference_server_dir} +COPY ${entrypoint_script} ./${entrypoint_script} + +USER root +RUN if id mambauser >/dev/null 2>&1; then \ + echo "mambauser already exists."; \ + else \ + # Set environment variables + export USER=mambauser && \ + export UID=1000 && \ + export HOME=/home/$USER && \ + echo "Creating $USER user..." && \ + adduser --disabled-password \ + --gecos "A non-root user for running inference server" \ + --uid $UID \ + --home $HOME \ + $USER; \ + fi +RUN chmod +x ./${entrypoint_script} +USER mambauser # Expose the port on which the Starlette app will run. EXPOSE 5000 -USER nonrootuser - -CMD ["/$entrypoint_script"] +CMD ["./${entrypoint_script}"] diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template b/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template new file mode 100644 index 00000000..9f8baa08 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/image_build_job_spec_template @@ -0,0 +1,22 @@ +spec: + container: + - name: $container_name + image: $base_image + command: + - sh + args: + - -c + - >- + while [ ! -f "$script_path" ]; do sleep 1; done; + chmod +x $script_path; + sh $script_path; + volumeMounts: + - name: vol1 + mountPath: /local/user/vol1 + - name: stagemount + mountPath: /$stage + volume: + - name: vol1 + source: local # only local emptyDir volume is supported + - name: stagemount + source: "@$stage" diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template b/snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template new file mode 100644 index 00000000..0983ce21 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/templates/kaniko_shell_script_template @@ -0,0 +1,78 @@ +#!/bin/sh + +# Set the file path to monitor +REGISTRY_CRED_PATH="/kaniko/.docker/config.json" +SESSION_TOKEN_PATH="/snowflake/session/token" + +# Function to gracefully terminate the file monitoring job +cleanup() { + echo "Stopping file monitoring job..." + trap - INT TERM # Remove the signal handlers + kill -- -$$$ # Kill the entire process group. Extra $ to escape, the generated shell script should have two $. +} + +generate_registry_cred() { + AUTH_TOKEN=$(echo -n "0auth2accesstoken:$(cat ${SESSION_TOKEN_PATH})" | base64); + echo '{"auths":{"$image_repo":{"auth":"'"$AUTH_TOKEN"'"}}}' | tr -d '\n' > $REGISTRY_CRED_PATH; +} + +on_session_token_change() { + # Get the initial checksum of the file + CHECKSUM=$(md5sum "${SESSION_TOKEN_PATH}" | awk '{ print $1 }') + # Run the command once before the loop + echo "Monitoring session token changes in the background..." + ( + while true; do + # Get the current checksum of the file + CURRENT_CHECKSUM=$(md5sum "${SESSION_TOKEN_PATH}" | awk '{ print $1 }') + if [ "${CURRENT_CHECKSUM}" != "${CHECKSUM}" ]; then + # Session token file has changed, regenerate registry credential. + echo "Session token has changed. Regenerating registry auth credentials." + generate_registry_cred + CHECKSUM="${CURRENT_CHECKSUM}" + fi + # Wait for a short period of time before checking again + sleep 1 + done + ) +} + +run_kaniko() { + # Run the Kaniko command in the foreground + echo "Starting Kaniko command..." + + # Set cache ttl to a large value as snowservice registry doesn't support deleting cache anyway. + /kaniko/executor \ + --dockerfile Dockerfile \ + --context ${context_dir} \ + --destination=${image_destination} \ + --cache=true \ + --cache-copy-layers=false \ + --use-new-run \ + --snapshot-mode=redo \ + --cache-repo=${cache_repo} \ + --cache-run-layers=true \ + --cache-ttl=8760h \ + --push-retry=3 \ + --image-fs-extract-retry=5 \ + --log-timestamp +} + +setup() { + tar -C "${tar_to}" -xf "${tar_from}"; + generate_registry_cred + # Set up the signal handlers + trap cleanup TERM +} + +setup + +# Running kaniko job on the foreground and session token monitoring on the background. When session token changes, +# overwrite the existing registry cred file with the new session token. +on_session_token_change & +run_kaniko + +# Capture the exit code from the previous kaniko command. +KANIKO_EXIT_CODE=$? +# Exit with the same exit code as the Kaniko command. This then triggers the cleanup function. +exit $KANIKO_EXIT_CODE diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture index aa44a8f6..77100d65 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture @@ -3,27 +3,32 @@ FROM mambaorg/micromamba:1.4.3 as build COPY env/conda.yaml conda.yaml COPY env/requirements.txt requirements.txt ARG MAMBA_DOCKERFILE_ACTIVATE=1 -RUN --mount=type=cache,target=/opt/conda/pkgs micromamba install -y -n base -f conda.yaml && \ +RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="" \ + micromamba install -y -n base -f conda.yaml && \ python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ - python -m pip install -r requirements.txt - -FROM debian:buster-slim AS runtime - -ENV USER nonrootuser -ENV UID 1000 -ENV HOME /home/$USER -RUN adduser --disabled-password \ - --gecos "A non-root user for running inference server" \ - --uid $UID \ - --home $HOME \ - $USER + python -m pip install -r requirements.txt && \ + micromamba clean -afy +ENV CONDA_PREFIX=/opt/conda COPY inference_server ./inference_server COPY gunicorn_run.sh ./gunicorn_run.sh -RUN chmod +x /gunicorn_run.sh -COPY --from=build /opt/conda /opt/conda -EXPOSE 5000 -USER nonrootuser +USER root +RUN if id mambauser >/dev/null 2>&1; then \ + echo "mambauser already exists."; \ + else \ + export USER=mambauser && \ + export UID=1000 && \ + export HOME=/home/$USER && \ + echo "Creating $USER user..." && \ + adduser --disabled-password \ + --gecos "A non-root user for running inference server" \ + --uid $UID \ + --home $HOME \ + $USER; \ + fi +RUN chmod +x ./gunicorn_run.sh +USER mambauser +EXPOSE 5000 -CMD ["/gunicorn_run.sh"] +CMD ["./gunicorn_run.sh"] diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA new file mode 100644 index 00000000..70ede47d --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/dockerfile_test_fixture_with_CUDA @@ -0,0 +1,34 @@ +FROM mambaorg/micromamba:1.4.3 as build + +COPY env/conda.yaml conda.yaml +COPY env/requirements.txt requirements.txt +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +RUN --mount=type=cache,target=/opt/conda/pkgs CONDA_OVERRIDE_CUDA="11.7" \ + micromamba install -y -n base -f conda.yaml && \ + python -m pip install "uvicorn[standard]" gunicorn starlette==0.30.0 && \ + python -m pip install -r requirements.txt && \ + micromamba clean -afy +ENV CONDA_PREFIX=/opt/conda + +COPY inference_server ./inference_server +COPY gunicorn_run.sh ./gunicorn_run.sh + +USER root +RUN if id mambauser >/dev/null 2>&1; then \ + echo "mambauser already exists."; \ + else \ + export USER=mambauser && \ + export UID=1000 && \ + export HOME=/home/$USER && \ + echo "Creating $USER user..." && \ + adduser --disabled-password \ + --gecos "A non-root user for running inference server" \ + --uid $UID \ + --home $HOME \ + $USER; \ + fi +RUN chmod +x ./gunicorn_run.sh +USER mambauser +EXPOSE 5000 + +CMD ["./gunicorn_run.sh"] diff --git a/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/kaniko_shell_script_fixture.sh b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/kaniko_shell_script_fixture.sh new file mode 100644 index 00000000..624510d7 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/test_fixtures/kaniko_shell_script_fixture.sh @@ -0,0 +1,78 @@ +#!/bin/sh + +# Set the file path to monitor +REGISTRY_CRED_PATH="/kaniko/.docker/config.json" +SESSION_TOKEN_PATH="/snowflake/session/token" + +# Function to gracefully terminate the file monitoring job +cleanup() { + echo "Stopping file monitoring job..." + trap - INT TERM # Remove the signal handlers + kill -- -$$ # Kill the entire process group. Extra $ to escape, the generated shell script should have two $. +} + +generate_registry_cred() { + AUTH_TOKEN=$(echo -n "0auth2accesstoken:$(cat ${SESSION_TOKEN_PATH})" | base64); + echo '{"auths":{"mock_image_repo":{"auth":"'"$AUTH_TOKEN"'"}}}' | tr -d '\n' > $REGISTRY_CRED_PATH; +} + +on_session_token_change() { + # Get the initial checksum of the file + CHECKSUM=$(md5sum "${SESSION_TOKEN_PATH}" | awk '{ print $1 }') + # Run the command once before the loop + echo "Monitoring session token changes in the background..." + ( + while true; do + # Get the current checksum of the file + CURRENT_CHECKSUM=$(md5sum "${SESSION_TOKEN_PATH}" | awk '{ print $1 }') + if [ "${CURRENT_CHECKSUM}" != "${CHECKSUM}" ]; then + # Session token file has changed, regenerate registry credential. + echo "Session token has changed. Regenerating registry auth credentials." + generate_registry_cred + CHECKSUM="${CURRENT_CHECKSUM}" + fi + # Wait for a short period of time before checking again + sleep 1 + done + ) +} + +run_kaniko() { + # Run the Kaniko command in the foreground + echo "Starting Kaniko command..." + + # Set cache ttl to a large value as snowservice registry doesn't support deleting cache anyway. + /kaniko/executor \ + --dockerfile Dockerfile \ + --context dir:///stage/models/id/context \ + --destination=mock_image_repo/mock_id:latest \ + --cache=true \ + --cache-copy-layers=false \ + --use-new-run \ + --snapshot-mode=redo \ + --cache-repo=mock_image_repo/cache \ + --cache-run-layers=true \ + --cache-ttl=8760h \ + --push-retry=3 \ + --image-fs-extract-retry=5 \ + --log-timestamp +} + +setup() { + tar -C "/stage/models/id" -xf "/stage/models/id/context.tar.gz"; + generate_registry_cred + # Set up the signal handlers + trap cleanup TERM +} + +setup + +# Running kaniko job on the foreground and session token monitoring on the background. When session token changes, +# overwrite the existing registry cred file with the new session token. +on_session_token_change & +run_kaniko + +# Capture the exit code from the previous kaniko command. +KANIKO_EXIT_CODE=$? +# Exit with the same exit code as the Kaniko command. This then triggers the cleanup function. +exit $KANIKO_EXIT_CODE diff --git a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel index 65bde879..6d913c3c 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/snowservice/BUILD.bazel @@ -14,20 +14,26 @@ py_library( name = "deploy", srcs = ["deploy.py"], deps = [ - "//snowflake/ml/model:_model", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_deploy_client/image_builds:base_image_builder", "//snowflake/ml/model/_deploy_client/image_builds:client_image_builder", - ":deploy_options", + "//snowflake/ml/model/_deploy_client/image_builds:server_image_builder", "//snowflake/ml/model/_deploy_client/utils:snowservice_client", - "//snowflake/ml/_internal:file_utils" + "//snowflake/ml/_internal/utils:identifier", + ":deploy_options", + ":instance_types" ], data = [ "templates/service_spec_template" ] ) +py_library( + name = "instance_types", + srcs = ["instance_types.py"] +) + py_test( name = "deploy_test", srcs = ["deploy_test.py"], diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy.py b/snowflake/ml/model/_deploy_client/snowservice/deploy.py index aaee5b81..d2b49a6f 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy.py @@ -1,20 +1,27 @@ +import copy import logging import os import posixpath import string import tempfile +import time from abc import ABC from typing import Any, Dict, Optional, cast import yaml from typing_extensions import Unpack -from snowflake.ml._internal import file_utils -from snowflake.ml.model import _model, _model_meta, type_hints -from snowflake.ml.model._deploy_client.image_builds import client_image_builder -from snowflake.ml.model._deploy_client.snowservice import deploy_options +from snowflake.ml._internal import env_utils +from snowflake.ml._internal.utils import identifier, query_result_checker +from snowflake.ml.model import _model_meta, type_hints +from snowflake.ml.model._deploy_client.image_builds import ( + base_image_builder, + client_image_builder, + server_image_builder, +) +from snowflake.ml.model._deploy_client.snowservice import deploy_options, instance_types from snowflake.ml.model._deploy_client.utils import constants, snowservice_client -from snowflake.snowpark import FileOperation, Session +from snowflake.snowpark import Session logger = logging.getLogger(__name__) @@ -23,18 +30,20 @@ def _deploy( session: Session, *, model_id: str, + model_meta: _model_meta.ModelMetadata, service_func_name: str, model_zip_stage_path: str, deployment_stage_path: str, target_method: str, **kwargs: Unpack[type_hints.SnowparkContainerServiceDeployOptions], -) -> _model_meta.ModelMetadata: +) -> None: """Entrypoint for model deployment to SnowService. This function will trigger a docker image build followed by workflow deployment to SnowService. Args: session: Snowpark session model_id: Unique hex string of length 32, provided by model registry. + model_meta: Model Metadata. service_func_name: The service function name in SnowService associated with the created service. model_zip_stage_path: Path to model zip file in stage. Note that this path has a "@" prefix. deployment_stage_path: Path to stage containing deployment artifacts. @@ -45,9 +54,6 @@ def _deploy( ValueError: Raised when model_id is empty. ValueError: Raised when service_func_name is empty. ValueError: Raised when model_stage_file_path is empty. - - Returns: - The metadata of the model that has been deployed. """ snowpark_logger = logging.getLogger("snowflake.snowpark") snowflake_connector_logger = logging.getLogger("snowflake.connector") @@ -79,36 +85,78 @@ def _deploy( assert deployment_stage_path.startswith("@"), f"stage path should start with @, actual: {deployment_stage_path}" options = deploy_options.SnowServiceDeployOptions.from_dict(cast(Dict[str, Any], kwargs)) + model_meta_deploy = copy.deepcopy(model_meta) + if options.use_gpu: + # Make mypy happy + assert options.num_gpus is not None + if model_meta.cuda_version is None: + raise ValueError( + "You are requesting GPUs for models that do not use a GPU or does not have CUDA version set." + ) + _validate_requested_gpus(session, request_gpus=options.num_gpus, compute_pool=options.compute_pool) + if model_meta.cuda_version: + ( + model_meta_deploy._conda_dependencies, + model_meta_deploy._pip_requirements, + ) = env_utils.generate_env_for_cuda( + model_meta._conda_dependencies, model_meta._pip_requirements, model_meta.cuda_version + ) + else: + # If user does not need GPU, we set this copies cuda_version to None, thus when Image builder gets a + # not-None cuda_version, it gets to know that GPU is required. + model_meta_deploy._cuda_version = None + + # Set conda-forge as backup channel for SPCS deployment + if "conda-forge" not in model_meta_deploy._conda_dependencies: + model_meta_deploy._conda_dependencies["conda-forge"] = [] + # TODO[shchen]: SNOW-863701, Explore ways to prevent entire model zip being downloaded during deploy step # (for both warehouse and snowservice deployment) # One alternative is for model registry to duplicate the model metadata and env dependency storage from model # zip so that we don't have to pull down the entire model zip. - fo = FileOperation(session=session) - zf = fo.get_stream(model_zip_stage_path) - with file_utils.unzip_stream_in_temp_dir(stream=zf) as temp_local_model_dir_path: - # Download the model zip file that is already uploaded to stage during model registry log_model step. - # This is needed in order to obtain the conda and requirement file inside the model zip, as well as to - # return the model object needed for deployment info tracking. - ss_deployment = SnowServiceDeployment( - session=session, - model_id=model_id, - service_func_name=service_func_name, - model_zip_stage_path=model_zip_stage_path, # Pass down model_zip_stage_path for service spec file - deployment_stage_path=deployment_stage_path, - model_dir=temp_local_model_dir_path, - target_method=target_method, - options=options, - ) - ss_deployment.deploy() - meta = _model.load_model(model_dir_path=temp_local_model_dir_path, meta_only=True) - return meta + ss_deployment = SnowServiceDeployment( + session=session, + model_id=model_id, + model_meta=model_meta_deploy, + service_func_name=service_func_name, + model_zip_stage_path=model_zip_stage_path, # Pass down model_zip_stage_path for service spec file + deployment_stage_path=deployment_stage_path, + target_method=target_method, + options=options, + ) + ss_deployment.deploy() finally: # Preserve the original logging level. snowpark_logger.setLevel(snowpark_log_level) snowflake_connector_logger.setLevel(snowflake_connector_log_level) -def _get_or_create_image_repo(session: Session, *, image_repo: Optional[str]) -> str: +def _validate_requested_gpus(session: Session, *, request_gpus: int, compute_pool: str) -> None: + # Remove full qualified name to avoid double quotes, which does not work well in desc compute pool syntax. + compute_pool = compute_pool.replace('"', "") + sql = f"DESC COMPUTE POOL {compute_pool}" + result = ( + query_result_checker.SqlResultValidator( + session=session, + query=sql, + ) + .has_column("instance_family") + .has_dimensions(expected_rows=1) + .validate() + ) + instance_family = result[0]["instance_family"] + if instance_family in instance_types.INSTANCE_TYPE_TO_GPU_COUNT: + gpu_capacity = instance_types.INSTANCE_TYPE_TO_GPU_COUNT[instance_family] + if request_gpus > gpu_capacity: + raise RuntimeError( + f"GPU request exceeds instance capability; {instance_family} instance type has total " + f"capacity of {gpu_capacity} GPU, yet a request was made for {request_gpus} GPUs." + ) + else: + logger.warning(f"Unknown instance type: {instance_family}, skipping GPU validation") + + +def _get_or_create_image_repo(session: Session, *, service_func_name: str, image_repo: Optional[str]) -> str: def _sanitize_dns_url(url: str) -> str: # Align with existing SnowService image registry url standard. return url.lower() @@ -120,15 +168,22 @@ def _sanitize_dns_url(url: str) -> str: conn = session._conn._conn org = conn.host.split(".")[1] account = conn.account - db = conn._database - schema = conn._schema + # We try to use the same db and schema as the service function locates, as we could retrieve those information + # if that is a fully qualified one. If not we use the current session one. + (_db, _schema, _, _) = identifier.parse_schema_level_object_identifier(service_func_name) + db = _db if _db is not None else conn._database + schema = _schema if _schema is not None else conn._schema + if db is None or schema is None: + # Will be captured in L180 + raise ValueError() + assert isinstance(db, str) and isinstance(schema, str) subdomain = constants.PROD_IMAGE_REGISTRY_SUBDOMAIN sanitized_url = _sanitize_dns_url( f"{org}-{account}.{subdomain}.{constants.PROD_IMAGE_REGISTRY_DOMAIN}/{db}/" f"{schema}/{constants.SNOWML_IMAGE_REPO}" ) client = snowservice_client.SnowServiceClient(session) - client.create_image_repo(constants.SNOWML_IMAGE_REPO) + client.create_image_repo(identifier.get_schema_level_object_identifier(db, schema, constants.SNOWML_IMAGE_REPO)) return sanitized_url except Exception: raise RuntimeError( @@ -146,8 +201,8 @@ def __init__( self, session: Session, model_id: str, + model_meta: _model_meta.ModelMetadata, service_func_name: str, - model_dir: str, model_zip_stage_path: str, deployment_stage_path: str, target_method: str, @@ -159,8 +214,8 @@ def __init__( session: Snowpark session model_id: Unique hex string of length 32, provided by model registry; if not provided, auto-generate one for resource naming.The model_id serves as an idempotent key throughout the deployment workflow. + model_meta: Model Metadata. service_func_name: The service function name in SnowService associated with the created service. - model_dir: Local model directory, downloaded form stage and extracted. model_zip_stage_path: Path to model zip file in stage. deployment_stage_path: Path to stage containing deployment artifacts. target_method: The name of the target method to be deployed. @@ -169,12 +224,14 @@ def __init__( self.session = session self.id = model_id + self.model_meta = model_meta self.service_func_name = service_func_name self.model_zip_stage_path = model_zip_stage_path - self.model_dir = model_dir self.options = options self.target_method = target_method - self._service_name = f"service_{model_id}" + (db, schema, _, _) = identifier.parse_schema_level_object_identifier(service_func_name) + + self._service_name = identifier.get_schema_level_object_identifier(db, schema, f"service_{model_id}") # Spec file and future deployment related artifacts will be stored under {stage}/models/{model_id} self._model_artifact_stage_location = posixpath.join(deployment_stage_path, "models", self.id) @@ -190,7 +247,10 @@ def deploy(self) -> None: "Building the Docker image and deploying to Snowpark Container Service. " "This process may take a few minutes." ) + start = time.time() image = self._build_and_upload_image() + end = time.time() + logger.info(f"Time taken to build and upload image to registry: {end-start:.2f} seconds") logger.warning( f"Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, " @@ -204,13 +264,23 @@ def _build_and_upload_image(self) -> str: Returns: Path to the image in the remote image repository. """ - image_repo = _get_or_create_image_repo(self.session, image_repo=self.options.image_repo) - image_builder = client_image_builder.ClientImageBuilder( - id=self.id, - image_repo=image_repo, - model_dir=self.model_dir, - session=self.session, + image_repo = _get_or_create_image_repo( + self.session, service_func_name=self.service_func_name, image_repo=self.options.image_repo ) + image_builder: base_image_builder.ImageBuilder + if self.options.enable_remote_image_build: + image_builder = server_image_builder.ServerImageBuilder( + id=self.id, + image_repo=image_repo, + model_meta=self.model_meta, + session=self.session, + artifact_stage_location=self._model_artifact_stage_location, + compute_pool=self.options.compute_pool, + ) + else: + image_builder = client_image_builder.ClientImageBuilder( + id=self.id, image_repo=image_repo, model_meta=self.model_meta, session=self.session + ) return image_builder.build_and_upload_image() def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: @@ -227,19 +297,23 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: with open(spec_template_path, encoding="utf-8") as template, open( spec_file_path, "w+", encoding="utf-8" ) as spec_file: + assert self.model_zip_stage_path.startswith("@") + norm_stage_path = posixpath.normpath(identifier.remove_prefix(self.model_zip_stage_path, "@")) + (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(norm_stage_path) content = string.Template(template.read()).substitute( { "image": image, "predict_endpoint_name": constants.PREDICT, - "model_stage": self.model_zip_stage_path[1:].split("/")[0], # Reserve only the stage name - "model_zip_stage_path": self.model_zip_stage_path[1:], # Remove the @ prefix + "model_stage": identifier.get_schema_level_object_identifier(db, schema, stage), + "model_zip_stage_path": norm_stage_path, "inference_server_container_name": constants.INFERENCE_SERVER_CONTAINER, "target_method": self.target_method, "num_workers": self.options.num_workers, + "use_gpu": self.options.use_gpu, } ) content_dict = yaml.safe_load(content) - if self.options.num_gpus is not None and self.options.num_gpus > 0: + if self.options.use_gpu: container = content_dict["spec"]["container"][0] # TODO[shchen]: SNOW-871538, external dependency that only single GPU is supported on SnowService. # GPU limit has to be specified in order to trigger the workload to be run on GPU in SnowService. @@ -248,6 +322,13 @@ def _prepare_and_upload_artifacts_to_stage(self, image: str) -> None: "requests": {"nvidia.com/gpu": self.options.num_gpus}, } + # Make LLM use case sequential + if any( + model_blob_meta.model_type == "huggingface_pipeline" + for model_blob_meta in self.model_meta.models.values() + ): + container["env"]["_CONCURRENT_REQUESTS_MAX"] = 1 + yaml.dump(content_dict, spec_file) spec_file.seek(0) logger.debug(f"Create service spec: \n {spec_file.read()}") @@ -284,8 +365,21 @@ def _deploy_workflow(self, image: str) -> None: client.block_until_resource_is_ready( resource_name=self._service_name, resource_type=constants.ResourceType.SERVICE ) + + # To avoid too large batch in HF LLM case + max_batch_rows = None + if self.options.use_gpu: + for model_blob_meta in self.model_meta.models.values(): + if model_blob_meta.model_type == "huggingface_pipeline": + batch_size = int(model_blob_meta.options.get("batch_size", 1)) + if max_batch_rows is None: + max_batch_rows = batch_size + else: + max_batch_rows = min(batch_size, max_batch_rows) + client.create_or_replace_service_function( service_func_name=self.service_func_name, service_name=self._service_name, endpoint_name=constants.PREDICT, + max_batch_rows=max_batch_rows, ) diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py index db35fb3f..4154bd7a 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_options.py @@ -1,4 +1,5 @@ import inspect +import logging from typing import Any, Dict, Optional from snowflake.ml.model._deploy_client.utils import constants @@ -12,10 +13,10 @@ def __init__( image_repo: Optional[str] = None, min_instances: Optional[int] = 1, max_instances: Optional[int] = 1, - endpoint: Optional[str] = constants.PREDICT, prebuilt_snowflake_image: Optional[str] = None, num_gpus: Optional[int] = 0, num_workers: Optional[int] = None, + enable_remote_image_build: Optional[bool] = False, ) -> None: """Initialization @@ -28,8 +29,6 @@ def __init__( inferred based on session information. min_instances: Minimum number of service replicas. Default to 1. max_instances: Maximum number of service replicas. Default to 1. - endpoint: The specific name of the endpoint that the service function will communicate with. This option is - useful when the service has multiple endpoints. Default to “predict”. prebuilt_snowflake_image: When provided, the image-building step is skipped, and the pre-built image from Snowflake is used as is. This option is for users who consistently use the same image for multiple use cases, allowing faster deployment. The snowflake image used for deployment is logged to the console for @@ -38,16 +37,26 @@ def __init__( num_workers: Number of workers used for model inference. Please ensure that the number of workers is set lower than the total available memory divided by the size of model to prevent memory-related issues. Default is number of CPU cores * 2 + 1. + enable_remote_image_build: When set to True, will enable image build on a remote SnowService job. + Default is False. """ self.compute_pool = compute_pool self.image_repo = image_repo self.min_instances = min_instances self.max_instances = max_instances - self.endpoint = endpoint self.prebuilt_snowflake_image = prebuilt_snowflake_image self.num_gpus = num_gpus self.num_workers = num_workers + self.enable_remote_image_build = enable_remote_image_build + + if self.num_workers is None and self.use_gpu: + logging.info("num_workers has been defaulted to 1 when using GPU.") + self.num_workers = 1 + + @property + def use_gpu(self) -> bool: + return self.num_gpus is not None and self.num_gpus > 0 @classmethod def from_dict(cls, options_dict: Dict[str, Any]) -> "SnowServiceDeployOptions": diff --git a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py index 6fc058d6..0aae0a41 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py +++ b/snowflake/ml/model/_deploy_client/snowservice/deploy_test.py @@ -10,8 +10,8 @@ _get_or_create_image_repo, ) from snowflake.ml.model._deploy_client.utils import constants -from snowflake.ml.test_utils import mock_session -from snowflake.snowpark import FileOperation, session +from snowflake.ml.test_utils import mock_data_frame, mock_session +from snowflake.snowpark import row, session class Connection: @@ -25,59 +25,56 @@ def __init__(self, host: str, account: str, database: str, schema: str) -> None: class DeployTestCase(absltest.TestCase): def setUp(self) -> None: super().setUp() - self.m_session = cast(session.Session, mock_session.MockSession(conn=None, test_case=self)) + self.m_session = mock_session.MockSession(conn=None, test_case=self) self.options: Dict[str, Any] = { "compute_pool": "mock_compute_pool", "image_repo": "mock_image_repo", } - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model") # type: ignore - @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.file_utils") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore - def test_deploy_with_model_id( - self, m_deployment_class: mock.MagicMock, m_file_utils_class: mock.MagicMock, m_model_class: mock.MagicMock - ) -> None: + def test_deploy_with_model_id(self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock) -> None: m_deployment = m_deployment_class.return_value - m_file_utils = m_file_utils_class.return_value + m_model_meta = m_model_meta_class.return_value - m_extracted_model_dir = "mock_extracted_model_dir" m_model_zip_stage_path = "@mock_model_zip_stage_path/model.zip" m_deployment_stage_path = "@mock_model_deployment_stage_path" - with mock.patch.object(FileOperation, "get_stream", return_value=None): - with mock.patch.object(m_file_utils, "unzip_stream_in_temp_dir", return_value=m_extracted_model_dir): - _deploy( - session=self.m_session, - model_id="provided_model_id", - service_func_name="mock_service_func", - model_zip_stage_path=m_model_zip_stage_path, - deployment_stage_path=m_deployment_stage_path, - target_method=constants.PREDICT, - **self.options, - ) - - # TODO: for some reason mock is not wired up properly - # m_model.load_model.assert_called_once_with(model_dir_path=m_extracted_model_dir, meta_only=True) - - m_deployment_class.assert_called_once_with( - session=self.m_session, - model_id="provided_model_id", - service_func_name="mock_service_func", - model_zip_stage_path=m_model_zip_stage_path, - deployment_stage_path=m_deployment_stage_path, - model_dir=mock.ANY, - target_method=constants.PREDICT, - options=mock.ANY, - ) - m_deployment.deploy.assert_called_once() + _deploy( + session=cast(session.Session, self.m_session), + model_id="provided_model_id", + model_meta=m_model_meta, + service_func_name="mock_service_func", + model_zip_stage_path=m_model_zip_stage_path, + deployment_stage_path=m_deployment_stage_path, + target_method=constants.PREDICT, + **self.options, + ) + m_deployment_class.assert_called_once_with( + session=self.m_session, + model_id="provided_model_id", + service_func_name="mock_service_func", + model_zip_stage_path=m_model_zip_stage_path, + deployment_stage_path=m_deployment_stage_path, + model_meta=m_model_meta, + target_method=constants.PREDICT, + options=mock.ANY, + ) + m_deployment.deploy.assert_called_once() + + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore - def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> None: + def test_deploy_with_empty_model_id( + self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock + ) -> None: + m_model_meta = m_model_meta_class.return_value with self.assertRaises(ValueError): _deploy( - session=self.m_session, + session=cast(session.Session, self.m_session), service_func_name="mock_service_func", model_id="", + model_meta=m_model_meta, model_zip_stage_path="@mock_model_zip_stage_path/model.zip", deployment_stage_path="@mock_model_deployment_stage_path", target_method=constants.PREDICT, @@ -86,14 +83,19 @@ def test_deploy_with_empty_model_id(self, m_deployment_class: mock.MagicMock) -> m_deployment_class.assert_not_called() + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore - def test_deploy_with_missing_required_options(self, m_deployment_class: mock.MagicMock) -> None: + def test_deploy_with_missing_required_options( + self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock + ) -> None: + m_model_meta = m_model_meta_class.return_value with self.assertRaisesRegex(ValueError, "compute_pool"): options: Dict[str, Any] = {} _deploy( - session=self.m_session, + session=cast(session.Session, self.m_session), service_func_name="mock_service_func", model_id="mock_model_id", + model_meta=m_model_meta, model_zip_stage_path="@mock_model_zip_stage_path/model.zip", deployment_stage_path="@mock_model_deployment_stage_path", target_method=constants.PREDICT, @@ -101,6 +103,118 @@ def test_deploy_with_missing_required_options(self, m_deployment_class: mock.Mag ) m_deployment_class.assert_not_called() + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore + def test_deploy_with_over_requested_gpus( + self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock + ) -> None: + m_model_meta = m_model_meta_class.return_value + with self.assertRaisesRegex(RuntimeError, "GPU request exceeds instance capability"): + self.m_session.add_mock_sql( + query=f"DESC COMPUTE POOL {self.options['compute_pool']}", + result=mock_data_frame.MockDataFrame( + [row.Row(name="MY_GPU_POOL", state="IDLE", min_nodes=1, max_nodes=1, instance_family="GPU_3")] + ), + ) + + _deploy( + session=cast(session.Session, self.m_session), + service_func_name="mock_service_func", + model_id="mock_model_id", + model_meta=m_model_meta, + model_zip_stage_path="@mock_model_zip_stage_path/model.zip", + deployment_stage_path="@mock_model_deployment_stage_path", + target_method=constants.PREDICT, + num_gpus=2, + **self.options, + ) + m_deployment_class.assert_not_called() + + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore + def test_deploy_with_over_requested_gpus_no_cuda( + self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock + ) -> None: + m_model_meta = m_model_meta_class.return_value + m_model_meta.cuda_version = None + with self.assertRaisesRegex( + ValueError, "You are requesting GPUs for models that do not use a GPU or does not have CUDA version set" + ): + self.m_session.add_mock_sql( + query=f"DESC COMPUTE POOL {self.options['compute_pool']}", + result=mock_data_frame.MockDataFrame( + [row.Row(name="MY_GPU_POOL", state="IDLE", min_nodes=1, max_nodes=1, instance_family="GPU_7")] + ), + ) + _deploy( + session=cast(session.Session, self.m_session), + service_func_name="mock_service_func", + model_id="mock_model_id", + model_meta=m_model_meta, + model_zip_stage_path="@mock_model_zip_stage_path/model.zip", + deployment_stage_path="@mock_model_deployment_stage_path", + target_method=constants.PREDICT, + num_gpus=2, + **self.options, + ) + m_deployment_class.assert_not_called() + + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.copy.deepcopy") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy.SnowServiceDeployment") # type: ignore + def test_deploy_with_gpu_validation_and_unknown_instance_type( + self, m_deployment_class: mock.MagicMock, m_model_meta_class: mock.MagicMock, m_deepcopy_func: mock.MagicMock + ) -> None: + m_deployment = m_deployment_class.return_value + m_model_meta = m_model_meta_class.return_value + m_model_meta.cuda_version = "11.7" + m_model_meta_deploy = m_deepcopy_func.return_value + m_model_zip_stage_path = "@mock_model_zip_stage_path/model.zip" + m_deployment_stage_path = "@mock_model_deployment_stage_path" + + unknown_instance_type = "GPU_UNKNOWN" + self.m_session.add_mock_sql( + query=f"DESC COMPUTE POOL {self.options['compute_pool']}", + result=mock_data_frame.MockDataFrame( + [row.Row(name="MY_GPU_POOL", state="IDLE", instance_family=unknown_instance_type)] + ), + ) + with self.assertLogs(level="INFO") as cm: + _deploy( + session=cast(session.Session, self.m_session), + model_id="provided_model_id", + model_meta=m_model_meta, + service_func_name="mock_service_func", + model_zip_stage_path=m_model_zip_stage_path, + deployment_stage_path=m_deployment_stage_path, + target_method=constants.PREDICT, + num_gpus=2, + **self.options, + ) + + self.assertListEqual( + cm.output, + [ + "INFO:root:num_workers has been defaulted to 1 when using GPU.", + ( + "WARNING:snowflake.ml.model._deploy_client.snowservice.deploy:Unknown " + "instance type: GPU_UNKNOWN, skipping GPU validation" + ), + ], + ) + + m_deployment_class.assert_called_once_with( + session=self.m_session, + model_id="provided_model_id", + model_meta=m_model_meta_deploy, + service_func_name="mock_service_func", + model_zip_stage_path=m_model_zip_stage_path, + deployment_stage_path=m_deployment_stage_path, + target_method=constants.PREDICT, + options=mock.ANY, + ) + m_deployment.deploy.assert_called_once() + @mock.patch( "snowflake.ml.model._deploy_client.snowservice.deploy." "snowservice_client.SnowServiceClient" ) # type: ignore @@ -108,36 +222,62 @@ def test_get_or_create_image_repo(self, m_snowservice_client_class: mock.MagicMo # Test when image repo url is provided. self.assertEqual( _get_or_create_image_repo( - self.m_session, image_repo="org-account.registry-dev.snowflakecomputing.com/DB/SCHEMA/REPO" + session=cast(session.Session, self.m_session), + service_func_name="func", + image_repo="org-account.registry-dev.snowflakecomputing.com/DB/SCHEMA/REPO", ), "org-account.registry-dev.snowflakecomputing.com/db/schema/repo", ) # Test when session is missing component(db/schema etc) in order to construct image repo url with self.assertRaises(RuntimeError): - _get_or_create_image_repo(self.m_session, image_repo=None) + _get_or_create_image_repo( + session=cast(session.Session, self.m_session), service_func_name="func", image_repo=None + ) # Test constructing image repo from session object self.m_session._conn = mock.MagicMock() self.m_session._conn._conn = Connection( host="account.org.us-west-2.aws.snowflakecomputing.com", account="account", database="DB", schema="SCHEMA" - ) # type: ignore + ) m_snowservice_client = m_snowservice_client_class.return_value expected = f"org-account.registry.snowflakecomputing.com/db/schema/{constants.SNOWML_IMAGE_REPO}" - self.assertEqual(_get_or_create_image_repo(self.m_session, image_repo=None), expected) - m_snowservice_client.create_image_repo.assert_called_with(constants.SNOWML_IMAGE_REPO) + self.assertEqual( + _get_or_create_image_repo( + session=cast(session.Session, self.m_session), service_func_name="func", image_repo=None + ), + expected, + ) + m_snowservice_client.create_image_repo.assert_called_with(f"DB.SCHEMA.{constants.SNOWML_IMAGE_REPO}") + + m_snowservice_client = m_snowservice_client_class.return_value + expected = ( + f"org-account.registry.snowflakecomputing.com/another_db/another_schema/{constants.SNOWML_IMAGE_REPO}" + ) + self.assertEqual( + _get_or_create_image_repo( + session=cast(session.Session, self.m_session), + service_func_name="another_db.another_schema.func", + image_repo=None, + ), + expected, + ) + m_snowservice_client.create_image_repo.assert_called_with( + f"another_db.another_schema.{constants.SNOWML_IMAGE_REPO}" + ) class SnowServiceDeploymentTestCase(absltest.TestCase): - def setUp(self) -> None: + @mock.patch("snowflake.ml.model._deploy_client.snowservice.deploy._model_meta.ModelMetadata") # type: ignore + def setUp(self, m_model_meta_class: mock.MagicMock) -> None: super().setUp() self.m_session = cast(session.Session, mock_session.MockSession(conn=None, test_case=self)) self.m_model_id = "provided_model_id" - self.m_service_func_name = "provided_service_func_name" + self.m_service_func_name = "mock_db.mock_schema.provided_service_func_name" self.m_model_zip_stage_path = "@provided_model_zip_stage_path/model.zip" self.m_deployment_stage_path = "@mock_model_deployment_stage_path" - self.m_model_dir = "tmp/local_model.zip" + self.m_model_meta = m_model_meta_class.return_value self.m_options = { "stage": "mock_stage", "compute_pool": "mock_compute_pool", @@ -148,13 +288,16 @@ def setUp(self) -> None: self.m_session, model_id=self.m_model_id, service_func_name=self.m_service_func_name, - model_dir=self.m_model_dir, + model_meta=self.m_model_meta, model_zip_stage_path=self.m_model_zip_stage_path, deployment_stage_path=self.m_deployment_stage_path, target_method=constants.PREDICT, options=deploy_options.SnowServiceDeployOptions.from_dict(self.m_options), ) + def test_service_name(self) -> None: + self.assertEqual(self.deployment._service_name, "mock_db.mock_schema.service_provided_model_id") + def test_deploy(self) -> None: with mock.patch.object( self.deployment, "_build_and_upload_image" diff --git a/snowflake/ml/model/_deploy_client/snowservice/instance_types.py b/snowflake/ml/model/_deploy_client/snowservice/instance_types.py new file mode 100644 index 00000000..11a9e09a --- /dev/null +++ b/snowflake/ml/model/_deploy_client/snowservice/instance_types.py @@ -0,0 +1,2 @@ +# Snowpark Container Service GPU instance type and corresponding GPU counts. +INSTANCE_TYPE_TO_GPU_COUNT = {"GPU_3": 1, "GPU_5": 1, "GPU_7": 4, "GPU_10": 8} diff --git a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template index 0dc9520a..cb043a2f 100644 --- a/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +++ b/snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template @@ -6,6 +6,7 @@ spec: MODEL_ZIP_STAGE_PATH: ${model_zip_stage_path} TARGET_METHOD: ${target_method} NUM_WORKERS: ${num_workers} + SNOWML_USE_GPU: ${use_gpu} readinessProbe: port: 5000 path: /health diff --git a/snowflake/ml/model/_deploy_client/utils/BUILD.bazel b/snowflake/ml/model/_deploy_client/utils/BUILD.bazel index 47207297..6271453c 100644 --- a/snowflake/ml/model/_deploy_client/utils/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/utils/BUILD.bazel @@ -15,6 +15,14 @@ py_library( ] ) +py_library( + name = "image_registry_client", + srcs = ["image_registry_client.py"], + deps = [ + "//snowflake/ml/_internal/utils:spcs_image_registry", + ] +) + py_test( name = "snowservice_client_test", srcs = ["snowservice_client_test.py"], @@ -23,3 +31,12 @@ py_test( "//snowflake/ml/test_utils:mock_session", ] ) + +py_test( + name = "image_registry_client_test", + srcs = ["image_registry_client_test.py"], + deps = [ + ":image_registry_client", + "//snowflake/ml/test_utils:mock_session", + ] +) diff --git a/snowflake/ml/model/_deploy_client/utils/constants.py b/snowflake/ml/model/_deploy_client/utils/constants.py index 1404db08..b43ed431 100644 --- a/snowflake/ml/model/_deploy_client/utils/constants.py +++ b/snowflake/ml/model/_deploy_client/utils/constants.py @@ -50,3 +50,8 @@ class ResourceStatus(Enum): DEV_IMAGE_REGISTRY_SUBDOMAIN = "registry-dev" MODEL_ENV_FOLDER = "env" CONDA_FILE = "conda.yaml" +IMAGE_BUILD_JOB_SPEC_TEMPLATE = "image_build_job_spec_template" +KANIKO_SHELL_SCRIPT_TEMPLATE = "kaniko_shell_script_template" +CONTEXT = "context" +KANIKO_SHELL_SCRIPT_NAME = "kaniko_shell_script_fixture.sh" +KANIKO_CONTAINER_NAME = "kaniko" diff --git a/snowflake/ml/model/_deploy_client/utils/image_registry_client.py b/snowflake/ml/model/_deploy_client/utils/image_registry_client.py new file mode 100644 index 00000000..6cfe85e2 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/utils/image_registry_client.py @@ -0,0 +1,109 @@ +import json +from urllib.parse import urlparse, urlunparse + +# library `requests` has known stubs but is not installed. +import requests # type: ignore + +from snowflake.ml._internal.utils import spcs_image_registry +from snowflake.snowpark import Session + + +class ImageRegistryClient: + """ + A simple SPCS image registry HTTP client partial implementation. This client exists due to current unavailability + of registry "list image" system function and lack of registry SDK. + """ + + def __init__(self, session: Session) -> None: + """Initialization + + Args: + session: Snowpark session + """ + self.session = session + + def login(self, repo_url: str, registry_cred: str) -> str: + """Log in to image registry + + Args: + repo_url: image repo url. + registry_cred: registry basic auth credential. + + Returns: + Bearer token when login succeeded. + + Raises: + RuntimeError: when login failed. + """ + parsed_url = urlparse(repo_url) + scheme = parsed_url.scheme + host = parsed_url.netloc + + login_path = "/login" # Construct the login path + url_tuple = (scheme, host, login_path, "", "", "") + login_url = urlunparse(url_tuple) + + resp = requests.get(login_url, headers={"Authorization": f"Basic {registry_cred}"}) + if resp.status_code != 200: + raise RuntimeError("Failed to login to the repository", resp.text) + + return str(json.loads(resp.text)["token"]) + + def convert_to_v2_head_manifests_url(self, full_image_name: str) -> str: + """Converts a full image name to a Docker Registry HTTP API V2 URL: + https://docs.docker.com/registry/spec/api/#existing-manifests + + org-account.registry-dev.snowflakecomputing.com/db/schema/repo/image_name:tag becomes + https://org-account.registry-dev.snowflakecomputing.com/v2/db/schema/repo/image_name/manifests/tag + + Args: + full_image_name: a string consists of image name and image tag. + + Returns: + Docker HTTP V2 URL for checking manifest existence. + """ + scheme = "https" + full_image_name_parts = full_image_name.split(":") + assert len(full_image_name_parts) == 2, "full image name should include both image name and tag" + + image_name = full_image_name_parts[0] + tag = full_image_name_parts[1] + image_name_parts = image_name.split("/") + domain = image_name_parts[0] + rest = "/".join(image_name_parts[1:]) + path = f"/v2/{rest}/manifests/{tag}" + url_tuple = (scheme, domain, path, "", "", "") + return urlunparse(url_tuple) + + def image_exists(self, full_image_name: str) -> bool: + """Check whether image already exists in the registry. + + Args: + full_image_name: Full image name consists of image name and image tag. + + Returns: + Boolean value. True when image already exists, else False. + + """ + + with spcs_image_registry.generate_image_registry_credential(self.session) as registry_cred: + v2_api_url = self.convert_to_v2_head_manifests_url(full_image_name) + bearer_login = self.login(v2_api_url, registry_cred) + + headers_v1 = { + "Authorization": f"Bearer {bearer_login}", + "Accept": "application/vnd.oci.image.manifest.v1+json", + } + + headers_v2 = { + "Authorization": f"Bearer {bearer_login}", + "Accept": "application/vnd.docker.distribution.manifest.v2+json", + } + # Depending on the built image, the media type of the image manifest might be either + # application/vnd.oci.image.manifest.v1+json or application/vnd.docker.distribution.manifest.v2+json + # Hence we need to check for both, otherwise it could result in false negative. + if requests.head(v2_api_url, headers=headers_v1).status_code == 200: + return True + elif requests.head(v2_api_url, headers=headers_v2).status_code == 200: + return True + return False diff --git a/snowflake/ml/model/_deploy_client/utils/image_registry_client_test.py b/snowflake/ml/model/_deploy_client/utils/image_registry_client_test.py new file mode 100644 index 00000000..8a353606 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/utils/image_registry_client_test.py @@ -0,0 +1,141 @@ +from typing import cast + +from absl.testing import absltest +from absl.testing.absltest import mock + +from snowflake.ml.model._deploy_client.utils import image_registry_client +from snowflake.ml.test_utils import mock_session +from snowflake.snowpark import session + + +class ImageRegistryClientTest(absltest.TestCase): + def setUp(self) -> None: + super().setUp() + self.m_session = mock_session.MockSession(conn=None, test_case=self) + self.client = image_registry_client.ImageRegistryClient(cast(session.Session, self.m_session)) + + @mock.patch("snowflake.ml.model._deploy_client.utils.image_registry_client.requests.get") # type: ignore + def test_successful_login(self, mock_get: mock.MagicMock) -> None: + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.text = '{"token": "dummy_token"}' + mock_get.return_value = mock_response + + repo_url = "https://org-account.registry-dev.snowflakecomputing.com/v2/db/schema/repo" + registry_cred = "dummy_credentials" + token = self.client.login(repo_url, registry_cred) + + # Assertions + self.assertEqual(token, "dummy_token") + mock_get.assert_called_once_with( + "https://org-account.registry-dev.snowflakecomputing.com/login", + headers={"Authorization": f"Basic {registry_cred}"}, + ) + + @mock.patch("snowflake.ml.model._deploy_client.utils.image_registry_client.requests.get") # type: ignore + def test_failed_login(self, mock_get: mock.MagicMock) -> None: + mock_response = mock.Mock() + mock_response.status_code = 401 + mock_response.text = "Unauthorized" + mock_get.return_value = mock_response + + repo_url = "https://org-account.registry-dev.snowflakecomputing.com/v2/db/schema/repo" + registry_cred = "dummy_credentials" + + with self.assertRaises(RuntimeError): + self.client.login(repo_url, registry_cred) + + mock_get.assert_called_once_with( + "https://org-account.registry-dev.snowflakecomputing.com/login", + headers={"Authorization": f"Basic {registry_cred}"}, + ) + + def test_convert_to_v2_head_manifests_url(self) -> None: + full_image_name = "org-account.registry-dev.snowflakecomputing.com/db/schema/repo/image:latest" + actual = self.client.convert_to_v2_head_manifests_url(full_image_name=full_image_name) + expected = "https://org-account.registry-dev.snowflakecomputing.com/v2/db/schema/repo/image/manifests/latest" + self.assertEqual(actual, expected) + + def test_convert_to_v2_head_manifests_url_with_invalid_full_image_name(self) -> None: + image_name_without_tag = "org-account.registry-dev.snowflakecomputing.com/db/schema/repo/image" + with self.assertRaises(AssertionError): + self.client.convert_to_v2_head_manifests_url(full_image_name=image_name_without_tag) + + @mock.patch( + "snowflake.ml.model._deploy_client.utils.image_registry_client." "ImageRegistryClient.login" + ) # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.utils.image_registry_client.requests.head") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.utils.image_registry_client.spcs_image_registry") # type: ignore + def test_image_exists( + self, mock_spcs_image_registry: mock.MagicMock, mock_head: mock.MagicMock, mock_login: mock.MagicMock + ) -> None: + mock_head_response = mock.Mock() + mock_head_response.status_code = 200 + mock_head.return_value = mock_head_response + + mock_bearer_token = "dummy_bearer_token" + mock_registry_cred = "dummy_registry_cred" + mock_login.return_value = mock_bearer_token + + with mock.patch.object(mock_spcs_image_registry, "generate_image_registry_credential") as m_generate: + m_generate.return_value.__enter__.return_value = mock_registry_cred + full_image_name = "org-account.registry-dev.snowflakecomputing.com/db/schema/repo/image:latest" + url = "https://org-account.registry-dev.snowflakecomputing.com/v2/db/schema/repo/image/manifests/latest" + self.assertEqual(self.client.image_exists(full_image_name=full_image_name), True) + mock_login.assert_called_once_with(url, mock_registry_cred) + mock_head.assert_called_once_with( + url, + headers={ + "Authorization": f"Bearer {mock_bearer_token}", + "Accept": "application/vnd.oci.image.manifest.v1+json", + }, + ) + + @mock.patch( + "snowflake.ml.model._deploy_client.utils.image_registry_client" ".ImageRegistryClient.login" + ) # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.utils.image_registry_client.requests.head") # type: ignore + @mock.patch("snowflake.ml.model._deploy_client.utils.image_registry_client.spcs_image_registry") # type: ignore + def test_image_exists_with_two_head_requests( + self, mock_spcs_image_registry: mock.MagicMock, mock_head: mock.MagicMock, mock_login: mock.MagicMock + ) -> None: + mock_head_response_success = mock.Mock() + mock_head_response_success.status_code = 200 + mock_head_response_fail = mock.Mock() + mock_head_response_fail.status_code = 404 + + # Simulate that first head request fails, but second succeeded with the different header. + mock_head.side_effect = [mock_head_response_fail, mock_head_response_success] + + mock_bearer_token = "dummy_bearer_token" + mock_registry_cred = "dummy_registry_cred" + mock_login.return_value = mock_bearer_token + + with mock.patch.object(mock_spcs_image_registry, "generate_image_registry_credential") as m_generate: + m_generate.return_value.__enter__.return_value = mock_registry_cred + full_image_name = "org-account.registry-dev.snowflakecomputing.com/db/schema/repo/image:latest" + url = "https://org-account.registry-dev.snowflakecomputing.com/v2/db/schema/repo/image/manifests/latest" + self.assertEqual(self.client.image_exists(full_image_name=full_image_name), True) + mock_login.assert_called_once_with(url, mock_registry_cred) + self.assertEqual(mock_head.call_count, 2) + expected_calls = [ + mock.call( + url, + headers={ + "Authorization": f"Bearer {mock_bearer_token}", + "Accept": "application/vnd.oci.image.manifest.v1+json", + }, + ), + mock.call( + url, + headers={ + "Authorization": f"Bearer {mock_bearer_token}", + "Accept": "application/vnd.docker.distribution.manifest.v2+json", + }, + ), + ] + mock_head.assert_has_calls(expected_calls) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py index 2bf9de48..463be273 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client.py @@ -23,7 +23,7 @@ def __init__(self, session: Session) -> None: self.session = session def create_image_repo(self, repo_name: str) -> None: - self.session.sql(f"CREATE OR REPLACE IMAGE REPOSITORY {repo_name}").collect() + self.session.sql(f"CREATE IMAGE REPOSITORY IF NOT EXISTS {repo_name}").collect() def create_or_replace_service( self, @@ -57,6 +57,24 @@ def create_or_replace_service( logger.debug(f"Create service with SQL: \n {sql}") self.session.sql(sql).collect() + def create_job(self, compute_pool: str, spec_stage_location: str) -> str: + """ + Return the newly created Job ID. + + Args: + compute_pool: name of the compute pool + spec_stage_location: path to the stage location where the spec is located at. + + Returns: + job id in string format. + """ + assert spec_stage_location.startswith("@"), f"stage path should start with @, actual: {spec_stage_location}" + sql = f"execute service compute_pool={compute_pool} spec={spec_stage_location}" + logger.debug(f"Create job with SQL: \n {sql}") + res = self.session.sql(sql).collect() + job_id = res[0].status.split(" ")[-1].strip(".") + return str(job_id) + def _drop_service_if_exists(self, service_name: str) -> None: """Drop service if it already exists. @@ -72,6 +90,7 @@ def create_or_replace_service_function( *, endpoint_name: str = constants.PREDICT, path_at_service_endpoint: str = constants.PREDICT, + max_batch_rows: Optional[int] = None, ) -> None: """Create or replace service function. @@ -82,13 +101,19 @@ def create_or_replace_service_function( path_at_service_endpoint: Specify the path/route at the service endpoint. Multiple paths can exist for a given endpoint. For example, an inference server listening on port 5000 may have paths like "/predict" and "/monitoring + max_batch_rows: Specify the MAX_BATCH_ROWS property of the service function, if None, leave unset """ + max_batch_rows_sql = "" + if max_batch_rows: + max_batch_rows_sql = f"MAX_BATCH_ROWS = {max_batch_rows}" + sql = f""" CREATE OR REPLACE FUNCTION {service_func_name}(input OBJECT) RETURNS OBJECT SERVICE={service_name} ENDPOINT={endpoint_name} + {max_batch_rows_sql} AS '/{path_at_service_endpoint}' """ logger.debug(f"Create service function with SQL: \n {sql}") @@ -100,7 +125,8 @@ def block_until_resource_is_ready( resource_name: str, resource_type: constants.ResourceType, *, - max_retries: int = 60, + max_retries: int = 180, + container_name: str = constants.INFERENCE_SERVER_CONTAINER, retry_interval_secs: int = 10, ) -> None: """Blocks execution until the specified resource is ready. @@ -111,6 +137,7 @@ def block_until_resource_is_ready( Args: resource_name: Name of the resource. resource_type: Type of the resource. + container_name: The container to query the log from. max_retries: The maximum number of retries to check the resource readiness (default: 60). retry_interval_secs: The number of seconds to wait between each retry (default: 10). @@ -120,7 +147,15 @@ def block_until_resource_is_ready( """ for _ in range(max_retries): status = self.get_resource_status(resource_name=resource_name, resource_type=resource_type) - if status in [constants.ResourceStatus.READY, constants.ResourceStatus.DONE]: + if resource_type == constants.ResourceType.JOB and status == constants.ResourceStatus.DONE: + full_job_log = self.get_resource_log( + resource_name=resource_name, + resource_type=resource_type, + container_name=container_name, + ) + logger.debug(full_job_log) + return + elif resource_type == constants.ResourceType.SERVICE and status == constants.ResourceStatus.READY: return elif status in [ constants.ResourceStatus.FAILED, @@ -131,25 +166,31 @@ def block_until_resource_is_ready( error_log = self.get_resource_log( resource_name=resource_name, resource_type=resource_type, - container_name=constants.INFERENCE_SERVER_CONTAINER, + container_name=container_name, ) raise RuntimeError(f"{resource_type} {resource_name} failed. \n {error_log if error_log else ''}") time.sleep(retry_interval_secs) - raise RuntimeError("Resource never reached the ready/done state.") def get_resource_log( self, resource_name: str, resource_type: constants.ResourceType, container_name: str ) -> Optional[str]: - if resource_type != constants.ResourceType.SERVICE: + if resource_type == constants.ResourceType.SERVICE: + try: + row = self.session.sql( + f"CALL SYSTEM$GET_SNOWSERVICE_LOGS('{resource_name}', '0', '{container_name}')" + ).collect() + return str(row[0]["SYSTEM$GET_SNOWSERVICE_LOGS"]) + except Exception: + return None + elif resource_type == constants.ResourceType.JOB: + try: + row = self.session.sql(f"CALL SYSTEM$GET_JOB_LOGS('{resource_name}', '{container_name}')").collect() + return str(row[0]["SYSTEM$GET_JOB_LOGS"]) + except Exception: + return None + else: raise NotImplementedError(f"{resource_type.name} is not yet supported in get_resource_log function") - try: - row = self.session.sql( - f"CALL SYSTEM$GET_SNOWSERVICE_LOGS('{resource_name}', '0', '{container_name}')" - ).collect() - return str(row[0]["SYSTEM$GET_SNOWSERVICE_LOGS"]) - except Exception: - return None def get_resource_status( self, resource_name: str, resource_type: constants.ResourceType diff --git a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py index 93124595..58d1b8b7 100644 --- a/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py +++ b/snowflake/ml/model/_deploy_client/utils/snowservice_client_test.py @@ -71,6 +71,35 @@ def test_create_service_function(self) -> None: path_at_service_endpoint=m_path_at_endpoint, ) + def test_create_service_function_max_batch_rows(self) -> None: + m_service_func_name = "mock_service_func_name" + m_service_name = "mock_service_name" + m_endpoint_name = "mock_endpoint_name" + m_path_at_endpoint = "mock_route" + m_max_batch_rows = 1 + + m_sql = f""" + CREATE OR REPLACE FUNCTION {m_service_func_name}(input OBJECT) + RETURNS OBJECT + SERVICE={m_service_name} + ENDPOINT={m_endpoint_name} + MAX_BATCH_ROWS={m_max_batch_rows} + AS '/{m_path_at_endpoint}' + """ + + self.m_session.add_mock_sql( + query=m_sql, + result=mock_data_frame.MockDataFrame(collect_result=[]), + ) + + self.client.create_or_replace_service_function( + service_func_name=m_service_func_name, + service_name=m_service_name, + endpoint_name=m_endpoint_name, + path_at_service_endpoint=m_path_at_endpoint, + max_batch_rows=m_max_batch_rows, + ) + def test_get_service_status(self) -> None: row = snowpark.Row( **{ diff --git a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel index ee87b150..04dd00f8 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/warehouse/BUILD.bazel @@ -15,7 +15,6 @@ py_library( "//snowflake/ml/_internal:env", "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", - "//snowflake/ml/model:_model", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:type_hints", ], diff --git a/snowflake/ml/model/_deploy_client/warehouse/deploy.py b/snowflake/ml/model/_deploy_client/warehouse/deploy.py index ac174d78..66acb7c7 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/deploy.py +++ b/snowflake/ml/model/_deploy_client/warehouse/deploy.py @@ -1,4 +1,3 @@ -import os import posixpath import tempfile from types import ModuleType @@ -7,7 +6,7 @@ from typing_extensions import Unpack from snowflake.ml._internal import env_utils, file_utils -from snowflake.ml.model import _model, _model_meta, type_hints as model_types +from snowflake.ml.model import _model_meta, type_hints as model_types from snowflake.ml.model._deploy_client.warehouse import infer_template from snowflake.snowpark import session as snowpark_session, types as st @@ -15,18 +14,18 @@ def _deploy_to_warehouse( session: snowpark_session.Session, *, - model_dir_path: Optional[str] = None, - model_stage_file_path: Optional[str] = None, + model_stage_file_path: str, + model_meta: _model_meta.ModelMetadata, udf_name: str, target_method: str, **kwargs: Unpack[model_types.WarehouseDeployOptions], -) -> _model_meta.ModelMetadata: +) -> None: """Deploy the model to warehouse as UDF. Args: session: Snowpark session. - model_dir_path: Path to model directory. Exclusive with model_stage_file_path. - model_stage_file_path: Path to the stored model zip file in the stage. Exclusive with model_dir_path. + model_stage_file_path: Path to the stored model zip file in the stage. + model_meta: Model Metadata. udf_name: Name of the UDF. target_method: The name of the target method to be deployed. **kwargs: Options that control some features in generated udf code. @@ -37,34 +36,18 @@ def _deploy_to_warehouse( ValueError: Raised when target method does not exist in model. ValueError: Raised when confronting invalid stage location. - Returns: - The metadata of the model deployed. """ # TODO(SNOW-862576): Should remove check on ASCII encoding after SNOW-862576 fixed. - if model_dir_path: - model_dir_path = os.path.normpath(model_dir_path) - model_dir_name = os.path.basename(model_dir_path) - if not file_utils._able_ascii_encode(model_dir_name): - raise ValueError(f"Model file name {model_dir_name} cannot be encoded using ASCII. Please rename.") - extract_model_code = infer_template._EXTRACT_LOCAL_MODEL_CODE.format(model_dir_name=model_dir_name) - meta = _model.load_model(model_dir_path=model_dir_path, meta_only=True) - else: - assert model_stage_file_path is not None, "Unreachable assertion error." - model_stage_file_name = posixpath.basename(model_stage_file_path) - if not file_utils._able_ascii_encode(model_stage_file_name): - raise ValueError(f"Model file name {model_stage_file_name} cannot be encoded using ASCII. Please rename.") - - extract_model_code = infer_template._EXTRACT_STAGE_MODEL_CODE.format( - model_stage_file_name=model_stage_file_name - ) - meta = _model.load_model(session=session, model_stage_file_path=model_stage_file_path, meta_only=True) + model_stage_file_name = posixpath.basename(model_stage_file_path) + if not file_utils._able_ascii_encode(model_stage_file_name): + raise ValueError(f"Model file name {model_stage_file_name} cannot be encoded using ASCII. Please rename.") relax_version = kwargs.get("relax_version", False) - if target_method not in meta.signatures.keys(): + if target_method not in model_meta.signatures.keys(): raise ValueError(f"Target method {target_method} does not exist in model.") - final_packages = _get_model_final_packages(meta, session, relax_version=relax_version) + final_packages = _get_model_final_packages(model_meta, session, relax_version=relax_version) stage_location = kwargs.get("permanent_udf_stage_location", None) if stage_location: @@ -73,11 +56,8 @@ def _deploy_to_warehouse( raise ValueError(f"Invalid stage location {stage_location}.") with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f: - _write_UDF_py_file(f.file, extract_model_code, target_method, **kwargs) + _write_UDF_py_file(f.file, model_stage_file_name=model_stage_file_name, target_method=target_method, **kwargs) print(f"Generated UDF file is persisted at: {f.name}") - imports = ([model_dir_path] if model_dir_path else []) + ( - [model_stage_file_path] if model_stage_file_path else [] - ) class _UDFParams(TypedDict): file_path: str @@ -94,7 +74,7 @@ class _UDFParams(TypedDict): name=udf_name, return_type=st.PandasSeriesType(st.MapType(st.StringType(), st.VariantType())), input_types=[st.PandasDataFrameType([st.MapType()])], - imports=list(imports), + imports=[model_stage_file_path], packages=list(final_packages), ) if stage_location is None: # Temporary UDF @@ -108,12 +88,11 @@ class _UDFParams(TypedDict): ) print(f"{udf_name} is deployed to warehouse.") - return meta def _write_UDF_py_file( f: IO[str], - extract_model_code: str, + model_stage_file_name: str, target_method: str, **kwargs: Unpack[model_types.WarehouseDeployOptions], ) -> None: @@ -121,15 +100,13 @@ def _write_UDF_py_file( Args: f: File descriptor to write the python code. - extract_model_code: Code to extract the model. + model_stage_file_name: Model zip file name. target_method: The name of the target method to be deployed. **kwargs: Options that control some features in generated udf code. """ - keep_order = kwargs.get("keep_order", True) - udf_code = infer_template._UDF_CODE_TEMPLATE.format( - extract_model_code=extract_model_code, - keep_order_code=infer_template._KEEP_ORDER_CODE_TEMPLATE if keep_order else "", + model_stage_file_name=model_stage_file_name, + _KEEP_ORDER_COL_NAME=infer_template._KEEP_ORDER_COL_NAME, target_method=target_method, code_dir_name=_model_meta.ModelMetadata.MODEL_CODE_DIR, ) diff --git a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py index b8669f48..5486d4e5 100644 --- a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py +++ b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py @@ -1,27 +1,5 @@ _KEEP_ORDER_COL_NAME = "_ID" -_KEEP_ORDER_CODE_TEMPLATE = f'predictions_df["{_KEEP_ORDER_COL_NAME}"] = input_df["{_KEEP_ORDER_COL_NAME}"]' -_EXTRACT_LOCAL_MODEL_CODE = """ -model_dir_name = '{model_dir_name}' -zip_model_path = os.path.join(import_dir, '{model_dir_name}.zip') -extracted = '/tmp/models' -extracted_model_dir_path = os.path.join(extracted, model_dir_name) -with FileLock(): - if not os.path.isdir(extracted_model_dir_path): - with zipfile.ZipFile(zip_model_path, 'r') as myzip: - myzip.extractall(extracted) -""" -_EXTRACT_STAGE_MODEL_CODE = """ -model_dir_name = os.path.splitext('{model_stage_file_name}')[0] -zip_model_path = os.path.join(import_dir, '{model_stage_file_name}') -extracted = '/tmp/models' -extracted_model_dir_path = os.path.join(extracted, model_dir_name) - -with FileLock(): - if not os.path.isdir(extracted_model_dir_path): - with zipfile.ZipFile(zip_model_path, 'r') as myzip: - myzip.extractall(extracted_model_dir_path) -""" _UDF_CODE_TEMPLATE = """ import pandas as pd import numpy as np @@ -48,11 +26,23 @@ def __exit__(self, type, value, traceback): IMPORT_DIRECTORY_NAME = "snowflake_import_directory" import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] -{extract_model_code} +model_dir_name = os.path.splitext('{model_stage_file_name}')[0] +zip_model_path = os.path.join(import_dir, '{model_stage_file_name}') +extracted = '/tmp/models' +extracted_model_dir_path = os.path.join(extracted, model_dir_name) + +with FileLock(): + if not os.path.isdir(extracted_model_dir_path): + with zipfile.ZipFile(zip_model_path, 'r') as myzip: + myzip.extractall(extracted_model_dir_path) sys.path.insert(0, os.path.join(extracted_model_dir_path, "{code_dir_name}")) from snowflake.ml.model import _model -model, meta = _model._load_model_for_deploy(extracted_model_dir_path) +# Backward for <= 1.0.5 +if hasattr(_model, "_load_model_for_deploy"): + model, meta = _model._load_model_for_deploy(extracted_model_dir_path) +else: + model, meta = _model._load(local_dir_path=extracted_model_dir_path, as_custom_model=True) features = meta.signatures["{target_method}"].inputs input_cols = [feature.name for feature in features] @@ -68,7 +58,8 @@ def infer(df): else: predictions_df = model.{target_method}(input_df[input_cols]) - {keep_order_code} + if "{_KEEP_ORDER_COL_NAME}" in input_df.columns: + predictions_df["{_KEEP_ORDER_COL_NAME}"] = input_df["{_KEEP_ORDER_COL_NAME}"] return predictions_df.to_dict("records") """ diff --git a/snowflake/ml/model/_deployer.py b/snowflake/ml/model/_deployer.py index 0a5231a8..3503d132 100644 --- a/snowflake/ml/model/_deployer.py +++ b/snowflake/ml/model/_deployer.py @@ -6,6 +6,7 @@ from snowflake.ml._internal.utils import identifier from snowflake.ml.model import ( + _model, deploy_platforms, model_signature, type_hints as model_types, @@ -26,12 +27,14 @@ class Deployment(TypedDict): Attributes: name: Name of the deployment. platform: Target platform to deploy the model. + target_method: Target method name. signature: The signature of the model method. options: Additional options when deploying the model. """ name: Required[str] platform: Required[deploy_platforms.TargetPlatform] + target_method: Required[str] signature: model_signature.ModelSignature options: Required[model_types.DeployOptions] @@ -42,31 +45,7 @@ def deploy( *, name: str, platform: deploy_platforms.TargetPlatform, - target_method: str, - model_dir_path: str, - options: Optional[model_types.DeployOptions], -) -> Optional[Deployment]: - """Create a deployment from a model in a local directory and deploy it to remote platform. - - Args: - session: Snowpark Connection Session. - name: Name of the deployment for the model. - platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. - model_dir_path: Directory of the model. - options: Additional options when deploying the model. - Each target platform will have their own specifications of options. - """ - ... - - -@overload -def deploy( - session: Session, - *, - name: str, - platform: deploy_platforms.TargetPlatform, - target_method: str, + target_method: Optional[str], model_stage_file_path: str, options: Optional[model_types.DeployOptions], ) -> Optional[Deployment]: @@ -76,7 +55,8 @@ def deploy( session: Snowpark Connection Session. name: Name of the deployment for the model. platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. + target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in + the model. model_stage_file_path: Model file in the stage to be deployed. Must be a file with .zip extension. options: Additional options when deploying the model. Each target platform will have their own specifications of options. @@ -91,7 +71,7 @@ def deploy( model_id: str, name: str, platform: deploy_platforms.TargetPlatform, - target_method: str, + target_method: Optional[str], model_stage_file_path: str, deployment_stage_path: str, options: Optional[model_types.DeployOptions], @@ -103,7 +83,8 @@ def deploy( model_id: Internal model ID string. name: Name of the deployment for the model. platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. + target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in + the model. model_stage_file_path: Model file in the stage to be deployed. Must be a file with .zip extension. deployment_stage_path: Path to stage containing snowpark container service deployment artifacts. options: Additional options when deploying the model. @@ -117,9 +98,8 @@ def deploy( *, name: str, platform: deploy_platforms.TargetPlatform, - target_method: str, - model_dir_path: Optional[str] = None, - model_stage_file_path: Optional[str] = None, + model_stage_file_path: str, + target_method: Optional[str] = None, deployment_stage_path: Optional[str] = None, model_id: Optional[str] = None, options: Optional[model_types.DeployOptions], @@ -131,8 +111,8 @@ def deploy( model_id: Internal model ID string. name: Name of the deployment for the model. platform: Target platform to deploy the model. - target_method: The name of the target method to be deployed. - model_dir_path: Directory of the model. Exclusive with `model_stage_dir_path`. + target_method: The name of the target method to be deployed. Can be omitted if there is only 1 target method in + the model. model_stage_file_path: Model file in the stage to be deployed. Exclusive with `model_dir_path`. Must be a file with .zip extension. deployment_stage_path: Path to stage containing deployment artifacts. @@ -147,23 +127,26 @@ def deploy( Returns: The deployment information. """ - if not ((model_stage_file_path is None) ^ (model_dir_path is None)): - raise ValueError( - "model_dir_path and model_stage_file_path both cannot be " - + f"{'None' if model_stage_file_path is None else 'specified'} at the same time." - ) info = None if not options: options = {} + meta = _model.load_model(session=session, model_stage_file_path=model_stage_file_path, meta_only=True) + + if target_method is None: + if len(meta.signatures.keys()) == 1: + target_method = list(meta.signatures.keys())[0] + else: + raise ValueError("Only when the model has 1 target methods can target_method be omitted when deploying.") + if platform == deploy_platforms.TargetPlatform.WAREHOUSE: try: - meta = warehouse_deploy._deploy_to_warehouse( + warehouse_deploy._deploy_to_warehouse( session=session, - model_dir_path=model_dir_path, model_stage_file_path=model_stage_file_path, + model_meta=meta, udf_name=name, target_method=target_method, **options, @@ -179,9 +162,10 @@ def deploy( if snowservice_constants.COMPUTE_POOL not in options: raise ValueError("Missing 'compute_pool' in options field for Snowpark container service deployment") try: - meta = snowservice_deploy._deploy( + snowservice_deploy._deploy( session=session, model_id=model_id, + model_meta=meta, service_func_name=name, model_zip_stage_path=model_stage_file_path, deployment_stage_path=deployment_stage_path, @@ -196,7 +180,7 @@ def deploy( signature = meta.signatures.get(target_method, None) if not signature: raise ValueError(f"Target method {target_method} does not exist in model.") - info = Deployment(name=name, platform=platform, signature=signature, options=options) + info = Deployment(name=name, platform=platform, target_method=target_method, signature=signature, options=options) return info @@ -235,9 +219,6 @@ def predict( deployment: The deployment info to use for predict. X: The input dataframe. - Raises: - ValueError: Raised when the input is too large to use keep_order option. - Returns: The output dataframe. """ @@ -245,26 +226,19 @@ def predict( # Get options INTERMEDIATE_OBJ_NAME = "tmp_result" sig = deployment["signature"] - keep_order = deployment["options"].get("keep_order", True) - output_with_input_features = deployment["options"].get("output_with_input_features", False) # Validate and prepare input if not isinstance(X, SnowparkDataFrame): + keep_order = True + output_with_input_features = False df = model_signature._convert_and_validate_local_data(X, sig.inputs) s_df = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(session, df, keep_order=keep_order) else: + keep_order = False + output_with_input_features = True model_signature._validate_snowpark_data(X, sig.inputs) s_df = X - if keep_order: - # ID is UINT64 type, this we should limit. - if s_df.count() > 2**64: - raise ValueError("Unable to keep order of a DataFrame with more than 2 ** 64 rows.") - s_df = s_df.with_column( - infer_template._KEEP_ORDER_COL_NAME, - F.monotonically_increasing_id(), - ) - # Infer and get intermediate result input_cols = [] for col_name in s_df.columns: @@ -291,8 +265,6 @@ def predict( F.col(INTERMEDIATE_OBJ_NAME)[infer_template._KEEP_ORDER_COL_NAME], ascending=True, ) - if output_with_input_features: - df_res = df_res.drop(infer_template._KEEP_ORDER_COL_NAME) # Prepare the output output_cols = [] diff --git a/snowflake/ml/model/_env.py b/snowflake/ml/model/_env.py index aa19df6c..56ac71c1 100644 --- a/snowflake/ml/model/_env.py +++ b/snowflake/ml/model/_env.py @@ -32,11 +32,11 @@ def save_conda_env_file( path = os.path.join(dir_path, _CONDA_ENV_FILE_NAME) env: Dict[str, Any] = dict() env["name"] = "snow-env" - env["channels"] = ( - [_SNOWFLAKE_CONDA_CHANNEL_URL] - + [channel_name for channel_name, channel_deps in deps.items() if len(channel_deps) == 0] - + [_NODEFAULTS] - ) + # Get all channels in the dependencies, ordered by the number of the packages which belongs to + channels = list(dict(sorted(deps.items(), key=lambda item: len(item[1]), reverse=True)).keys()) + if env_utils.DEFAULT_CHANNEL_NAME in channels: + channels.remove(env_utils.DEFAULT_CHANNEL_NAME) + env["channels"] = [_SNOWFLAKE_CONDA_CHANNEL_URL] + channels + [_NODEFAULTS] env["dependencies"] = [f"python=={python_version}"] for chan, reqs in deps.items(): env["dependencies"].extend([f"{chan}::{str(req)}" if chan else str(req) for req in reqs]) @@ -101,7 +101,8 @@ def load_conda_env_file(path: str) -> Tuple[DefaultDict[str, List[requirements.R if len(channels) > 0: for channel in channels: - conda_dep_dict[channel] = [] + if channel not in conda_dep_dict: + conda_dep_dict[channel] = [] return conda_dep_dict, python_version diff --git a/snowflake/ml/model/_env_test.py b/snowflake/ml/model/_env_test.py index 3e7e7697..cb1b0b12 100644 --- a/snowflake/ml/model/_env_test.py +++ b/snowflake/ml/model/_env_test.py @@ -62,7 +62,7 @@ def test_conda_env_file(self) -> None: writed_yaml, { "name": "snow-env", - "channels": ["https://repo.anaconda.com/pkgs/snowflake", "apple", "nodefaults"], + "channels": ["https://repo.anaconda.com/pkgs/snowflake", "conda-forge", "apple", "nodefaults"], "dependencies": [ f"python=={snowml_env.PYTHON_VERSION}", "numpy>=1.22.4", diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_handlers/BUILD.bazel index 7fea5971..69071cab 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_handlers/BUILD.bazel @@ -132,3 +132,19 @@ py_library( "//snowflake/ml/model/_signatures:utils", ], ) + +py_library( + name = "huggingface_pipeline", + srcs = ["huggingface_pipeline.py"], + deps = [ + ":_base", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/model:_model_meta", + "//snowflake/ml/model:custom_model", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/model/models:huggingface_pipeline", + "//snowflake/ml/model/_signatures:utils", + "//snowflake/ml/model/_signatures:builtins_handler", + ], +) diff --git a/snowflake/ml/model/_handlers/_base.py b/snowflake/ml/model/_handlers/_base.py index 50177ad7..1294462c 100644 --- a/snowflake/ml/model/_handlers/_base.py +++ b/snowflake/ml/model/_handlers/_base.py @@ -7,12 +7,22 @@ class _ModelHandler(ABC, Generic[model_types._ModelType]): - """Provides handling for a given type of model defined by `type` class property.""" + """ + Provides handling for a given type of model defined by `type` class property. + + handler_type: The string type that identify the handler. Should be unique in the library. + MODEL_BLOB_FILE: Relative path of the model blob file in the model subdir. + MODEL_ARTIFACTS_DIR: Relative path of the model artifacts dir in the model subdir. + DEFAULT_TARGET_METHODS: Default target methods to be logged if not specified in this kind of model. + is_auto_signature: Set to True if the model could get model signature automatically and do not require user + inputting sample data or model signature. + """ handler_type = "_base" MODEL_BLOB_FILE = "model.pkl" MODEL_ARTIFACTS_DIR = "artifacts" DEFAULT_TARGET_METHODS = ["predict"] + is_auto_signature = False @staticmethod @abstractmethod @@ -61,7 +71,10 @@ def _save_model( @staticmethod @abstractmethod def _load_model( - name: str, model_meta: _model_meta.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: _model_meta.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> model_types._ModelType: """Load the model into memory. @@ -69,5 +82,6 @@ def _load_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. """ ... diff --git a/snowflake/ml/model/_handlers/custom.py b/snowflake/ml/model/_handlers/custom.py index 4fc7c8fa..8062ff63 100644 --- a/snowflake/ml/model/_handlers/custom.py +++ b/snowflake/ml/model/_handlers/custom.py @@ -116,9 +116,17 @@ def get_prediction( }, ) + # For Custom we set only when user set it. + cuda_version = kwargs.get("cuda_version", None) + if cuda_version: + model_meta.cuda_version = cuda_version + @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> "custom_model.CustomModel": from snowflake.ml.model import custom_model diff --git a/snowflake/ml/model/_handlers/huggingface_pipeline.py b/snowflake/ml/model/_handlers/huggingface_pipeline.py new file mode 100644 index 00000000..f17d740a --- /dev/null +++ b/snowflake/ml/model/_handlers/huggingface_pipeline.py @@ -0,0 +1,406 @@ +import json +import os +import warnings +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Type, Union + +import cloudpickle +import numpy as np +import pandas as pd +from typing_extensions import TypeGuard, Unpack + +from snowflake.ml._internal import type_utils +from snowflake.ml.model import ( + _model_meta as model_meta_api, + custom_model, + model_signature, + type_hints as model_types, +) +from snowflake.ml.model._handlers import _base +from snowflake.ml.model._signatures import ( + builtins_handler, + utils as model_signature_utils, +) +from snowflake.ml.model.models import huggingface_pipeline + +if TYPE_CHECKING: + import transformers + + +def get_requirements_from_task(task: str) -> List[model_meta_api.Dependency]: + # Text + if task in [ + "conversational", + "fill-mask", + "ner", + "token-classification", + "question-answering", + "summarization", + "table-question-answering", + "text-classification", + "sentiment-analysis", + "text-generation", + "text2text-generation", + "zero-shot-classification", + ] or task.startswith("translation"): + return [model_meta_api.Dependency(conda_name="tokenizers", pip_name="tokenizers")] + + return [] + + +class NumpyEncoder(json.JSONEncoder): + # This is a JSON encoder class to ensure the output from Huggingface pipeline is JSON serializable. + # What it covers is numpy object. + def default(self, z: object) -> object: + if isinstance(z, np.number): + if np.can_cast(z, np.int64, casting="safe"): + return int(z) + elif np.can_cast(z, np.float64, casting="safe"): + return z.astype(np.float64) + return super().default(z) + + +class _HuggingFacePipelineHandler( + _base._ModelHandler[Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]] +): + """Handler for custom model.""" + + handler_type = "huggingface_pipeline" + MODEL_BLOB_FILE = "model" + ADDITIONAL_CONFIG_FILE = "pipeline_config.pt" + DEFAULT_TARGET_METHODS = ["__call__"] + is_auto_signature = True + + @staticmethod + def can_handle( + model: model_types.SupportedModelType, + ) -> TypeGuard[Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]]: + if type_utils.LazyType("transformers.Pipeline").isinstance(model): + return True + if isinstance(model, huggingface_pipeline.HuggingFacePipelineModel): + return True + return False + + @staticmethod + def cast_model( + model: model_types.SupportedModelType, + ) -> Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]: + try: + if isinstance(model, huggingface_pipeline.HuggingFacePipelineModel): + raise ImportError + else: + import transformers + except ImportError: + assert isinstance(model, huggingface_pipeline.HuggingFacePipelineModel) + return model + else: + assert isinstance(model, transformers.Pipeline) + return model + + @staticmethod + def _save_model( + name: str, + model: Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"], + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + sample_input: Optional[model_types.SupportedDataType] = None, + is_sub_model: Optional[bool] = False, + **kwargs: Unpack[model_types.HuggingFaceSaveOptions], + ) -> None: + if type_utils.LazyType("transformers.Pipeline").isinstance(model): + task = model.task # type:ignore[attr-defined] + framework = model.framework # type:ignore[attr-defined] + batch_size = model._batch_size # type:ignore[attr-defined] + else: + assert isinstance(model, huggingface_pipeline.HuggingFacePipelineModel) + task = model.task + framework = getattr(model, "framework", None) + batch_size = getattr(model, "batch_size", None) + + if type_utils.LazyType("transformers.Pipeline").isinstance(model): + params = { + **model._preprocess_params, # type:ignore[attr-defined] + **model._forward_params, # type:ignore[attr-defined] + **model._postprocess_params, # type:ignore[attr-defined] + } + else: + assert isinstance(model, huggingface_pipeline.HuggingFacePipelineModel) + params = {**model.__dict__, **model.model_kwargs} + + inferred_pipe_sig = model_signature_utils.huggingface_pipeline_signature_auto_infer(task, params=params) + + if not is_sub_model: + target_methods = model_meta_api._get_target_methods( + model=model, + target_methods=kwargs.pop("target_methods", None), + default_target_methods=_HuggingFacePipelineHandler.DEFAULT_TARGET_METHODS, + ) + + if model_meta._signatures is not None: + model_meta_api._validate_target_methods(model, list(model_meta.signatures.keys())) + else: + model_meta_api._validate_target_methods(model, target_methods) + if sample_input is not None: + warnings.warn( + "Inferring model signature from sample input for hugggingface pipeline is not supported. " + + "Model signature will automatically be inferred from pipeline task. " + + "Or, you could specify model signature manually." + ) + if inferred_pipe_sig is None: + raise NotImplementedError(f"Cannot auto infer the signature of pipeline for task {task}") + + model_meta._signatures = {"__call__": inferred_pipe_sig} + + model_blob_path = os.path.join(model_blobs_dir_path, name) + os.makedirs(model_blob_path, exist_ok=True) + + if type_utils.LazyType("transformers.Pipeline").isinstance(model): + model.save_pretrained( # type:ignore[attr-defined] + os.path.join(model_blob_path, _HuggingFacePipelineHandler.MODEL_BLOB_FILE) + ) + pipeline_params = { + "_batch_size": model._batch_size, # type:ignore[attr-defined] + "_num_workers": model._num_workers, # type:ignore[attr-defined] + "_preprocess_params": model._preprocess_params, # type:ignore[attr-defined] + "_forward_params": model._forward_params, # type:ignore[attr-defined] + "_postprocess_params": model._postprocess_params, # type:ignore[attr-defined] + } + with open( + os.path.join( + model_blob_path, + _HuggingFacePipelineHandler.MODEL_BLOB_FILE, + _HuggingFacePipelineHandler.ADDITIONAL_CONFIG_FILE, + ), + "wb", + ) as f: + cloudpickle.dump(pipeline_params, f) + else: + with open(os.path.join(model_blob_path, _HuggingFacePipelineHandler.MODEL_BLOB_FILE), "wb") as f: + cloudpickle.dump(model, f) + model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + + base_meta = model_meta_api._ModelBlobMetadata( + name=name, + model_type=_HuggingFacePipelineHandler.handler_type, + path=_HuggingFacePipelineHandler.MODEL_BLOB_FILE, + options={ + "task": task, + "accelerate_mixed_precision_config": kwargs.get("accelerate_mix_precision_config", "fp16"), + "batch_size": batch_size if batch_size is not None else 1, + }, + ) + model_meta.models[name] = base_meta + + pkgs_requirements = [ + model_meta_api.Dependency(conda_name="transformers", pip_name="transformers"), + ] + get_requirements_from_task(task) + if framework is None or framework == "pt": + pkgs_requirements.append(model_meta_api.Dependency(conda_name="pytorch", pip_name="torch")) + elif framework == "tf": + pkgs_requirements.append(model_meta_api.Dependency(conda_name="tensorflow", pip_name="tensorflow")) + model_meta._include_if_absent(pkgs_requirements) + + @staticmethod + def _get_device_config(mixed_precision: str = "fp16") -> Dict[str, str]: + from accelerate import utils + + device_config = {} + utils.write_basic_config(mixed_precision=mixed_precision) + device_config["device_map"] = "auto" + + return device_config + + @staticmethod + def _load_model( + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> Union[huggingface_pipeline.HuggingFacePipelineModel, "transformers.Pipeline"]: + model_blob_path = os.path.join(model_blobs_dir_path, name) + if not hasattr(model_meta, "models"): + raise ValueError("Ill model metadata found.") + model_blobs_metadata = model_meta.models + if name not in model_blobs_metadata: + raise ValueError(f"Blob of model {name} does not exist.") + model_blob_metadata = model_blobs_metadata[name] + model_blob_filename = model_blob_metadata.path + model_blob_options = model_blob_metadata.options + + model_blob_file_or_dir_path = os.path.join(model_blob_path, model_blob_filename) + if os.path.isdir(model_blob_file_or_dir_path): + import transformers + + if "task" not in model_blob_options: + raise ValueError("`task` must be specified in options.") + + with open( + os.path.join(model_blob_file_or_dir_path, _HuggingFacePipelineHandler.ADDITIONAL_CONFIG_FILE), "rb" + ) as f: + pipeline_params = cloudpickle.load(f) + + if kwargs.get("use_gpu", False): + device_config = _HuggingFacePipelineHandler._get_device_config( + model_blob_metadata.options["accelerate_mixed_precision_config"] + ) + else: + device_config = {} + + m = transformers.pipeline(model_blob_options["task"], model=model_blob_file_or_dir_path, **device_config) + + m.__dict__.update(pipeline_params) + + else: + assert os.path.isfile(model_blob_file_or_dir_path) + with open(model_blob_file_or_dir_path, "rb") as f: + m = cloudpickle.load(f) + assert isinstance(m, huggingface_pipeline.HuggingFacePipelineModel) + if ( + getattr(m, "device", None) is None + and getattr(m, "device_map", None) is None + and kwargs.get("use_gpu", False) + ): + m.__dict__.update( + _HuggingFacePipelineHandler._get_device_config( + model_blob_metadata.options["accelerate_mixed_precision_config"] + ) + ) + + if getattr(m, "torch_dtype", None) is None and kwargs.get("use_gpu", False): + m.__dict__.update(torch_dtype="auto") + return m + + @staticmethod + def _load_as_custom_model( + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> custom_model.CustomModel: + """Create a custom model class wrap for unified interface when being deployed. The predict method will be + re-targeted based on target_method metadata. + + Args: + name: Name of the model. + model_meta: The model metadata. + model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. + + Returns: + The model object as a custom model. + """ + + import transformers + + from snowflake.ml.model import custom_model + + def _create_custom_model( + raw_model: "transformers.Pipeline", + model_meta: model_meta_api.ModelMetadata, + ) -> Type[custom_model.CustomModel]: + def fn_factory( + raw_model: "transformers.Pipeline", + signature: model_signature.ModelSignature, + target_method: str, + ) -> Callable[[custom_model.CustomModel, pd.DataFrame], pd.DataFrame]: + @custom_model.inference_api + def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: + # These 3 zero-shot classification cannot take a list of dict as input like other multi input + # pipelines, thus dealing separately. + if isinstance( + raw_model, + ( + transformers.ZeroShotAudioClassificationPipeline, + transformers.ZeroShotClassificationPipeline, + transformers.ZeroShotImageClassificationPipeline, + ), + ): + temp_res = X.apply( + lambda row: getattr(raw_model, target_method)( + row[signature.inputs[0].name], row["candidate_labels"] + ), + axis=1, + ).to_list() + else: + # For others, we could offer the whole dataframe as a list. + # Some of them may need some conversion + if isinstance(raw_model, transformers.ConversationalPipeline): + input_data = [ + transformers.Conversation( + text=conv_data["user_inputs"][0], + past_user_inputs=conv_data["user_inputs"][1:], + generated_responses=conv_data["generated_responses"], + ) + for conv_data in X.to_dict("records") + ] + elif len(signature.inputs) == 1: + input_data = X.to_dict("list")[signature.inputs[0].name] + else: + if isinstance(raw_model, transformers.TableQuestionAnsweringPipeline): + X["table"] = X["table"].apply(json.loads) + + input_data = X.to_dict("records") + temp_res = getattr(raw_model, target_method)(input_data) + + # Some huggingface pipeline will omit the outer list when there is only 1 input. + # Making it not aligned with the auto-inferred signature. + # If the output is a dict, we could blindly create a list containing that. + # Otherwise, creating pandas DataFrame won't succeed. + if isinstance(temp_res, (dict, transformers.Conversation)) or ( + # For some pipeline that is expected to generate a list of dict per input + # When it omit outer list, it becomes list of dict instead of list of list of dict. + # We need to distinguish them from those pipelines that designed to output a dict per input + # So we need to check the pipeline type. + isinstance(raw_model, (transformers.FillMaskPipeline, transformers.QuestionAnsweringPipeline)) + and X.shape[0] == 1 + and isinstance(temp_res[0], dict) + ): + temp_res = [temp_res] + + if len(temp_res) == 0: + return pd.DataFrame() + + if isinstance(raw_model, transformers.ConversationalPipeline): + temp_res = [[conv.generated_responses] for conv in temp_res] + + # To concat those who outputs a list with one input. + if builtins_handler.ListOfBuiltinHandler.can_handle(temp_res): + res = builtins_handler.ListOfBuiltinHandler.convert_to_df(temp_res) + elif isinstance(temp_res[0], dict): + res = pd.DataFrame(temp_res) + elif isinstance(temp_res[0], list): + res = pd.DataFrame([json.dumps(output, cls=NumpyEncoder) for output in temp_res]) + else: + raise ValueError(f"Cannot parse output {temp_res} from pipeline object") + + return model_signature_utils.rename_pandas_df(data=res, features=signature.outputs) + + return fn + + type_method_dict = {} + for target_method_name, sig in model_meta.signatures.items(): + type_method_dict[target_method_name] = fn_factory(raw_model, sig, target_method_name) + + _HFPipelineModel = type( + "_HFPipelineModel", + (custom_model.CustomModel,), + type_method_dict, + ) + + return _HFPipelineModel + + raw_model = _HuggingFacePipelineHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) + if isinstance(raw_model, huggingface_pipeline.HuggingFacePipelineModel): + pipe = transformers.pipeline(**raw_model.__dict__) + else: + pipe = raw_model + + pipe.binary_output = False + + # To enable batch_size > 1 for LLM + if hasattr(pipe, "tokenizer") and pipe.tokenizer.pad_token_id is None: + pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id + + _HFPipelineModel = _create_custom_model(pipe, model_meta) + hg_pipe_model = _HFPipelineModel(custom_model.ModelContext()) + + return hg_pipe_model diff --git a/snowflake/ml/model/_handlers/mlflow.py b/snowflake/ml/model/_handlers/mlflow.py index c0393e00..bfa5156f 100644 --- a/snowflake/ml/model/_handlers/mlflow.py +++ b/snowflake/ml/model/_handlers/mlflow.py @@ -136,6 +136,7 @@ class _MLFlowHandler(_base._ModelHandler["mlflow.pyfunc.PyFuncModel"]): MODEL_BLOB_FILE = "model" _DEFAULT_TARGET_METHOD = "predict" DEFAULT_TARGET_METHODS = [_DEFAULT_TARGET_METHOD] + is_auto_signature = True @staticmethod def can_handle( @@ -220,7 +221,10 @@ def _save_model( @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> "mlflow.pyfunc.PyFuncModel": import mlflow @@ -252,7 +256,10 @@ def _load_model( @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -261,6 +268,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -303,7 +311,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _MLFlowModel - raw_model = _MLFlowHandler._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _MLFlowHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _MLFlowModel = _create_custom_model(raw_model, model_meta) mlflow_model = _MLFlowModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/pytorch.py b/snowflake/ml/model/_handlers/pytorch.py index da371492..c2d5b267 100644 --- a/snowflake/ml/model/_handlers/pytorch.py +++ b/snowflake/ml/model/_handlers/pytorch.py @@ -84,7 +84,10 @@ def get_prediction( target_method = getattr(model, target_method_name, None) assert callable(target_method) with torch.no_grad(): - predictions_df = target_method(sample_input) + predictions_df = target_method(*sample_input) + + if isinstance(predictions_df, torch.Tensor): + predictions_df = [predictions_df] return predictions_df model_meta = model_meta_api._validate_signature( @@ -108,9 +111,14 @@ def get_prediction( model_meta.models[name] = base_meta model_meta._include_if_absent([model_meta_api.Dependency(conda_name="pytorch", pip_name="torch")]) + model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> "torch.nn.Module": import torch @@ -125,11 +133,18 @@ def _load_model( with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: m = torch.load(f) assert isinstance(m, torch.nn.Module) + + if kwargs.get("use_gpu", False): + m = m.cuda() + return m @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -138,6 +153,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -163,8 +179,15 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: raw_model.eval() t = pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(X, signature.inputs) + if kwargs.get("use_gpu", False): + t = [element.cuda() for element in t] + with torch.no_grad(): - res = getattr(raw_model, target_method)(t) + res = getattr(raw_model, target_method)(*t) + + if isinstance(res, torch.Tensor): + res = [res] + return model_signature_utils.rename_pandas_df( data=pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(res), features=signature.outputs ) @@ -183,7 +206,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _PyTorchModel - raw_model = _PyTorchHandler._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _PyTorchHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _PyTorchModel = _create_custom_model(raw_model, model_meta) pytorch_model = _PyTorchModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/sklearn.py b/snowflake/ml/model/_handlers/sklearn.py index 3e87ddd5..af76350c 100644 --- a/snowflake/ml/model/_handlers/sklearn.py +++ b/snowflake/ml/model/_handlers/sklearn.py @@ -110,7 +110,10 @@ def get_prediction( @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> Union["sklearn.base.BaseEstimator", "sklearn.pipeline.Pipeline"]: model_blob_path = os.path.join(model_blobs_dir_path, name) if not hasattr(model_meta, "models"): @@ -131,7 +134,10 @@ def _load_model( @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -140,6 +146,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -182,7 +189,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _SKLModel - raw_model = _SKLModelHandler._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _SKLModelHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _SKLModel = _create_custom_model(raw_model, model_meta) skl_model = _SKLModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/snowmlmodel.py b/snowflake/ml/model/_handlers/snowmlmodel.py index fe8f47bf..c4dfc131 100644 --- a/snowflake/ml/model/_handlers/snowmlmodel.py +++ b/snowflake/ml/model/_handlers/snowmlmodel.py @@ -29,6 +29,7 @@ class _SnowMLModelHandler(_base._ModelHandler["BaseEstimator"]): handler_type = "snowml" DEFAULT_TARGET_METHODS = ["predict", "transform", "predict_proba", "predict_log_proba", "decision_function"] + is_auto_signature = True @staticmethod def can_handle( @@ -115,7 +116,12 @@ def get_prediction( model_meta._include_if_absent(_include_if_absent_pkgs) @staticmethod - def _load_model(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str) -> "BaseEstimator": + def _load_model( + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], + ) -> "BaseEstimator": model_blob_path = os.path.join(model_blobs_dir_path, name) if not hasattr(model_meta, "models"): raise ValueError("Ill model metadata found.") @@ -134,7 +140,10 @@ def _load_model(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -143,6 +152,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -185,7 +195,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _SnowMLModel - raw_model = _SnowMLModelHandler._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _SnowMLModelHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _SnowMLModel = _create_custom_model(raw_model, model_meta) snowml_model = _SnowMLModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/tensorflow.py b/snowflake/ml/model/_handlers/tensorflow.py index a8d985cd..c67b4d05 100644 --- a/snowflake/ml/model/_handlers/tensorflow.py +++ b/snowflake/ml/model/_handlers/tensorflow.py @@ -87,7 +87,11 @@ def get_prediction( assert callable(target_method) for tensor in sample_input: tensorflow.stop_gradient(tensor) - predictions_df = target_method(sample_input) + predictions_df = target_method(*sample_input) + + if isinstance(predictions_df, (tensorflow.Tensor, tensorflow.Variable, np.ndarray)): + predictions_df = [predictions_df] + return predictions_df model_meta = model_meta_api._validate_signature( @@ -111,9 +115,14 @@ def get_prediction( model_meta.models[name] = base_meta model_meta._include_if_absent([model_meta_api.Dependency(conda_name="tensorflow", pip_name="tensorflow")]) + model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> "tensorflow.Module": import tensorflow @@ -132,7 +141,10 @@ def _load_model( @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -141,6 +153,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -167,7 +180,11 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: for tensor in t: tensorflow.stop_gradient(tensor) - res = getattr(raw_model, target_method)(t) + res = getattr(raw_model, target_method)(*t) + + if isinstance(res, (tensorflow.Tensor, tensorflow.Variable, np.ndarray)): + res = [res] + if isinstance(res, list) and len(res) > 0 and isinstance(res[0], np.ndarray): # In case of running on CPU, it will return numpy array df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df(res) @@ -189,7 +206,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _TensorFlowModel - raw_model = _TensorFlowHandler()._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _TensorFlowHandler()._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _TensorFlowModel = _create_custom_model(raw_model, model_meta) tf_model = _TensorFlowModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/torchscript.py b/snowflake/ml/model/_handlers/torchscript.py index 27654496..e263f1e5 100644 --- a/snowflake/ml/model/_handlers/torchscript.py +++ b/snowflake/ml/model/_handlers/torchscript.py @@ -80,7 +80,11 @@ def get_prediction( target_method = getattr(model, target_method_name, None) assert callable(target_method) with torch.no_grad(): - predictions_df = target_method(sample_input) + predictions_df = target_method(*sample_input) + + if isinstance(predictions_df, torch.Tensor): + predictions_df = [predictions_df] + return predictions_df model_meta = model_meta_api._validate_signature( @@ -101,9 +105,14 @@ def get_prediction( model_meta.models[name] = base_meta model_meta._include_if_absent([model_meta_api.Dependency(conda_name="pytorch", pip_name="torch")]) + model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> "torch.jit.ScriptModule": # type:ignore[name-defined] import torch @@ -118,11 +127,18 @@ def _load_model( with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: m = torch.jit.load(f) # type:ignore[attr-defined] assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] + + if kwargs.get("use_gpu", False): + m = m.cuda() + return m @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -131,6 +147,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -157,8 +174,15 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: t = pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(X, signature.inputs) + if kwargs.get("use_gpu", False): + t = [element.cuda() for element in t] + with torch.no_grad(): - res = getattr(raw_model, target_method)(t) + res = getattr(raw_model, target_method)(*t) + + if isinstance(res, torch.Tensor): + res = [res] + return model_signature_utils.rename_pandas_df( data=pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(res), features=signature.outputs ) @@ -177,7 +201,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _TorchScriptModel - raw_model = _TorchScriptHandler._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _TorchScriptHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _TorchScriptModel = _create_custom_model(raw_model, model_meta) torchscript_model = _TorchScriptModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_handlers/xgboost.py b/snowflake/ml/model/_handlers/xgboost.py index d3143518..7a43bc72 100644 --- a/snowflake/ml/model/_handlers/xgboost.py +++ b/snowflake/ml/model/_handlers/xgboost.py @@ -110,9 +110,14 @@ def get_prediction( ] ) + model_meta.cuda_version = kwargs.get("cuda_version", model_meta_api._DEFAULT_CUDA_VERSION) + @staticmethod def _load_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> Union["xgboost.Booster", "xgboost.XGBModel"]: import xgboost @@ -128,14 +133,24 @@ def _load_model( if not xgb_estimator_type or not hasattr(xgboost, xgb_estimator_type): raise ValueError("Type of XGB estimator unknown or illegal.") m = getattr(xgboost, xgb_estimator_type)() + m.load_model(os.path.join(model_blob_path, model_blob_filename)) + + if kwargs.get("use_gpu", False): + gpu_params = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"} + if isinstance(m, xgboost.Booster): + m.set_param(gpu_params) + elif isinstance(m, xgboost.XGBModel): + m.set_params(**gpu_params) assert isinstance(m, xgboost.Booster) or isinstance(m, xgboost.XGBModel) - m.load_model(os.path.join(model_blob_path, model_blob_filename)) return m @staticmethod def _load_as_custom_model( - name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str + name: str, + model_meta: model_meta_api.ModelMetadata, + model_blobs_dir_path: str, + **kwargs: Unpack[model_types.ModelLoadOption], ) -> custom_model.CustomModel: """Create a custom model class wrap for unified interface when being deployed. The predict method will be re-targeted based on target_method metadata. @@ -144,6 +159,7 @@ def _load_as_custom_model( name: Name of the model. model_meta: The model metadata. model_blobs_dir_path: Directory path to the whole model. + kwargs: Options when loading the model. Returns: The model object as a custom model. @@ -191,7 +207,7 @@ def fn(self: custom_model.CustomModel, X: pd.DataFrame) -> pd.DataFrame: return _XGBModel - raw_model = _XGBModelHandler._load_model(name, model_meta, model_blobs_dir_path) + raw_model = _XGBModelHandler._load_model(name, model_meta, model_blobs_dir_path, **kwargs) _XGBModel = _create_custom_model(raw_model, model_meta) xgb_model = _XGBModel(custom_model.ModelContext()) diff --git a/snowflake/ml/model/_model.py b/snowflake/ml/model/_model.py index d46808c6..256a4582 100644 --- a/snowflake/ml/model/_model.py +++ b/snowflake/ml/model/_model.py @@ -1,11 +1,13 @@ import os import posixpath import tempfile -import warnings from types import ModuleType from typing import Dict, List, Literal, Optional, Tuple, Union, overload -from snowflake.ml._internal import file_utils, type_utils +from absl import logging +from packaging import requirements + +from snowflake.ml._internal import env as snowml_env, env_utils, file_utils from snowflake.ml.model import ( _env, _model_handler, @@ -19,119 +21,6 @@ MODEL_BLOBS_DIR = "models" -@overload -def save_model( - *, - name: str, - model: model_types.SupportedNoSignatureRequirementsModelType, - model_dir_path: str, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save a model that does not require a signature under `dir_path`. - - Args: - name: Name of the model. - model: Model object. - model_dir_path: Directory to save the model. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - """ - ... - - -@overload -def save_model( - *, - name: str, - model: model_types.SupportedRequireSignatureModelType, - model_dir_path: str, - signatures: Dict[str, model_signature.ModelSignature], - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save a model that requires a external signature with user provided signatures under `dir_path`. - - Args: - name: Name of the model. - model: Model object. - model_dir_path: Directory to save the model. - signatures: Model data signatures for inputs and output for every target methods. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - """ - ... - - -@overload -def save_model( - *, - name: str, - model: model_types.SupportedRequireSignatureModelType, - model_dir_path: str, - sample_input: model_types.SupportedDataType, - metadata: Optional[Dict[str, str]] = None, - conda_dependencies: Optional[List[str]] = None, - pip_requirements: Optional[List[str]] = None, - python_version: Optional[str] = None, - ext_modules: Optional[List[ModuleType]] = None, - code_paths: Optional[List[str]] = None, - options: Optional[model_types.ModelSaveOption] = None, -) -> _model_meta.ModelMetadata: - """Save a model that requires a external signature under `dir_path` with signature - inferred from a sample_input_data. - - Args: - name: Name of the model. - model: Model object. - model_dir_path: Directory to save the model. - sample_input: Sample input data to infer the model signatures from. - metadata: Model metadata. - conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify - a dependency. It is a recommended way to specify your dependencies using conda. When channel is not - specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel would be - replaced with the Snowflake Anaconda channel. - pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip - requirements. - python_version: A string of python version where model is run. Used for user override. If specified as None, - current version would be captured. Defaults to None. - code_paths: Directory of code to import. - ext_modules: External modules that user might want to get pickled with model object. Defaults to None. - options: Model specific kwargs. - """ - ... - - @overload def save_model( *, @@ -259,9 +148,8 @@ def save_model( *, name: str, model: model_types.SupportedModelType, - session: Optional[Session] = None, - model_stage_file_path: Optional[str] = None, - model_dir_path: Optional[str] = None, + session: Session, + model_stage_file_path: str, signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, sample_input: Optional[model_types.SupportedDataType] = None, metadata: Optional[Dict[str, str]] = None, @@ -277,11 +165,9 @@ def save_model( Args: name: Name of the model. model: Model object. - model_dir_path: Directory to save the model. Exclusive with `session` and `model_stage_file_path`. - session: Snowpark connection session. Needs to present with `model_stage_file_path`. - Exclusive with `model_dir_path`. + session: Snowpark connection session. model_stage_file_path: Path to the file in Snowflake stage where the function should put the saved model. - Needs to present with `session`. Exclusive with `model_dir_path`. Must be a file with .zip extension. + Must be a file with .zip extension. signatures: Model data signatures for inputs and output for every target methods. If it is None, sample_input would be used to infer the signatures if it is a local (non-SnowML modeling model). If not None, sample_input should not be specified. Defaults to None. @@ -305,71 +191,39 @@ def save_model( Model metadata. Raises: - ValueError: Raised when the session and model_stage_file_path not specified or not be None at the same time. - ValueError: Raised when the model_stage_file_path and model_dir_path specified at the same time. ValueError: Raised when the signatures and sample_input specified at the same time, or not presented when specifying local model. ValueError: Raised when provided model directory is not a directory. ValueError: Raised when provided model stage path is not a zip file. """ - if (session is None) ^ (model_stage_file_path is None): - raise ValueError( - "Session and model_stage_file_path must be " - + f"{'None' if session is None else 'specified'} at the same time." - ) - if not ((model_stage_file_path is None) ^ (model_dir_path is None)): - raise ValueError( - "model_dir_path and model_stage_file_path both cannot be " - + f"{'None' if model_stage_file_path is None else 'specified'} at the same time." - ) + if (signatures is None) and (sample_input is None) and not _model_handler.is_auto_signature_model(model): + raise ValueError("Signatures and sample_input both cannot be None at the same time for this kind of model.") - if ( - (signatures is None) - and (sample_input is None) - and not ( - type_utils.LazyType("snowflake.ml.modeling.framework.base.BaseEstimator").isinstance(model) - or type_utils.LazyType("mlflow.pyfunc.PyFuncModel").isinstance(model) - ) - ) or ((signatures is not None) and (sample_input is not None)): - raise ValueError( - "Signatures and sample_input both cannot be " - + f"{'None for local model' if signatures is None else 'specified'} at the same time." - ) + if (signatures is not None) and (sample_input is not None): + raise ValueError("Signatures and sample_input both cannot be specified at the same time.") if not options: options = model_types.BaseModelSaveOption() - if model_dir_path: - if os.path.exists(model_dir_path): - if not os.path.isdir(model_dir_path): - raise ValueError(f"Provided model directory {model_dir_path} is not a directory.") - if os.listdir(model_dir_path): - warnings.warn( - f"Provided model directory {model_dir_path} is not an empty directory. Files might be overwritten.", - category=UserWarning, - ) - else: - os.makedirs(model_dir_path) - return _save( - name=name, - model=model, - local_dir_path=model_dir_path, - signatures=signatures, - sample_input=sample_input, - metadata=metadata, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - python_version=python_version, - ext_modules=ext_modules, - code_paths=code_paths, - options=options, - ) - assert session and model_stage_file_path if posixpath.splitext(model_stage_file_path)[1] != ".zip": raise ValueError(f"Provided model path in the stage {model_stage_file_path} must be a path to a zip file.") + snowml_server_availability = env_utils.validate_requirements_in_snowflake_conda_channel( + session=session, + reqs=[requirements.Requirement(f"snowflake-ml-python=={snowml_env.VERSION}")], + python_version=snowml_env.PYTHON_VERSION, + ) + + if snowml_server_availability is None: + if options.get("embed_local_ml_library", False) is False: + logging.info( + f"Local snowflake-ml-python library has version {snowml_env.VERSION}," + " which is not available in the Snowflake server, embedding local ML library automatically." + ) + options["embed_local_ml_library"] = True + with tempfile.TemporaryDirectory() as temp_local_model_dir_path: meta = _save( name=name, @@ -404,16 +258,19 @@ def _save( name: str, model: model_types.SupportedModelType, local_dir_path: str, - signatures: Optional[Dict[str, model_signature.ModelSignature]], - sample_input: Optional[model_types.SupportedDataType], - metadata: Optional[Dict[str, str]], - conda_dependencies: Optional[List[str]], - pip_requirements: Optional[List[str]], - python_version: Optional[str], - ext_modules: Optional[List[ModuleType]], - code_paths: Optional[List[str]], - options: model_types.ModelSaveOption, + signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, + sample_input: Optional[model_types.SupportedDataType] = None, + metadata: Optional[Dict[str, str]] = None, + conda_dependencies: Optional[List[str]] = None, + pip_requirements: Optional[List[str]] = None, + python_version: Optional[str] = None, + ext_modules: Optional[List[ModuleType]] = None, + code_paths: Optional[List[str]] = None, + options: Optional[model_types.ModelSaveOption] = None, ) -> _model_meta.ModelMetadata: + if not options: + options = model_types.BaseModelSaveOption() + local_dir_path = os.path.normpath(local_dir_path) handler = _model_handler._find_handler(model) @@ -450,31 +307,20 @@ def _save( @overload def load_model( - *, model_dir_path: str, meta_only: Optional[Literal[False]] = None + *, session: Session, model_stage_file_path: str ) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: - """Load the model into memory from directory. - - Args: - model_dir_path: Directory containing the model. - meta_only: Flag to indicate that if only load metadata. - """ - ... - - -@overload -def load_model(*, model_dir_path: str, meta_only: Literal[True]) -> _model_meta.ModelMetadata: - """Load the model into memory from directory with metadata only. + """Load the model into memory from a zip file in the stage. Args: - model_dir_path: Directory containing the model. - meta_only: Flag to indicate that if only load metadata. + session: Snowflake connection session. + model_stage_file_path: The path to zipped model file in the stage. Must be a file with .zip extension. """ ... @overload def load_model( - *, session: Session, model_stage_file_path: str, meta_only: Optional[Literal[False]] = None + *, session: Session, model_stage_file_path: str, meta_only: Literal[False] ) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: """Load the model into memory from a zip file in the stage. @@ -500,10 +346,9 @@ def load_model(*, session: Session, model_stage_file_path: str, meta_only: Liter def load_model( *, - session: Optional[Session] = None, - model_stage_file_path: Optional[str] = None, - model_dir_path: Optional[str] = None, - meta_only: Optional[bool] = None, + session: Session, + model_stage_file_path: str, + meta_only: bool = False, ) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: """Load the model into memory from directory or a zip file in the stage. @@ -512,38 +357,14 @@ def load_model( Exclusive with model_dir_path. model_stage_file_path: The path to zipped model file in the stage. Must be specified when specifying session. Exclusive with model_dir_path. Must be a file with .zip extension. - model_dir_path: Directory containing the model. Exclusive with session and model_stage_file_path. meta_only: Flag to indicate that if only load metadata. Raises: - ValueError: Raised when the session and model_stage_file_path not specified or not be None at the same time. - ValueError: Raised when the model_stage_file_path and model_dir_path specified at the same time. - ValueError: Raised if model directory does not exist. - ValueError: Raised if model directory is not a directory. ValueError: Raised if model provided in the stage is not a zip file. Returns: A tuple containing the model object and the model metadata. """ - if (session is None) ^ (model_stage_file_path is None): - raise ValueError( - "Session and model_stage_file_path must be " - + f"{'None' if session is None else 'specified'} at the same time." - ) - - if not ((model_stage_file_path is None) ^ (model_dir_path is None)): - raise ValueError( - "model_dir_path and model_stage_file_path both cannot be " - + f"{'None' if model_stage_file_path is None else 'specified'} at the same time." - ) - - if model_dir_path: - if not os.path.exists(model_dir_path): - raise ValueError(f"Provided model directory {model_dir_path} does not exist.") - if not os.path.isdir(model_dir_path): - raise ValueError(f"Provided model directory {model_dir_path} is not a directory.") - - return _load(local_dir_path=model_dir_path, meta_only=meta_only) assert session and model_stage_file_path if posixpath.splitext(model_stage_file_path)[1] != ".zip": @@ -552,54 +373,94 @@ def load_model( fo = FileOperation(session=session) zf = fo.get_stream(model_stage_file_path) with file_utils.unzip_stream_in_temp_dir(stream=zf) as temp_local_model_dir_path: - return _load(local_dir_path=temp_local_model_dir_path, meta_only=meta_only) + # This is to make mypy happy. + if meta_only: + return _load(local_dir_path=temp_local_model_dir_path, meta_only=True) + return _load(local_dir_path=temp_local_model_dir_path) +@overload def _load( *, local_dir_path: str, - meta_only: Optional[bool] = None, -) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: - local_dir_path = os.path.normpath(local_dir_path) - meta = _model_meta._load_model_metadata(local_dir_path) - if meta_only: - return meta + meta_only: Literal[False] = False, + as_custom_model: Literal[False] = False, + options: Optional[model_types.ModelLoadOption] = None, +) -> Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]: + ... - _env.validate_py_runtime_version(meta.python_version) - handler = _model_handler._load_handler(meta.model_type) - if handler is None: - raise TypeError(f"{meta.model_type} is not supported.") - model_blobs_path = os.path.join(local_dir_path, MODEL_BLOBS_DIR) - m = handler._load_model(meta.name, meta, model_blobs_path) - return m, meta +@overload +def _load( + *, + local_dir_path: str, + meta_only: Literal[False] = False, + as_custom_model: Literal[True], + options: Optional[model_types.ModelLoadOption] = None, +) -> Tuple[custom_model.CustomModel, _model_meta.ModelMetadata]: + ... + + +@overload +def _load( + *, + local_dir_path: str, + meta_only: Literal[True], + as_custom_model: bool = False, + options: Optional[model_types.ModelLoadOption] = None, +) -> _model_meta.ModelMetadata: + ... -def _load_model_for_deploy(model_dir_path: str) -> Tuple[custom_model.CustomModel, _model_meta.ModelMetadata]: - """Load the model into memory from directory. Internal used when deploying only. - It will try to use _load_as_custom_model method in the handler if provided, otherwise, it will use _load_model. +def _load( + *, + local_dir_path: str, + meta_only: bool = False, + as_custom_model: bool = False, + options: Optional[model_types.ModelLoadOption] = None, +) -> Union[_model_meta.ModelMetadata, Tuple[model_types.SupportedModelType, _model_meta.ModelMetadata]]: + """Load the model into memory from directory. Used internal only. Args: - model_dir_path: Directory containing the model. + local_dir_path: Directory containing the model. + meta_only: Flag to indicate that if only load metadata. + as_custom_model: When set to True, It will try to use _load_as_custom_model method in the handler if provided, + otherwise, it will use _load_model. + options: Model loading options. Raises: TypeError: Raised if model is not native format. Returns: - A tuple containing the model object as a custom model and the model metadata. + ModelMeta data when meta_only is True. + A tuple containing the model object as a custom model and the model metadata when as_custom_model is True. + A tuple containing the model object and the model metadata when as_custom_model is False. """ - model_dir_path = os.path.normpath(model_dir_path) + local_dir_path = os.path.normpath(local_dir_path) + meta = _model_meta._load_model_metadata(local_dir_path) + if meta_only: + return meta + + _model_meta._load_code_path(local_dir_path) + + _env.validate_py_runtime_version(meta.python_version) - meta = _model_meta._load_model_metadata(model_dir_path) handler = _model_handler._load_handler(meta.model_type) if handler is None: raise TypeError(f"{meta.model_type} is not supported.") - model_blobs_path = os.path.join(model_dir_path, MODEL_BLOBS_DIR) - load_func = getattr(handler, "_load_as_custom_model", None) - if not callable(load_func): + model_blobs_path = os.path.join(local_dir_path, MODEL_BLOBS_DIR) + if as_custom_model: + load_func = getattr(handler, "_load_as_custom_model", None) + if not callable(load_func): + load_func = handler._load_model + else: load_func = handler._load_model - m = load_func(meta.name, meta, model_blobs_path) - assert isinstance(m, custom_model.CustomModel) + if options is None: + options = {} + + m = load_func(meta.name, meta, model_blobs_path, **options) + if as_custom_model: + assert isinstance(m, custom_model.CustomModel) return m, meta diff --git a/snowflake/ml/model/_model_handler.py b/snowflake/ml/model/_model_handler.py index 51f5957e..79a9cb5f 100644 --- a/snowflake/ml/model/_model_handler.py +++ b/snowflake/ml/model/_model_handler.py @@ -1,14 +1,16 @@ +import functools import importlib import os import pkgutil from types import ModuleType -from typing import Dict, Optional, Type +from typing import Any, Callable, Dict, Optional, Type, TypeVar, cast from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._handlers import _base _HANDLERS_BASE = "_handlers" _MODEL_HANDLER_REGISTRY: Dict[str, Type[_base._ModelHandler[model_types.SupportedModelType]]] = dict() +_IS_HANDLER_LOADED = False def _register_handlers() -> None: @@ -34,29 +36,43 @@ def _register_handlers() -> None: _MODEL_HANDLER_REGISTRY[k_class.handler_type] = k_class +F = TypeVar("F", bound=Callable[..., Any]) + + +def ensure_handlers_registration(fn: F) -> F: + @functools.wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + global _IS_HANDLER_LOADED + if not _IS_HANDLER_LOADED: + _register_handlers() + _IS_HANDLER_LOADED = True + + return fn(*args, **kwargs) + + return cast(F, wrapper) + + +@ensure_handlers_registration def _find_handler( model: model_types.SupportedModelType, ) -> Optional[Type[_base._ModelHandler[model_types.SupportedModelType]]]: - retried = False - while True: - for handler in _MODEL_HANDLER_REGISTRY.values(): - if handler.can_handle(model): - return handler - if retried: - return None - else: - _register_handlers() - retried = True + for handler in _MODEL_HANDLER_REGISTRY.values(): + if handler.can_handle(model): + return handler + return None +@ensure_handlers_registration def _load_handler(target_model_type: str) -> Optional[Type[_base._ModelHandler[model_types.SupportedModelType]]]: - retried = False - while True: - for model_type, handler in _MODEL_HANDLER_REGISTRY.items(): - if target_model_type == model_type: - return handler - if retried: - return None - else: - _register_handlers() - retried = True + for model_type, handler in _MODEL_HANDLER_REGISTRY.items(): + if target_model_type == model_type: + return handler + return None + + +@ensure_handlers_registration +def is_auto_signature_model(model: model_types.SupportedModelType) -> bool: + for handler in _MODEL_HANDLER_REGISTRY.values(): + if handler.can_handle(model): + return handler.is_auto_signature + return False diff --git a/snowflake/ml/model/_model_meta.py b/snowflake/ml/model/_model_meta.py index 9d81e6fb..8f984492 100644 --- a/snowflake/ml/model/_model_meta.py +++ b/snowflake/ml/model/_model_meta.py @@ -27,6 +27,7 @@ _BASIC_DEPENDENCIES = _core_requirements.REQUIREMENTS _SNOWFLAKE_PKG_NAME = "snowflake" _SNOWFLAKE_ML_PKG_NAME = f"{_SNOWFLAKE_PKG_NAME}.ml" +_DEFAULT_CUDA_VERSION = "11.7" Dependency = namedtuple("Dependency", ["conda_name", "pip_name"]) @@ -153,8 +154,18 @@ def _load_model_metadata(model_dir_path: str) -> "ModelMetadata": A model metadata object. """ model_dir_path = os.path.normpath(model_dir_path) - meta = ModelMetadata.load_model_metadata(model_dir_path) + return meta + + +def _load_code_path(model_dir_path: str) -> None: + """Load custom code in the code path into memory. + + Args: + model_dir_path: Path to the directory containing the model to be loaded. + + """ + model_dir_path = os.path.normpath(model_dir_path) code_path = os.path.join(model_dir_path, ModelMetadata.MODEL_CODE_DIR) if os.path.exists(code_path): if code_path in sys.path: @@ -177,8 +188,6 @@ def _load_model_metadata(model_dir_path: str) -> "ModelMetadata": assert code_path in sys.path sys.path.remove(code_path) - return meta - class ModelMetadata: """Model metadata for Snowflake native model packaged model. @@ -188,6 +197,7 @@ class ModelMetadata: model_type: Type of the model. creation_timestamp: Unix timestamp when the model metadata is created. python_version: String 'major.minor.patchlevel' showing the python version where the model runs. + cuda_version: CUDA version to be used, if None then the model cannot be deployed to instance with GPUs. """ MANIFEST_FILE = "MANIFEST" @@ -250,6 +260,7 @@ def __init__( self._include_if_absent( [Dependency(conda_name=dep, pip_name=dep) for dep in _BASIC_DEPENDENCIES + [env_utils._SNOWML_PKG_NAME]] ) + self._cuda_version: Optional[str] = None self.__dict__.update(kwargs) @@ -299,6 +310,22 @@ def _include_if_absent(self, pkgs: List[Dependency]) -> None: category=UserWarning, ) + @property + def cuda_version(self) -> Optional[str]: + return self._cuda_version + + @cuda_version.setter + def cuda_version(self, _cuda_version: str) -> None: + if not isinstance(_cuda_version, str): + raise ValueError("Cannot set CUDA version as a non-str object.") + if self._cuda_version is None: + self._cuda_version = _cuda_version + else: + if self._cuda_version != _cuda_version: + raise ValueError( + f"Different CUDA version {self._cuda_version} and {_cuda_version} found in the same model!" + ) + @property def signatures(self) -> Dict[str, model_signature.ModelSignature]: """Signatures of the model. @@ -334,6 +361,7 @@ def to_dict(self) -> Dict[str, Any]: res["models"] = {name: dataclasses.asdict(blob_meta) for name, blob_meta in self._models.items()} res["pip_requirements"] = self.pip_requirements res["conda_dependencies"] = self.conda_dependencies + res["cuda_version"] = self._cuda_version return res @classmethod @@ -353,6 +381,7 @@ def from_dict(cls, model_dict: Dict[str, Any]) -> "ModelMetadata": model_dict["_models"] = { name: _ModelBlobMetadata(**blob_meta) for name, blob_meta in model_dict.pop("models").items() } + model_dict["_cuda_version"] = model_dict.pop("cuda_version", None) return cls(**model_dict) def save_model_metadata(self, path: str) -> None: diff --git a/snowflake/ml/model/_model_meta_test.py b/snowflake/ml/model/_model_meta_test.py index 69f8c4b4..c8594a09 100644 --- a/snowflake/ml/model/_model_meta_test.py +++ b/snowflake/ml/model/_model_meta_test.py @@ -273,6 +273,25 @@ def test_model_meta_check(self) -> None: with self.assertRaises(NotImplementedError): _ = _model_meta.ModelMetadata.load_model_metadata(tmpdir) + def test_model_meta_cuda(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + with _model_meta._create_model_metadata( + model_dir_path=tmpdir, name="model1", model_type="custom", signatures=_DUMMY_SIG + ) as meta: + with self.assertRaisesRegex(ValueError, "Cannot set CUDA version as a non-str object."): + meta.cuda_version = None + + meta.cuda_version = "11.7" + + meta_dict = meta.to_dict() + + laoded_meta = _model_meta.ModelMetadata.from_dict(meta_dict) + + self.assertEqual(laoded_meta.cuda_version, "11.7") + + with self.assertRaisesRegex(ValueError, "Different CUDA version .+ and .+ found in the same model!"): + laoded_meta.cuda_version = "12.0" + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py index 92561115..71802803 100644 --- a/snowflake/ml/model/_model_test.py +++ b/snowflake/ml/model/_model_test.py @@ -5,7 +5,7 @@ import tempfile import uuid import warnings -from typing import List, Tuple, cast +from typing import Tuple, cast from unittest import mock import mlflow @@ -17,6 +17,7 @@ from absl.testing import absltest from sklearn import datasets, ensemble, linear_model, model_selection, multioutput +from snowflake.ml._internal import env as snowml_env, env_utils from snowflake.ml.model import ( _model as model_api, custom_model, @@ -113,24 +114,24 @@ def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.nn.Sigmoid(), ) - def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: - return [self.model(tensors[0])] + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + return self.model(tensor) # type: ignore[no-any-return] def _prepare_torch_model( dtype: torch.dtype = torch.float32, -) -> Tuple[torch.nn.Module, List[torch.Tensor], List[torch.Tensor]]: +) -> Tuple[torch.nn.Module, torch.Tensor, torch.Tensor]: n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 x = np.random.rand(batch_size, n_input) - data_x = [torch.from_numpy(x).to(dtype=dtype)] - data_y = [(torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype)] + data_x = torch.from_numpy(x).to(dtype=dtype) + data_y = (torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype) model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) loss_function = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for _epoch in range(100): pred_y = model(data_x) - loss = loss_function(pred_y[0], data_y[0]) + loss = loss_function(pred_y, data_y) optimizer.zero_grad() loss.backward() optimizer.step() @@ -144,8 +145,8 @@ def __init__(self, name: str = None) -> None: self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me") @tf.function # type: ignore[misc] - def __call__(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: - return [self.a_variable * tensors[0] + self.non_trainable_variable] + def __call__(self, tensor: tf.Tensor) -> tf.Tensor: + return self.a_variable * tensor + self.non_trainable_variable class KerasModel(tf.keras.Model): @@ -154,28 +155,27 @@ def __init__(self, n_hidden: int, n_out: int) -> None: self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") - def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: - input = tensors[0] + def call(self, tensors: tf.Tensor) -> tf.Tensor: + input = tensors x = self.fc_1(input) x = self.fc_2(x) - return [x] + return x def _prepare_keras_model( dtype: tf.dtypes.DType = tf.float32, -) -> Tuple[tf.keras.Model, List[tf.Tensor], List[tf.Tensor]]: +) -> Tuple[tf.keras.Model, tf.Tensor, tf.Tensor]: n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 x = np.random.rand(batch_size, n_input) - data_x = [tf.convert_to_tensor(x, dtype=dtype)] + data_x = tf.convert_to_tensor(x, dtype=dtype) raw_data_y = tf.random.uniform((batch_size, 1)) raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) - data_y = [tf.cast(raw_data_y, dtype=dtype)] - - def loss_fn(y_true: List[tf.Tensor], y_pred: List[tf.Tensor]) -> tf.Tensor: - return tf.keras.losses.mse(y_true[0], y_pred[0]) + data_y = tf.cast(raw_data_y, dtype=dtype) model = KerasModel(n_hidden, n_out) - model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=loss_fn) + model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=tf.keras.losses.MeanSquaredError() + ) model.fit(data_x, data_y, batch_size=batch_size, epochs=100) return model, data_x, data_y @@ -209,17 +209,16 @@ def test_model_load_hygiene(self) -> None: lm = DemoModel(context=custom_model.ModelContext(models={}, artifacts={})) arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(workspace, "model1"), + local_dir_path=os.path.join(workspace, "model1"), model=lm, sample_input=d, metadata={"author": "halu", "version": "1"}, code_paths=[os.path.join(src_path, "fake")], ) - print(list(os.walk(os.path.join(workspace, "model1")))) - _ = model_api.load_model(model_dir_path=os.path.join(workspace, "model1")) + _ = model_api._load(local_dir_path=os.path.join(workspace, "model1")) from fake.fake_module import p self.assertEqual(p.__file__, os.path.join(workspace, "model1", "code", "fake", "fake_module", "p.py")) @@ -243,9 +242,9 @@ def test_model_save_validation(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(workspace, "model1"), + local_dir_path=os.path.join(workspace, "model1"), model=lm, sample_input=d, metadata={"author": "halu", "version": "1"}, @@ -262,9 +261,9 @@ def test_model_save_validation(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(workspace, "model1"), + local_dir_path=os.path.join(workspace, "model1"), model=lm, sample_input=d, metadata={"author": "halu", "version": "1"}, @@ -277,120 +276,59 @@ def test_save_interface(self) -> None: m_session = mock_session.MockSession(conn=None, test_case=self) c_session = cast(Session, m_session) - local_dir = "path/to/local/model/dir" stage_path = '@"db"."schema"."stage"/model.zip' arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - with self.assertRaisesRegex( - ValueError, "model_dir_path and model_stage_file_path both cannot be None at the same time." - ): - model_api.save_model(name="model", model=linear_model.LinearRegression()) # type:ignore[call-overload] - - with self.assertRaisesRegex( - ValueError, "Session and model_stage_file_path must be specified at the same time." - ): - model_api.save_model( - name="model", model=linear_model.LinearRegression(), session=c_session, sample_input=d - ) # type:ignore[call-overload] - - with self.assertRaisesRegex(ValueError, "Session and model_stage_file_path must be None at the same time."): - model_api.save_model( - name="model", model=linear_model.LinearRegression(), model_stage_file_path=stage_path, sample_input=d - ) # type:ignore[call-overload] - - with self.assertRaisesRegex( - ValueError, "Session and model_stage_file_path must be specified at the same time." - ): - model_api.save_model( - name="model", - model=linear_model.LinearRegression(), - session=c_session, - model_dir_path=local_dir, - sample_input=d, - ) # type:ignore[call-overload] - - with self.assertRaisesRegex(ValueError, "Session and model_stage_file_path must be None at the same time."): - model_api.save_model( - name="model", - model=linear_model.LinearRegression(), - model_stage_file_path=stage_path, - model_dir_path=local_dir, - sample_input=d, - ) # type:ignore[call-overload] - - with self.assertRaisesRegex( - ValueError, "model_dir_path and model_stage_file_path both cannot be specified at the same time." - ): - model_api.save_model( - name="model", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path=stage_path, - model_dir_path=local_dir, - sample_input=d, - ) # type:ignore[call-overload] - - with self.assertRaisesRegex( - ValueError, "Signatures and sample_input both cannot be None for local model at the same time." - ): - model_api.save_model( - name="model1", - model_dir_path=local_dir, - model=linear_model.LinearRegression(), - ) - with self.assertRaisesRegex( ValueError, "Signatures and sample_input both cannot be specified at the same time." ): model_api.save_model( # type:ignore[call-overload] name="model1", - model_dir_path=local_dir, + session=c_session, + model_stage_file_path=stage_path, model=linear_model.LinearRegression(), sample_input=d, signatures={"predict": model_signature.ModelSignature(inputs=[], outputs=[])}, ) with self.assertRaisesRegex( - ValueError, "Signatures and sample_input both cannot be specified at the same time." + ValueError, "Signatures and sample_input both cannot be None at the same time for this kind of model." ): - model_api.save_model( # type:ignore[call-overload] - name="model1", - model_dir_path=local_dir, - model=LinearRegression(), - sample_input=d, - signatures={"predict": model_signature.ModelSignature(inputs=[], outputs=[])}, - ) - - with mock.patch.object(model_api, "_save", return_value=None) as mock_save: model_api.save_model( name="model1", - model_dir_path=local_dir, - model=LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + model=linear_model.LinearRegression(), ) - with tempfile.TemporaryDirectory() as tempdir: - with open(os.path.join(tempdir, "some_file"), "w", encoding="utf-8") as f: - f.write("Hi Ciyana!") - - with self.assertRaisesRegex(ValueError, "Provided model directory [^\\s]* is not a directory."): - model_api.save_model( - name="model1", - model_dir_path=os.path.join(tempdir, "some_file"), - model=linear_model.LinearRegression(), - sample_input=d, - ) + with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] + ): + model_api.save_model( + name="model1", + session=c_session, + model_stage_file_path=stage_path, + model=LinearRegression(), + ) + mock_save.assert_called_once() - with self.assertWarnsRegex(UserWarning, "Provided model directory [^\\s]* is not an empty directory."): - with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] + ): model_api.save_model( name="model1", - model_dir_path=tempdir, - model=linear_model.LinearRegression(), - sample_input=d, + session=c_session, + model_stage_file_path=stage_path, + model=LinearRegression(), ) - mock_save.assert_called_once() + + mock_save.assert_called_once() with self.assertRaisesRegex( ValueError, "Provided model path in the stage [^\\s]* must be a path to a zip file." @@ -403,56 +341,63 @@ def test_save_interface(self) -> None: sample_input=d, ) - with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(model_api, "_save", return_value=None): + with mock.patch.object(FileOperation, "put_stream", return_value=None): + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=None + ): + with self.assertLogs(level="INFO") as cm: + model_api.save_model( + name="model1", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + sample_input=d, + ) + self.assertListEqual( + cm.output, + [ + ( + f"INFO:absl:Local snowflake-ml-python library has version {snowml_env.VERSION}," + " which is not available in the Snowflake server, embedding local ML " + "library automatically." + ) + ], + ) + + with mock.patch.object(model_api, "_save", return_value=None): with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - model_api.save_model( - name="model1", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path=stage_path, - sample_input=d, - ) + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] + ): + model_api.save_model( + name="model1", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + sample_input=d, + ) mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=False) - with mock.patch.object(model_api, "_save", return_value=None) as mock_save: + with mock.patch.object(model_api, "_save", return_value=None): with mock.patch.object(FileOperation, "put_stream", return_value=None) as mock_put_stream: - model_api.save_model( # type:ignore[call-overload] - name="model1", - model=linear_model.LinearRegression(), - session=c_session, - model_stage_file_path=stage_path, - sample_input=d, - options={"allow_overwritten_stage_file": True}, - ) + with mock.patch.object( + env_utils, "validate_requirements_in_snowflake_conda_channel", return_value=[""] + ): + model_api.save_model( # type:ignore[call-overload] + name="model1", + model=linear_model.LinearRegression(), + session=c_session, + model_stage_file_path=stage_path, + sample_input=d, + options={"allow_overwritten_stage_file": True}, + ) mock_put_stream.assert_called_once_with(mock.ANY, stage_path, auto_compress=False, overwrite=True) def test_load_interface(self) -> None: m_session = mock_session.MockSession(conn=None, test_case=self) c_session = cast(Session, m_session) - local_dir = "path/to/local/model/dir" - stage_path = '@"db"."schema"."stage"/model.zip' - - with self.assertRaisesRegex( - ValueError, "Session and model_stage_file_path must be specified at the same time." - ): - model_api.load_model(session=c_session) # type:ignore[call-overload] - - with self.assertRaisesRegex( - ValueError, "model_dir_path and model_stage_file_path both cannot be None at the same time." - ): - model_api.load_model() # type:ignore[call-overload] - - with self.assertRaisesRegex(ValueError, "Session and model_stage_file_path must be None at the same time."): - model_api.load_model(model_stage_file_path=stage_path) # type:ignore[call-overload] - - with self.assertRaisesRegex( - ValueError, "model_dir_path and model_stage_file_path both cannot be specified at the same time." - ): - model_api.load_model( - session=c_session, model_stage_file_path=stage_path, model_dir_path=local_dir - ) # type:ignore[call-overload] - with self.assertRaisesRegex( ValueError, "Provided model path in the stage [^\\s]* must be a path to a zip file." ): @@ -475,27 +420,27 @@ def test_bad_save_model(self) -> None: s = {"predict": model_signature.infer_signature(d, lm.predict(d))} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir.full_path, "model1"), + local_dir_path=os.path.join(tmpdir.full_path, "model1"), model=lm, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir.full_path, "model1"), + local_dir_path=os.path.join(tmpdir.full_path, "model1"), model=lm, signatures=s, metadata={"author": "halu", "version": "1"}, python_version="3.5.2", ) - _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1"), meta_only=True) + _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), meta_only=True) with self.assertRaises(RuntimeError): - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) def test_custom_model_with_multiple_artifacts(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: @@ -512,9 +457,9 @@ def test_custom_model_with_multiple_artifacts(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) s = {"predict": model_signature.infer_signature(d, lm.predict(d))} - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=lm, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -522,26 +467,26 @@ def test_custom_model_with_multiple_artifacts(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, DemoModelWithManyArtifacts) res = m.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) - m_UDF, meta = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_UDF, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) assert isinstance(m_UDF, DemoModelWithManyArtifacts) res = m_UDF.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) self.assertEqual(meta.metadata["author"] if meta.metadata else None, "halu") - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=lm, sample_input=d, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) assert isinstance(m, DemoModelWithManyArtifacts) res = m.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([94, 97]))) @@ -563,18 +508,18 @@ def test_model_composition(self) -> None: p2 = acm.predict(d) s = {"predict": model_signature.infer_signature(d, p2)} with tempfile.TemporaryDirectory() as tmpdir: - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=acm, signatures=s, metadata={"author": "halu", "version": "1"}, ) - lm, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + lm, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(lm, ComposeModel) p3 = lm.predict(d) - m_UDF, _ = model_api._load_model_for_deploy(model_dir_path=os.path.join(tmpdir, "model1")) + m_UDF, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) assert isinstance(m_UDF, ComposeModel) p4 = m_UDF.predict(d) np.testing.assert_allclose(p1, p2) @@ -597,18 +542,18 @@ async def _test(self: "ModelTest") -> None: p2 = await acm.predict(d) s = {"predict": model_signature.infer_signature(d, p2)} with tempfile.TemporaryDirectory() as tmpdir: - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=acm, signatures=s, metadata={"author": "halu", "version": "1"}, ) - lm, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + lm, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(lm, AsyncComposeModel) p3 = await lm.predict(d) # type: ignore[misc] - m_UDF, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_UDF, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) assert isinstance(m_UDF, AsyncComposeModel) p4 = await m_UDF.predict(d) np.testing.assert_allclose(p1, p2) @@ -627,15 +572,15 @@ def test_custom_model_with_artifacts(self) -> None: arr = np.array([[1, 2, 3], [4, 2, 5]]) d = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) s = {"predict": model_signature.infer_signature(d, lm.predict(d))} - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=lm, signatures=s, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, DemoModelWithArtifacts) res = m.predict(d) np.testing.assert_allclose(res["output"], pd.Series(np.array([11, 14]))) @@ -646,7 +591,7 @@ def test_custom_model_with_artifacts(self) -> None: ) as f: f.write("20") - m_UDF, meta = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_UDF, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) assert isinstance(m_UDF, DemoModelWithArtifacts) res = m_UDF.predict(d) @@ -662,9 +607,9 @@ def test_skl_multiple_output_proba(self) -> None: model.fit(iris_X_df[:-10], dual_target[:-10]) with tempfile.TemporaryDirectory() as tmpdir: s = {"predict_proba": model_signature.infer_signature(iris_X_df, model.predict_proba(iris_X_df))} - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -674,12 +619,12 @@ def test_skl_multiple_output_proba(self) -> None: orig_res = model.predict_proba(iris_X_df[-10:]) m: multioutput.MultiOutputClassifier - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) loaded_res = m.predict_proba(iris_X_df[-10:]) np.testing.assert_allclose(np.hstack(orig_res), np.hstack(loaded_res)) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict_proba", None) assert callable(predict_method) udf_res = predict_method(iris_X_df[-10:]) @@ -688,31 +633,31 @@ def test_skl_multiple_output_proba(self) -> None: ) with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1_no_sig_bad", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_bad"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_bad"), model=model, sample_input=iris_X_df, metadata={"author": "halu", "version": "1"}, options=model_types.SKLModelSaveOptions({"target_methods": ["random"]}), ) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=model, sample_input=iris_X_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose( np.hstack(model.predict_proba(iris_X_df[-10:])), np.hstack(m.predict_proba(iris_X_df[-10:])) ) np.testing.assert_allclose(model.predict(iris_X_df[-10:]), m.predict(iris_X_df[-10:])) self.assertEqual(s["predict_proba"], meta.signatures["predict_proba"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict_proba", None) assert callable(predict_method) @@ -734,17 +679,17 @@ def test_skl(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(iris_X_df, regr.predict(iris_X_df))} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -754,26 +699,26 @@ def test_skl(self) -> None: warnings.simplefilter("error") m: linear_model.LinearRegression - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose(np.array([-0.08254936]), m.predict(iris_X_df[:1])) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(iris_X_df[:1])) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=iris_X_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose(np.array([-0.08254936]), m.predict(iris_X_df[:1])) self.assertEqual(s["predict"], meta.signatures["predict"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(iris_X_df[:1])) @@ -789,17 +734,17 @@ def test_xgb_booster(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(cal_X_test, y_pred)} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -808,28 +753,28 @@ def test_xgb_booster(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, xgboost.Booster) np.testing.assert_allclose(m.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regressor, sample_input=cal_X_test, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) assert isinstance(m, xgboost.Booster) np.testing.assert_allclose(m.predict(xgboost.DMatrix(data=cal_X_test)), y_pred) self.assertEqual(s["predict"], meta.signatures["predict"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) @@ -846,17 +791,17 @@ def test_xgb(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(cal_X_test, y_pred)} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regressor, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -865,29 +810,29 @@ def test_xgb(self) -> None: with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, xgboost.XGBClassifier) np.testing.assert_allclose(m.predict(cal_X_test), y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regressor, sample_input=cal_X_test, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) assert isinstance(m, xgboost.XGBClassifier) np.testing.assert_allclose(m.predict(cal_X_test), y_pred) np.testing.assert_allclose(m.predict_proba(cal_X_test), y_pred_proba) self.assertEqual(s["predict"], meta.signatures["predict"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predict_method(cal_X_test), np.expand_dims(y_pred, axis=1)) @@ -913,17 +858,17 @@ def test_snowml_all_input(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -933,27 +878,27 @@ def test_snowml_all_input(self) -> None: warnings.simplefilter("error") m: LinearRegression - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=df[INPUT_COLUMNS], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose(np.array([[-0.08254936]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) s = regr.model_signatures self.assertEqual(s["predict"], meta.signatures["predict"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) @@ -975,17 +920,17 @@ def test_snowml_signature_partial_input(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -995,28 +940,28 @@ def test_snowml_signature_partial_input(self) -> None: warnings.simplefilter("error") m: LinearRegression - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose(np.array([[0.17150434]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) s = regr.model_signatures # Compare the Model Signature without indexing self.assertItemsEqual(s["predict"].to_dict(), meta.signatures["predict"].to_dict()) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[0.17150434]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) @@ -1040,17 +985,17 @@ def test_snowml_signature_drop_input_cols(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: s = {"predict": model_signature.infer_signature(df[INPUT_COLUMNS], regr.predict(df)[[OUTPUT_COLUMNS]])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures={**s, "another_predict": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=regr, signatures=s, metadata={"author": "halu", "version": "1"}, @@ -1060,28 +1005,28 @@ def test_snowml_signature_drop_input_cols(self) -> None: warnings.simplefilter("error") m: LinearRegression - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) np.testing.assert_allclose(predictions, m.predict(df[:1])[[OUTPUT_COLUMNS]]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(predictions, predict_method(df[:1])[[OUTPUT_COLUMNS]]) - model_api.save_model( + model_api._save( name="model1_no_sig", - model_dir_path=os.path.join(tmpdir, "model1_no_sig"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig"), model=regr, sample_input=df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig")) np.testing.assert_allclose(np.array([[-0.08254936]]), m.predict(df[:1])[[OUTPUT_COLUMNS]]) s = regr.model_signatures # Compare the Model Signature without indexing self.assertItemsEqual(s["predict"].to_dict(), meta.signatures["predict"].to_dict()) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.array([[-0.08254936]]), predict_method(df[:1])[[OUTPUT_COLUMNS]]) @@ -1089,39 +1034,43 @@ def test_snowml_signature_drop_input_cols(self) -> None: def test_pytorch(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: model, data_x, data_y = _prepare_torch_model() - s = {"forward": model_signature.infer_signature(data_x, data_y)} + s = {"forward": model_signature.infer_signature([data_x], [data_y])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures={**s, "another_forward": s["forward"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, ) model.eval() - y_pred = model.forward(data_x)[0].detach() + y_pred = model.forward(data_x).detach() x_df = model_signature_utils.rename_pandas_df( - pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False), s["forward"].inputs, ) with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x)[0], y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + torch.testing.assert_close(m.forward(data_x), y_pred) + + with self.assertRaisesRegex(AssertionError, "Torch not compiled with CUDA enabled"): + _, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), options={"use_gpu": True}) + + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( @@ -1131,20 +1080,20 @@ def test_pytorch(self) -> None: y_pred, ) - model_api.save_model( + model_api._save( name="model1_no_sig_1", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model, - sample_input=data_x, + sample_input=[data_x], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x)[0], y_pred) + torch.testing.assert_close(m.forward(data_x), y_pred) self.assertEqual(s["forward"], meta.signatures["forward"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( @@ -1159,39 +1108,43 @@ def test_torchscript(self) -> None: model_script = torch.jit.script(model) # type:ignore[attr-defined] with tempfile.TemporaryDirectory() as tmpdir: - s = {"forward": model_signature.infer_signature(data_x, data_y)} + s = {"forward": model_signature.infer_signature([data_x], [data_y])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model_script, signatures={**s, "another_forward": s["forward"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model_script, signatures=s, metadata={"author": "halu", "version": "1"}, ) model_script.eval() - y_pred = model_script.forward(data_x)[0].detach() + y_pred = model_script.forward(data_x).detach() x_df = model_signature_utils.rename_pandas_df( - pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False), s["forward"].inputs, ) with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x)[0], y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + torch.testing.assert_close(m.forward(data_x), y_pred) + + with self.assertRaisesRegex(AssertionError, "Torch not compiled with CUDA enabled"): + _, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), options={"use_gpu": True}) + + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( @@ -1201,20 +1154,20 @@ def test_torchscript(self) -> None: y_pred, ) - model_api.save_model( + model_api._save( name="model1_no_sig_1", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model_script, - sample_input=data_x, + sample_input=[data_x], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x)[0], y_pred) + torch.testing.assert_close(m.forward(data_x), y_pred) self.assertEqual(s["forward"], meta.signatures["forward"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( @@ -1227,29 +1180,29 @@ def test_torchscript(self) -> None: def test_torch_df_sample_input(self) -> None: model, data_x, data_y = _prepare_torch_model(torch.float64) model_script = torch.jit.script(model) # type:ignore[attr-defined] - s = {"forward": model_signature.infer_signature(data_x, data_y)} + s = {"forward": model_signature.infer_signature([data_x], [data_y])} with tempfile.TemporaryDirectory() as tmpdir: model.eval() - y_pred = model.forward(data_x)[0].detach() + y_pred = model.forward(data_x).detach() x_df = model_signature_utils.rename_pandas_df( - pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False), + pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False), s["forward"].inputs, ) - model_api.save_model( + model_api._save( name="model1_no_sig_1", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model, sample_input=x_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, torch.nn.Module) - torch.testing.assert_close(m.forward(data_x)[0], y_pred) + torch.testing.assert_close(m.forward(data_x), y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( @@ -1257,21 +1210,21 @@ def test_torch_df_sample_input(self) -> None: ) model_script.eval() - y_pred = model_script.forward(data_x)[0].detach() + y_pred = model_script.forward(data_x).detach() - model_api.save_model( + model_api._save( name="model1_no_sig_2", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), model=model_script, sample_input=x_df, metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_2")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_2")) assert isinstance(m, torch.jit.ScriptModule) # type:ignore[attr-defined] - torch.testing.assert_close(m.forward(data_x)[0], y_pred) + torch.testing.assert_close(m.forward(data_x), y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_2")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), as_custom_model=True) predict_method = getattr(m_udf, "forward", None) assert callable(predict_method) torch.testing.assert_close( @@ -1281,114 +1234,114 @@ def test_torch_df_sample_input(self) -> None: def test_tensorflow(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: simple_module = SimpleModule(name="simple") - x = [tf.constant([[5.0], [10.0]])] + x = tf.constant([[5.0], [10.0]]) y_pred = simple_module(x) - s = {"__call__": model_signature.infer_signature(x, y_pred)} + s = {"__call__": model_signature.infer_signature([x], [y_pred])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=simple_module, signatures={**s, "another_forward": s["__call__"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=simple_module, signatures=s, metadata={"author": "halu", "version": "1"}, ) x_df = model_signature_utils.rename_pandas_df( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(x, ensure_serializable=False), + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data=[x], ensure_serializable=False), s["__call__"].inputs, ) with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert callable(m) - tf.assert_equal(m.__call__(x)[0], y_pred[0]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + tf.assert_equal(m.__call__(x), y_pred) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) assert callable(m_udf) tf.assert_equal( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[ 0 ], - y_pred[0], + y_pred, ) - model_api.save_model( + model_api._save( name="model1_no_sig_1", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=simple_module, - sample_input=x, + sample_input=[x], metadata={"author": "halu", "version": "1"}, ) - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert callable(m) - tf.assert_equal(m(x)[0], y_pred[0]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + tf.assert_equal(m(x), y_pred) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) assert callable(m_udf) tf.assert_equal( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[0], - y_pred[0], + y_pred, ) - model_api.save_model( + model_api._save( name="model1_no_sig_2", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), model=simple_module, sample_input=x_df, metadata={"author": "halu", "version": "1"}, ) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_2")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_2"), as_custom_model=True) assert callable(m_udf) tf.assert_equal( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(m_udf(x_df), s["__call__"].outputs)[0], - y_pred[0], + y_pred, ) def test_tensorflow_keras(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: model, data_x, data_y = _prepare_keras_model() - s = {"predict": model_signature.infer_signature(data_x, data_y)} + s = {"predict": model_signature.infer_signature([data_x], [data_y])} with self.assertRaises(ValueError): - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures={**s, "another_forward": s["predict"]}, metadata={"author": "halu", "version": "1"}, ) - model_api.save_model( + model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=model, signatures=s, metadata={"author": "halu", "version": "1"}, ) - y_pred = model.predict(data_x)[0] + y_pred = model.predict(data_x) x_df = model_signature_utils.rename_pandas_df( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False), + tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False), s["predict"].inputs, ) with warnings.catch_warnings(): warnings.simplefilter("error") - m, _ = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, tf.keras.Model) - tf.debugging.assert_near(m.predict(data_x)[0], y_pred) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + tf.debugging.assert_near(m.predict(data_x), y_pred) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) tf.debugging.assert_near( @@ -1398,20 +1351,20 @@ def test_tensorflow_keras(self) -> None: y_pred, ) - model_api.save_model( + model_api._save( name="model1_no_sig_1", - model_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), + local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), model=model, - sample_input=data_x, + sample_input=[data_x], metadata={"author": "halu", "version": "1"}, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1")) assert isinstance(m, tf.keras.Model) - tf.debugging.assert_near(m.predict(data_x)[0], y_pred) + tf.debugging.assert_near(m.predict(data_x), y_pred) self.assertEqual(s["predict"], meta.signatures["predict"]) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1_no_sig_1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1_no_sig_1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) tf.debugging.assert_near( @@ -1461,9 +1414,9 @@ def test_mlflow_model(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: mlflow_pyfunc_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") - saved_meta = model_api.save_model( + saved_meta = model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, ) @@ -1500,13 +1453,13 @@ def test_mlflow_model(self) -> None: ) self.assertIn("pip<=23.0.1", saved_meta.conda_dependencies) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, mlflow.pyfunc.PyFuncModel) self.assertNotEqual(m.metadata.run_id, run_id) - _ = model_api.save_model( + _ = model_api._save( name="model1_again", - model_dir_path=os.path.join(tmpdir, "model1_again"), + local_dir_path=os.path.join(tmpdir, "model1_again"), model=m, ) @@ -1545,7 +1498,7 @@ def test_mlflow_model(self) -> None: np.testing.assert_allclose(predictions, m.predict(X_test)) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) X_df = pd.DataFrame(X_test) @@ -1571,19 +1524,19 @@ def test_mlflow_model_df_inputs(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: mlflow_pyfunc_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") - _ = model_api.save_model( + _ = model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, mlflow.pyfunc.PyFuncModel) self.assertNotEqual(m.metadata.run_id, run_id) np.testing.assert_allclose(predictions, m.predict(X_test)) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_test).to_numpy()) @@ -1612,16 +1565,16 @@ def test_mlflow_model_bad_case(self) -> None: mlflow_pyfunc_model = mlflow.pyfunc.load_model(local_path) mlflow_pyfunc_model.metadata.run_id = uuid.uuid4().hex.lower() with self.assertRaisesRegex(ValueError, "Cannot load MLFlow model artifacts."): - _ = model_api.save_model( + _ = model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, options={"ignore_mlflow_dependencies": True}, ) - saved_meta = model_api.save_model( + saved_meta = model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, options={"model_uri": local_path, "ignore_mlflow_dependencies": True}, ) @@ -1629,34 +1582,34 @@ def test_mlflow_model_bad_case(self) -> None: self.assertEmpty(saved_meta.pip_requirements) with self.assertRaisesRegex(ValueError, "Cannot load MLFlow model dependencies."): - _ = model_api.save_model( + _ = model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=mlflow_pyfunc_model, ) - saved_meta = model_api.save_model( + saved_meta = model_api._save( name="model2", - model_dir_path=os.path.join(tmpdir, "model2"), + local_dir_path=os.path.join(tmpdir, "model2"), model=mlflow_pyfunc_model, options={"model_uri": local_path, "ignore_mlflow_metadata": True}, ) self.assertIsNone(saved_meta.metadata) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model2")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model2")) assert isinstance(m, mlflow.pyfunc.PyFuncModel) self.assertNotEqual(m.metadata.run_id, run_id) np.testing.assert_allclose(predictions, m.predict(X_test)) - _ = model_api.save_model( + _ = model_api._save( name="model2_again", - model_dir_path=os.path.join(tmpdir, "model2_again"), + local_dir_path=os.path.join(tmpdir, "model2_again"), model=m, ) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model2")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model2"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose(np.expand_dims(predictions, axis=1), predict_method(X_test).to_numpy()) @@ -1688,18 +1641,18 @@ def test_mlflow_model_pytorch(self) -> None: predictions = pytorch_pyfunc.predict(input_x) with tempfile.TemporaryDirectory() as tmpdir: - _ = model_api.save_model( + _ = model_api._save( name="model1", - model_dir_path=os.path.join(tmpdir, "model1"), + local_dir_path=os.path.join(tmpdir, "model1"), model=pytorch_pyfunc, ) - m, meta = model_api.load_model(model_dir_path=os.path.join(tmpdir, "model1")) + m, meta = model_api._load(local_dir_path=os.path.join(tmpdir, "model1")) assert isinstance(m, mlflow.pyfunc.PyFuncModel) np.testing.assert_allclose(predictions, m.predict(input_x)) - m_udf, _ = model_api._load_model_for_deploy(os.path.join(tmpdir, "model1")) + m_udf, _ = model_api._load(local_dir_path=os.path.join(tmpdir, "model1"), as_custom_model=True) predict_method = getattr(m_udf, "predict", None) assert callable(predict_method) np.testing.assert_allclose( diff --git a/snowflake/ml/model/_signatures/builtins_handler.py b/snowflake/ml/model/_signatures/builtins_handler.py index 76a4bc2f..d56cd503 100644 --- a/snowflake/ml/model/_signatures/builtins_handler.py +++ b/snowflake/ml/model/_signatures/builtins_handler.py @@ -1,3 +1,4 @@ +from collections import abc from typing import Literal, Sequence import pandas as pd @@ -14,11 +15,19 @@ class ListOfBuiltinHandler(base_handler.BaseDataHandler[model_types._SupportedBuiltinsList]): @staticmethod def can_handle(data: model_types.SupportedDataType) -> TypeGuard[model_types._SupportedBuiltinsList]: - return ( - isinstance(data, list) - and len(data) > 0 - and all(isinstance(data_col, (int, float, bool, str, bytes, list)) for data_col in data) - ) + if not isinstance(data, abc.Sequence) or isinstance(data, str): + return False + if len(data) == 0: + return False + can_handle = True + for element in data: + # String is a Sequence but we take them as an whole + if isinstance(element, abc.Sequence) and not isinstance(element, str): + can_handle = ListOfBuiltinHandler.can_handle(element) + elif not isinstance(element, (int, float, bool, str)): + can_handle = False + break + return can_handle @staticmethod def count(data: model_types._SupportedBuiltinsList) -> int: diff --git a/snowflake/ml/model/_signatures/builtins_test.py b/snowflake/ml/model/_signatures/builtins_test.py index ff5a1b42..b5e96fb2 100644 --- a/snowflake/ml/model/_signatures/builtins_test.py +++ b/snowflake/ml/model/_signatures/builtins_test.py @@ -6,21 +6,52 @@ class ListOfBuiltinsHandlerTest(absltest.TestCase): + def test_can_handle_list_builtins(self) -> None: + lt1 = [(2, 3), [2, 3]] + self.assertTrue(builtins_handler.ListOfBuiltinHandler.can_handle(lt1)) + + lt2 = (2, 3) + self.assertTrue(builtins_handler.ListOfBuiltinHandler.can_handle(lt2)) + + lt3 = ([3, 3], 3) + self.assertTrue(builtins_handler.ListOfBuiltinHandler.can_handle(lt3)) + + lt4 = ({"a": 1}, 3) + self.assertFalse(builtins_handler.ListOfBuiltinHandler.can_handle(lt4)) + + lt5 = [({"a": 1}, 3)] + self.assertFalse(builtins_handler.ListOfBuiltinHandler.can_handle(lt5)) + + lt6 = "abcd" + self.assertFalse(builtins_handler.ListOfBuiltinHandler.can_handle(lt6)) + + lt7 = ["abcd", "abcd"] + self.assertTrue(builtins_handler.ListOfBuiltinHandler.can_handle(lt7)) + + lt8 = [("ab", "ab"), "ab"] + self.assertTrue(builtins_handler.ListOfBuiltinHandler.can_handle(lt8)) + + lt9 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] + self.assertFalse(builtins_handler.ListOfBuiltinHandler.can_handle(lt9)) + def test_validate_list_builtins(self) -> None: - lt6 = ["Hello", [2, 3]] + lt1 = ["Hello", [2, 3]] with exception_utils.assert_snowml_exceptions( self, expected_original_error_type=ValueError, expected_regex="Inconsistent type of object found in data" ): - builtins_handler.ListOfBuiltinHandler.validate(lt6) # type:ignore[arg-type] + builtins_handler.ListOfBuiltinHandler.validate(lt1) # type:ignore[arg-type] - lt7 = [[1], [2, 3]] + lt2 = [[1], [2, 3]] with exception_utils.assert_snowml_exceptions( self, expected_original_error_type=ValueError, expected_regex="Ill-shaped list data" ): - builtins_handler.ListOfBuiltinHandler.validate(lt7) + builtins_handler.ListOfBuiltinHandler.validate(lt2) - lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] - self.assertFalse(builtins_handler.ListOfBuiltinHandler.can_handle(lt8)) + lt3 = [("ab", "ab"), "ab"] + with exception_utils.assert_snowml_exceptions( + self, expected_original_error_type=ValueError, expected_regex="Inconsistent type of object found in data" + ): + builtins_handler.ListOfBuiltinHandler.validate(lt3) def test_infer_signature_list_builtins(self) -> None: lt1 = [1, 2, 3, 4] diff --git a/snowflake/ml/model/_signatures/numpy_handler.py b/snowflake/ml/model/_signatures/numpy_handler.py index 73a13bf7..9144d08d 100644 --- a/snowflake/ml/model/_signatures/numpy_handler.py +++ b/snowflake/ml/model/_signatures/numpy_handler.py @@ -1,3 +1,4 @@ +from collections import abc from typing import List, Literal, Sequence import numpy as np @@ -80,7 +81,7 @@ def convert_to_df(data: model_types._SupportedNumpyArray, ensure_serializable: b class SeqOfNumpyArrayHandler(base_handler.BaseDataHandler[Sequence[model_types._SupportedNumpyArray]]): @staticmethod def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Sequence[model_types._SupportedNumpyArray]]: - if not isinstance(data, list): + if not isinstance(data, abc.Sequence): return False if len(data) == 0: return False diff --git a/snowflake/ml/model/_signatures/numpy_test.py b/snowflake/ml/model/_signatures/numpy_test.py index e0a1b904..e9614f75 100644 --- a/snowflake/ml/model/_signatures/numpy_test.py +++ b/snowflake/ml/model/_signatures/numpy_test.py @@ -105,9 +105,24 @@ def test_convert_to_df_numpy_array(self) -> None: class SeqOfNumpyArrayHandlerTest(absltest.TestCase): - def test_validate_list_of_numpy_array(self) -> None: - lt8 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] - self.assertFalse(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt8)) + def test_can_handle_list_of_numpy_array(self) -> None: + lt1 = [np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4])] + self.assertTrue(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt1)) + + lt2 = (np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4])) + self.assertTrue(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt2)) + + lt3 = (np.array([1, 2, 3, 4]), 3) + self.assertFalse(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt3)) + + lt4 = ({"a": np.array([1, 2, 3, 4])}, 3) + self.assertFalse(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt4)) + + lt5 = [np.array([1, 2, 3, 4]), 3] + self.assertFalse(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt5)) + + lt6 = [pd.DataFrame([1]), pd.DataFrame([2, 3])] + self.assertFalse(numpy_handler.SeqOfNumpyArrayHandler.can_handle(lt6)) def test_trunc_np_ndarray(self) -> None: arrs = [np.array([1] * (numpy_handler.SeqOfNumpyArrayHandler.SIG_INFER_ROWS_COUNT_LIMIT + 1))] * 2 diff --git a/snowflake/ml/model/_signatures/pytorch_handler.py b/snowflake/ml/model/_signatures/pytorch_handler.py index f81c917b..af8d9043 100644 --- a/snowflake/ml/model/_signatures/pytorch_handler.py +++ b/snowflake/ml/model/_signatures/pytorch_handler.py @@ -1,3 +1,4 @@ +from collections import abc from typing import TYPE_CHECKING, List, Literal, Optional, Sequence import numpy as np @@ -19,7 +20,7 @@ class SeqOfPyTorchTensorHandler(base_handler.BaseDataHandler[Sequence["torch.Tensor"]]): @staticmethod def can_handle(data: model_types.SupportedDataType) -> TypeGuard[Sequence["torch.Tensor"]]: - if not isinstance(data, list): + if not isinstance(data, abc.Sequence): return False if len(data) == 0: return False diff --git a/snowflake/ml/model/_signatures/pytorch_test.py b/snowflake/ml/model/_signatures/pytorch_test.py index c89cb30c..e875b477 100644 --- a/snowflake/ml/model/_signatures/pytorch_test.py +++ b/snowflake/ml/model/_signatures/pytorch_test.py @@ -8,6 +8,25 @@ class SeqOfPyTorchTensorHandlerTest(absltest.TestCase): + def test_can_handle_list_pytorch_tensor(self) -> None: + lt1 = [torch.Tensor([1, 2]), torch.Tensor([1, 2])] + self.assertTrue(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt1)) + + lt2 = (torch.Tensor([1, 2]), torch.Tensor([1, 2])) + self.assertTrue(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt2)) + + lt3 = (torch.Tensor([1, 2]), 3) + self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt3)) + + lt4 = ({"a": torch.Tensor([1, 2])}, 3) + self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt4)) + + lt5 = [torch.Tensor([1, 2]), 3] + self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt5)) + + lt6 = [np.array([1, 2, 3, 4]), torch.Tensor([1, 2])] + self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt6)) + def test_validate_list_of_pytorch_tensor(self) -> None: lt1 = [np.array([1, 4]), np.array([2, 3])] self.assertFalse(pytorch_handler.SeqOfPyTorchTensorHandler.can_handle(lt1)) diff --git a/snowflake/ml/model/_signatures/snowpark_handler.py b/snowflake/ml/model/_signatures/snowpark_handler.py index e7cd59cf..b656fe8f 100644 --- a/snowflake/ml/model/_signatures/snowpark_handler.py +++ b/snowflake/ml/model/_signatures/snowpark_handler.py @@ -99,7 +99,7 @@ def convert_to_df( @staticmethod def convert_from_df( - session: snowflake.snowpark.Session, df: pd.DataFrame, keep_order: bool = True + session: snowflake.snowpark.Session, df: pd.DataFrame, keep_order: bool = False ) -> snowflake.snowpark.DataFrame: # This method is necessary to create the Snowpark Dataframe in correct schema. # Snowpark ignore the schema argument when providing a pandas DataFrame. diff --git a/snowflake/ml/model/_signatures/tensorflow_handler.py b/snowflake/ml/model/_signatures/tensorflow_handler.py index 49a8c953..a78c16e2 100644 --- a/snowflake/ml/model/_signatures/tensorflow_handler.py +++ b/snowflake/ml/model/_signatures/tensorflow_handler.py @@ -1,3 +1,4 @@ +from collections import abc from typing import TYPE_CHECKING, List, Literal, Optional, Sequence, Union import numpy as np @@ -23,7 +24,7 @@ class SeqOfTensorflowTensorHandler( def can_handle( data: model_types.SupportedDataType, ) -> TypeGuard[Sequence[Union["tensorflow.Tensor", "tensorflow.Variable"]]]: - if not isinstance(data, list): + if not isinstance(data, abc.Sequence): return False if len(data) == 0: return False diff --git a/snowflake/ml/model/_signatures/tensorflow_test.py b/snowflake/ml/model/_signatures/tensorflow_test.py index ca96f422..f990845d 100644 --- a/snowflake/ml/model/_signatures/tensorflow_test.py +++ b/snowflake/ml/model/_signatures/tensorflow_test.py @@ -8,6 +8,25 @@ class SeqOfTensorflowTensorHandlerTest(absltest.TestCase): + def test_can_handle_list_tf_tensor(self) -> None: + lt1 = [tf.constant([1, 2]), tf.constant([1, 2])] + self.assertTrue(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt1)) + + lt2 = (tf.constant([1, 2]), tf.Variable([1, 2])) + self.assertTrue(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt2)) + + lt3 = (tf.constant([1, 2]), 3) + self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt3)) + + lt4 = ({"a": tf.constant([1, 2])}, 3) + self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt4)) + + lt5 = [tf.constant([1, 2]), 3] + self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt5)) + + lt6 = [np.array([1, 2, 3, 4]), tf.constant([1, 2])] + self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt6)) + def test_validate_list_of_tf_tensor(self) -> None: lt1 = [np.array([1, 4]), np.array([2, 3])] self.assertFalse(tensorflow_handler.SeqOfTensorflowTensorHandler.can_handle(lt1)) diff --git a/snowflake/ml/model/_signatures/utils.py b/snowflake/ml/model/_signatures/utils.py index 2788acad..1335bbd0 100644 --- a/snowflake/ml/model/_signatures/utils.py +++ b/snowflake/ml/model/_signatures/utils.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, List, Optional, Sequence +from typing import Any, Dict, List, Optional, Sequence import numpy as np import numpy.typing as npt @@ -102,3 +102,189 @@ def rename_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec ) data.columns = pd.Index([feature.name for feature in features]) return data + + +def huggingface_pipeline_signature_auto_infer(task: str, params: Dict[str, Any]) -> Optional[core.ModelSignature]: + # Text + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.ConversationalPipeline + # Needs to convert to conversation object. + if task == "conversational": + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="user_inputs", dtype=core.DataType.STRING, shape=(-1,)), + core.FeatureSpec(name="generated_responses", dtype=core.DataType.STRING, shape=(-1,)), + ], + outputs=[ + core.FeatureSpec(name="generated_responses", dtype=core.DataType.STRING, shape=(-1,)), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TokenClassificationPipeline + if task == "fill-mask": + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="inputs", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="outputs", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TokenClassificationPipeline + if task == "ner" or task == "token-classification": + return core.ModelSignature( + inputs=[core.FeatureSpec(name="inputs", dtype=core.DataType.STRING)], + outputs=[ + core.FeatureSpec(name="outputs", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.QuestionAnsweringPipeline + if task == "question-answering": + # If top_k and topk is not set or set to 1, then the output is a dict per input, thus we could expand. + if params.get("top_k", 1) == 1 and params.get("topk", 1) == 1: + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="question", dtype=core.DataType.STRING), + core.FeatureSpec(name="context", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="score", dtype=core.DataType.DOUBLE), + core.FeatureSpec(name="start", dtype=core.DataType.INT64), + core.FeatureSpec(name="end", dtype=core.DataType.INT64), + core.FeatureSpec(name="answer", dtype=core.DataType.STRING), + ], + ) + # Else it is a list of dict per input. + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="question", dtype=core.DataType.STRING), + core.FeatureSpec(name="context", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="outputs", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.SummarizationPipeline + if task == "summarization": + if params.get("return_tensors", False): + raise NotImplementedError( + f"Auto deployment for HuggingFace pipeline {task} " + "when `return_tensors` set to `True` has not been supported yet." + ) + # Always generate a dict per input + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="documents", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="summary_text", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TableQuestionAnsweringPipeline + if task == "table-question-answering": + # Always generate a dict per input + # Table is a JSON serialized string + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="query", dtype=core.DataType.STRING), + core.FeatureSpec(name="table", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="answer", dtype=core.DataType.STRING), + core.FeatureSpec(name="coordinates", dtype=core.DataType.INT64, shape=(-1,)), + core.FeatureSpec(name="cells", dtype=core.DataType.STRING, shape=(-1,)), + core.FeatureSpec(name="aggregator", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextClassificationPipeline + if task == "text-classification" or task == "sentiment-analysis": + # If top_k is set, return a list of dict per input + if params.get("top_k", None) is not None: + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="text", dtype=core.DataType.STRING), + core.FeatureSpec(name="text_pair", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="outputs", dtype=core.DataType.STRING), + ], + ) + # Else, return a dict per input + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="text", dtype=core.DataType.STRING), + core.FeatureSpec(name="text_pair", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="label", dtype=core.DataType.STRING), + core.FeatureSpec(name="score", dtype=core.DataType.DOUBLE), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextGenerationPipeline + if task == "text-generation": + if params.get("return_tensors", False): + raise NotImplementedError( + f"Auto deployment for HuggingFace pipeline {task} " + "when `return_tensors` set to `True` has not been supported yet." + ) + # Always generate a list of dict per input + return core.ModelSignature( + inputs=[core.FeatureSpec(name="inputs", dtype=core.DataType.STRING)], + outputs=[ + core.FeatureSpec(name="outputs", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.Text2TextGenerationPipeline + if task == "text2text-generation": + if params.get("return_tensors", False): + raise NotImplementedError( + f"Auto deployment for HuggingFace pipeline {task} " + "when `return_tensors` set to `True` has not been supported yet." + ) + # Always generate a dict per input + return core.ModelSignature( + inputs=[core.FeatureSpec(name="inputs", dtype=core.DataType.STRING)], + outputs=[ + core.FeatureSpec(name="generated_text", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TranslationPipeline + if task.startswith("translation"): + if params.get("return_tensors", False): + raise NotImplementedError( + f"Auto deployment for HuggingFace pipeline {task} " + "when `return_tensors` set to `True` has not been supported yet." + ) + # Always generate a dict per input + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="inputs", dtype=core.DataType.STRING), + ], + outputs=[ + core.FeatureSpec(name="translation_text", dtype=core.DataType.STRING), + ], + ) + + # https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline + if task == "zero-shot-classification": + return core.ModelSignature( + inputs=[ + core.FeatureSpec(name="sequences", dtype=core.DataType.STRING), + core.FeatureSpec(name="candidate_labels", dtype=core.DataType.STRING, shape=(-1,)), + ], + outputs=[ + core.FeatureSpec(name="sequence", dtype=core.DataType.STRING), + core.FeatureSpec(name="labels", dtype=core.DataType.STRING, shape=(-1,)), + core.FeatureSpec(name="scores", dtype=core.DataType.DOUBLE, shape=(-1,)), + ], + ) + + return None diff --git a/snowflake/ml/model/models/BUILD.bazel b/snowflake/ml/model/models/BUILD.bazel new file mode 100644 index 00000000..25ef340d --- /dev/null +++ b/snowflake/ml/model/models/BUILD.bazel @@ -0,0 +1,8 @@ +load("//bazel:py_rules.bzl", "py_library") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "huggingface_pipeline", + srcs = ["huggingface_pipeline.py"], +) diff --git a/snowflake/ml/model/models/huggingface_pipeline.py b/snowflake/ml/model/models/huggingface_pipeline.py new file mode 100644 index 00000000..ce914f53 --- /dev/null +++ b/snowflake/ml/model/models/huggingface_pipeline.py @@ -0,0 +1,203 @@ +import warnings +from typing import Any, Dict, Optional + + +class HuggingFacePipelineModel: + def __init__( + self, + task: str = None, + model: Optional[str] = None, + *, + revision: Optional[str] = None, + token: Optional[str] = None, + trust_remote_code: Optional[bool] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> None: + """ + Utility factory method to build a wrapper over transformers [`Pipeline`]. + When deploying, this wrapper will create a real pipeline object and loading tokenizers and models. + + For pipelines docs, please refer: + https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline + + Args: + task: The task that pipeline will be used. If None it would be inferred from model. + For available tasks, please refer Transformers's documentation. Defaults to None. + model: The model that will be used by the pipeline to make predictions. This can only be a model identifier + currently. If not provided, the default for the `task` will be loaded. Defaults to None. + revision: When passing a task name or a string model identifier: The specific model version to use. It can + be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and + other artifacts on huggingface.co, so `revision` can be any identifier allowed by git. Defaults to None. + token: The token to use as HTTP bearer authorization for remote files. Defaults to None. + trust_remote_code: Whether or not to allow for custom code defined on the Hub in their own modeling, + configuration, tokenization or even pipeline files. This option should only be set to `True` for + repositories you trust and in which you have read the code, as it will execute code present on the Hub. + Defaults to None. + model_kwargs: Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,`. + Defaults to None. + kwargs: Additional keyword arguments passed along to the specific pipeline init (see the documentation for + the corresponding pipeline class for possible values). + + Return: + A wrapper over transformers [`Pipeline`]. + + Raises: + RuntimeError: Raised when the input argument cannot determine the pipeline. + ValueError: Raised when the pipeline contains remote code but trust_remote_code is not set or False. + ValueError: Raised when having conflicting arguments. + """ + import transformers + + config = kwargs.get("config", None) + tokenizer = kwargs.get("tokenizer", None) + framework = kwargs.get("framework", None) + feature_extractor = kwargs.get("feature_extractor", None) + + # ==== Start pipeline logic from transformers ==== + if model_kwargs is None: + model_kwargs = {} + + use_auth_token = model_kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + hub_kwargs = { + "revision": revision, + "token": token, + "trust_remote_code": trust_remote_code, + "_commit_hash": None, + } + + if task is None and model is None: + raise RuntimeError( + "Impossible to instantiate a pipeline without either a task or a model being specified. " + ) + + if model is None and tokenizer is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided" + " tokenizer may not be compatible with the default model. Please provide an identifier to a pretrained" + " model when providing tokenizer." + ) + if model is None and feature_extractor is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the" + "provided feature_extractor may not be compatible with the default model. Please provide an identifier" + " to a pretrained model when providing feature_extractor." + ) + + # ==== End pipeline logic from transformers ==== + + # We only support string as model argument. + + if model is not None and not isinstance(model, str): + raise RuntimeError( + "Impossible to use non-string model as input for HuggingFacePipelineModel. Use transformers.Pipeline" + " object if required." + ) + + # ==== Start pipeline logic (Config) from transformers ==== + + # Config is the primordial information item. + # Instantiate config if needed + if isinstance(config, str): + config_obj = transformers.AutoConfig.from_pretrained( + config, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config_obj._commit_hash + elif config is None: + config_obj = transformers.AutoConfig.from_pretrained( + model, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config_obj._commit_hash + # We only support string as config argument. + else: + raise RuntimeError( + "Impossible to use non-string config as input for HuggingFacePipelineModel. Use transformers.Pipeline" + " object if required." + ) + + # ==== Start pipeline logic (Task) from transformers ==== + + custom_tasks = {} + if config_obj is not None and len(getattr(config_obj, "custom_pipelines", {})) > 0: + custom_tasks = config_obj.custom_pipelines + if task is None and trust_remote_code is not False: + if len(custom_tasks) == 1: + task = list(custom_tasks.keys())[0] + else: + raise RuntimeError( + "We can't infer the task automatically for this model as there are multiple tasks available. " + f"Pick one in {', '.join(custom_tasks.keys())}" + ) + + if task is None and model is not None: + if not isinstance(model, str): + raise RuntimeError( + "Inferring the task automatically requires to check the hub with a model_id defined as a `str`." + f"{model} is not a valid model_id." + ) + task = transformers.pipelines.get_task(model, token) + + # Retrieve the task + if task in custom_tasks: + normalized_task = task + targeted_task, task_options = transformers.pipelines.clean_custom_task(custom_tasks[task]) + if not trust_remote_code: + raise ValueError( + "Loading this pipeline requires you to execute the code in the pipeline file in that" + " repo on your local machine. Make sure you have read the code there to avoid malicious use, then" + " set the option `trust_remote_code=True` to remove this error." + ) + else: + normalized_task, targeted_task, task_options = transformers.pipelines.check_task(task) + + # ==== Start pipeline logic (Model) from transformers ==== + + # Use default model/config/tokenizer for the task if no model is provided + if model is None: + # At that point framework might still be undetermined + model, default_revision = transformers.pipelines.get_default_model_and_revision( + targeted_task, framework, task_options + ) + revision = revision if revision is not None else default_revision + warnings.warn( + f"No model was supplied, defaulted to {model} and revision" + f" {revision} ({transformers.pipelines.HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n" + "Using a pipeline without specifying a model name and revision in production is not recommended." + ) + if config is None and isinstance(model, str): + config_obj = transformers.AutoConfig.from_pretrained( + model, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config_obj._commit_hash + + if kwargs.get("device_map", None) is not None: + if "device_map" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those' + " arguments might conflict, use only one.)" + ) + if kwargs.get("device", None) is not None: + warnings.warn( + "Both `device` and `device_map` are specified. `device` will override `device_map`. You" + " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`." + ) + + # ==== End pipeline logic from transformers ==== + + self.task = normalized_task + self.model = model + self.revision = revision + self.token = token + self.trust_remote_code = trust_remote_code + self.model_kwargs = model_kwargs + self.__dict__.update(kwargs) diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index 2ec25b49..a14b965a 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Sequence, TypedDict, TypeVar, Union import numpy.typing as npt -from typing_extensions import NotRequired, TypeAlias +from typing_extensions import NotRequired if TYPE_CHECKING: import mlflow @@ -12,9 +12,11 @@ import sklearn.pipeline import tensorflow import torch + import transformers import xgboost import snowflake.ml.model.custom_model + import snowflake.ml.model.models.huggingface_pipeline import snowflake.snowpark from snowflake.ml.modeling.framework import base # noqa: F401 @@ -60,7 +62,12 @@ "tensorflow.Module", ] -SupportedNoSignatureRequirementsModelType: TypeAlias = Union["base.BaseEstimator", "mlflow.pyfunc.PyFuncModel"] +SupportedNoSignatureRequirementsModelType = Union[ + "base.BaseEstimator", + "mlflow.pyfunc.PyFuncModel", + "transformers.Pipeline", + "snowflake.ml.model.models.huggingface_pipeline.HuggingFacePipelineModel", +] SupportedModelType = Union[ SupportedRequireSignatureModelType, @@ -80,6 +87,9 @@ | torch.nn.Module | pytroch.py | _PyTorchHandler | | torch.jit.ScriptModule | torchscript.py | _TorchScriptHandler | | tensorflow.Module | tensorflow.py | _TensorFlowHandler | +| mlflow.pyfunc.PyFuncModel | mlflow.py | _MLFlowHandler | +| transformers.Pipeline | huggingface_pipeline.py | _HuggingFacePipelineHandler | +| huggingface_pipeline.HuggingFacePipelineModel | huggingface_pipeline.py | _HuggingFacePipelineHandler | """ @@ -87,16 +97,9 @@ class DeployOptions(TypedDict): - """Common Options for deploying to Snowflake. - - keep_order: Whether or not preserve the row order when predicting. Only available for dataframe has fewer than 2**64 - rows. Defaults to True. - output_with_input_features: Whether or not preserve the input columns in the output when predicting. - Defaults to False. - """ + """Common Options for deploying to Snowflake.""" - keep_order: NotRequired[bool] - output_with_input_features: NotRequired[bool] + ... class WarehouseDeployOptions(DeployOptions): @@ -126,8 +129,6 @@ class SnowparkContainerServiceDeployOptions(DeployOptions): inferred based on session information. min_instances: Minimum number of service replicas. Default to 1. max_instances: Maximum number of service replicas. Default to 1. - endpoint: The specific name of the endpoint that the service function will communicate with. This option is - useful when the service has multiple endpoints. Default to “predict”. prebuilt_snowflake_image: When provided, the image-building step is skipped, and the pre-built image from Snowflake is used as is. This option is for users who consistently use the same image for multiple use cases, allowing faster deployment. The snowflake image used for deployment is logged to the console for @@ -136,16 +137,17 @@ class SnowparkContainerServiceDeployOptions(DeployOptions): num_workers: Number of workers used for model inference. Please ensure that the number of workers is set lower than the total available memory divided by the size of model to prevent memory-related issues. Default is number of CPU cores * 2 + 1. + enable_remote_image_build: When set to True, will enable image build on a remote SnowService job. Default is False. """ compute_pool: str image_repo: NotRequired[str] min_instances: NotRequired[int] max_instances: NotRequired[int] - endpoint: NotRequired[str] prebuilt_snowflake_image: NotRequired[str] num_gpus: NotRequired[int] num_workers: NotRequired[int] + enable_remote_image_build: NotRequired[bool] class BaseModelSaveOption(TypedDict): @@ -161,7 +163,7 @@ class BaseModelSaveOption(TypedDict): class CustomModelSaveOption(BaseModelSaveOption): - ... + cuda_version: NotRequired[str] class SKLModelSaveOptions(BaseModelSaveOption): @@ -170,6 +172,7 @@ class SKLModelSaveOptions(BaseModelSaveOption): class XGBModelSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] + cuda_version: NotRequired[str] class SNOWModelSaveOptions(BaseModelSaveOption): @@ -178,14 +181,17 @@ class SNOWModelSaveOptions(BaseModelSaveOption): class PyTorchSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] + cuda_version: NotRequired[str] class TorchScriptSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] + cuda_version: NotRequired[str] class TensorflowSaveOptions(BaseModelSaveOption): target_methods: NotRequired[Sequence[str]] + cuda_version: NotRequired[str] class MLFlowSaveOptions(BaseModelSaveOption): @@ -194,6 +200,12 @@ class MLFlowSaveOptions(BaseModelSaveOption): ignore_mlflow_dependencies: NotRequired[bool] +class HuggingFaceSaveOptions(BaseModelSaveOption): + target_methods: NotRequired[Sequence[str]] + cuda_version: NotRequired[str] + accelerate_mix_precision_config: NotRequired[str] + + ModelSaveOption = Union[ BaseModelSaveOption, CustomModelSaveOption, @@ -204,4 +216,14 @@ class MLFlowSaveOptions(BaseModelSaveOption): TorchScriptSaveOptions, TensorflowSaveOptions, MLFlowSaveOptions, + HuggingFaceSaveOptions, ] + + +class ModelLoadOption(TypedDict): + """Options for loading the model. + + use_gpu: Enable GPU-specific loading logic. + """ + + use_gpu: NotRequired[bool] diff --git a/snowflake/ml/modeling/framework/base.py b/snowflake/ml/modeling/framework/base.py index c54f1adf..323f64d9 100644 --- a/snowflake/ml/modeling/framework/base.py +++ b/snowflake/ml/modeling/framework/base.py @@ -410,10 +410,6 @@ def _use_input_cols_only(self, dataset: pd.DataFrame) -> pd.DataFrame: ) return dataset[self.input_cols] - @telemetry.send_api_usage_telemetry( - project=PROJECT, - subproject=SUBPROJECT, - ) def _compute( self, dataset: snowpark.DataFrame, cols: List[str], states: List[str] ) -> Dict[str, Dict[str, Union[int, float, str]]]: diff --git a/snowflake/ml/modeling/impute/simple_imputer.py b/snowflake/ml/modeling/impute/simple_imputer.py index 793e0378..d29daa7d 100644 --- a/snowflake/ml/modeling/impute/simple_imputer.py +++ b/snowflake/ml/modeling/impute/simple_imputer.py @@ -6,7 +6,7 @@ from typing import Any, Dict, Iterable, Optional, Type, Union import numpy as np -import numpy._typing as _npt +import numpy.typing as npt import pandas as pd from sklearn import impute @@ -26,7 +26,7 @@ "most_frequent": _utils.BasicStatistics.MODE, } -SNOWFLAKE_DATATYPE_TO_NUMPY_DTYPE_MAP: Dict[Type[T.DataType], _npt._DType[Any]] = { +SNOWFLAKE_DATATYPE_TO_NUMPY_DTYPE_MAP: Dict[Type[T.DataType], npt.DTypeLike] = { T.ByteType: np.dtype("int8"), T.ShortType: np.dtype("int16"), T.IntegerType: np.dtype("int32"), @@ -288,7 +288,7 @@ def fit(self, dataset: snowpark.DataFrame) -> "SimpleImputer": # This attribute is set during `fit` by sklearn objects. In order to avoid fitting # the sklearn object directly when creating the sklearn simple imputer, we have to # set this property. - self._sklearn_fit_dtype = max( + self._sklearn_fit_dtype = max( # type:ignore[type-var] SNOWFLAKE_DATATYPE_TO_NUMPY_DTYPE_MAP[type(input_col_datatypes[input_col])] for input_col in self.input_cols ) diff --git a/snowflake/ml/modeling/metrics/BUILD.bazel b/snowflake/ml/modeling/metrics/BUILD.bazel index ff1670d3..e2b2d504 100644 --- a/snowflake/ml/modeling/metrics/BUILD.bazel +++ b/snowflake/ml/modeling/metrics/BUILD.bazel @@ -47,6 +47,7 @@ py_library( deps = [ ":init", ":metrics_utils", + "//snowflake/ml/_internal/utils:result", "//snowflake/ml/_internal:telemetry", ], ) @@ -59,6 +60,7 @@ py_library( deps = [ ":init", ":metrics_utils", + "//snowflake/ml/_internal/utils:result", "//snowflake/ml/_internal:telemetry", ], ) @@ -100,6 +102,7 @@ py_library( ], deps = [ "//snowflake/ml/_internal:init_utils", + "//snowflake/ml/_internal/utils:result", ], ) diff --git a/snowflake/ml/modeling/metrics/__init__.py b/snowflake/ml/modeling/metrics/__init__.py index 976948b7..376fd3db 100644 --- a/snowflake/ml/modeling/metrics/__init__.py +++ b/snowflake/ml/modeling/metrics/__init__.py @@ -1,9 +1,16 @@ import os +import cloudpickle + from snowflake.ml._internal import init_utils +from snowflake.ml._internal.utils import result pkg_dir = os.path.dirname(os.path.abspath(__file__)) pkg_name = __name__ exportable_functions = init_utils.fetch_functions_from_modules_in_pkg_dir(pkg_dir=pkg_dir, pkg_name=pkg_name) for k, v in exportable_functions.items(): globals()[k] = v + +registered_modules = cloudpickle.list_registry_pickle_by_value() +if result not in registered_modules: + cloudpickle.register_pickle_by_value(result) diff --git a/snowflake/ml/modeling/metrics/classification.py b/snowflake/ml/modeling/metrics/classification.py index e21dae7f..9e3a0338 100644 --- a/snowflake/ml/modeling/metrics/classification.py +++ b/snowflake/ml/modeling/metrics/classification.py @@ -13,7 +13,11 @@ from snowflake.ml._internal import telemetry from snowflake.ml.modeling.metrics import metrics_utils from snowflake.snowpark import functions as F, types as T -from snowflake.snowpark._internal import utils as snowpark_utils +from snowflake.snowpark._internal.utils import ( + TempObjectType, + generate_random_alphanumeric, + random_name_for_temp_object, +) _PROJECT = "ModelDevelopment" _SUBPROJECT = "Metrics" @@ -140,7 +144,7 @@ def confusion_matrix( elif df[[y_true_col_name]].join(label_df, df[y_true_col_name] == label_df[metrics_utils.LABEL]).count() == 0: raise ValueError("At least one label specified must be in the y true column") - rand = snowpark_utils.generate_random_alphanumeric() + rand = generate_random_alphanumeric() if sample_weight_col_name is None: sample_weight_col_name = f'"_SAMPLE_WEIGHT_{rand}"' df = df.with_column(sample_weight_col_name, F.lit(1)) @@ -264,6 +268,7 @@ def update_confusion_matrix(self) -> None: self._batched_rows[:, 0], ) + # TODO(SNANDAMURI): Should we convert it to temp anonymous UDTF for it to work in Sproc? confusion_matrix_computer = "ConfusionMatrixComputer_{}".format(str(uuid.uuid4()).replace("-", "_").upper()) session.udtf.register( ConfusionMatrixComputer, @@ -328,24 +333,24 @@ def f1_score( This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: - ``'binary'``: + ``'binary'`` Only report results for the class specified by ``pos_label``. This is applicable only if targets (y true, y pred) are binary. - ``'micro'``: + ``'micro'`` Calculate metrics globally by counting the total true positives, false negatives and false positives. - ``'macro'``: + ``'macro'`` Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - ``'weighted'``: + ``'weighted'`` Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. - ``'samples'``: + ``'samples'`` Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from - :func:`accuracy_score`). + func`accuracy_score`). sample_weight_col_name: Column name representing sample weights. zero_division: "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division, i.e. when all @@ -353,7 +358,7 @@ def f1_score( but warnings are also raised. Returns: - f1_score: float or array of float, shape = [n_unique_labels] + f1_score - float or array of float, shape = [n_unique_labels] F1 score of the positive class in binary classification or weighted average of the F1 scores of each class for the multiclass task. """ @@ -414,24 +419,24 @@ def fbeta_score( This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: - ``'binary'``: + ``'binary'`` Only report results for the class specified by ``pos_label``. This is applicable only if targets (y true, y pred) are binary. - ``'micro'``: + ``'micro'`` Calculate metrics globally by counting the total true positives, false negatives and false positives. - ``'macro'``: + ``'macro'`` Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - ``'weighted'``: + ``'weighted'`` Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. - ``'samples'``: + ``'samples'`` Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from - :func:`accuracy_score`). + func`accuracy_score`). sample_weight_col_name: Column name representing sample weights. zero_division: "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division, i.e. when all @@ -439,7 +444,7 @@ def fbeta_score( but warnings are also raised. Returns: - fbeta_score: float (if average is not None) or array of float, shape = [n_unique_labels] + fbeta_score - float (if average is not None) or array of float, shape = [n_unique_labels] F-beta score of the positive class in binary classification or weighted average of the F-beta score of each class for the multiclass task. """ @@ -508,13 +513,14 @@ def log_loss( """ session = df._session assert session is not None - sproc_name = f"log_loss_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -524,8 +530,9 @@ def log_loss( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def log_loss_sproc(session: snowpark.Session) -> float: + def log_loss_anon_sproc(session: snowpark.Session) -> float: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -541,7 +548,7 @@ def log_loss_sproc(session: snowpark.Session) -> float: labels=labels, ) - loss: float = session.call(sproc_name, statement_params=statement_params) + loss: float = log_loss_anon_sproc(session) return loss @@ -606,21 +613,21 @@ def precision_recall_fscore_support( average: {'binary', 'micro', 'macro', 'samples', 'weighted'}, default=None If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: - ``'binary'``: + ``'binary'`` Only report results for the class specified by ``pos_label``. This is applicable only if targets (y true, y pred) are binary. - ``'micro'``: + ``'micro'`` Calculate metrics globally by counting the total true positives, false negatives and false positives. - ``'macro'``: + ``'macro'`` Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - ``'weighted'``: + ``'weighted'`` Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. - ``'samples'``: + ``'samples'`` Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). @@ -629,32 +636,34 @@ def precision_recall_fscore_support( sample_weight_col_name: Column name representing sample weights. zero_division: "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division: - - recall: when there are no positive labels - - precision: when there are no positive predictions - - f-score: both + * recall - when there are no positive labels + * precision - when there are no positive predictions + * f-score - both If set to "warn", this acts as 0, but warnings are also raised. Returns: - precision: float (if average is not None) or array of float, shape = [n_unique_labels] - Precision score. - recall: float (if average is not None) or array of float, shape = [n_unique_labels] - Recall score. - fbeta_score: float (if average is not None) or array of float, shape = [n_unique_labels] - F-beta score. - support: None (if average is not None) or array of int, shape = [n_unique_labels] - The number of occurrences of each label in the y true column(s). + Tuple containing following items + precision - float (if average is not None) or array of float, shape = [n_unique_labels] + Precision score. + recall - float (if average is not None) or array of float, shape = [n_unique_labels] + Recall score. + fbeta_score - float (if average is not None) or array of float, shape = [n_unique_labels] + F-beta score. + support - None (if average is not None) or array of int, shape = [n_unique_labels] + The number of occurrences of each label in the y true column(s). """ metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names) session = df._session assert session is not None - sproc_name = f"precision_recall_fscore_support_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -664,8 +673,9 @@ def precision_recall_fscore_support( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def precision_recall_fscore_support_sproc(session: snowpark.Session) -> bytes: + def precision_recall_fscore_support_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -693,7 +703,7 @@ def precision_recall_fscore_support_sproc(session: snowpark.Session) -> bytes: return cloudpickle.dumps((p, r, f, s, warning)) # type: ignore[no-any-return] - loaded_data = cloudpickle.loads(session.call(sproc_name, statement_params=statement_params)) + loaded_data = cloudpickle.loads(precision_recall_fscore_support_anon_sproc(session)) res: Union[ Tuple[float, float, float, None], Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]], @@ -744,31 +754,31 @@ def precision_score( average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: - ``'binary'``: + ``'binary'`` Only report results for the class specified by ``pos_label``. This is applicable only if targets (y true, y pred) are binary. - ``'micro'``: + ``'micro'`` Calculate metrics globally by counting the total true positives, false negatives and false positives. - ``'macro'``: + ``'macro'`` Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - ``'weighted'``: + ``'weighted'`` Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. - ``'samples'``: + ``'samples'`` Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from - :func:`accuracy_score`). + func`accuracy_score`). sample_weight_col_name: Column name representing sample weights. zero_division: "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division. If set to "warn", this acts as 0, but warnings are also raised. Returns: - precision: float (if average is not None) or array of float, shape = (n_unique_labels,) + precision - float (if average is not None) or array of float, shape = (n_unique_labels,) Precision of the positive class in binary classification or weighted average of the precision of each class for the multiclass task. """ @@ -826,32 +836,32 @@ def recall_score( This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: - ``'binary'``: + ``'binary'`` Only report results for the class specified by ``pos_label``. This is applicable only if targets (y true, y pred) are binary. - ``'micro'``: + ``'micro'`` Calculate metrics globally by counting the total true positives, false negatives and false positives. - ``'macro'``: + ``'macro'`` Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - ``'weighted'``: + ``'weighted'`` Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. Weighted recall is equal to accuracy. - ``'samples'``: + ``'samples'`` Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from - :func:`accuracy_score`). + func`accuracy_score`). sample_weight_col_name: Column name representing sample weights. zero_division: "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division. If set to "warn", this acts as 0, but warnings are also raised. Returns: - recall: float (if average is not None) or array of float of shape (n_unique_labels,) + recall - float (if average is not None) or array of float of shape (n_unique_labels,) Recall of the positive class in binary classification or weighted average of the recall of each class for the multiclass task. """ diff --git a/snowflake/ml/modeling/metrics/metrics_utils.py b/snowflake/ml/modeling/metrics/metrics_utils.py index 76d5dfa6..b390bd08 100644 --- a/snowflake/ml/modeling/metrics/metrics_utils.py +++ b/snowflake/ml/modeling/metrics/metrics_utils.py @@ -8,6 +8,7 @@ import cloudpickle import numpy as np +import snowflake.snowpark._internal.utils as snowpark_utils from snowflake import snowpark from snowflake.snowpark import Session, functions as F, types as T @@ -159,7 +160,10 @@ def accumulate_batch_sum_and_dot_prod(self) -> None: rows_by_count_d = self._batched_rows / (self._count - self._ddof) self._sum_by_countd += np.sum(rows_by_count_d[0 : self._cur_count, :], axis=0) - sharded_dot_and_sum_computer = "ShardedDotAndSumComputer_{}".format(str(uuid4()).replace("-", "_").upper()) + sharded_dot_and_sum_computer = snowpark_utils.random_name_for_temp_object( + snowpark_utils.TempObjectType.TABLE_FUNCTION + ) + # TODO (SNOW-897239): make this an anonymous temp UDTF for it to work in a SPROC session.udtf.register( ShardedDotAndSumComputer, output_schema=T.StructType( diff --git a/snowflake/ml/modeling/metrics/ranking.py b/snowflake/ml/modeling/metrics/ranking.py index e3c7fb2f..c401e899 100644 --- a/snowflake/ml/modeling/metrics/ranking.py +++ b/snowflake/ml/modeling/metrics/ranking.py @@ -9,6 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.utils import result from snowflake.ml.modeling.metrics import metrics_utils from snowflake.snowpark import functions as F from snowflake.snowpark._internal import utils as snowpark_utils @@ -62,25 +63,28 @@ def precision_recall_curve( sample_weight_col_name: Column name representing sample weights. Returns: - precision: ndarray of shape (n_thresholds + 1,) - Precision values such that element i is the precision of - predictions with score >= thresholds[i] and the last element is 1. - recall: ndarray of shape (n_thresholds + 1,) - Decreasing recall values such that element i is the recall of - predictions with score >= thresholds[i] and the last element is 0. - thresholds: ndarray of shape (n_thresholds,) - Increasing thresholds on the decision function used to compute - precision and recall. + Tuple containing following items + precision - ndarray of shape (n_thresholds + 1,) + Precision values such that element i is the precision of + predictions with score >= thresholds[i] and the last element is 1. + recall - ndarray of shape (n_thresholds + 1,) + Decreasing recall values such that element i is the recall of + predictions with score >= thresholds[i] and the last element is 0. + thresholds - ndarray of shape (n_thresholds,) + Increasing thresholds on the decision function used to compute + precision and recall. """ session = df._session assert session is not None - sproc_name = f"precision_recall_curve_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_name, probas_pred_col_name, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_snowflake_result = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -90,8 +94,9 @@ def precision_recall_curve( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def precision_recall_curve_sproc(session: snowpark.Session) -> bytes: + def precision_recall_curve_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -104,11 +109,17 @@ def precision_recall_curve_sproc(session: snowpark.Session) -> bytes: pos_label=pos_label, sample_weight=sample_weight, ) + result_module = cloudpickle.loads(pickled_snowflake_result) + result_object = result_module.SnowflakeResult(session, (precision, recall, thresholds)) - return cloudpickle.dumps((precision, recall, thresholds)) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - loaded_data = cloudpickle.loads(session.call(sproc_name, statement_params=statement_params)) - res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = loaded_data + sproc_result = precision_recall_curve_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object return res @@ -164,16 +175,16 @@ class scores must correspond to the order of ``labels``, 'weighted' averages. For multiclass targets, `average=None` is only implemented for `multi_class='ovr'` and `average='micro'` is only implemented for `multi_class='ovr'`. - ``'micro'``: + ``'micro'`` Calculate metrics globally by considering each element of the label indicator matrix as a label. - ``'macro'``: + ``'macro'`` Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - ``'weighted'``: + ``'weighted'`` Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). - ``'samples'``: + ``'samples'`` Calculate metrics for each instance, and find their average. Will be ignored when ``y_true`` is binary. sample_weight_col_name: Column name representing sample weights. @@ -186,14 +197,14 @@ class scores must correspond to the order of ``labels``, Only used for multiclass targets. Determines the type of configuration to use. The default value raises an error, so either ``'ovr'`` or ``'ovo'`` must be passed explicitly. - ``'ovr'``: + ``'ovr'`` Stands for One-vs-rest. Computes the AUC of each class against the rest [3]_ [4]_. This treats the multiclass case in the same way as the multilabel case. Sensitive to class imbalance even when ``average == 'macro'``, because class imbalance affects the composition of each of the 'rest' groupings. - ``'ovo'``: + ``'ovo'`` Stands for One-vs-one. Computes the average AUC of all possible pairwise combinations of classes [5]_. Insensitive to class imbalance when @@ -203,17 +214,19 @@ class scores must correspond to the order of ``labels``, order of the labels in ``y_true`` is used. Returns: - auc: Area Under the Curve score. + Area Under the Curve score. """ session = df._session assert session is not None - sproc_name = f"roc_auc_score_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_score_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_snowflake_result = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -223,8 +236,9 @@ class scores must correspond to the order of ``labels``, "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def roc_auc_score_sproc(session: snowpark.Session) -> bytes: + def roc_auc_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -240,12 +254,17 @@ def roc_auc_score_sproc(session: snowpark.Session) -> bytes: multi_class=multi_class, labels=labels, ) + result_module = cloudpickle.loads(pickled_snowflake_result) + result_object = result_module.SnowflakeResult(session, auc) - return cloudpickle.dumps(auc) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - auc: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + sproc_result = roc_auc_score_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + auc: Union[float, npt.NDArray[np.float_]] = result_object return auc @@ -282,26 +301,29 @@ def roc_curve( lighter ROC curves. Returns: - fpr: ndarray of shape (>2,) - Increasing false positive rates such that element i is the false - positive rate of predictions with score >= `thresholds[i]`. - tpr: ndarray of shape (>2,) - Increasing true positive rates such that element `i` is the true - positive rate of predictions with score >= `thresholds[i]`. - thresholds: ndarray of shape = (n_thresholds,) - Decreasing thresholds on the decision function used to compute - fpr and tpr. `thresholds[0]` represents no instances being predicted - and is arbitrarily set to `max(y_score) + 1`. + Tuple containing following items + fpr - ndarray of shape (>2,) + Increasing false positive rates such that element i is the false + positive rate of predictions with score >= `thresholds[i]`. + tpr - ndarray of shape (>2,) + Increasing true positive rates such that element `i` is the true + positive rate of predictions with score >= `thresholds[i]`. + thresholds - ndarray of shape = (n_thresholds,) + Decreasing thresholds on the decision function used to compute + fpr and tpr. `thresholds[0]` represents no instances being predicted + and is arbitrarily set to `max(y_score) + 1`. """ session = df._session assert session is not None - sproc_name = f"roc_curve_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_name, y_score_col_name, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_snowflake_result = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -311,8 +333,9 @@ def roc_curve( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def roc_curve_sproc(session: snowpark.Session) -> bytes: + def roc_curve_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -327,8 +350,16 @@ def roc_curve_sproc(session: snowpark.Session) -> bytes: drop_intermediate=drop_intermediate, ) - return cloudpickle.dumps((fpr, tpr, thresholds)) # type: ignore[no-any-return] + result_module = cloudpickle.loads(pickled_snowflake_result) + result_object = result_module.SnowflakeResult(session, (fpr, tpr, thresholds)) + + return result_object.serialize() # type: ignore[no-any-return] + + sproc_result = roc_curve_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = result_object - loaded_data = cloudpickle.loads(session.call(sproc_name, statement_params=statement_params)) - res: Tuple[npt.NDArray[np.float_], npt.NDArray[np.float_], npt.NDArray[np.float_]] = loaded_data return res diff --git a/snowflake/ml/modeling/metrics/regression.py b/snowflake/ml/modeling/metrics/regression.py index 8a45757a..472ab884 100644 --- a/snowflake/ml/modeling/metrics/regression.py +++ b/snowflake/ml/modeling/metrics/regression.py @@ -12,11 +12,12 @@ from packaging import version from sklearn import metrics +import snowflake.snowpark._internal.utils as snowpark_utils from snowflake import snowpark from snowflake.ml._internal import telemetry +from snowflake.ml._internal.utils import result from snowflake.ml.modeling.metrics import metrics_utils from snowflake.snowpark import functions as F -from snowflake.snowpark._internal import utils as snowpark_utils _PROJECT = "ModelDevelopment" _SUBPROJECT = "Metrics" @@ -63,13 +64,15 @@ def d2_absolute_error_score( session = df._session assert session is not None - sproc_name = f"d2_absolute_error_score_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_snowflake_result = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -79,8 +82,9 @@ def d2_absolute_error_score( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def d2_absolute_error_score_sproc(session: snowpark.Session) -> bytes: + def d2_absolute_error_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -94,12 +98,17 @@ def d2_absolute_error_score_sproc(session: snowpark.Session) -> bytes: sample_weight=sample_weight, multioutput=multioutput, ) + result_module = cloudpickle.loads(pickled_snowflake_result) + result_object = result_module.SnowflakeResult(session, score) - return cloudpickle.dumps(score) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - score: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + sproc_result = d2_absolute_error_score_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + score: Union[float, npt.NDArray[np.float_]] = result_object return score @@ -147,13 +156,15 @@ def d2_pinball_score( session = df._session assert session is not None - sproc_name = f"d2_pinball_score_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -163,8 +174,9 @@ def d2_pinball_score( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def d2_pinball_score_sproc(session: snowpark.Session) -> bytes: + def d2_pinball_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -179,12 +191,17 @@ def d2_pinball_score_sproc(session: snowpark.Session) -> bytes: alpha=alpha, multioutput=multioutput, ) + result_module = cloudpickle.loads(pickled_result_module) + result_object = result_module.SnowflakeResult(session, score) - return cloudpickle.dumps(score) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - score: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + sproc_result = d2_pinball_score_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + score: Union[float, npt.NDArray[np.float_]] = result_object return score @@ -248,13 +265,15 @@ def explained_variance_score( session = df._session assert session is not None - sproc_name = f"explained_variance_score_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -264,8 +283,9 @@ def explained_variance_score( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def explained_variance_score_sproc(session: snowpark.Session) -> bytes: + def explained_variance_score_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -280,12 +300,17 @@ def explained_variance_score_sproc(session: snowpark.Session) -> bytes: multioutput=multioutput, force_finite=force_finite, ) + result_module = cloudpickle.loads(pickled_result_module) + result_object = result_module.SnowflakeResult(session, score) - return cloudpickle.dumps(score) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - score: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + sproc_result = explained_variance_score_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + score: Union[float, npt.NDArray[np.float_]] = result_object return score @@ -328,13 +353,15 @@ def mean_absolute_error( session = df._session assert session is not None - sproc_name = f"mean_absolute_error_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -344,8 +371,9 @@ def mean_absolute_error( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def mean_absolute_error_sproc(session: snowpark.Session) -> bytes: + def mean_absolute_error_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -360,11 +388,17 @@ def mean_absolute_error_sproc(session: snowpark.Session) -> bytes: multioutput=multioutput, ) - return cloudpickle.dumps(loss) # type: ignore[no-any-return] + result_module = cloudpickle.loads(pickled_result_module) + result_object = result_module.SnowflakeResult(session, loss) - loss: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + return result_object.serialize() # type: ignore[no-any-return] + + sproc_result = mean_absolute_error_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + loss: Union[float, npt.NDArray[np.float_]] = result_object return loss @@ -416,13 +450,15 @@ def mean_absolute_percentage_error( session = df._session assert session is not None - sproc_name = f"mean_absolute_percentage_error_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -432,8 +468,9 @@ def mean_absolute_percentage_error( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def mean_absolute_percentage_error_sproc(session: snowpark.Session) -> bytes: + def mean_absolute_percentage_error_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -447,12 +484,17 @@ def mean_absolute_percentage_error_sproc(session: snowpark.Session) -> bytes: sample_weight=sample_weight, multioutput=multioutput, ) + result_module = cloudpickle.loads(pickled_result_module) + result_object = result_module.SnowflakeResult(session, loss) - return cloudpickle.dumps(loss) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - loss: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + sproc_result = mean_absolute_percentage_error_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + loss: Union[float, npt.NDArray[np.float_]] = result_object return loss @@ -493,13 +535,15 @@ def mean_squared_error( session = df._session assert session is not None - sproc_name = f"mean_squared_error_{snowpark_utils.generate_random_alphanumeric()}" + sproc_name = snowpark_utils.random_name_for_temp_object(snowpark_utils.TempObjectType.PROCEDURE) sklearn_release = version.parse(sklearn.__version__).release statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) cols = metrics_utils.flatten_cols([y_true_col_names, y_pred_col_names, sample_weight_col_name]) queries = df[cols].queries["queries"] + pickled_result_module = cloudpickle.dumps(result) @F.sproc( # type: ignore[misc] + is_permanent=False, session=session, name=sproc_name, replace=True, @@ -509,8 +553,9 @@ def mean_squared_error( "snowflake-snowpark-python", ], statement_params=statement_params, + anonymous=True, ) - def mean_squared_error_sproc(session: snowpark.Session) -> bytes: + def mean_squared_error_anon_sproc(session: snowpark.Session) -> bytes: for query in queries[:-1]: _ = session.sql(query).collect(statement_params=statement_params) df = session.sql(queries[-1]).to_pandas(statement_params=statement_params) @@ -525,12 +570,17 @@ def mean_squared_error_sproc(session: snowpark.Session) -> bytes: multioutput=multioutput, squared=squared, ) + result_module = cloudpickle.loads(pickled_result_module) + result_object = result_module.SnowflakeResult(session, loss) - return cloudpickle.dumps(loss) # type: ignore[no-any-return] + return result_object.serialize() # type: ignore[no-any-return] - loss: Union[float, npt.NDArray[np.float_]] = cloudpickle.loads( - session.call(sproc_name, statement_params=statement_params) - ) + sproc_result = mean_squared_error_anon_sproc(session) + result_object, result_object_filepath = cloudpickle.loads(sproc_result) + if result_object_filepath is not None: + result_object = result.SnowflakeResult.load_result_from_filepath(session, result_object_filepath) + + loss: Union[float, npt.NDArray[np.float_]] = result_object return loss diff --git a/snowflake/ml/modeling/model_selection/_internal/BUILD.bazel b/snowflake/ml/modeling/model_selection/_internal/BUILD.bazel new file mode 100644 index 00000000..f011b611 --- /dev/null +++ b/snowflake/ml/modeling/model_selection/_internal/BUILD.bazel @@ -0,0 +1,33 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "init", + srcs = [ + "__init__.py", + ], + deps = [ + "//snowflake/ml/_internal:init_utils" + ], +) + +py_library( + name = "_grid_search_cv", + srcs = ["_grid_search_cv.py"], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", + ] +) + +py_library( + name = "_randomized_search_cv", + srcs = ["_randomized_search_cv.py"], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal/exceptions:exceptions", + ] +) diff --git a/snowflake/ml/modeling/model_selection/_internal/__init__.py b/snowflake/ml/modeling/model_selection/_internal/__init__.py new file mode 100644 index 00000000..6010bc4d --- /dev/null +++ b/snowflake/ml/modeling/model_selection/_internal/__init__.py @@ -0,0 +1,9 @@ +import os + +from snowflake.ml._internal import init_utils + +pkg_dir = os.path.dirname(os.path.abspath(__file__)) +pkg_name = __name__ +exportable_classes = init_utils.fetch_classes_from_modules_in_pkg_dir(pkg_dir=pkg_dir, pkg_name=pkg_name) +for k, v in exportable_classes.items(): + globals()[k] = v diff --git a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py new file mode 100644 index 00000000..3dc4fddf --- /dev/null +++ b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py @@ -0,0 +1,1369 @@ +# +# This code is auto-generated using the sklearn_wrapper_template.py_template template. +# Do not modify the auto-generated code(except automatic reformating by precommit hooks). +# +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +import copy +import inspect +import os +import posixpath +import sys +from collections import defaultdict +from math import ceil +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union +from uuid import uuid4 + +import cachetools +import cloudpickle +import cloudpickle as cp +import numpy +import numpy as np +import pandas as pd +import sklearn +import sklearn.model_selection +from sklearn.model_selection import ParameterGrid +from sklearn.utils.metaestimators import available_if +from typing_extensions import TypeGuard + +from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions +from snowflake.ml._internal.utils import identifier, pkg_version_utils +from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator +from snowflake.ml._internal.utils.temp_file_utils import ( + cleanup_temp_files, + get_temp_file_path, +) +from snowflake.ml.model._signatures import utils as model_signature_utils +from snowflake.ml.model.model_signature import ( + BaseFeatureSpec, + DataType, + FeatureSpec, + ModelSignature, + _infer_signature, +) +from snowflake.ml.modeling.framework._utils import to_native_format +from snowflake.ml.modeling.framework.base import BaseTransformer +from snowflake.snowpark import DataFrame, Session, functions as F +from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type +from snowflake.snowpark._internal.utils import ( + TempObjectType, + random_name_for_temp_object, +) +from snowflake.snowpark.functions import col, pandas_udf, sproc, udtf +from snowflake.snowpark.types import ( + BinaryType, + PandasSeries, + StringType, + StructField, + StructType, +) + +_PROJECT = "ModelDevelopment" +# Derive subproject from module name by removing "sklearn" +# and converting module name from underscore to CamelCase +# e.g. sklearn.linear_model -> LinearModel. +_SUBPROJECT = "ModelSelection" + + +# TODO: refactor all the common logic into a shared utility module. +def _original_estimator_has_callable(attr: str) -> Callable[[Any], bool]: + """Checks that the original estimator has callable `attr`. + + Args: + attr: Attribute to check for. + + Returns: + A function which checks for the existance of callable `attr` on the given object. + """ + + def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]: + """Check for the existance of callable `attr` in self. + + Args: + self: BaseTransformer object + + Returns: + True of the callable `attr` exists in self, False otherwise. + """ + return callable(getattr(self._sklearn_object, attr, None)) + + return check + + +def _gather_dependencies(obj: Any) -> Set[str]: + """Gethers dependencies from the SnowML Estimator and Transformer objects. + + Args: + obj: Source object to collect dependencies from. Source object could of any type, example, lists, tuples, etc. + + Returns: + A set of dependencies required to work with the object. + """ + + if isinstance(obj, list) or isinstance(obj, tuple): + deps: Set[str] = set() + for elem in obj: + deps = deps | set(_gather_dependencies(elem)) + return deps + elif isinstance(obj, BaseTransformer): + return set(obj._get_dependencies()) + else: + return set() + + +def _transform_snowml_obj_to_sklearn_obj(obj: Any) -> Any: + """Converts SnowML Estimator and Transformer objects to equivalent SKLearn objects. + + Args: + obj: Source object that needs to be converted. Source object could of any type, example, lists, tuples, etc. + + Returns: + An equivalent object with SnowML estimators and transforms replaced with equivalent SKLearn objects. + """ + + if isinstance(obj, list): + # Apply transform function to each element in the list + return list(map(_transform_snowml_obj_to_sklearn_obj, obj)) + elif isinstance(obj, tuple): + # Apply transform function to each element in the tuple + return tuple(map(_transform_snowml_obj_to_sklearn_obj, obj)) + elif isinstance(obj, BaseTransformer): + # Convert SnowML object to equivalent SKLearn object + return to_native_format(obj) + else: + # Return all other objects as it is. + return obj + + +def _validate_sklearn_args(args: Dict[str, Any], klass: type) -> Dict[str, Any]: + """Validate if all the keyword args are supported by current version of SKLearn/XGBoost object. + + Args: + args: Dictionary of keyword args for the wrapper init method. + klass: Underlying SKLearn/XGBoost class object. + + Returns: + result: sklearn arguments + + Raises: + SnowflakeMLException: if a user specified arg is not supported by current version of sklearn/xgboost. + """ + result = {} + signature = inspect.signature(klass.__init__) # type: ignore + for k, v in args.items(): + if k not in signature.parameters.keys(): # Arg is not supported. + if v[2] or ( # Arg doesn't have default value in the signature. + v[0] != v[1] # Value is not same as default. + and not (isinstance(v[0], float) and np.isnan(v[0]) and np.isnan(v[1])) + ): # both are not NANs + raise exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=RuntimeError(f"Arg {k} is not supported by current version of SKLearn/XGBoost."), + ) + else: + result[k] = v[0] + return result + + +class GridSearchCV(BaseTransformer): + r"""Exhaustive search over specified parameter values for an estimator + For more details on this class, see [sklearn.model_selection.GridSearchCV] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (`str`) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + scoring : str, callable, list, tuple or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_parameter`); + - a callable (see :ref:`scoring`) that returns a single value. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables a values. + + See :ref:`multimetric_grid_search` for an example. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + refit : bool, str, or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a `str` denoting the + scorer that would be used to find the best parameters for refitting + the estimator at the end. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``GridSearchCV`` instance. + + Also for multiple metric evaluation, the attributes ``best_index_``, + ``best_score_`` and ``best_params_`` will only be available if + ``refit`` is set and all of them will be determined w.r.t this specific + scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` + to see how to design a custom selection strategy using a callable + via `refit`. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + verbose : int + Controls the verbosity: the higher, the more messages. + + - >1 : the computation time for each fold and parameter candidate is + displayed; + - >2 : the score is also displayed; + - >3 : the fold and candidate parameter indexes are also displayed + together with the starting time of the computation. + + pre_dispatch : int, or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A str, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + input_cols : Optional[Union[str, List[str]]] + A string or list of strings representing column names that contain features. + If this parameter is not specified, all columns in the input DataFrame except + the columns specified by label_cols and sample-weight_col parameters are + considered input columns. + + label_cols : Optional[Union[str, List[str]]] + A string or list of strings representing column names that contain labels. + This is a required param for estimators, as there is no way to infer these + columns. If this parameter is not specified, then object is fitted without + labels(Like a transformer). + + output_cols: Optional[Union[str, List[str]]] + A string or list of strings representing column names that will store the + output of predict and transform operations. The length of output_cols mus + match the expected number of output columns from the specific estimator or + transformer class used. + If this parameter is not specified, output column names are derived by + adding an OUTPUT_ prefix to the label column names. These inferred output + column names work for estimator's predict() method, but output_cols must + be set explicitly for transformers. + + sample_weight_col: Optional[str] + A string representing the column name containing the examples’ weights. + This argument is only required when working with weighted datasets. + + drop_input_cols: Optional[bool], default=False + If set, the response of predict(), transform() methods will not contain input columns. + """ + + def __init__( # type: ignore + self, + *, + estimator, + param_grid, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=False, + input_cols: Optional[Union[str, Iterable[str]]] = None, + output_cols: Optional[Union[str, Iterable[str]]] = None, + label_cols: Optional[Union[str, Iterable[str]]] = None, + drop_input_cols: Optional[bool] = False, + sample_weight_col: Optional[str] = None, + ) -> None: + super().__init__() + deps: Set[str] = { + f"numpy=={np.__version__}", + f"scikit-learn=={sklearn.__version__}", + f"cloudpickle=={cp.__version__}", + f"cachetools=={cachetools.__version__}", # type: ignore + } + deps = deps | _gather_dependencies(estimator) + self._deps = list(deps) + estimator = _transform_snowml_obj_to_sklearn_obj(estimator) + init_args = { + "estimator": (estimator, None, True), + "param_grid": (param_grid, None, True), + "scoring": (scoring, None, False), + "n_jobs": (n_jobs, None, False), + "refit": (refit, True, False), + "cv": (cv, None, False), + "verbose": (verbose, 0, False), + "pre_dispatch": (pre_dispatch, "2*n_jobs", False), + "error_score": (error_score, np.nan, False), + "return_train_score": (return_train_score, False, False), + } + cleaned_up_init_args = _validate_sklearn_args(args=init_args, klass=sklearn.model_selection.GridSearchCV) + self._sklearn_object = sklearn.model_selection.GridSearchCV( + **cleaned_up_init_args, + ) + self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None + self.set_input_cols(input_cols) + self.set_output_cols(output_cols) + self.set_label_cols(label_cols) + self.set_drop_input_cols(drop_input_cols) + self.set_sample_weight_col(sample_weight_col) + + def _get_rand_id(self) -> str: + """ + Generate random id to be used in sproc and stage names. + + Returns: + Random id string usable in sproc, table, and stage names. + """ + return str(uuid4()).replace("-", "_").upper() + + def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: + """ + Infer `self.input_cols` and `self.output_cols` if they are not explicitly set. + + Args: + dataset: Input dataset. + """ + if not self.input_cols: + cols = [c for c in dataset.columns if c not in self.get_label_cols() and c != self.sample_weight_col] + self.set_input_cols(input_cols=cols) + + if not self.output_cols: + cols = [identifier.concat_names(ids=["OUTPUT_", c]) for c in self.label_cols] + self.set_output_cols(output_cols=cols) + + def _get_active_columns(self) -> List[str]: + """ "Get the list of columns that are relevant to the transformer.""" + selected_cols = ( + self.input_cols + self.label_cols + ([self.sample_weight_col] if self.sample_weight_col is not None else []) + ) + return selected_cols + + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "GridSearchCV": + """Run fit with all sets of parameters + For more details on this function, see [sklearn.model_selection.GridSearchCV.fit] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.fit) + + + Raises: + TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame. + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + self + """ + self._infer_input_output_cols(dataset) + if isinstance(dataset, pd.DataFrame): + self._fit_pandas(dataset) + elif isinstance(dataset, DataFrame): + self._fit_snowpark(dataset) + else: + raise TypeError( + f"Unexpected dataset type: {type(dataset)}." + "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." + ) + self._is_fitted = True + self._get_model_signatures(dataset) + return self + + def _fit_snowpark(self, dataset: DataFrame) -> None: + session = dataset._session + assert session is not None # keep mypy happy + # Validate that key package version in user workspace are supported in snowflake conda channel + # If customer doesn't have package in conda channel, replace the ones have the closest versions + self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( + pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT + ) + + # Create two stages - one for data and one for estimators. + temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE) + temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};" + session.sql(temp_stage_creation_query).collect() + + # Stage data. + data_name = temp_stage_name + selected_cols = self._get_active_columns() + if len(selected_cols) > 0: + dataset = dataset.select(selected_cols) + # TODO: add index column to the staged data + dataset.write.save_as_table(f"{data_name}", table_type="temp") + + # TODO: explore using Fileset.make() + file_format_name = "parquet_file_format" + file_format_query = f"CREATE OR REPLACE FILE FORMAT {file_format_name} TYPE = 'PARQUET';" + session.sql(file_format_query).collect() + + file_format = f"FILE_FORMAT = (FORMAT_NAME = '{file_format_name}')" + stage_load_query = f"COPY INTO @{temp_stage_name}/{data_name} FROM {data_name} {file_format} header = true" + session.sql(stage_load_query).collect() + + imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}").collect()] + + # Create estimators with subset of param grid. + # TODO: Decide how to choose parallelization factor. + parallel_factor = 8 + + assert self._sklearn_object is not None + params_to_evaluate = list(ParameterGrid(self._sklearn_object.param_grid)) + max_params_per_estimator = ceil(len(params_to_evaluate) / parallel_factor) + param_chunks = [ + params_to_evaluate[x : x + max_params_per_estimator] + for x in range(0, len(params_to_evaluate), max_params_per_estimator) + ] + target_locations = [] + for param_chunk in param_chunks: + + param_chunk_dist: Any = defaultdict(set) + for d in param_chunk: + for k, v in d.items(): + param_chunk_dist[k].add(v) + for k, v in param_chunk_dist.items(): + param_chunk_dist[k] = list(v) + + estimator = copy.deepcopy(self._sklearn_object) + estimator.param_grid = param_chunk_dist + + # Create a temp file and dump the transform to that file. + local_transform_file_name = get_temp_file_path() + with open(local_transform_file_name, mode="w+b") as local_transform_file: + cp.dump(estimator, local_transform_file) + + # Put locally serialized transform on stage and add it to the list of imports. + # TODO: Add statement params. + put_result = session.file.put( + local_transform_file_name, + temp_stage_name, + auto_compress=False, + overwrite=True, + ) + target_location = put_result[0].target + target_locations.append(target_location) + imports.append(f"@{temp_stage_name}/{target_location}") + + input_cols = copy.deepcopy(self.input_cols) + label_cols = copy.deepcopy(self.label_cols) + + # cachetools package was not included in udtf, error raised. + @cachetools.cached(cache={}) + def _load_data_into_udtf() -> Tuple[pd.DataFrame, pd.DataFrame]: + data_files = [ + filename + for filename in os.listdir(sys._xoptions["snowflake_import_directory"]) + if filename.startswith(data_name) + ] + partial_df = [ + pd.read_parquet(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)) + for file_name in data_files + ] + df = pd.concat(partial_df, ignore_index=True) + + for column in df: + df[column] = pd.to_numeric(df[column]) + + dfx = df[input_cols] + dfy = df[label_cols] + + return dfx, dfy + + # TODO: set refit = False, and fit after retrieving the resultf from udtf + @udtf( + output_schema=StructType( + [ + StructField("ESTIMATOR_LOCATION", StringType()), + StructField("BEST_SCORE", StringType()), + StructField("ESTIMATOR", BinaryType()), + ] + ), + input_types=[StringType()], + name="hyperparameter_tuning", + packages=["snowflake-snowpark-python"] + self._get_dependencies(), + replace=True, + imports=imports, + ) + class SearchCV: + def __init__(self) -> None: + dfx, dfy = _load_data_into_udtf() + self._dfx = dfx + self._dfy = dfy + + def process(self, estimator_location): + local_transform_file_path = os.path.join( + sys._xoptions["snowflake_import_directory"], f"{estimator_location}" + ) + with open(local_transform_file_path, mode="rb") as local_transform_file_obj: + estimator = cp.load(local_transform_file_obj) + + # TODO: handle sample weights. + fit_estimator = estimator.fit(self._dfx, self._dfy) + # TODO: handle the case of estimator size > maximum column size or to just serialize and return score. + yield (estimator_location, fit_estimator.best_score_, cloudpickle.dumps(fit_estimator)) + + def end_partition(self) -> None: + ... + + # TODO: Check partitioning to ensure that partitions are uniformly distributed over UDTF intances + # Set parallelism to 16 and ensure that one partion goes to one instance of UDTF + HP_TUNING = F.table_function("hyperparameter_tuning") + + # TODO: check cv_results + df = session.create_dataframe(target_locations, schema=["estimator_location"]) + results = df.select(HP_TUNING(df["estimator_location"]).over(partition_by=df["estimator_location"])).sort( + col("BEST_SCORE").desc() + ) + + best_estimator = cloudpickle.loads(results.select("ESTIMATOR").first()[0]) + self._sklearn_object = best_estimator + + def _fit_pandas(self, dataset: pd.DataFrame) -> None: + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "fit") # keep mypy happy + argspec = inspect.getfullargspec(self._sklearn_object.fit) + args = {"X": dataset[self.input_cols]} + if self.label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = dataset[self.label_cols].squeeze() + + if self.sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = dataset[self.sample_weight_col].squeeze() + + self._sklearn_object.fit(**args) + + def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]: + if self._drop_input_cols: + return [] + else: + return list(set(dataset.columns) - set(self.output_cols)) + + def _batch_inference( + self, + dataset: DataFrame, + inference_method: str, + expected_output_cols_list: List[str], + expected_output_cols_type: str = "", + ) -> DataFrame: + """Util method to create UDF and run batch inference.""" + if not self._is_fitted: + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=RuntimeError( + f"Estimator {self.__class__.__name__} not fitted before calling {inference_method} method." + ), + ) + + session = dataset._session + if session is None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError("Session must not specified for snowpark dataset."), + ) + # Validate that key package version in user workspace are supported in snowflake conda channel + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( + pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT + ) + + # Register vectorized UDF for batch inference + batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) + + # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark + # will try to pickle all of self which fails. + estimator = self._sklearn_object + + # Input columns for UDF are sorted by column names. + # We need actual order of input cols to reorder dataframe before calling inference methods. + input_cols = self.input_cols + unquoted_input_cols = identifier.get_unescaped_names(self.input_cols) + + statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[pandas_udf], + custom_tags=dict([("autogen", True)]), + ) + + @pandas_udf( # type: ignore + is_permanent=False, + name=batch_inference_udf_name, + packages=self._get_dependencies(), # type: ignore + replace=True, + session=session, + statement_params=statement_params, + ) + def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: ignore + import numpy as np + import pandas as pd + + input_df = pd.io.json.json_normalize(ds) + + # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). + # But trained models have unquoted input column names saved in internal state if trained using snowpark_df + # or quoted input column names saved in internal state if trained using pandas_df. + # Model expects exact same columns names in the input df for predict call. + + input_df = input_df[input_cols] # Select input columns with quoted column names. + if hasattr(estimator, "feature_names_in_"): + assert estimator is not None + missing_features = [] + for i, f in enumerate(estimator.feature_names_in_): + if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f): + missing_features.append(f) + + if len(missing_features) > 0: + raise ValueError( + "The feature names should match with those that were passed during fit.\n" + f"Features seen during fit call but not present in the input: {missing_features}\n" + f"Features in the input dataframe : {input_cols}\n" + ) + input_df.columns = estimator.feature_names_in_ + else: + # Just rename the column names to unquoted identifiers. + input_df.columns = ( + unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids. + ) + transformed_numpy_array = getattr(estimator, inference_method)(input_df) + if ( + isinstance(transformed_numpy_array, list) + and len(transformed_numpy_array) > 0 + and isinstance(transformed_numpy_array[0], np.ndarray) + ): + # In case of multioutput estimators, predict_proba(), decision_function(), etc., functions return + # a list of ndarrays. We need to concatenate them. + transformed_numpy_array = np.concatenate(transformed_numpy_array, axis=1) + + if len(transformed_numpy_array.shape) == 3: + # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes) + # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms, + # so we ignore flatten_transform flag and flatten the results. + transformed_numpy_array = np.hstack(transformed_numpy_array) + + if len(transformed_numpy_array.shape) > 1 and transformed_numpy_array.shape[1] != len( + expected_output_cols_list + ): + # HeterogeneousEnsemble's transfrom method produce results with variying shapes + # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes). + # It is hard to predict the response shape without using fragile introspection logic. + # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with + # each element being a list. + if len(expected_output_cols_list) != 1: + raise TypeError( + "expected_output_cols_list must be same length as transformed array or " "should be of length 1" + ) + series = pd.Series(transformed_numpy_array.tolist()) + transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols_list) + else: + transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols_list) + return transformed_pandas_df.to_dict("records") # type: ignore + + batch_inference_table_name = f"SNOWML_BATCH_INFERENCE_INPUT_TABLE_{self._get_rand_id()}" + + pass_through_columns = self._get_pass_through_columns(dataset) + # Run Transform + query_from_df = str(dataset.queries["queries"][0]) + + outer_select_list = pass_through_columns[:] + inner_select_list = pass_through_columns[:] + + outer_select_list.extend( + [ + "{object_name}:{column_name}{udf_datatype} as {column_name}".format( + object_name=batch_inference_udf_name, + column_name=c, + udf_datatype=(f"::{expected_output_cols_type}" if expected_output_cols_type else ""), + ) + for c in expected_output_cols_list + ] + ) + + inner_select_list.extend( + [ + "{udf_name}(object_construct_keep_null({input_cols_dict})) AS {udf_name}".format( + udf_name=batch_inference_udf_name, + input_cols_dict=", ".join([f"'{c}', {c}" for c in self.input_cols]), + ) + ] + ) + + sql = """WITH {input_table_name} AS ({query}) + SELECT + {outer_select_stmt} + FROM ( + SELECT + {inner_select_stmt} + FROM {input_table_name} + ) + """.format( + input_table_name=batch_inference_table_name, + query=query_from_df, + outer_select_stmt=", ".join(outer_select_list), + inner_select_stmt=", ".join(inner_select_list), + ) + + return session.sql(sql) + + def _sklearn_inference( + self, dataset: pd.DataFrame, inference_method: str, expected_output_cols_list: List[str] + ) -> pd.DataFrame: + output_cols = expected_output_cols_list.copy() + + # Model expects exact same columns names in the input df for predict call. + # Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call + # input cols need to match unquoted / quoted + input_cols = self.input_cols + unquoted_input_cols = identifier.get_unescaped_names(self.input_cols) + quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols) + + estimator = self._sklearn_object + + assert estimator is not None + features_required_by_estimator = ( + estimator.feature_names_in_ if hasattr(estimator, "feature_names_in_") else unquoted_input_cols + ) + missing_features = [] + features_in_dataset = set(dataset.columns) + columns_to_select = [] + for i, f in enumerate(features_required_by_estimator): + if ( + i >= len(input_cols) + or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f) + or ( + input_cols[i] not in features_in_dataset + and unquoted_input_cols[i] not in features_in_dataset + and quoted_input_cols[i] not in features_in_dataset + ) + ): + missing_features.append(f) + elif input_cols[i] in features_in_dataset: + columns_to_select.append(input_cols[i]) + elif unquoted_input_cols[i] in features_in_dataset: + columns_to_select.append(unquoted_input_cols[i]) + else: + columns_to_select.append(quoted_input_cols[i]) + + if len(missing_features) > 0: + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError( + "The feature names should match with those that were passed during fit.\n" + f"Features seen during fit call but not present in the input: {missing_features}\n" + f"Features in the input dataframe : {input_cols}\n" + ), + ) + input_df = dataset[columns_to_select] + input_df.columns = features_required_by_estimator + + transformed_numpy_array = getattr(estimator, inference_method)(input_df) + + if ( + isinstance(transformed_numpy_array, list) + and len(transformed_numpy_array) > 0 + and isinstance(transformed_numpy_array[0], np.ndarray) + ): + # In case of multioutput estimators, predict_proba(), decision_function(), etc., functions return + # a list of ndarrays. We need to concatenate them. + + # First compute output column names + if len(output_cols) == len(transformed_numpy_array): + actual_output_cols = [] + for idx, np_arr in enumerate(transformed_numpy_array): + for i in range(1 if len(np_arr.shape) <= 1 else np_arr.shape[1]): + actual_output_cols.append(f"{output_cols[idx]}_{i}") + output_cols = actual_output_cols + + # Concatenate np arrays + transformed_numpy_array = np.concatenate(transformed_numpy_array, axis=1) + + if len(transformed_numpy_array.shape) == 3: + # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes) + # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms, + # so we ignore flatten_transform flag and flatten the results. + transformed_numpy_array = np.hstack(transformed_numpy_array) + + if len(transformed_numpy_array.shape) == 1: + transformed_numpy_array = np.reshape(transformed_numpy_array, (-1, 1)) + + shape = transformed_numpy_array.shape + if shape[1] != len(output_cols): + if len(output_cols) != 1: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=TypeError( + "expected_output_cols_list must be same length as transformed array or " "should be of length 1" + ), + ) + actual_output_cols = [] + for i in range(shape[1]): + actual_output_cols.append(f"{output_cols[0]}_{i}") + output_cols = actual_output_cols + + if self._drop_input_cols: + dataset = pd.DataFrame(data=transformed_numpy_array, columns=output_cols) + else: + dataset = dataset.copy() + dataset[output_cols] = transformed_numpy_array + return dataset + + @available_if(_original_estimator_has_callable("predict")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: + """Call predict on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.GridSearchCV.predict] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.predict) + + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + Transformed dataset. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + expected_type_inferred = "" + # when it is classifier, infer the datatype from label columns + if expected_type_inferred == "" and "predict" in self.model_signatures: + expected_type_inferred = convert_sp_to_sf_type( + self.model_signatures["predict"].outputs[0].as_snowpark_type() + ) + + output_df = self._batch_inference( + dataset=dataset, + inference_method="predict", + expected_output_cols_list=self.output_cols, + expected_output_cols_type=expected_type_inferred, + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="predict", + expected_output_cols_list=self.output_cols, + ) + + return output_df + + @available_if(_original_estimator_has_callable("transform")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: + """Call transform on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.GridSearchCV.transform] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.transform) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + Transformed dataset. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + expected_dtype = "" + if False: # is child of _BaseHeterogeneousEnsemble + # transform() method of HeterogeneousEnsemble estimators return responses of varying shapes + # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between) + # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with + # each row containing a list of values. + expected_dtype = "ARRAY" + + output_df = self._batch_inference( + dataset=dataset, + inference_method="transform", + expected_output_cols_list=self.output_cols, + expected_output_cols_type=expected_dtype, + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="transform", + expected_output_cols_list=self.output_cols, + ) + + return output_df + + def _get_output_column_names(self, output_cols_prefix: str) -> List[str]: + """Returns the list of output columns for predict_proba(), decision_function(), etc.. functions. + Returns a list with output_cols_prefix as the only element if the estimator is not a classifier. + + Args: + output_cols_prefix (str): prefix according to the function + + Returns: + List[str]: output cols with prefix + """ + if getattr(self._sklearn_object, "classes_", None) is None: + return [output_cols_prefix] + + assert self._sklearn_object is not None # keep mypy happy + classes = self._sklearn_object.classes_ + if isinstance(classes, numpy.ndarray): + return [f"{output_cols_prefix}{c}" for c in classes.tolist()] + elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray): + # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays. + output_cols = [] + for i, cl in enumerate(classes): + # For binary classification, there is only one output column for each class + # ndarray as the two classes are complementary. + if len(cl) == 2: + output_cols.append(f"{output_cols_prefix}_{i}_{cl[0]}") + else: + output_cols.extend([f"{output_cols_prefix}_{i}_{c}" for c in cl.tolist()]) + return output_cols + return [] + + @available_if(_original_estimator_has_callable("predict_proba")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def predict_proba( + self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_proba_" + ) -> Union[DataFrame, pd.DataFrame]: + """Call predict_proba on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.GridSearchCV.predict_proba] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.predict_proba) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + output_cols_prefix: Prefix for the response columns + + Returns: + Output dataset with probability of the sample for each class in the model. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + output_df = self._batch_inference( + dataset=dataset, + inference_method="predict_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + expected_output_cols_type="float", + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="predict_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + ) + + return output_df + + @available_if(_original_estimator_has_callable("predict_log_proba")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def predict_log_proba( + self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_log_proba_" + ) -> Union[DataFrame, pd.DataFrame]: + """Call predict_proba on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.GridSearchCV.predict_proba] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.predict_proba) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + output_cols_prefix: str + Prefix for the response columns + + Returns: + Output dataset with log probability of the sample for each class in the model. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + output_df = self._batch_inference( + dataset=dataset, + inference_method="predict_log_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + expected_output_cols_type="float", + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="predict_log_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + ) + + return output_df + + @available_if(_original_estimator_has_callable("decision_function")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def decision_function( + self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "decision_function_" + ) -> Union[DataFrame, pd.DataFrame]: + """Call decision_function on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.GridSearchCV.decision_function] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.decision_function) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + output_cols_prefix: str + Prefix for the response columns + + Returns: + Output dataset with results of the decision function for the samples in input dataset. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + output_df = self._batch_inference( + dataset=dataset, + inference_method="decision_function", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + expected_output_cols_type="float", + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="decision_function", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + ) + + return output_df + + @available_if(_original_estimator_has_callable("score")) # type: ignore + def score(self, dataset: Union[DataFrame, pd.DataFrame]) -> float: + """ + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + Score. + """ + self._infer_input_output_cols(dataset) + super()._check_dataset_type(dataset) + if isinstance(dataset, pd.DataFrame): + output_score = self._score_sklearn(dataset) + elif isinstance(dataset, DataFrame): + output_score = self._score_snowpark(dataset) + return output_score + + def _score_sklearn(self, dataset: pd.DataFrame) -> float: + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "score") # make type checker happy + argspec = inspect.getfullargspec(self._sklearn_object.score) + if "X" in argspec.args: + args = {"X": dataset[self.input_cols]} + elif "X_test" in argspec.args: + args = {"X_test": dataset[self.input_cols]} + else: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError("Neither 'X' or 'X_test' exist in argument"), + ) + + if self.label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = dataset[self.label_cols].squeeze() + + if self.sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = dataset[self.sample_weight_col].squeeze() + + score = self._sklearn_object.score(**args) + return score + + def _score_snowpark(self, dataset: DataFrame) -> float: + # Specify input columns so column pruing will be enforced + selected_cols = self._get_active_columns() + if len(selected_cols) > 0: + dataset = dataset.select(selected_cols) + + # Extract queries that generated the dataframe. We will need to pass it to score procedure. + queries = dataset.queries["queries"] + + # Create a temp file and dump the score to that file. + local_score_file_name = get_temp_file_path() + with open(local_score_file_name, mode="w+b") as local_score_file: + cp.dump(self._sklearn_object, local_score_file) + + # Create temp stage to run score. + score_stage_name = random_name_for_temp_object(TempObjectType.STAGE) + session = dataset._session + assert session is not None # keep mypy happy + stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};" + SqlResultValidator(session=session, query=stage_creation_query).has_dimensions( + expected_rows=1, expected_cols=1 + ).validate() + + # Use posixpath to construct stage paths + stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name)) + score_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) + statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[sproc], + custom_tags=dict([("autogen", True)]), + ) + # Put locally serialized score on stage. + session.file.put( + local_score_file_name, + stage_score_file_name, + auto_compress=False, + overwrite=True, + statement_params=statement_params, + ) + + @sproc( + is_permanent=False, + name=score_sproc_name, + packages=["snowflake-snowpark-python"] + self._get_dependencies(), # type: ignore + replace=True, + session=session, + statement_params=statement_params, + anonymous=True, + ) + def score_wrapper_sproc( + session: Session, + sql_queries: List[str], + stage_score_file_name: str, + input_cols: List[str], + label_cols: List[str], + sample_weight_col: Optional[str], + statement_params: Dict[str, str], + ) -> float: + import inspect + import os + import tempfile + + import cloudpickle as cp + import numpy as np # noqa: F401 + import pandas # noqa: F401 + import sklearn # noqa: F401 + + for query in sql_queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(sql_queries[-1]).to_pandas(statement_params=statement_params) + + local_score_file = tempfile.NamedTemporaryFile(delete=True) + local_score_file_name = local_score_file.name + local_score_file.close() + + session.file.get(stage_score_file_name, local_score_file_name, statement_params=statement_params) + + local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0]) + with open(local_score_file_name_path, mode="r+b") as local_score_file_obj: + estimator = cp.load(local_score_file_obj) + + argspec = inspect.getfullargspec(estimator.score) + if "X" in argspec.args: + args = {"X": df[input_cols]} + elif "X_test" in argspec.args: + args = {"X_test": df[input_cols]} + else: + raise RuntimeError("Neither 'X' or 'X_test' exist in argument") + + if label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = df[label_cols].squeeze() + + if sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = df[sample_weight_col].squeeze() + + result: float = estimator.score(**args) + return result + + # Call score sproc + statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[Session.call], + custom_tags=dict([("autogen", True)]), + ) + score: float = score_wrapper_sproc( + session, + queries, + stage_score_file_name, + identifier.get_unescaped_names(self.input_cols), + identifier.get_unescaped_names(self.label_cols), + identifier.get_unescaped_names(self.sample_weight_col), + statement_params, + ) + + cleanup_temp_files([local_score_file_name]) + + return score + + def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: + self._model_signature_dict = dict() + + PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"] + + inputs = list(_infer_signature(dataset[self.input_cols], "input")) + outputs: List[BaseFeatureSpec] = [] + if hasattr(self, "predict"): + # keep mypy happy + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type") + # For classifier, the type of predict is the same as the type of label + if self._sklearn_object._estimator_type == "classifier": + # label columns is the desired type for output + outputs = _infer_signature(dataset[self.label_cols], "output") + # rename the output columns + outputs = model_signature_utils.rename_features(outputs, self.output_cols) + self._model_signature_dict["predict"] = ModelSignature( + inputs, ([] if self._drop_input_cols else inputs) + outputs + ) + # For regressor, the type of predict is float64 + elif self._sklearn_object._estimator_type == "regressor": + outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols] + self._model_signature_dict["predict"] = ModelSignature( + inputs, ([] if self._drop_input_cols else inputs) + outputs + ) + for prob_func in PROB_FUNCTIONS: + if hasattr(self, prob_func): + output_cols_prefix: str = f"{prob_func}_" + output_column_names = self._get_output_column_names(output_cols_prefix) + outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names] + self._model_signature_dict[prob_func] = ModelSignature( + inputs, ([] if self._drop_input_cols else inputs) + outputs + ) + + @property + def model_signatures(self) -> Dict[str, ModelSignature]: + """Returns model signature of current class. + + Raises: + SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred + + Returns: + Dict[str, ModelSignature]: each method and its input output signature + """ + if self._model_signature_dict is None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"), + ) + return self._model_signature_dict + + def to_sklearn(self) -> Any: + if self._sklearn_object is None: + self._sklearn_object = self._create_sklearn_object() + return self._sklearn_object + + def _get_dependencies(self) -> List[str]: + return self._deps diff --git a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py new file mode 100644 index 00000000..58abc02f --- /dev/null +++ b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py @@ -0,0 +1,1377 @@ +import copy +import inspect +import os +import posixpath +import sys +from collections import defaultdict +from math import ceil +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union +from uuid import uuid4 + +import cachetools +import cloudpickle +import cloudpickle as cp +import numpy +import numpy as np +import pandas as pd +import sklearn +import sklearn.model_selection +from sklearn.model_selection import ParameterSampler +from sklearn.utils.metaestimators import available_if +from typing_extensions import TypeGuard + +from snowflake.ml._internal import telemetry +from snowflake.ml._internal.exceptions import error_codes, exceptions +from snowflake.ml._internal.utils import identifier, pkg_version_utils +from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator +from snowflake.ml._internal.utils.temp_file_utils import ( + cleanup_temp_files, + get_temp_file_path, +) +from snowflake.ml.model._signatures import utils as model_signature_utils +from snowflake.ml.model.model_signature import ( + BaseFeatureSpec, + DataType, + FeatureSpec, + ModelSignature, + _infer_signature, +) +from snowflake.ml.modeling.framework._utils import to_native_format +from snowflake.ml.modeling.framework.base import BaseTransformer +from snowflake.snowpark import DataFrame, Session, functions as F +from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type +from snowflake.snowpark._internal.utils import ( + TempObjectType, + random_name_for_temp_object, +) +from snowflake.snowpark.functions import col, pandas_udf, sproc, udtf +from snowflake.snowpark.types import ( + BinaryType, + PandasSeries, + StringType, + StructField, + StructType, +) + +_PROJECT = "ModelDevelopment" +# Derive subproject from module name by removing "sklearn" +# and converting module name from underscore to CamelCase +# e.g. sklearn.linear_model -> LinearModel. +_SUBPROJECT = "ModelSelection" + + +# TODO: refactor all the common logic into a shared utility module. +def _original_estimator_has_callable(attr: str) -> Callable[[Any], bool]: + """Checks that the original estimator has callable `attr`. + + Args: + attr: Attribute to check for. + + Returns: + A function which checks for the existance of callable `attr` on the given object. + """ + + def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]: + """Check for the existance of callable `attr` in self. + + Args: + self: BaseTransformer object + + Returns: + True of the callable `attr` exists in self, False otherwise. + """ + return callable(getattr(self._sklearn_object, attr, None)) + + return check + + +def _gather_dependencies(obj: Any) -> Set[str]: + """Gethers dependencies from the SnowML Estimator and Transformer objects. + + Args: + obj: Source object to collect dependencies from. Source object could of any type, example, lists, tuples, etc. + + Returns: + A set of dependencies required to work with the object. + """ + + if isinstance(obj, list) or isinstance(obj, tuple): + deps: Set[str] = set() + for elem in obj: + deps = deps | set(_gather_dependencies(elem)) + return deps + elif isinstance(obj, BaseTransformer): + return set(obj._get_dependencies()) + else: + return set() + + +def _transform_snowml_obj_to_sklearn_obj(obj: Any) -> Any: + """Converts SnowML Estimator and Transformer objects to equivalent SKLearn objects. + + Args: + obj: Source object that needs to be converted. Source object could of any type, example, lists, tuples, etc. + + Returns: + An equivalent object with SnowML estimators and transforms replaced with equivalent SKLearn objects. + """ + + if isinstance(obj, list): + # Apply transform function to each element in the list + return list(map(_transform_snowml_obj_to_sklearn_obj, obj)) + elif isinstance(obj, tuple): + # Apply transform function to each element in the tuple + return tuple(map(_transform_snowml_obj_to_sklearn_obj, obj)) + elif isinstance(obj, BaseTransformer): + # Convert SnowML object to equivalent SKLearn object + return to_native_format(obj) + else: + # Return all other objects as it is. + return obj + + +def _validate_sklearn_args(args: Dict[str, Any], klass: type) -> Dict[str, Any]: + """Validate if all the keyword args are supported by current version of SKLearn/XGBoost object. + + Args: + args: Dictionary of keyword args for the wrapper init method. + klass: Underlying SKLearn/XGBoost class object. + + Returns: + result: sklearn arguments + + Raises: + SnowflakeMLException: if a user specified arg is not supported by current version of sklearn/xgboost. + """ + result = {} + signature = inspect.signature(klass.__init__) # type: ignore + for k, v in args.items(): + if k not in signature.parameters.keys(): # Arg is not supported. + if v[2] or ( # Arg doesn't have default value in the signature. + v[0] != v[1] # Value is not same as default. + and not (isinstance(v[0], float) and np.isnan(v[0]) and np.isnan(v[1])) + ): # both are not NANs + raise exceptions.SnowflakeMLException( + error_code=error_codes.DEPENDENCY_VERSION_ERROR, + original_exception=RuntimeError(f"Arg {k} is not supported by current version of SKLearn/XGBoost."), + ) + else: + result[k] = v[0] + return result + + +class RandomizedSearchCV(BaseTransformer): + r"""Randomized search on hyper parameters + For more details on this class, see [sklearn.model_selection.RandomizedSearchCV] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) + + Parameters + ---------- + estimator : estimator object + An object of that type is instantiated for each grid point. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_distributions : dict or list of dicts + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_iter : int, default=10 + Number of parameter settings that are sampled. n_iter trades + off runtime vs quality of the solution. + + scoring : str, callable, list, tuple or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_parameter`); + - a callable (see :ref:`scoring`) that returns a single value. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables a values. + + See :ref:`multimetric_grid_search` for an example. + + If None, the estimator's score method is used. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + refit : bool, str, or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a `str` denoting the + scorer that would be used to find the best parameters for refitting + the estimator at the end. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given the ``cv_results``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``RandomizedSearchCV`` instance. + + Also for multiple metric evaluation, the attributes ``best_index_``, + ``best_score_`` and ``best_params_`` will only be available if + ``refit`` is set and all of them will be determined w.r.t this specific + scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + verbose : int + Controls the verbosity: the higher, the more messages. + + - >1 : the computation time for each fold and parameter candidate is + displayed; + - >2 : the score is also displayed; + - >3 : the fold and candidate parameter indexes are also displayed + together with the starting time of the computation. + + pre_dispatch : int, or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A str, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + input_cols : Optional[Union[str, List[str]]] + A string or list of strings representing column names that contain features. + If this parameter is not specified, all columns in the input DataFrame except + the columns specified by label_cols and sample-weight_col parameters are + considered input columns. + + label_cols : Optional[Union[str, List[str]]] + A string or list of strings representing column names that contain labels. + This is a required param for estimators, as there is no way to infer these + columns. If this parameter is not specified, then object is fitted without + labels(Like a transformer). + + output_cols: Optional[Union[str, List[str]]] + A string or list of strings representing column names that will store the + output of predict and transform operations. The length of output_cols mus + match the expected number of output columns from the specific estimator or + transformer class used. + If this parameter is not specified, output column names are derived by + adding an OUTPUT_ prefix to the label column names. These inferred output + column names work for estimator's predict() method, but output_cols must + be set explicitly for transformers. + + sample_weight_col: Optional[str] + A string representing the column name containing the examples’ weights. + This argument is only required when working with weighted datasets. + + drop_input_cols: Optional[bool], default=False + If set, the response of predict(), transform() methods will not contain input columns. + """ + + def __init__( # type: ignore + self, + *, + estimator, + param_distributions, + n_iter=10, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + random_state=None, + error_score=np.nan, + return_train_score=False, + input_cols: Optional[Union[str, Iterable[str]]] = None, + output_cols: Optional[Union[str, Iterable[str]]] = None, + label_cols: Optional[Union[str, Iterable[str]]] = None, + drop_input_cols: Optional[bool] = False, + sample_weight_col: Optional[str] = None, + ) -> None: + super().__init__() + deps: Set[str] = { + f"numpy=={np.__version__}", + f"scikit-learn=={sklearn.__version__}", + f"cloudpickle=={cp.__version__}", + f"cachetools=={cachetools.__version__}", # type: ignore + } + deps = deps | _gather_dependencies(estimator) + self._deps = list(deps) + estimator = _transform_snowml_obj_to_sklearn_obj(estimator) + init_args = { + "estimator": (estimator, None, True), + "param_distributions": (param_distributions, None, True), + "n_iter": (n_iter, 10, False), + "scoring": (scoring, None, False), + "n_jobs": (n_jobs, None, False), + "refit": (refit, True, False), + "cv": (cv, None, False), + "verbose": (verbose, 0, False), + "pre_dispatch": (pre_dispatch, "2*n_jobs", False), + "random_state": (random_state, None, False), + "error_score": (error_score, np.nan, False), + "return_train_score": (return_train_score, False, False), + } + cleaned_up_init_args = _validate_sklearn_args(args=init_args, klass=sklearn.model_selection.RandomizedSearchCV) + self._sklearn_object = sklearn.model_selection.RandomizedSearchCV( + **cleaned_up_init_args, + ) + self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None + self.set_input_cols(input_cols) + self.set_output_cols(output_cols) + self.set_label_cols(label_cols) + self.set_drop_input_cols(drop_input_cols) + self.set_sample_weight_col(sample_weight_col) + + def _get_rand_id(self) -> str: + """ + Generate random id to be used in sproc and stage names. + + Returns: + Random id string usable in sproc, table, and stage names. + """ + return str(uuid4()).replace("-", "_").upper() + + def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: + """ + Infer `self.input_cols` and `self.output_cols` if they are not explicitly set. + + Args: + dataset: Input dataset. + """ + if not self.input_cols: + cols = [c for c in dataset.columns if c not in self.get_label_cols() and c != self.sample_weight_col] + self.set_input_cols(input_cols=cols) + + if not self.output_cols: + cols = [identifier.concat_names(ids=["OUTPUT_", c]) for c in self.label_cols] + self.set_output_cols(output_cols=cols) + + def _get_active_columns(self) -> List[str]: + """ "Get the list of columns that are relevant to the transformer.""" + selected_cols = ( + self.input_cols + self.label_cols + ([self.sample_weight_col] if self.sample_weight_col is not None else []) + ) + return selected_cols + + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "RandomizedSearchCV": + """Run fit with all sets of parameters + For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.fit] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.fit) + + + Raises: + TypeError: Supported dataset types: snowpark.DataFrame, pandas.DataFrame. + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + self + """ + self._infer_input_output_cols(dataset) + if isinstance(dataset, pd.DataFrame): + self._fit_pandas(dataset) + elif isinstance(dataset, DataFrame): + self._fit_snowpark(dataset) + else: + raise TypeError( + f"Unexpected dataset type: {type(dataset)}." + "Supported dataset types: snowpark.DataFrame, pandas.DataFrame." + ) + self._is_fitted = True + self._get_model_signatures(dataset) + return self + + def _fit_snowpark(self, dataset: DataFrame) -> None: + session = dataset._session + assert session is not None + # Validate that key package version in user workspace are supported in snowflake conda channel + # If customer doesn't have package in conda channel, replace the ones have the closest versions + self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( + pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT + ) + + # Create two stages - one for data and one for estimators. + temp_stage_name = random_name_for_temp_object(TempObjectType.STAGE) + temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};" + session.sql(temp_stage_creation_query).collect() + + # Stage data. + data_name = temp_stage_name + selected_cols = self._get_active_columns() + if len(selected_cols) > 0: + dataset = dataset.select(selected_cols) + # TODO: add index column to the staged data + dataset.write.save_as_table(f"{data_name}", table_type="temp") + + # TODO: explore using Fileset.make() + file_format_name = "parquet_file_format" + file_format_query = f"CREATE OR REPLACE FILE FORMAT {file_format_name} TYPE = 'PARQUET';" + session.sql(file_format_query).collect() + + file_format = f"FILE_FORMAT = (FORMAT_NAME = '{file_format_name}')" + stage_load_query = f"COPY INTO @{temp_stage_name}/{data_name} FROM {data_name} {file_format} header = true" + session.sql(stage_load_query).collect() + + imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}").collect()] + + # Create estimators with subset of param grid. + # TODO: Decide how to choose parallelization factor. + parallel_factor = 16 + + assert self._sklearn_object is not None + params_to_evaluate = list( + ParameterSampler(self._sklearn_object.param_distributions, n_iter=self._sklearn_object.n_iter) + ) + max_params_per_estimator = ceil(len(params_to_evaluate) / parallel_factor) + param_chunks = [ + params_to_evaluate[x : x + max_params_per_estimator] + for x in range(0, len(params_to_evaluate), max_params_per_estimator) + ] + target_locations = [] + for param_chunk in param_chunks: + + param_chunk_dist: Any = defaultdict(set) + for d in param_chunk: + for k, v in d.items(): + param_chunk_dist[k].add(v) + for k, v in param_chunk_dist.items(): + param_chunk_dist[k] = list(v) + + estimator = copy.deepcopy(self._sklearn_object) + estimator.param_distributions = param_chunk_dist + + # Create a temp file and dump the transform to that file. + local_transform_file_name = get_temp_file_path() + with open(local_transform_file_name, mode="w+b") as local_transform_file: + cp.dump(estimator, local_transform_file) + + # Put locally serialized transform on stage and add it to the list of imports. + # TODO: Add statement params. + put_result = session.file.put( + local_transform_file_name, + temp_stage_name, + auto_compress=False, + overwrite=True, + ) + target_location = put_result[0].target + target_locations.append(target_location) + imports.append(f"@{temp_stage_name}/{target_location}") + + input_cols = copy.deepcopy(self.input_cols) + label_cols = copy.deepcopy(self.label_cols) + + @cachetools.cached(cache={}) + def _load_data_into_udtf() -> Tuple[pd.DataFrame, pd.DataFrame]: + data_files = [ + filename + for filename in os.listdir(sys._xoptions["snowflake_import_directory"]) + if filename.startswith(data_name) + ] + partial_df = [ + pd.read_parquet(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)) + for file_name in data_files + ] + df = pd.concat(partial_df, ignore_index=True) + + for column in df: + df[column] = pd.to_numeric(df[column]) + + dfx = df[input_cols] + dfy = df[label_cols] + + return dfx, dfy + + # TODO: set refit = False, and fit after retrieving the resultf from udtf + @udtf( + output_schema=StructType( + [ + StructField("ESTIMATOR_LOCATION", StringType()), + StructField("BEST_SCORE", StringType()), + StructField("ESTIMATOR", BinaryType()), + ] + ), + input_types=[StringType()], + name="hyperparameter_tuning", + packages=["snowflake-snowpark-python"] + self._get_dependencies(), + replace=True, + imports=imports, + ) + class SearchCV: + def __init__(self) -> None: + dfx, dfy = _load_data_into_udtf() + self._dfx = dfx + self._dfy = dfy + + def process(self, estimator_location): + local_transform_file_path = os.path.join( + sys._xoptions["snowflake_import_directory"], f"{estimator_location}" + ) + with open(local_transform_file_path, mode="rb") as local_transform_file_obj: + estimator = cp.load(local_transform_file_obj) + + # TODO: handle sample weights. + fit_estimator = estimator.fit(self._dfx, self._dfy) + + # TODO: handle the case of estimator size > maximum column size or to just serialize and return score. + yield (estimator_location, fit_estimator.best_score_, cloudpickle.dumps(fit_estimator)) + + def end_partition(self) -> None: + ... + + # TODO: Check partitioning to ensure that partitions are uniformly distributed over UDTF intances + # Set parallelism to 16 and ensure that one partion goes to one instance of UDTF + HP_TUNING = F.table_function("hyperparameter_tuning") + + df = session.create_dataframe(target_locations, schema=["estimator_location"]) + results = df.select(HP_TUNING(df["estimator_location"]).over(partition_by=df["estimator_location"])).sort( + col("BEST_SCORE").desc() + ) + # TODO: check cv_results + best_estimator = cloudpickle.loads(results.select("ESTIMATOR").first()[0]) + self._sklearn_object = best_estimator + + def _fit_pandas(self, dataset: pd.DataFrame) -> None: + assert self._sklearn_object is not None + argspec = inspect.getfullargspec(self._sklearn_object.fit) + args = {"X": dataset[self.input_cols]} + if self.label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = dataset[self.label_cols].squeeze() + + if self.sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = dataset[self.sample_weight_col].squeeze() + + self._sklearn_object.fit(**args) + + def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]: + if self._drop_input_cols: + return [] + else: + return list(set(dataset.columns) - set(self.output_cols)) + + def _batch_inference( + self, + dataset: DataFrame, + inference_method: str, + expected_output_cols_list: List[str], + expected_output_cols_type: str = "", + ) -> DataFrame: + """Util method to create UDF and run batch inference.""" + if not self._is_fitted: + raise exceptions.SnowflakeMLException( + error_code=error_codes.METHOD_NOT_ALLOWED, + original_exception=RuntimeError( + f"Estimator {self.__class__.__name__} not fitted before calling {inference_method} method." + ), + ) + + session = dataset._session + if session is None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError("Session must not specified for snowpark dataset."), + ) + # Validate that key package version in user workspace are supported in snowflake conda channel + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( + pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT + ) + + # Register vectorized UDF for batch inference + batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) + + # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark + # will try to pickle all of self which fails. + estimator = self._sklearn_object + + # Input columns for UDF are sorted by column names. + # We need actual order of input cols to reorder dataframe before calling inference methods. + input_cols = self.input_cols + unquoted_input_cols = identifier.get_unescaped_names(self.input_cols) + + statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[pandas_udf], + custom_tags=dict([("autogen", True)]), + ) + + @pandas_udf( # type: ignore + is_permanent=False, + name=batch_inference_udf_name, + packages=self._get_dependencies(), # type: ignore + replace=True, + session=session, + statement_params=statement_params, + ) + def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]: # type: ignore + import numpy as np + import pandas as pd + + input_df = pd.io.json.json_normalize(ds) + + # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas(). + # But trained models have unquoted input column names saved in internal state if trained using snowpark_df + # or quoted input column names saved in internal state if trained using pandas_df. + # Model expects exact same columns names in the input df for predict call. + + input_df = input_df[input_cols] # Select input columns with quoted column names. + if hasattr(estimator, "feature_names_in_"): + assert estimator is not None + missing_features = [] + for i, f in enumerate(estimator.feature_names_in_): + if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f): + missing_features.append(f) + + if len(missing_features) > 0: + raise ValueError( + "The feature names should match with those that were passed during fit.\n" + f"Features seen during fit call but not present in the input: {missing_features}\n" + f"Features in the input dataframe : {input_cols}\n" + ) + input_df.columns = estimator.feature_names_in_ + else: + # Just rename the column names to unquoted identifiers. + input_df.columns = ( + unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids. + ) + transformed_numpy_array = getattr(estimator, inference_method)(input_df) + if ( + isinstance(transformed_numpy_array, list) + and len(transformed_numpy_array) > 0 + and isinstance(transformed_numpy_array[0], np.ndarray) + ): + # In case of multioutput estimators, predict_proba(), decision_function(), etc., functions return + # a list of ndarrays. We need to concatenate them. + transformed_numpy_array = np.concatenate(transformed_numpy_array, axis=1) + + if len(transformed_numpy_array.shape) == 3: + # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes) + # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms, + # so we ignore flatten_transform flag and flatten the results. + transformed_numpy_array = np.hstack(transformed_numpy_array) + + if len(transformed_numpy_array.shape) > 1 and transformed_numpy_array.shape[1] != len( + expected_output_cols_list + ): + # HeterogeneousEnsemble's transfrom method produce results with variying shapes + # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes). + # It is hard to predict the response shape without using fragile introspection logic. + # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with + # each element being a list. + if len(expected_output_cols_list) != 1: + raise TypeError( + "expected_output_cols_list must be same length as transformed array or " "should be of length 1" + ) + series = pd.Series(transformed_numpy_array.tolist()) + transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols_list) + else: + transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols_list) + return transformed_pandas_df.to_dict("records") # type: ignore + + batch_inference_table_name = f"SNOWML_BATCH_INFERENCE_INPUT_TABLE_{self._get_rand_id()}" + + pass_through_columns = self._get_pass_through_columns(dataset) + # Run Transform + query_from_df = str(dataset.queries["queries"][0]) + + outer_select_list = pass_through_columns[:] + inner_select_list = pass_through_columns[:] + + outer_select_list.extend( + [ + "{object_name}:{column_name}{udf_datatype} as {column_name}".format( + object_name=batch_inference_udf_name, + column_name=c, + udf_datatype=(f"::{expected_output_cols_type}" if expected_output_cols_type else ""), + ) + for c in expected_output_cols_list + ] + ) + + inner_select_list.extend( + [ + "{udf_name}(object_construct_keep_null({input_cols_dict})) AS {udf_name}".format( + udf_name=batch_inference_udf_name, + input_cols_dict=", ".join([f"'{c}', {c}" for c in self.input_cols]), + ) + ] + ) + + sql = """WITH {input_table_name} AS ({query}) + SELECT + {outer_select_stmt} + FROM ( + SELECT + {inner_select_stmt} + FROM {input_table_name} + ) + """.format( + input_table_name=batch_inference_table_name, + query=query_from_df, + outer_select_stmt=", ".join(outer_select_list), + inner_select_stmt=", ".join(inner_select_list), + ) + + return session.sql(sql) + + def _sklearn_inference( + self, dataset: pd.DataFrame, inference_method: str, expected_output_cols_list: List[str] + ) -> pd.DataFrame: + output_cols = expected_output_cols_list.copy() + + # Model expects exact same columns names in the input df for predict call. + # Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call + # input cols need to match unquoted / quoted + input_cols = self.input_cols + unquoted_input_cols = identifier.get_unescaped_names(self.input_cols) + quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols) + + estimator = self._sklearn_object + + assert estimator is not None + features_required_by_estimator = ( + estimator.feature_names_in_ if hasattr(estimator, "feature_names_in_") else unquoted_input_cols + ) + missing_features = [] + features_in_dataset = set(dataset.columns) + columns_to_select = [] + for i, f in enumerate(features_required_by_estimator): + if ( + i >= len(input_cols) + or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f) + or ( + input_cols[i] not in features_in_dataset + and unquoted_input_cols[i] not in features_in_dataset + and quoted_input_cols[i] not in features_in_dataset + ) + ): + missing_features.append(f) + elif input_cols[i] in features_in_dataset: + columns_to_select.append(input_cols[i]) + elif unquoted_input_cols[i] in features_in_dataset: + columns_to_select.append(unquoted_input_cols[i]) + else: + columns_to_select.append(quoted_input_cols[i]) + + if len(missing_features) > 0: + raise exceptions.SnowflakeMLException( + error_code=error_codes.NOT_FOUND, + original_exception=ValueError( + "The feature names should match with those that were passed during fit.\n" + f"Features seen during fit call but not present in the input: {missing_features}\n" + f"Features in the input dataframe : {input_cols}\n" + ), + ) + input_df = dataset[columns_to_select] + input_df.columns = features_required_by_estimator + + transformed_numpy_array = getattr(estimator, inference_method)(input_df) + + if ( + isinstance(transformed_numpy_array, list) + and len(transformed_numpy_array) > 0 + and isinstance(transformed_numpy_array[0], np.ndarray) + ): + # In case of multioutput estimators, predict_proba(), decision_function(), etc., functions return + # a list of ndarrays. We need to concatenate them. + + # First compute output column names + if len(output_cols) == len(transformed_numpy_array): + actual_output_cols = [] + for idx, np_arr in enumerate(transformed_numpy_array): + for i in range(1 if len(np_arr.shape) <= 1 else np_arr.shape[1]): + actual_output_cols.append(f"{output_cols[idx]}_{i}") + output_cols = actual_output_cols + + # Concatenate np arrays + transformed_numpy_array = np.concatenate(transformed_numpy_array, axis=1) + + if len(transformed_numpy_array.shape) == 3: + # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes) + # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms, + # so we ignore flatten_transform flag and flatten the results. + transformed_numpy_array = np.hstack(transformed_numpy_array) + + if len(transformed_numpy_array.shape) == 1: + transformed_numpy_array = np.reshape(transformed_numpy_array, (-1, 1)) + + shape = transformed_numpy_array.shape + if shape[1] != len(output_cols): + if len(output_cols) != 1: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ARGUMENT, + original_exception=TypeError( + "expected_output_cols_list must be same length as transformed array or " "should be of length 1" + ), + ) + actual_output_cols = [] + for i in range(shape[1]): + actual_output_cols.append(f"{output_cols[0]}_{i}") + output_cols = actual_output_cols + + if self._drop_input_cols: + dataset = pd.DataFrame(data=transformed_numpy_array, columns=output_cols) + else: + dataset = dataset.copy() + dataset[output_cols] = transformed_numpy_array + return dataset + + @available_if(_original_estimator_has_callable("predict")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: + """Call predict on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.predict] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.predict) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + Transformed dataset. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + expected_type_inferred = "" + # when it is classifier, infer the datatype from label columns + if expected_type_inferred == "" and "predict" in self.model_signatures: + expected_type_inferred = convert_sp_to_sf_type( + self.model_signatures["predict"].outputs[0].as_snowpark_type() + ) + + output_df = self._batch_inference( + dataset=dataset, + inference_method="predict", + expected_output_cols_list=self.output_cols, + expected_output_cols_type=expected_type_inferred, + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="predict", + expected_output_cols_list=self.output_cols, + ) + + return output_df + + @available_if(_original_estimator_has_callable("transform")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: + """Call transform on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.transform] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.transform) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + Transformed dataset. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + expected_dtype = "" + if False: # is child of _BaseHeterogeneousEnsemble + # transform() method of HeterogeneousEnsemble estimators return responses of varying shapes + # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between) + # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with + # each row containing a list of values. + expected_dtype = "ARRAY" + + output_df = self._batch_inference( + dataset=dataset, + inference_method="transform", + expected_output_cols_list=self.output_cols, + expected_output_cols_type=expected_dtype, + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="transform", + expected_output_cols_list=self.output_cols, + ) + + return output_df + + def _get_output_column_names(self, output_cols_prefix: str) -> List[str]: + """Returns the list of output columns for predict_proba(), decision_function(), etc.. functions. + Returns a list with output_cols_prefix as the only element if the estimator is not a classifier. + + Args: + output_cols_prefix (str): prefix according to the function + + Returns: + List[str]: output cols with prefix + """ + if getattr(self._sklearn_object, "classes_", None) is None: + return [output_cols_prefix] + + assert self._sklearn_object is not None # keep mypy happy + classes = self._sklearn_object.classes_ + if isinstance(classes, numpy.ndarray): + return [f"{output_cols_prefix}{c}" for c in classes.tolist()] + elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray): + # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays. + output_cols = [] + for i, cl in enumerate(classes): + # For binary classification, there is only one output column for each class + # ndarray as the two classes are complementary. + if len(cl) == 2: + output_cols.append(f"{output_cols_prefix}_{i}_{cl[0]}") + else: + output_cols.extend([f"{output_cols_prefix}_{i}_{c}" for c in cl.tolist()]) + return output_cols + return [] + + @available_if(_original_estimator_has_callable("predict_proba")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def predict_proba( + self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_proba_" + ) -> Union[DataFrame, pd.DataFrame]: + """Call predict_proba on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.predict_proba] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.predict_proba) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + output_cols_prefix: Prefix for the response columns + + Returns: + Output dataset with probability of the sample for each class in the model. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + output_df = self._batch_inference( + dataset=dataset, + inference_method="predict_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + expected_output_cols_type="float", + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="predict_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + ) + + return output_df + + @available_if(_original_estimator_has_callable("predict_log_proba")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def predict_log_proba( + self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "predict_log_proba_" + ) -> Union[DataFrame, pd.DataFrame]: + """Call predict_proba on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.predict_proba] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.predict_proba) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + output_cols_prefix: str + Prefix for the response columns + + Returns: + Output dataset with log probability of the sample for each class in the model. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + output_df = self._batch_inference( + dataset=dataset, + inference_method="predict_log_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + expected_output_cols_type="float", + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="predict_log_proba", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + ) + + return output_df + + @available_if(_original_estimator_has_callable("decision_function")) # type: ignore + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + @telemetry.add_stmt_params_to_df( + project=_PROJECT, + subproject=_SUBPROJECT, + custom_tags=dict([("autogen", True)]), + ) + def decision_function( + self, dataset: Union[DataFrame, pd.DataFrame], output_cols_prefix: str = "decision_function_" + ) -> Union[DataFrame, pd.DataFrame]: + """Call decision_function on the estimator with the best found parameters + For more details on this function, see [sklearn.model_selection.RandomizedSearchCV.decision_function] + (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV.decision_function) + + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + output_cols_prefix: str + Prefix for the response columns + + Returns: + Output dataset with results of the decision function for the samples in input dataset. + """ + super()._check_dataset_type(dataset) + if isinstance(dataset, DataFrame): + output_df = self._batch_inference( + dataset=dataset, + inference_method="decision_function", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + expected_output_cols_type="float", + ) + elif isinstance(dataset, pd.DataFrame): + output_df = self._sklearn_inference( + dataset=dataset, + inference_method="decision_function", + expected_output_cols_list=self._get_output_column_names(output_cols_prefix), + ) + + return output_df + + @available_if(_original_estimator_has_callable("score")) # type: ignore + def score(self, dataset: Union[DataFrame, pd.DataFrame]) -> float: + """ + Args: + dataset: Union[snowflake.snowpark.DataFrame, pandas.DataFrame] + Snowpark or Pandas DataFrame. + + Returns: + Score. + """ + self._infer_input_output_cols(dataset) + super()._check_dataset_type(dataset) + if isinstance(dataset, pd.DataFrame): + output_score = self._score_sklearn(dataset) + elif isinstance(dataset, DataFrame): + output_score = self._score_snowpark(dataset) + return output_score + + def _score_sklearn(self, dataset: pd.DataFrame) -> float: + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "score") # make type checker happy + argspec = inspect.getfullargspec(self._sklearn_object.score) + if "X" in argspec.args: + args = {"X": dataset[self.input_cols]} + elif "X_test" in argspec.args: + args = {"X_test": dataset[self.input_cols]} + else: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError("Neither 'X' or 'X_test' exist in argument"), + ) + + if self.label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = dataset[self.label_cols].squeeze() + + if self.sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = dataset[self.sample_weight_col].squeeze() + + score = self._sklearn_object.score(**args) + return score + + def _score_snowpark(self, dataset: DataFrame) -> float: + # Specify input columns so column pruing will be enforced + selected_cols = self._get_active_columns() + if len(selected_cols) > 0: + dataset = dataset.select(selected_cols) + + # Extract queries that generated the dataframe. We will need to pass it to score procedure. + queries = dataset.queries["queries"] + + # Create a temp file and dump the score to that file. + local_score_file_name = get_temp_file_path() + with open(local_score_file_name, mode="w+b") as local_score_file: + cp.dump(self._sklearn_object, local_score_file) + + # Create temp stage to run score. + score_stage_name = random_name_for_temp_object(TempObjectType.STAGE) + session = dataset._session + assert session is not None # keep mypy happy + stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};" + SqlResultValidator(session=session, query=stage_creation_query).has_dimensions( + expected_rows=1, expected_cols=1 + ).validate() + + # Use posixpath to construct stage paths + stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name)) + score_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE) + statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[sproc], + custom_tags=dict([("autogen", True)]), + ) + # Put locally serialized score on stage. + session.file.put( + local_score_file_name, + stage_score_file_name, + auto_compress=False, + overwrite=True, + statement_params=statement_params, + ) + + @sproc( + is_permanent=False, + name=score_sproc_name, + packages=["snowflake-snowpark-python"] + self._get_dependencies(), # type: ignore + replace=True, + session=session, + statement_params=statement_params, + anonymous=True, + ) + def score_wrapper_sproc( + session: Session, + sql_queries: List[str], + stage_score_file_name: str, + input_cols: List[str], + label_cols: List[str], + sample_weight_col: Optional[str], + statement_params: Dict[str, str], + ) -> float: + import inspect + import os + import tempfile + + import cloudpickle as cp + import numpy as np # noqa: F401 + import pandas # noqa: F401 + import sklearn # noqa: F401 + + for query in sql_queries[:-1]: + _ = session.sql(query).collect(statement_params=statement_params) + df = session.sql(sql_queries[-1]).to_pandas(statement_params=statement_params) + + local_score_file = tempfile.NamedTemporaryFile(delete=True) + local_score_file_name = local_score_file.name + local_score_file.close() + + session.file.get(stage_score_file_name, local_score_file_name, statement_params=statement_params) + + local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0]) + with open(local_score_file_name_path, mode="r+b") as local_score_file_obj: + estimator = cp.load(local_score_file_obj) + + argspec = inspect.getfullargspec(estimator.score) + if "X" in argspec.args: + args = {"X": df[input_cols]} + elif "X_test" in argspec.args: + args = {"X_test": df[input_cols]} + else: + raise RuntimeError("Neither 'X' or 'X_test' exist in argument") + + if label_cols: + label_arg_name = "Y" if "Y" in argspec.args else "y" + args[label_arg_name] = df[label_cols].squeeze() + + if sample_weight_col is not None and "sample_weight" in argspec.args: + args["sample_weight"] = df[sample_weight_col].squeeze() + + result: float = estimator.score(**args) + return result + + # Call score sproc + statement_params = telemetry.get_function_usage_statement_params( + project=_PROJECT, + subproject=_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name( + inspect.currentframe(), self.__class__.__name__ + ), + api_calls=[Session.call], + custom_tags=dict([("autogen", True)]), + ) + score: float = score_wrapper_sproc( + session, + queries, + stage_score_file_name, + identifier.get_unescaped_names(self.input_cols), + identifier.get_unescaped_names(self.label_cols), + identifier.get_unescaped_names(self.sample_weight_col), + statement_params, + ) + + cleanup_temp_files([local_score_file_name]) + + return score + + def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None: + self._model_signature_dict = dict() + + PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"] + + inputs = list(_infer_signature(dataset[self.input_cols], "input")) + outputs: List[BaseFeatureSpec] = [] + if hasattr(self, "predict"): + # keep mypy happy + assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type") + # For classifier, the type of predict is the same as the type of label + if self._sklearn_object._estimator_type == "classifier": + # label columns is the desired type for output + outputs = _infer_signature(dataset[self.label_cols], "output") + # rename the output columns + outputs = model_signature_utils.rename_features(outputs, self.output_cols) + self._model_signature_dict["predict"] = ModelSignature( + inputs, ([] if self._drop_input_cols else inputs) + outputs + ) + # For regressor, the type of predict is float64 + elif self._sklearn_object._estimator_type == "regressor": + outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols] + self._model_signature_dict["predict"] = ModelSignature( + inputs, ([] if self._drop_input_cols else inputs) + outputs + ) + for prob_func in PROB_FUNCTIONS: + if hasattr(self, prob_func): + output_cols_prefix: str = f"{prob_func}_" + output_column_names = self._get_output_column_names(output_cols_prefix) + outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names] + self._model_signature_dict[prob_func] = ModelSignature( + inputs, ([] if self._drop_input_cols else inputs) + outputs + ) + + @property + def model_signatures(self) -> Dict[str, ModelSignature]: + """Returns model signature of current class. + + Raises: + SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred + + Returns: + Dict[str, ModelSignature]: each method and its input output signature + """ + if self._model_signature_dict is None: + raise exceptions.SnowflakeMLException( + error_code=error_codes.INVALID_ATTRIBUTE, + original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"), + ) + return self._model_signature_dict + + def to_sklearn(self) -> Any: + if self._sklearn_object is None: + self._sklearn_object = self._create_sklearn_object() + return self._sklearn_object + + def _get_dependencies(self) -> List[str]: + return self._deps diff --git a/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py index 649a198c..bfaaa7c9 100644 --- a/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +++ b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py @@ -18,7 +18,10 @@ from snowflake.ml._internal.exceptions import error_codes, exceptions from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T -from snowflake.snowpark._internal import utils as snowpark_utils +from snowflake.snowpark._internal.utils import ( + TempObjectType, + random_name_for_temp_object, +) # constants used to validate the compatibility of the kwargs passed to the sklearn # transformer with the sklearn version @@ -328,17 +331,18 @@ def _handle_ordinal(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: # NB: the reason we need to generate a random UDF name each time is because the UDF registration # is centralized per database, so if there are multiple sessions with same UDF name, there might be # a conflict and some parties could fail to fetch the UDF. - udf_name = f"vec_bucketize_{snowpark_utils.generate_random_alphanumeric()}" + udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) # 1. Register vec_bucketize UDF @F.pandas_udf( # type: ignore[arg-type, misc] + is_permanent=False, name=udf_name, replace=True, packages=["numpy"], session=dataset._session, statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__), ) - def vec_bucketize(x: T.PandasSeries[float], boarders: T.PandasSeries[List[float]]) -> T.PandasSeries[int]: + def vec_bucketize_temp(x: T.PandasSeries[float], boarders: T.PandasSeries[List[float]]) -> T.PandasSeries[int]: # NB: vectorized udf doesn't work well with const array arg, so we pass it in as a list via PandasSeries boarders = boarders[0] res = np.searchsorted(boarders[1:-1], x, side="right") @@ -369,16 +373,17 @@ def _handle_onehot(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: Output dataset in sparse representation. """ passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] - udf_name = f"vec_bucketize_sparse_{snowpark_utils.generate_random_alphanumeric()}" + udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) @F.pandas_udf( # type: ignore[arg-type, misc] + is_permanent=False, name=udf_name, replace=True, packages=["numpy"], session=dataset._session, statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__), ) - def vec_bucketize_sparse_output( + def vec_bucketize_sparse_output_temp( x: T.PandasSeries[float], boarders: T.PandasSeries[List[float]] ) -> T.PandasSeries[Dict[str, int]]: res: List[Dict[str, int]] = [] @@ -416,7 +421,7 @@ def _handle_onehot_dense(self, dataset: snowpark.DataFrame) -> snowpark.DataFram origional_dataset_columns = dataset.columns[:] all_output_cols = [] - udf_name = f"vec_bucketize_dense_{snowpark_utils.generate_random_alphanumeric()}" + udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) @F.pandas_udf( # type: ignore[arg-type, misc] name=udf_name, @@ -425,7 +430,7 @@ def _handle_onehot_dense(self, dataset: snowpark.DataFrame) -> snowpark.DataFram session=dataset._session, statement_params=telemetry.get_statement_params(base.PROJECT, base.SUBPROJECT, self.__class__.__name__), ) - def vec_bucketize_dense_output( + def vec_bucketize_dense_output_temp( x: T.PandasSeries[float], boarders: T.PandasSeries[List[float]] ) -> T.PandasSeries[List[int]]: res: List[npt.NDArray[np.int32]] = [] diff --git a/snowflake/ml/modeling/preprocessing/one_hot_encoder.py b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py index acf20cc8..bacc6f57 100644 --- a/snowflake/ml/modeling/preprocessing/one_hot_encoder.py +++ b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py @@ -21,7 +21,11 @@ from snowflake.ml.model import model_signature from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F, types as T -from snowflake.snowpark._internal import utils as snowpark_utils +from snowflake.snowpark._internal.utils import ( + TempObjectType, + generate_random_alphanumeric, + random_name_for_temp_object, +) _INFREQUENT_CATEGORY = "_INFREQUENT" _COLUMN_NAME = "_COLUMN_NAME" @@ -864,9 +868,11 @@ def _transform_snowpark_sparse_udf(self, dataset: snowpark.DataFrame) -> snowpar Output dataset in the sparse representation. """ encoder_sklearn = self.to_sklearn() + udf_name = random_name_for_temp_object(TempObjectType.FUNCTION) @F.pandas_udf( # type: ignore is_permanent=False, + name=udf_name, replace=True, return_type=T.PandasSeriesType(T.ArrayType(T.MapType(T.FloatType(), T.FloatType()))), input_types=[T.PandasDataFrameType([T.StringType() for _ in range(len(self.input_cols))])], @@ -897,7 +903,7 @@ def one_hot_encoder_sparse_transform(data: pd.DataFrame) -> List[List[Optional[D return transformed_vals # encoded column returned by `one_hot_encoder_sparse_transform` - encoded_output_col = f"'ENCODED_OUTPUT_{snowpark_utils.generate_random_alphanumeric()}'" + encoded_output_col = f"'ENCODED_OUTPUT_{generate_random_alphanumeric()}'" encoded_column = one_hot_encoder_sparse_transform(self.input_cols) # type: ignore encoded_dataset = dataset.with_column(encoded_output_col, encoded_column) diff --git a/snowflake/ml/registry/BUILD.bazel b/snowflake/ml/registry/BUILD.bazel index 240dfa9a..7094c290 100644 --- a/snowflake/ml/registry/BUILD.bazel +++ b/snowflake/ml/registry/BUILD.bazel @@ -8,15 +8,17 @@ py_library( srcs = ["model_registry.py"], deps = [ ":_schema", + ":_ml_artifact", "//snowflake/ml/_internal/utils:formatting", "//snowflake/ml/_internal/utils:query_result_checker", "//snowflake/ml/_internal/utils:uri", "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/_internal:file_utils", + "//snowflake/ml/_internal/utils:table_manager", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/model:_model", "//snowflake/ml/model:_deployer", "//snowflake/ml/model:deploy_platforms", + "//snowflake/ml/training_dataset:training_dataset", "//snowflake/ml/modeling/framework:framework" ], ) @@ -38,6 +40,27 @@ py_library( visibility = ["//visibility:private"], ) +py_library( + name = "_ml_artifact", + srcs = ["_ml_artifact.py"], + deps = [ + ":_schema", + "//snowflake/ml/_internal/utils:table_manager", + "//snowflake/ml/_internal/utils:formatting", + ], +) + +py_test( + name = "_ml_artifact_test", + srcs = ["_ml_artifact_test.py"], + deps = [ + ":_ml_artifact", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/test_utils:mock_data_frame", + "//snowflake/ml/test_utils:mock_session", + ], +) + py_package( name = "model_registry_pkg", packages = ["snowflake.ml"], diff --git a/snowflake/ml/registry/_ml_artifact.py b/snowflake/ml/registry/_ml_artifact.py new file mode 100644 index 00000000..42e14bbc --- /dev/null +++ b/snowflake/ml/registry/_ml_artifact.py @@ -0,0 +1,214 @@ +import enum +from typing import Any, Dict, Optional, cast + +from snowflake import connector, snowpark +from snowflake.ml._internal.utils import formatting, identifier, table_manager +from snowflake.ml.registry import _schema + + +# Set of allowed artifact types. +class ArtifactType(enum.Enum): + TESTTYPE = "TESTTYPE" # A placeholder type just for unit test + TRAINING_DATASET = "TRAINING_DATASET" + + +# Default name of the artifact table +_ARTIFACT_TABLE_NAME: str = identifier.get_inferred_name("_SYSTEM_REGISTRY_ARTIFACTS") + + +def create_ml_artifact_table( + session: snowpark.Session, + database_name: str, + schema_name: str, + statement_params: Dict[str, Any], +) -> None: + """Create the ml artifact table to store immutable properties of various artifacts. + + This artifact table will follow a predefined schema detailed in `_ARTIFACT_TABLE_SCHEMA` from `_schema.py`. + + Note: + The artifact table uses (ID + TYPE) as its compound primary key, hence, it needs an out-of-line private key. + + Args: + session: Snowpark session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + statement_params: Function usage statement parameters used in sql query executions. + """ + table_manager.create_single_registry_table( + session=session, + database_name=database_name, + schema_name=schema_name, + table_name=_ARTIFACT_TABLE_NAME, + table_schema=_schema._ARTIFACT_TABLE_SCHEMA, + statement_params=statement_params, + ) + + +def if_artifact_table_exists( + session: snowpark.Session, + database_name: str, + schema_name: str, +) -> bool: + """ + Verify the existence of the artifact table. + + Args: + session: Snowpark session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + + Returns: + bool: True if the artifact table exists, False otherwise. + """ + qualified_schema_name = table_manager.get_fully_qualified_schema_name(database_name, schema_name) + return table_manager.validate_table_exist(session, _ARTIFACT_TABLE_NAME, qualified_schema_name) + + +def if_artifact_exists( + session: snowpark.Session, database_name: str, schema_name: str, artifact_id: str, artifact_type: ArtifactType +) -> bool: + """Validate if a specific artifact record exists in the artifact table. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + artifact_id: Unique identifier of the target artifact. + artifact_type: Type of the target artifact + + Returns: + bool: True if the artifact exists, False otherwise. + """ + selected_artifact = _get_artifact(session, database_name, schema_name, artifact_id, artifact_type).collect() + + assert ( + len(selected_artifact) < 2 + ), f"Multiple records found for the specified artifact (ID: {artifact_id}, TYPE: {artifact_type.name})!" + + return len(selected_artifact) == 1 + + +def add_artifact( + session: snowpark.Session, + database_name: str, + schema_name: str, + artifact_id: str, + artifact_type: ArtifactType, + artifact_name: str, + artifact_version: Optional[str], + artifact_spec: Dict[str, Any], +) -> None: + """ + Insert a new artifact record into the designated artifact table. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + artifact_id: Unique identifier for the artifact. + artifact_type: Type of the artifact. + artifact_name: Name of the artifact. + artifact_version: Version of the artifact if applicable. + artifact_spec: Specifications related to the artifact. + + Raises: + TypeError: If the given artifact type isn't valid. + DataError: If the given artifact already exists in the database. + """ + if not isinstance(artifact_type, ArtifactType): + raise TypeError(f"{artifact_type} isn't a recognized artifact type.") + + if if_artifact_exists(session, database_name, schema_name, artifact_id, artifact_type): + raise connector.DataError( + f"artifact with ID {artifact_id} and TYPE {artifact_type.name} already exists. Unable to add the artifact." + ) + + fully_qualified_table_name = table_manager.get_fully_qualified_table_name( + database_name, schema_name, _ARTIFACT_TABLE_NAME + ) + + new_artifact = { + "ID": artifact_id, + "TYPE": artifact_type.name, + "NAME": artifact_name, + "VERSION": artifact_version, + "CREATION_ROLE": session.get_current_role(), + "CREATION_TIME": formatting.SqlStr("CURRENT_TIMESTAMP()"), + "ARTIFACT_SPEC": artifact_spec, + } + + # TODO: Consider updating the METADATA table for artifact history tracking as well. + table_manager.insert_table_entry(session, fully_qualified_table_name, new_artifact) + + +def delete_artifact( + session: snowpark.Session, + database_name: str, + schema_name: str, + artifact_id: str, + artifact_type: ArtifactType, + error_if_not_exist: bool = False, +) -> None: + """ + Remove an artifact record from the designated artifact table. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + artifact_id: Unique identifier for the artifact to be deleted. + artifact_type: Type of the artifact to be deleted. + error_if_not_exist: Whether to raise errors if the target entry doesn't exist. Default to be false. + + Raises: + DataError: If error_if_not_exist is true and the artifact doesn't exist in the database. + RuntimeError: If the artifact deletion failed. + """ + if error_if_not_exist and not if_artifact_exists(session, database_name, schema_name, artifact_id, artifact_type): + raise connector.DataError( + f"Artifact with ID '{artifact_id}' and TYPE '{artifact_type.name}' doesn't exist. Deletion not possible." + ) + + fully_qualified_table_name = table_manager.get_fully_qualified_table_name( + database_name, schema_name, _ARTIFACT_TABLE_NAME + ) + + delete_query = f"DELETE FROM {fully_qualified_table_name} WHERE ID='{artifact_id}' AND TYPE='{artifact_type.name}'" + + # TODO: Consider updating the METADATA table for artifact history tracking as well. + try: + session.sql(delete_query).collect() + except Exception as e: + raise RuntimeError(f"Delete ML artifact (ID: {artifact_id}, TYPE: {artifact_type.name}) failed due to {e}") + + +def _get_artifact( + session: snowpark.Session, database_name: str, schema_name: str, artifact_id: str, artifact_type: ArtifactType +) -> snowpark.DataFrame: + """Retrieve the Snowpark dataframe of the artifact matching the provided artifact id and type. + + Given that ID and TYPE act as a compound primary key for the artifact table, the resulting dataframe should have, + at most, one row. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + artifact_id: Unique identifier of the target artifact. + artifact_type: Type of the target artifact + + Returns: + A Snowpark dataframe representing the artifacts that match the given constraints. + + WARNING: + The returned DataFrame is writable and shouldn't be made accessible to users. + """ + artifacts = session.sql( + "SELECT * FROM " + f"{table_manager.get_fully_qualified_table_name(database_name, schema_name, _ARTIFACT_TABLE_NAME)}" + ) + target_artifact = artifacts.filter(snowpark.Column("ID") == artifact_id).filter( + snowpark.Column("TYPE") == artifact_type.name + ) + return cast(snowpark.DataFrame, target_artifact) diff --git a/snowflake/ml/registry/_ml_artifact_test.py b/snowflake/ml/registry/_ml_artifact_test.py new file mode 100644 index 00000000..c6370939 --- /dev/null +++ b/snowflake/ml/registry/_ml_artifact_test.py @@ -0,0 +1,203 @@ +import datetime +from typing import List, cast + +from absl.testing import absltest + +from snowflake import connector, snowpark +from snowflake.ml._internal.utils import identifier, table_manager +from snowflake.ml.registry import _ml_artifact +from snowflake.ml.test_utils import mock_data_frame, mock_session + +_DATABASE_NAME = identifier.get_inferred_name("_SYSTEM_MODEL_REGISTRY") +_SCHEMA_NAME = identifier.get_inferred_name("_SYSTEM_MODEL_REGISTRY_SCHEMA") +_TABLE_NAME = identifier.get_inferred_name("_SYSTEM_REGISTRY_ARTIFACTS") +_FULLY_QUALIFIED_TABLE_NAME = table_manager.get_fully_qualified_table_name(_DATABASE_NAME, _SCHEMA_NAME, _TABLE_NAME) + + +class ArtifactTest(absltest.TestCase): + """Testing Artifact table related functions.""" + + def setUp(self) -> None: + """Creates Snowpark environemnts for testing.""" + self._session = mock_session.MockSession(conn=None, test_case=self) + + def tearDown(self) -> None: + """Complete test case. Ensure all expected operations have been observed.""" + self._session.finalize() + + def _get_show_tables_success( + self, name: str, database_name: str = _DATABASE_NAME, schema_name: str = _SCHEMA_NAME + ) -> List[snowpark.Row]: + """Helper method that returns a DataFrame that looks like the response of from a successful listing of + tables.""" + return [ + snowpark.Row( + created_on=datetime.datetime(2022, 11, 4, 17, 1, 30, 153000), + name=name, + database_name=database_name, + schema_name=schema_name, + kind="TABLE", + comment="", + cluster_by="", + rows=0, + bytes=0, + owner="OWNER_ROLE", + retention_time=1, + change_tracking="OFF", + is_external="N", + enable_schema_evolution="N", + ) + ] + + def _get_select_artifact(self) -> List[snowpark.Row]: + """Helper method that returns a DataFrame that looks like the response of from a successful listing of + tables.""" + return [ + snowpark.Row( + id="FAKE_ID", + type=_ml_artifact.ArtifactType.TESTTYPE, + name="FAKE_NAME", + version=None, + creation_time=datetime.datetime(2022, 11, 4, 17, 1, 30, 153000), + creation_role="OWNER_ROLE", + artifact_spec={}, + ) + ] + + def test_create_artifact_table(self) -> None: + expected_artifact_table_schema_query = ( + "ID VARCHAR, " + "TYPE VARCHAR, " + "NAME VARCHAR, " + "VERSION VARCHAR, " + "CREATION_ROLE VARCHAR, " + "CREATION_TIME TIMESTAMP_TZ, " + "ARTIFACT_SPEC OBJECT, " + "PRIMARY KEY (ID, TYPE) RELY" + ) + self._session.add_mock_sql( + query=f"CREATE TABLE IF NOT EXISTS {_FULLY_QUALIFIED_TABLE_NAME} ({expected_artifact_table_schema_query})", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"Table {_TABLE_NAME} successfully created.")], + ), + ) + _ml_artifact.create_ml_artifact_table(cast(snowpark.Session, self._session), _DATABASE_NAME, _SCHEMA_NAME, {}) + + def test_if_artifact_table_exists(self) -> None: + for mock_df, expected_res in [ + (mock_data_frame.MockDataFrame(self._get_show_tables_success(name=_TABLE_NAME)), True), + (mock_data_frame.MockDataFrame([]), False), + ]: + with self.subTest(): + self._session.add_mock_sql( + query=f"SHOW TABLES LIKE '{_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", + result=mock_df, + ) + self.assertEqual( + _ml_artifact.if_artifact_table_exists( + cast(snowpark.Session, self._session), _DATABASE_NAME, _SCHEMA_NAME + ), + expected_res, + ) + + def test_if_artifact_exists(self) -> None: + for mock_df_collect, expected_res in [ + (self._get_select_artifact(), True), + ([], False), + ]: + with self.subTest(): + artifact_id = "FAKE_ID" + artifact_type = _ml_artifact.ArtifactType.TESTTYPE + expected_df = mock_data_frame.MockDataFrame() + expected_df.add_operation("filter") + expected_df.add_operation("filter") + expected_df.add_collect_result(cast(List[snowpark.Row], mock_df_collect)) + self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) + self.assertEqual( + _ml_artifact.if_artifact_exists( + cast(snowpark.Session, self._session), + _DATABASE_NAME, + _SCHEMA_NAME, + artifact_id, + artifact_type, + ), + expected_res, + ) + + def test_add_artifact(self) -> None: + artifact_id = "FAKE_ID" + artifact_name = "FAKE_NAME" + artifact_version = "1.0.0" + artifact_spec = {"description": "mock description"} + + # Mock the get_artifact call + expected_df = mock_data_frame.MockDataFrame() + expected_df.add_operation("filter") + expected_df.add_operation("filter") + expected_df.add_collect_result([]) + self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) + + # Mock the insertion call + self._session.add_operation("get_current_role", result="current_role") + insert_query = ( + f"INSERT INTO {_FULLY_QUALIFIED_TABLE_NAME}" + " ( ARTIFACT_SPEC,CREATION_ROLE,CREATION_TIME,ID,NAME,TYPE,VERSION )" + " SELECT" + " OBJECT_CONSTRUCT('description','mock description'),'current_role',CURRENT_TIMESTAMP()," + "'FAKE_ID','FAKE_NAME','TESTTYPE','1.0.0' " + ) + self._session.add_mock_sql( + query=insert_query, + result=mock_data_frame.MockDataFrame([snowpark.Row(**{"number of rows inserted": 1})]), + ) + _ml_artifact.add_artifact( + cast(snowpark.Session, self._session), + _DATABASE_NAME, + _SCHEMA_NAME, + artifact_id, + _ml_artifact.ArtifactType.TESTTYPE, + artifact_name, + artifact_version, + artifact_spec, + ) + + def test_delete_artifact(self) -> None: + for error_if_not_exist in [True, False]: + with self.subTest(): + if error_if_not_exist: + artifact_id = "FAKE_ID" + expected_df = mock_data_frame.MockDataFrame() + expected_df.add_operation("filter") + expected_df.add_operation("filter") + expected_df.add_collect_result([]) + self._session.add_mock_sql(query=f"SELECT * FROM {_FULLY_QUALIFIED_TABLE_NAME}", result=expected_df) + with self.assertRaises(connector.DataError): + _ml_artifact.delete_artifact( + cast(snowpark.Session, self._session), + _DATABASE_NAME, + _SCHEMA_NAME, + artifact_id, + _ml_artifact.ArtifactType.TESTTYPE, + True, + ) + else: + # Mock the delete call + insert_query = ( + f"DELETE FROM {_FULLY_QUALIFIED_TABLE_NAME}" + f" WHERE ID='{artifact_id}' AND TYPE='{_ml_artifact.ArtifactType.TESTTYPE.name}'" + ) + self._session.add_mock_sql( + query=insert_query, + result=mock_data_frame.MockDataFrame([snowpark.Row(**{"number of rows deleted": 1})]), + ) + _ml_artifact.delete_artifact( + cast(snowpark.Session, self._session), + _DATABASE_NAME, + _SCHEMA_NAME, + artifact_id, + _ml_artifact.ArtifactType.TESTTYPE, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/registry/_schema.py b/snowflake/ml/registry/_schema.py index d0757b53..75788ce0 100644 --- a/snowflake/ml/registry/_schema.py +++ b/snowflake/ml/registry/_schema.py @@ -1,41 +1,55 @@ -from typing import Dict +from typing import List, Tuple # TODO(amauser): Move this scheme and registry creation in general into a server-side implementation. -_REGISTRY_TABLE_SCHEMA: Dict[str, str] = { - "CREATION_CONTEXT": "VARCHAR", - "CREATION_ENVIRONMENT_SPEC": "OBJECT", - "CREATION_ROLE": "VARCHAR", - "CREATION_TIME": "TIMESTAMP_TZ", - "ID": "VARCHAR PRIMARY KEY RELY", - "INPUT_SPEC": "OBJECT", - "NAME": "VARCHAR", - "OUTPUT_SPEC": "OBJECT", - "RUNTIME_ENVIRONMENT_SPEC": "OBJECT", - "TYPE": "VARCHAR", - "URI": "VARCHAR", - "VERSION": "VARCHAR", -} +_REGISTRY_TABLE_SCHEMA: List[Tuple[str, str]] = [ + ("CREATION_CONTEXT", "VARCHAR"), + ("CREATION_ENVIRONMENT_SPEC", "OBJECT"), + ("CREATION_ROLE", "VARCHAR"), + ("CREATION_TIME", "TIMESTAMP_TZ"), + ("ID", "VARCHAR PRIMARY KEY RELY"), + ("INPUT_SPEC", "OBJECT"), + ("NAME", "VARCHAR"), + ("OUTPUT_SPEC", "OBJECT"), + ("RUNTIME_ENVIRONMENT_SPEC", "OBJECT"), + ("TRAINING_DATASET_ID", "VARCHAR"), + ("TYPE", "VARCHAR"), + ("URI", "VARCHAR"), + ("VERSION", "VARCHAR"), +] -_METADATA_TABLE_SCHEMA: Dict[str, str] = { - # TODO(amauser): Generalize attribute to any column reference. - "ATTRIBUTE_NAME": "VARCHAR", - "EVENT_ID": "VARCHAR UNIQUE NOT NULL", - "EVENT_TIMESTAMP": "TIMESTAMP_TZ", - "MODEL_ID": "VARCHAR FOREIGN KEY REFERENCES {registry_table_name}(ID) RELY", - "OPERATION": "VARCHAR", - "ROLE": "VARCHAR", - "SEQUENCE_ID": "BIGINT AUTOINCREMENT START 0 INCREMENT 1 PRIMARY KEY", - "VALUE": "OBJECT", -} +# TODO(amauser): Generalize attribute to any column reference. +_METADATA_TABLE_SCHEMA: List[Tuple[str, str]] = [ + ("ATTRIBUTE_NAME", "VARCHAR"), + ("EVENT_ID", "VARCHAR UNIQUE NOT NULL"), + ("EVENT_TIMESTAMP", "TIMESTAMP_TZ"), + ("MODEL_ID", "VARCHAR FOREIGN KEY REFERENCES {registry_table_name}(ID) RELY"), + ("OPERATION", "VARCHAR"), + ("ROLE", "VARCHAR"), + ("SEQUENCE_ID", "BIGINT AUTOINCREMENT START 0 INCREMENT 1 PRIMARY KEY"), + ("VALUE", "OBJECT"), +] -_DEPLOYMENTS_TABLE_SCHEMA: Dict[str, str] = { - "CREATION_TIME": "TIMESTAMP_TZ", - "MODEL_ID": "VARCHAR FOREIGN KEY REFERENCES {registry_table_name}(ID) RELY", - "DEPLOYMENT_NAME": "VARCHAR", - "OPTIONS": "VARIANT", - "TARGET_PLATFORM": "VARCHAR", - "ROLE": "VARCHAR", - "STAGE_PATH": "VARCHAR", - "SIGNATURE": "VARIANT", - "TARGET_METHOD": "VARCHAR", -} +_DEPLOYMENTS_TABLE_SCHEMA: List[Tuple[str, str]] = [ + ("CREATION_TIME", "TIMESTAMP_TZ"), + ("MODEL_ID", "VARCHAR FOREIGN KEY REFERENCES {registry_table_name}(ID) RELY"), + ("DEPLOYMENT_NAME", "VARCHAR"), + ("OPTIONS", "VARIANT"), + ("TARGET_PLATFORM", "VARCHAR"), + ("ROLE", "VARCHAR"), + ("STAGE_PATH", "VARCHAR"), + ("SIGNATURE", "VARIANT"), + ("TARGET_METHOD", "VARCHAR"), +] + +_ARTIFACT_TABLE_SCHEMA: List[Tuple[str, str]] = [ + ("ID", "VARCHAR"), + ("TYPE", "VARCHAR"), + ("NAME", "VARCHAR"), + ("VERSION", "VARCHAR"), + ("CREATION_ROLE", "VARCHAR"), + ("CREATION_TIME", "TIMESTAMP_TZ"), + ("ARTIFACT_SPEC", "OBJECT"), + # Below is out-of-line constraints of Snowflake table. + # See https://docs.snowflake.com/en/sql-reference/sql/create-table + ("PRIMARY KEY", "(ID, TYPE) RELY"), +] diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index 3e84e8b2..0f053eef 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -3,20 +3,29 @@ import os import posixpath import sys -import tempfile import types -import zipfile -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Union, + cast, +) from uuid import uuid1 from absl import logging from snowflake import connector, snowpark -from snowflake.ml._internal import file_utils, telemetry +from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils import ( formatting, identifier, query_result_checker, + table_manager, uri, ) from snowflake.ml.model import ( @@ -26,8 +35,8 @@ model_signature, type_hints as model_types, ) -from snowflake.ml.modeling.framework import base -from snowflake.ml.registry import _schema +from snowflake.ml.registry import _ml_artifact, _schema +from snowflake.ml.training_dataset import training_dataset if TYPE_CHECKING: import pandas as pd @@ -51,7 +60,7 @@ _METADATA_ATTRIBUTE_DEPLOYMENT: str = "DEPLOYMENTS" _METADATA_ATTRIBUTE_DELETION: str = "DELETION" -# Leaving out REGISTRATION/DEPLOYMENT evnts as they will be handled differently from all mutable attributes. +# Leaving out REGISTRATION/DEPLOYMENT events as they will be handled differently from all mutable attributes. _LIST_METADATA_ATTRIBUTE: List[str] = [ _METADATA_ATTRIBUTE_DESCRIPTION, _METADATA_ATTRIBUTE_METRICS, @@ -60,72 +69,7 @@ _TELEMETRY_PROJECT = "MLOps" _TELEMETRY_SUBPROJECT = "ModelRegistry" - -@telemetry.send_api_usage_telemetry( - project=_TELEMETRY_PROJECT, - subproject=_TELEMETRY_SUBPROJECT, -) -@snowpark._internal.utils.private_preview(version="0.2.0") -def create_model_registry( - *, - session: snowpark.Session, - database_name: str = _DEFAULT_REGISTRY_NAME, - schema_name: str = _DEFAULT_SCHEMA_NAME, -) -> bool: - """Setup a new model registry. This should be run once per model registry by an administrator role. - - Args: - session: Session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - - Returns: - True if the creation of the model registry internal data structures was successful, - False otherwise. - """ - # Get the db & schema of the current session - old_db = session.get_current_database() - old_schema = session.get_current_schema() - - # These might be exposed as parameters in the future. - database_name = identifier.get_inferred_name(database_name) - schema_name = identifier.get_inferred_name(schema_name) - registry_table_name = identifier.get_inferred_name(_MODELS_TABLE_NAME) - metadata_table_name = identifier.get_inferred_name(_METADATA_TABLE_NAME) - deployment_table_name = identifier.get_inferred_name(_DEPLOYMENT_TABLE_NAME) - statement_params = telemetry.get_function_usage_statement_params( - project=_TELEMETRY_PROJECT, - subproject=_TELEMETRY_SUBPROJECT, - function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), ""), - ) - try: - _create_registry_database(session, database_name, statement_params) - _create_registry_schema(session, database_name, schema_name, statement_params) - _create_registry_tables( - session, - database_name, - schema_name, - registry_table_name, - metadata_table_name, - deployment_table_name, - statement_params, - ) - _create_registry_views( - session, - database_name, - schema_name, - registry_table_name, - metadata_table_name, - deployment_table_name, - statement_params, - ) - finally: - # Restore the db & schema to the original ones - if old_db is not None: - session.use_database(old_db) - if old_schema is not None: - session.use_schema(old_schema) - return True +_STAGE_PREFIX = "@" def _create_registry_database( @@ -172,29 +116,16 @@ def _create_registry_schema( if len(registry_schemas) > 0: logging.warning( - f"The schema {_get_fully_qualified_schema_name(database_name, schema_name)}already exists. " + f"The schema {table_manager.get_fully_qualified_schema_name(database_name, schema_name)}already exists. " + "Skipping creation." ) return - session.sql(f"CREATE SCHEMA {_get_fully_qualified_schema_name(database_name, schema_name)}").collect( + session.sql(f"CREATE SCHEMA {table_manager.get_fully_qualified_schema_name(database_name, schema_name)}").collect( statement_params=statement_params ) -def _get_fully_qualified_schema_name(database_name: str, schema_name: str) -> str: - return ".".join([database_name, schema_name]) - - -def _get_fully_qualified_table_name(database_name: str, schema_name: str, table_name: str) -> str: - return ".".join( - [ - _get_fully_qualified_schema_name(database_name, schema_name), - table_name, - ] - ) - - def _create_registry_tables( session: snowpark.Session, database_name: str, @@ -202,6 +133,7 @@ def _create_registry_tables( registry_table_name: str, metadata_table_name: str, deployment_table_name: str, + artifact_table_name: str, statement_params: Dict[str, Any], ) -> None: """Private helper to create the model registry required tables. @@ -213,64 +145,54 @@ def _create_registry_tables( registry_table_name: Name for the main model registry table. metadata_table_name: Name for the metadata table used by the model registry. deployment_table_name: Name for the deployment event table. + artifact_table_name: Name for the artifact table. statement_params: Function usage statement parameters used in sql query executions. """ # Create model registry table to store immutable properties of models - registry_table_schema_string = ", ".join([f"{k} {v}" for k, v in _schema._REGISTRY_TABLE_SCHEMA.items()]) - fully_qualified_registry_table_name = _create_single_registry_table( - session, database_name, schema_name, registry_table_name, registry_table_schema_string, statement_params + fully_qualified_registry_table_name = table_manager.create_single_registry_table( + session=session, + database_name=database_name, + schema_name=schema_name, + table_name=registry_table_name, + table_schema=_schema._REGISTRY_TABLE_SCHEMA, + statement_params=statement_params, ) # Create model metadata table to store mutable properties of models - metadata_table_schema_string = ", ".join( - [ - f"{k} {v.format(registry_table_name=fully_qualified_registry_table_name)}" - for k, v in _schema._METADATA_TABLE_SCHEMA.items() - ] - ) - _create_single_registry_table( - session, database_name, schema_name, metadata_table_name, metadata_table_schema_string, statement_params + metadata_table_schema = [ + (k, v.format(registry_table_name=fully_qualified_registry_table_name)) + for k, v in _schema._METADATA_TABLE_SCHEMA + ] + table_manager.create_single_registry_table( + session=session, + database_name=database_name, + schema_name=schema_name, + table_name=metadata_table_name, + table_schema=metadata_table_schema, + statement_params=statement_params, ) # Create model deployment table to store deployment events of models - deployment_table_schema_string = ", ".join( - [ - f"{k} {v.format(registry_table_name=fully_qualified_registry_table_name)}" - for k, v in _schema._DEPLOYMENTS_TABLE_SCHEMA.items() - ] + deployment_table_schema = [ + (k, v.format(registry_table_name=fully_qualified_registry_table_name)) + for k, v in _schema._DEPLOYMENTS_TABLE_SCHEMA + ] + table_manager.create_single_registry_table( + session=session, + database_name=database_name, + schema_name=schema_name, + table_name=deployment_table_name, + table_schema=deployment_table_schema, + statement_params=statement_params, ) - _create_single_registry_table( - session, database_name, schema_name, deployment_table_name, deployment_table_schema_string, statement_params - ) - - -def _create_single_registry_table( - session: snowpark.Session, - database_name: str, - schema_name: str, - table_name: str, - table_schema_string: str, - statement_params: Dict[str, Any], -) -> str: - """Creates a single table for registry and returns the fully qualified name of the table. - Args: - session: Session object to communicate with Snowflake. - database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the database. - table_name: Name of the target table. - table_schema_string: The SQL expression of the desired table schema. - statement_params: Function usage statement parameters used in sql query executions. - - Returns: - A string which is the name of the created table. - """ - fully_qualified_table_name = _get_fully_qualified_table_name(database_name, schema_name, table_name) - session.sql(f"CREATE TABLE IF NOT EXISTS {fully_qualified_table_name} ({table_schema_string})").collect( - statement_params=statement_params + _ml_artifact.create_ml_artifact_table( + session=session, + database_name=database_name, + schema_name=schema_name, + statement_params=statement_params, ) - return fully_qualified_table_name def _create_registry_views( @@ -280,6 +202,7 @@ def _create_registry_views( registry_table_name: str, metadata_table_name: str, deployment_table_name: str, + artifact_table_name: str, statement_params: Dict[str, Any], ) -> None: """Create views on underlying ModelRegistry tables. @@ -287,13 +210,14 @@ def _create_registry_views( Args: session: Session object to communicate with Snowflake. database_name: Desired name of the model registry database. - schema_name: Desired name of the schema used by this model registry inside the databse. + schema_name: Desired name of the schema used by this model registry inside the database. registry_table_name: Name for the main model registry table. metadata_table_name: Name for the metadata table used by the model registry. deployment_table_name: Name for the deployment event table. + artifact_table_name: Name for the artifact table. statement_params: Function usage statement parameters used in sql query executions. """ - fully_qualified_schema_name = _get_fully_qualified_schema_name(database_name, schema_name) + fully_qualified_schema_name = table_manager.get_fully_qualified_schema_name(database_name, schema_name) # From the documentation: Each DDL statement executes as a separate transaction. Races should not be an issue. # https://docs.snowflake.com/en/sql-reference/transactions.html#ddl @@ -365,6 +289,21 @@ def _create_registry_views( FROM {registry_table_name} {metadata_views_join}""" ).collect(statement_params=statement_params) + # Create artifact view. it joins artifact tables with registry table on model id. + artifact_view_name = identifier.concat_names([artifact_table_name, "_VIEW"]) + session.sql( + f"""CREATE OR REPLACE VIEW {fully_qualified_schema_name}.{artifact_view_name} COPY GRANTS AS + SELECT + {registry_table_name}.NAME AS MODEL_NAME, + {registry_table_name}.VERSION AS MODEL_VERSION, + {artifact_table_name}.* + FROM {registry_table_name} + LEFT JOIN {artifact_table_name} + ON ({registry_table_name}.TRAINING_DATASET_ID = {artifact_table_name}.ID) + WHERE {artifact_table_name}.TYPE = 'TRAINING_DATASET' + """ + ).collect(statement_params=statement_params) + def _create_active_permanent_deployment_view( session: snowpark.Session, @@ -417,6 +356,7 @@ def __init__( session: snowpark.Session, database_name: str = _DEFAULT_REGISTRY_NAME, schema_name: str = _DEFAULT_SCHEMA_NAME, + create_if_not_exists: bool = False, ) -> None: """ Opens an already-created registry. @@ -425,7 +365,11 @@ def __init__( session: Session object to communicate with Snowflake. database_name: Desired name of the model registry database. schema_name: Desired name of the schema used by this model registry inside the database. + create_if_not_exists: create model registry if it's not exists already. """ + if create_if_not_exists: + create_model_registry(session=session, database_name=database_name, schema_name=schema_name) + self._name = identifier.get_inferred_name(database_name) self._schema = identifier.get_inferred_name(schema_name) self._registry_table = identifier.get_inferred_name(_MODELS_TABLE_NAME) @@ -434,7 +378,8 @@ def __init__( self._deployment_table = identifier.get_inferred_name(_DEPLOYMENT_TABLE_NAME) self._permanent_deployment_view = identifier.concat_names([self._deployment_table, "_VIEW"]) self._permanent_deployment_stage = identifier.concat_names([self._deployment_table, "_STAGE"]) - + self._artifact_table = identifier.get_inferred_name(_ml_artifact._ARTIFACT_TABLE_NAME) + self._artifact_view = identifier.concat_names([self._artifact_table, "_VIEW"]) self._session = session # A in-memory deployment info cache to store information of temporary deployments @@ -458,40 +403,24 @@ def _check_access(self) -> None: query=f"SHOW SCHEMAS LIKE '{identifier.get_unescaped_names(self._schema)}' IN DATABASE {self._name}", ).has_dimensions(expected_rows=1).validate() - query_result_checker.SqlResultValidator( - self._session, - query=formatting.unwrap( - f""" - SHOW TABLES LIKE '{identifier.get_unescaped_names(self._registry_table)}' - IN {self._fully_qualified_schema_name()}""" - ), - ).has_dimensions(expected_rows=1).validate() - - self._validate_registry_table_schema(add_if_not_exists=set()) + table_manager.validate_table_exist( + self._session, identifier.get_unescaped_names(self._registry_table), self._fully_qualified_schema_name() + ) + self._validate_registry_table_schema(add_if_not_exists=["TRAINING_DATASET_ID"]) - query_result_checker.SqlResultValidator( - self._session, - query=formatting.unwrap( - f""" - SHOW TABLES LIKE '{identifier.get_unescaped_names(self._metadata_table)}' - IN {self._fully_qualified_schema_name()}""" - ), - ).has_dimensions(expected_rows=1).validate() + table_manager.validate_table_exist( + self._session, identifier.get_unescaped_names(self._metadata_table), self._fully_qualified_schema_name() + ) - query_result_checker.SqlResultValidator( - self._session, - query=formatting.unwrap( - f""" - SHOW TABLES LIKE '{identifier.get_unescaped_names(self._deployment_table)}' - IN {self._fully_qualified_schema_name()}""" - ), - ).has_dimensions(expected_rows=1).validate() + table_manager.validate_table_exist( + self._session, identifier.get_unescaped_names(self._deployment_table), self._fully_qualified_schema_name() + ) # TODO(zzhu): Also check validity of views. # TODO checks type as well. note type in _schema doesn't match with it appears in 'DESC TABLE'. # We need another layer of mapping. This function can also be extended to other tables as well. - def _validate_registry_table_schema(self, add_if_not_exists: Set[str]) -> None: + def _validate_registry_table_schema(self, add_if_not_exists: List[str]) -> None: """Validate the table schema to check for any missing columns. Args: @@ -501,15 +430,16 @@ def _validate_registry_table_schema(self, add_if_not_exists: Set[str]) -> None: TypeError: required column not exists in schema table and not defined in add_if_not_exists. """ + valid_cols = [t[0] for t in _schema._REGISTRY_TABLE_SCHEMA] for k in add_if_not_exists: - assert k in _schema._REGISTRY_TABLE_SCHEMA + assert k in valid_cols actual_table_rows = self._session.sql(f"DESC TABLE {self._fully_qualified_registry_table_name()}").collect() actual_schema_dict = {} for row in actual_table_rows: actual_schema_dict[row["name"]] = row["type"] - for col_name, col_type in _schema._REGISTRY_TABLE_SCHEMA.items(): + for col_name, col_type in _schema._REGISTRY_TABLE_SCHEMA: if col_name not in actual_schema_dict: if col_name not in add_if_not_exists: raise TypeError( @@ -538,50 +468,30 @@ def _get_new_unique_identifier(self) -> str: def _fully_qualified_registry_table_name(self) -> str: """Get the fully qualified name to the current registry table.""" - return _get_fully_qualified_table_name(self._name, self._schema, self._registry_table) + return table_manager.get_fully_qualified_table_name(self._name, self._schema, self._registry_table) def _fully_qualified_metadata_table_name(self) -> str: """Get the fully qualified name to the current metadata table.""" - return _get_fully_qualified_table_name(self._name, self._schema, self._metadata_table) + return table_manager.get_fully_qualified_table_name(self._name, self._schema, self._metadata_table) def _fully_qualified_deployment_table_name(self) -> str: """Get the fully qualified name to the current deployment table.""" - return _get_fully_qualified_table_name(self._name, self._schema, self._deployment_table) + return table_manager.get_fully_qualified_table_name(self._name, self._schema, self._deployment_table) def _fully_qualified_permanent_deployment_view_name(self) -> str: """Get the fully qualified name to the permanent deployment view.""" - return _get_fully_qualified_table_name(self._name, self._schema, self._permanent_deployment_view) + return table_manager.get_fully_qualified_table_name(self._name, self._schema, self._permanent_deployment_view) + + def _fully_qualified_artifact_view_name(self) -> str: + return table_manager.get_fully_qualified_table_name(self._name, self._schema, self._artifact_view) def _fully_qualified_schema_name(self) -> str: """Get the fully qualified name to the current registry schema.""" - return _get_fully_qualified_schema_name(self._name, self._schema) + return table_manager.get_fully_qualified_schema_name(self._name, self._schema) def _fully_qualified_deployment_name(self, deployment_name: str) -> str: """Get the fully qualified name to the given deployment.""" - return _get_fully_qualified_table_name(self._name, self._schema, deployment_name) - - def _insert_table_entry(self, *, table: str, columns: Dict[str, Any]) -> List[snowpark.Row]: - """Insert an entry into an internal Model Registry table. - - Args: - table: Name of the table to insert into. - columns: Key-value pairs of columns and values to be inserted into the table. - - Returns: - Result of the operation as returned by the Snowpark session (snowpark.DataFrame). - """ - sorted_columns = sorted(columns.items()) - - sql = "INSERT INTO {table} ( {columns} ) SELECT {values}".format( - table=table, - columns=",".join([x[0] for x in sorted_columns]), - values=",".join([formatting.format_value_for_select(x[1]) for x in sorted_columns]), - ) - return ( - query_result_checker.SqlResultValidator(self._session, sql) - .insertion_success(expected_num_rows=1) - .validate() - ) + return table_manager.get_fully_qualified_table_name(self._name, self._schema, deployment_name) def _insert_registry_entry( self, *, id: str, name: str, version: str, properties: Dict[str, Any] @@ -619,7 +529,9 @@ def _insert_registry_entry( # [CON] Code logic becomes messy depending on which fields are set. # [CON] Harder to re-use existing methods like set_model_name. # Context: https://docs.snowflake.com/en/sql-reference/sql/insert-multi-table.html - return self._insert_table_entry(table=self._fully_qualified_registry_table_name(), columns=properties) + return table_manager.insert_table_entry( + self._session, table=self._fully_qualified_registry_table_name(), columns=properties + ) def _insert_metadata_entry(self, *, id: str, attribute: str, value: Any, operation: str) -> List[snowpark.Row]: """Insert a new row into the model metadata table. @@ -648,7 +560,9 @@ def _insert_metadata_entry(self, *, id: str, attribute: str, value: Any, operati columns["ATTRIBUTE_NAME"] = attribute columns["VALUE"] = value - return self._insert_table_entry(table=self._fully_qualified_metadata_table_name(), columns=columns) + return table_manager.insert_table_entry( + self._session, table=self._fully_qualified_metadata_table_name(), columns=columns + ) def _insert_deployment_entry( self, @@ -696,7 +610,9 @@ def _insert_deployment_entry( columns["TARGET_METHOD"] = target_method columns["OPTIONS"] = options - return self._insert_table_entry(table=self._fully_qualified_deployment_table_name(), columns=columns) + return table_manager.insert_table_entry( + self._session, table=self._fully_qualified_deployment_table_name(), columns=columns + ) def _prepare_deployment_stage(self) -> str: """Create a stage in the model registry for storing all permanent deployments. @@ -716,7 +632,7 @@ def _prepare_deployment_stage(self) -> str: def _prepare_model_stage(self, model_id: str) -> str: """Create a stage in the model registry for storing the model with the given id. - Creating a permanent stage here since we do not have a way to swtich a stage from temporary to permanent. + Creating a permanent stage here since we do not have a way to switch a stage from temporary to permanent. This can result in orphaned stages in case the process fails. It might be better to try to create a temporary stage, attempt to perform all operations and convert the temp stage into permanent once the operation is complete. @@ -732,7 +648,7 @@ def _prepare_model_stage(self, model_id: str) -> str: """ schema = self._fully_qualified_schema_name() - # Uppercasing the model_stage_name to avoid having to quote the the stage name. + # Uppercase the model_stage_name to avoid having to quote the the stage name. stage_name = model_id.upper() model_stage_name = f"SNOWML_MODEL_{stage_name}" @@ -763,12 +679,11 @@ def _get_fully_qualified_stage_name_from_uri(self, model_uri: str) -> Optional[s The fully qualified Snowflake stage location encoded by the given URI. Returns None if the URI is not pointing to a Snowflake stage. """ - raw_stage_name = uri.get_snowflake_stage_path_from_uri(model_uri) - if not raw_stage_name: + raw_stage_path = uri.get_snowflake_stage_path_from_uri(model_uri) + if not raw_stage_path: return None - model_stage_name = raw_stage_name.split(".")[-1] - qualified_stage_path = f"{self._fully_qualified_schema_name()}.{model_stage_name}" - return qualified_stage_path + (db, schema, stage, _) = identifier.parse_schema_level_object_identifier(raw_stage_path) + return identifier.get_schema_level_object_identifier(db, schema, stage) def _list_selected_models( self, *, id: Optional[str] = None, model_name: Optional[str] = None, model_version: Optional[str] = None @@ -869,7 +784,7 @@ def _set_metadata_attribute( operation: str = _SET_METADATA_OPERATION, enable_model_presence_check: bool = True, ) -> None: - """Set the value of the given metadata attribute for targat model with given (model name + model version) or id. + """Set the value of the given metadata attribute for target model with given (model name + model version) or id. Args: attribute: Name of the attribute to set. @@ -978,64 +893,23 @@ def _log_model_path( self, model_name: str, model_version: str, - *, - path: str, - type: str, - description: Optional[str] = None, - tags: Optional[Dict[Any, Any]] = None, - ) -> str: - """Uploads and register a model to the Model Registry from a local file path. - - If `path` is a directory all files will be uploaded recursively, preserving the relative directory structure. - Symbolic links will be followed. - - NOTE: If any symlinks under `path` point to a parent directory, this can lead to infinite recursion. + ) -> Tuple[str, str]: + """Generate a path in the Model Registry to store a model. Args: model_name: The given name for the model. model_version: Version string to be set for the model. - path: Local file path to be uploaded. - type: Type of the model to be added. - description: A desription for the model. The description can be changed later. - tags: string-to-string dictonary of tag names and values to be set for the model. Returns: - String of the auto-generate unique model identifier. + String of the auto-generate unique model identifier and path to store it. """ - self._model_identifier_is_nonempty_or_raise(model_name, model_version) - id = self._get_new_unique_identifier() + model_id = self._get_new_unique_identifier() # Copy model from local disk to remote stage. # TODO(zhe): Check if we could use the same stage for multiple models. - fully_qualified_model_stage_name = self._prepare_model_stage(model_id=id) - - # Check if directory or file and adapt accordingly. - # TODO: Unify and explicit about compression for both file and directory. - if os.path.isfile(path): - self._session.file.put(path, posixpath.join(fully_qualified_model_stage_name, "data")) - elif os.path.isdir(path): - with file_utils.zip_file_or_directory_to_stream(path, path) as input_stream: - self._session._conn.upload_stream( - input_stream=input_stream, - stage_location=fully_qualified_model_stage_name, - dest_filename=f"{posixpath.basename(path)}.zip", - dest_prefix="", - source_compression="DEFLATE", - compress_data=False, - overwrite=True, - is_in_udf=True, - ) - self._register_model_with_id( - model_name=model_name, - model_version=model_version, - model_id=id, - type=type, - uri=uri.get_uri_from_snowflake_stage_path(fully_qualified_model_stage_name), - description=description, - tags=tags, - ) + fully_qualified_model_stage_name = self._prepare_model_stage(model_id=model_id) - return id + return model_id, fully_qualified_model_stage_name def _register_model_with_id( self, @@ -1049,6 +923,7 @@ def _register_model_with_id( output_spec: Optional[Dict[str, str]] = None, description: Optional[str] = None, tags: Optional[Dict[str, str]] = None, + training_dataset: Optional[training_dataset.TrainingDataset] = None, ) -> None: """Helper function to register model metadata. @@ -1065,9 +940,10 @@ def _register_model_with_id( expected column names and the values are the value types. output_spec: The expected output schema of the model. Dictionary where the keys are expected column names and the values are the value types. - description: A desription for the model. The description can be changed later. + description: A description for the model. The description can be changed later. tags: Key-value pairs of tags to be set for this model. Tags can be modified after model registration. + training_dataset: An object contains training dataset metadata. Raises: DataError: The given model already exists. @@ -1084,6 +960,20 @@ def _register_model_with_id( new_model["CREATION_TIME"] = formatting.SqlStr("CURRENT_TIMESTAMP()") new_model["CREATION_ROLE"] = self._session.get_current_role() new_model["CREATION_ENVIRONMENT_SPEC"] = {"python": ".".join(map(str, sys.version_info[:3]))} + if training_dataset is not None: + _ml_artifact.add_artifact( + session=self._session, + database_name=self._name, + schema_name=self._schema, + artifact_id=training_dataset.id(), + artifact_type=_ml_artifact.ArtifactType.TRAINING_DATASET, + artifact_name=training_dataset.id(), + artifact_version="", + artifact_spec=training_dataset.to_dict(), + ) + new_model["TRAINING_DATASET_ID"] = training_dataset.id() + else: + new_model["TRAINING_DATASET_ID"] = None existing_model_nums = self._list_selected_models(model_name=model_name, model_version=model_version).count() if existing_model_nums: @@ -1256,7 +1146,7 @@ def get_tags(self, model_name: str = None, model_version: str = None) -> Dict[st Returns: String-to-string dictionary containing all tags and values. The resulting dictionary can be empty. """ - # Snowpark snowpark.dataframes returns dictionary objects as strings. We need to convert it back to a dictionary + # Snowpark snowpark.dataframe returns dictionary objects as strings. We need to convert it back to a dictionary # here. result = self._get_metadata_attribute( _METADATA_ATTRIBUTE_TAGS, model_name=model_name, model_version=model_version @@ -1281,7 +1171,7 @@ def get_model_description(self, model_name: str, model_version: str) -> Optional model_version: Model Version string. Returns: - Descrption of the model or None. + Description of the model or None. """ result = self._get_metadata_attribute( _METADATA_ATTRIBUTE_DESCRIPTION, model_name=model_name, model_version=model_version @@ -1482,7 +1372,7 @@ def get_metrics(self, model_name: str, model_version: str) -> Dict[str, object]: Returns: String-to-float dictionary containing all metrics and values. The resulting dictionary can be empty. """ - # Snowpark snowpark.dataframes returns dictionary objects as strings. We need to convert it back to a dictionary + # Snowpark snowpark.dataframe returns dictionary objects as strings. We need to convert it back to a dictionary # here. result = self._get_metadata_attribute( _METADATA_ATTRIBUTE_METRICS, model_name=model_name, model_version=model_version @@ -1512,9 +1402,10 @@ def log_model( pip_requirements: Optional[List[str]] = None, signatures: Optional[Dict[str, model_signature.ModelSignature]] = None, sample_input_data: Optional[Any] = None, + training_dataset: Optional[training_dataset.TrainingDataset] = None, code_paths: Optional[List[str]] = None, options: Optional[model_types.BaseModelSaveOption] = None, - ) -> str: + ) -> Optional["ModelReference"]: """Uploads and register a model to the Model Registry. Args: @@ -1522,8 +1413,8 @@ def log_model( model_version: Version string to be set for the model. The combination (name + version) must be unique for each model. model: Local model object in a supported format. - description: A desription for the model. The description can be changed later. - tags: string-to-string dictonary of tag names and values to be set for the model. + description: A description for the model. The description can be changed later. + tags: string-to-string dictionary of tag names and values to be set for the model. conda_dependencies: List of Conda package specs. Use "[channel::]package [operator version]" syntax to specify a dependency. It is a recommended way to specify your dependencies using conda. When channel is not specified, defaults channel will be used. When deploying to Snowflake Warehouse, defaults channel @@ -1531,73 +1422,81 @@ def log_model( pip_requirements: List of PIP package specs. Model will not be able to deploy to the warehouse if there is pip requirements. signatures: Signatures of the model, which is a mapping from target method name to signatures of input and - output, which could be inferred by calling `infer_signature` method with sample input data. + output, which could be inferred by calling `infer_signature` method with sample input data or training + dataset. sample_input_data: Sample of the input data for the model. + training_dataset: A training dataset metadata object. code_paths: Directory of code to import when loading and deploying the model. options: Additional options when saving the model. Raises: - TypeError: Raised when both signatures and sample_input_data is not presented. Will be captured locally. DataError: Raised when the given model exists. + ValueError: Raised in following cases: + 1) both sample_input_data and training_dataset are provided; + 2) signatures and sample_input_data/training_dataset are both not provided and + model is not a snowflake estimator. + Exception: Raised when there is any error raised when saving the model. Returns: - String of the auto-generate unique model identifier. None if failed. + Model Reference . None if failed. """ # Ideally, the whole operation should be a single transaction. Currently, transactions do not support stage # operations. self._model_identifier_is_nonempty_or_raise(model_name, model_version) + if sample_input_data is not None and training_dataset is not None: + raise ValueError("Only one of sample_input_data and training_dataset should be provided.") + + if training_dataset is not None: + sample_input_data = training_dataset.df + if training_dataset.timestamp_col is not None: + sample_input_data = sample_input_data.drop(training_dataset.timestamp_col) + if training_dataset.label_cols is not None: + sample_input_data = sample_input_data.drop(training_dataset.label_cols) + existing_model_nums = self._list_selected_models(model_name=model_name, model_version=model_version).count() if existing_model_nums: raise connector.DataError(f"Model {model_name}/{model_version} already exists. Unable to log the model.") - with tempfile.TemporaryDirectory() as tmpdir: - model = cast(model_types.SupportedModelType, model) - if signatures: - model_metadata = model_api.save_model( - name=model_name, - model_dir_path=tmpdir, - model=model, - signatures=signatures, - metadata=tags, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - code_paths=code_paths, - options=options, - ) - elif sample_input_data is not None: - model_metadata = model_api.save_model( - name=model_name, - model_dir_path=tmpdir, - model=model, - metadata=tags, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - sample_input=sample_input_data, - code_paths=code_paths, - options=options, - ) - elif isinstance(model, base.BaseEstimator): - model_metadata = model_api.save_model( - name=model_name, - model_dir_path=tmpdir, - model=model, - metadata=tags, - conda_dependencies=conda_dependencies, - pip_requirements=pip_requirements, - code_paths=code_paths, - options=options, - ) - else: - raise TypeError("Either signature or sample input data should exist for native model packaging.") - return self._log_model_path( - model_name=model_name, - model_version=model_version, - path=tmpdir, - type=model_metadata.model_type, - description=description, - tags=tags, # TODO: Inherent model type enum. + model_id, fully_qualified_model_stage_name = self._log_model_path( + model_name=model_name, + model_version=model_version, + ) + model_stage_file_path = posixpath.join(f"{_STAGE_PREFIX}{fully_qualified_model_stage_name}", f"{model_id}.zip") + model = cast(model_types.SupportedModelType, model) + try: + model_metadata = model_api.save_model( # type: ignore[call-overload, misc] + name=model_name, + session=self._session, + model_stage_file_path=model_stage_file_path, + model=model, + signatures=signatures, + metadata=tags, + conda_dependencies=conda_dependencies, + pip_requirements=pip_requirements, + sample_input=sample_input_data, + code_paths=code_paths, + options=options, ) + except Exception: + # When model saving fails, clean up the model stage. + query_result_checker.SqlResultValidator( + self._session, f"DROP STAGE {fully_qualified_model_stage_name}" + ).has_dimensions(expected_rows=1, expected_cols=1).validate() + raise + + self._register_model_with_id( + model_name=model_name, + model_version=model_version, + model_id=model_id, + type=model_metadata.model_type, + uri=uri.get_uri_from_snowflake_stage_path(model_stage_file_path), + description=description, + tags=tags, + training_dataset=training_dataset, + ) + + return ModelReference(registry=self, model_name=model_name, model_version=model_version) @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, @@ -1616,15 +1515,8 @@ def load_model(self, model_name: str, model_version: str) -> Any: """ remote_model_path = self._get_model_path(model_name=model_name, model_version=model_version) restored_model = None - with tempfile.TemporaryDirectory() as local_model_directory: - self._session.file.get(remote_model_path, local_model_directory) - local_path = os.path.join(local_model_directory, posixpath.basename(remote_model_path)) - if zipfile.is_zipfile(local_path): - extracted_dir = os.path.join(local_model_directory, "extracted") - with zipfile.ZipFile(local_path, "r") as myzip: - if len(myzip.namelist()) > 1: - myzip.extractall(extracted_dir) - restored_model, _ = model_api.load_model(model_dir_path=extracted_dir) + + restored_model, _ = model_api.load_model(session=self._session, model_stage_file_path=remote_model_path) return restored_model @@ -1641,7 +1533,7 @@ def deploy( model_version: str, *, deployment_name: str, - target_method: str, + target_method: Optional[str] = None, permanent: bool = False, platform: deploy_platforms.TargetPlatform = deploy_platforms.TargetPlatform.WAREHOUSE, options: Optional[ @@ -1654,7 +1546,7 @@ def deploy( model_name: Model Name string. model_version: Model Version string. deployment_name: name of the generated UDF. - target_method: The method name to use in deployment. + target_method: The method name to use in deployment. Can be omitted if only 1 method in the model. permanent: Whether the deployment is permanent or not. Permanent deployment will generate a permanent UDF. (Only applicable for Warehouse deployment) platform: Target platform to deploy the model to. Currently supported platforms are @@ -1730,7 +1622,7 @@ def deploy( platform=deployment_info["platform"].value, stage_path=deployment_stage_path, signature=deployment_info["signature"].to_dict(), - target_method=target_method, + target_method=deployment_info["target_method"], options=options, ) @@ -1761,7 +1653,7 @@ def list_deployments(self, model_name: str, model_version: str) -> snowpark.Data model_version: Model Version string. Returns: - A snowpark dataframe that contains all deployments that assoicated with the given model. + A snowpark dataframe that contains all deployments that associated with the given model. """ deployments_df = ( self._session.sql(f"SELECT * FROM {self._fully_qualified_permanent_deployment_view_name()}") @@ -1782,6 +1674,24 @@ def list_deployments(self, model_name: str, model_version: str) -> snowpark.Data ) return cast(snowpark.DataFrame, res) + @snowpark._internal.utils.private_preview(version="1.0.1") + def list_artifacts(self, model_name: str, model_version: str) -> snowpark.DataFrame: + """List all artifacts that associated with given model. + + Args: + model_name: Model Name string. + model_version: Model Version string. + + Returns: + A snowpark dataframe that contains all artifacts that assoicated with the given model. + """ + artifacts = ( + self._session.sql(f"SELECT * FROM {self._fully_qualified_artifact_view_name()}") + .filter(snowpark.Column("MODEL_NAME") == model_name) + .filter(snowpark.Column("MODEL_VERSION") == model_version) + ) + return cast(snowpark.DataFrame, artifacts) + @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, @@ -1798,7 +1708,7 @@ def get_deployment(self, model_name: str, model_version: str, *, deployment_name deployment_name: Deployment name string. Returns: - A snowpark dataframe that contains the information of thetarget deployment. + A snowpark dataframe that contains the information of the target deployment. Raises: KeyError: Raised if the target deployment is not found. @@ -1812,6 +1722,33 @@ def get_deployment(self, model_name: str, model_version: str, *, deployment_name ) return cast(snowpark.DataFrame, deployment) + @telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, + ) + @snowpark._internal.utils.private_preview(version="1.0.1") + def get_training_dataset(self, model_name: str, model_version: str) -> Optional[training_dataset.TrainingDataset]: + """Get training dataset of the model with the given (model name + model version). + + Args: + model_name: Model Name string. + model_version: Model Version string. + + Returns: + Training dataset of the model or none if not found. + """ + artifacts = ( + self.list_artifacts(model_name, model_version) + .filter(snowpark.Column("TYPE") == _ml_artifact.ArtifactType.TRAINING_DATASET.value) + .collect() + ) + + return ( + training_dataset.TrainingDataset.from_json(artifacts[0]["ARTIFACT_SPEC"], self._session) + if len(artifacts) != 0 + else None + ) + @telemetry.send_api_usage_telemetry( project=_TELEMETRY_PROJECT, subproject=_TELEMETRY_SUBPROJECT, @@ -1821,7 +1758,7 @@ def delete_deployment(self, model_name: str, model_version: str, *, deployment_n """Delete the target permanent deployment of the given model. Deleting temporary deployment are currently not supported. - Temporart deployment will get cleaned automatically when the current session closed. + Temporary deployment will get cleaned automatically when the current session closed. Args: model_name: Model Name string. @@ -2090,6 +2027,7 @@ def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": self._model_name, self._model_version, deployment_name=deployment_name ).collect()[0] platform = deploy_platforms.TargetPlatform(deployment["TARGET_PLATFORM"]) + target_method = deployment["TARGET_METHOD"] signature = model_signature.ModelSignature.from_dict(json.loads(deployment["SIGNATURE"])) options_dict = cast(Dict[str, Any], json.loads(deployment["OPTIONS"])) platform_options = { @@ -2105,9 +2043,81 @@ def predict(self, deployment_name: str, data: Any) -> "pd.DataFrame": di = _deployer.Deployment( name=self._registry._fully_qualified_deployment_name(deployment_name), platform=platform, + target_method=target_method, signature=signature, options=options, ) return _deployer.predict(session=self._registry._session, deployment=di, X=data) except KeyError: raise ValueError(f"The deployment with name {deployment_name} haven't been deployed") + + +@telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, +) +@snowpark._internal.utils.private_preview(version="0.2.0") +def create_model_registry( + *, + session: snowpark.Session, + database_name: str = _DEFAULT_REGISTRY_NAME, + schema_name: str = _DEFAULT_SCHEMA_NAME, +) -> bool: + """Setup a new model registry. This should be run once per model registry by an administrator role. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + + Returns: + True if the creation of the model registry internal data structures was successful, + False otherwise. + """ + # Get the db & schema of the current session + old_db = session.get_current_database() + old_schema = session.get_current_schema() + + # These might be exposed as parameters in the future. + database_name = identifier.get_inferred_name(database_name) + schema_name = identifier.get_inferred_name(schema_name) + registry_table_name = identifier.get_inferred_name(_MODELS_TABLE_NAME) + metadata_table_name = identifier.get_inferred_name(_METADATA_TABLE_NAME) + deployment_table_name = identifier.get_inferred_name(_DEPLOYMENT_TABLE_NAME) + artifact_table_name = identifier.get_inferred_name(_ml_artifact._ARTIFACT_TABLE_NAME) + + statement_params = telemetry.get_function_usage_statement_params( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, + function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), ""), + ) + try: + _create_registry_database(session, database_name, statement_params) + _create_registry_schema(session, database_name, schema_name, statement_params) + _create_registry_tables( + session, + database_name, + schema_name, + registry_table_name, + metadata_table_name, + deployment_table_name, + artifact_table_name, + statement_params, + ) + _create_registry_views( + session, + database_name, + schema_name, + registry_table_name, + metadata_table_name, + deployment_table_name, + artifact_table_name, + statement_params, + ) + finally: + # Restore the db & schema to the original ones + if old_db is not None: + session.use_database(old_db) + if old_schema is not None: + session.use_schema(old_schema) + return True diff --git a/snowflake/ml/registry/model_registry_test.py b/snowflake/ml/registry/model_registry_test.py index 50758cee..ca38fd2f 100644 --- a/snowflake/ml/registry/model_registry_test.py +++ b/snowflake/ml/registry/model_registry_test.py @@ -1,24 +1,28 @@ import datetime import itertools import json +import posixpath from typing import Any, Dict, List, Union, cast from _schema import ( + _ARTIFACT_TABLE_SCHEMA, _DEPLOYMENTS_TABLE_SCHEMA, _METADATA_TABLE_SCHEMA, _REGISTRY_TABLE_SCHEMA, ) from absl.testing import absltest -from snowflake import snowpark +from snowflake import connector, snowpark from snowflake.ml._internal import telemetry -from snowflake.ml._internal.utils import formatting, identifier +from snowflake.ml._internal.utils import formatting, identifier, uri +from snowflake.ml.model import _model from snowflake.ml.registry import model_registry from snowflake.ml.test_utils import mock_data_frame, mock_session _DATABASE_NAME = identifier.get_inferred_name("_SYSTEM_MODEL_REGISTRY") _SCHEMA_NAME = identifier.get_inferred_name("_SYSTEM_MODEL_REGISTRY_SCHEMA") _REGISTRY_TABLE_NAME = identifier.get_inferred_name("_SYSTEM_REGISTRY_MODELS") +_ARTIFACTS_TABLE_NAME = identifier.get_inferred_name("_SYSTEM_REGISTRY_ARTIFACTS") _METADATA_TABLE_NAME = identifier.get_inferred_name("_SYSTEM_REGISTRY_METADATA") _DEPLOYMENTS_TABLE_NAME = identifier.get_inferred_name("_SYSTEM_REGISTRY_DEPLOYMENTS") _FULLY_QUALIFIED_REGISTRY_TABLE_NAME = ".".join( @@ -28,19 +32,18 @@ _REGISTRY_TABLE_NAME, ] ) -_REGISTRY_SCHEMA_STRING = ", ".join([f"{k} {v}" for k, v in _REGISTRY_TABLE_SCHEMA.items()]) -_METADATA_INSERT_COLUMNS_STRING = ",".join(filter(lambda x: x != "SEQUENCE_ID", _METADATA_TABLE_SCHEMA.keys())) +_REGISTRY_SCHEMA_STRING = ", ".join([f"{k} {v}" for k, v in _REGISTRY_TABLE_SCHEMA]) +_METADATA_INSERT_COLUMNS_STRING = ",".join( + filter(lambda x: x != "SEQUENCE_ID", [item[0] for item in _METADATA_TABLE_SCHEMA]) +) _METADATA_SCHEMA_STRING = ", ".join( - [ - f"{k} {v.format(registry_table_name=_FULLY_QUALIFIED_REGISTRY_TABLE_NAME)}" - for k, v in _METADATA_TABLE_SCHEMA.items() - ] + [f"{k} {v.format(registry_table_name=_FULLY_QUALIFIED_REGISTRY_TABLE_NAME)}" for k, v in _METADATA_TABLE_SCHEMA] ) _DEPLOYMENTS_SCHEMA_STRING = ",".join( - [ - f"{k} {v.format(registry_table_name=_FULLY_QUALIFIED_REGISTRY_TABLE_NAME)}" - for k, v in _DEPLOYMENTS_TABLE_SCHEMA.items() - ] + [f"{k} {v.format(registry_table_name=_FULLY_QUALIFIED_REGISTRY_TABLE_NAME)}" for k, v in _DEPLOYMENTS_TABLE_SCHEMA] +) +_ARTIFACTS_SCHEMA_STRING = ",".join( + [f"{k} {v.format(registry_table_name=_FULLY_QUALIFIED_REGISTRY_TABLE_NAME)}" for k, v in _ARTIFACT_TABLE_SCHEMA] ) @@ -70,6 +73,140 @@ def _setup_mock_session(self) -> None: self._session.use_database = absltest.mock.MagicMock() self._session.use_schema = absltest.mock.MagicMock() + def _mock_show_database_exists(self) -> None: + self.add_session_mock_sql( + query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", + result=mock_data_frame.MockDataFrame(self.get_show_databases_success(name=_DATABASE_NAME)), + ) + + def _mock_show_database_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", + result=mock_data_frame.MockDataFrame([]).add_collect_result([], statement_params=statement_params), + ) + + def _mock_create_database_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"CREATE DATABASE IF NOT EXISTS {_DATABASE_NAME}", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status="MODEL_REGISTRY already exists, statement succeeded.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_database_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"CREATE DATABASE {_DATABASE_NAME}", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status="Database MODEL_REGISTRY successfully created.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_show_schema_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"SHOW SCHEMAS LIKE '{_SCHEMA_NAME}' IN DATABASE {_DATABASE_NAME}", + result=mock_data_frame.MockDataFrame(self.get_show_schemas_success(name=_SCHEMA_NAME)).add_collect_result( + self.get_show_schemas_success(name=_SCHEMA_NAME), + statement_params=statement_params, + ), + ) + + def _mock_show_schema_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"SHOW SCHEMAS LIKE '{_SCHEMA_NAME}' IN DATABASE {_DATABASE_NAME}", + result=mock_data_frame.MockDataFrame([]).add_collect_result([], statement_params=statement_params), + ) + + def _mock_create_schema_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"CREATE SCHEMA {_DATABASE_NAME}.{_SCHEMA_NAME}", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"SCHEMA {_SCHEMA_NAME} successfully created.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_registry_table_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_REGISTRY_TABLE_NAME} + ({_REGISTRY_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"{_REGISTRY_TABLE_NAME} already exists, statement succeeded.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_artifacts_table_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_ARTIFACTS_TABLE_NAME} + ({_ARTIFACTS_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"{_ARTIFACTS_TABLE_NAME} already exists, statement succeeded.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_registry_table_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_REGISTRY_TABLE_NAME} + ({_REGISTRY_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"Table {_REGISTRY_TABLE_NAME} successfully created.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_artifacts_table_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_ARTIFACTS_TABLE_NAME} + ({_ARTIFACTS_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"Table {_ARTIFACTS_TABLE_NAME} successfully created.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_metadata_table_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_METADATA_TABLE_NAME} + ({_METADATA_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"{_METADATA_TABLE_NAME} already exists, statement succeeded.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_metadata_table_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_METADATA_TABLE_NAME} + ({_METADATA_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"Table {_METADATA_TABLE_NAME} successfully created.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_deployment_table_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_DEPLOYMENTS_TABLE_NAME} + ({_DEPLOYMENTS_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"{_DEPLOYMENTS_TABLE_NAME} already exists, statement succeeded.")], + collect_statement_params=statement_params, + ), + ) + + def _mock_create_deployment_table_not_exists(self, statement_params: Dict[str, str]) -> None: + self.add_session_mock_sql( + query=f"""CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_DEPLOYMENTS_TABLE_NAME} + ({_DEPLOYMENTS_SCHEMA_STRING})""", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"Table {_DEPLOYMENTS_TABLE_NAME} successfully created.")], + collect_statement_params=statement_params, + ), + ) + def add_session_mock_sql(self, query: str, result: Any) -> None: self._session.add_mock_sql(query=query, result=result) @@ -151,6 +288,7 @@ def get_desc_registry_table_success(self) -> List[snowpark.Row]: snowpark.Row(name="NAME", type="VARCHAR"), snowpark.Row(name="OUTPUT_SPEC", type="OBJECT"), snowpark.Row(name="RUNTIME_ENVIRONMENT_SPEC", type="OBJECT"), + snowpark.Row(name="TRAINING_DATASET_ID", type="VARCHAR"), snowpark.Row(name="TYPE", type="VARCHAR"), snowpark.Row(name="URI", type="VARCHAR"), snowpark.Row(name="VERSION", type="VARCHAR"), @@ -313,6 +451,52 @@ def setup_create_views_call(self) -> None: [snowpark.Row(status=f"View {_REGISTRY_TABLE_NAME}_VIEW successfully created.")] ), ) + self.add_session_mock_sql( # type: ignore + query=( + f"""CREATE OR REPLACE VIEW {_DATABASE_NAME}.{_SCHEMA_NAME}.{_ARTIFACTS_TABLE_NAME}_VIEW + COPY GRANTS AS + SELECT + {_REGISTRY_TABLE_NAME}.NAME AS MODEL_NAME, + {_REGISTRY_TABLE_NAME}.VERSION AS MODEL_VERSION, + {_ARTIFACTS_TABLE_NAME}.* + FROM {_REGISTRY_TABLE_NAME} + LEFT JOIN {_ARTIFACTS_TABLE_NAME} + ON ({_REGISTRY_TABLE_NAME}.TRAINING_DATASET_ID = {_ARTIFACTS_TABLE_NAME}.ID) + WHERE {_ARTIFACTS_TABLE_NAME}.TYPE = 'TRAINING_DATASET' + """ + ), + result=mock_data_frame.MockDataFrame( + [snowpark.Row(status=f"View {_ARTIFACTS_TABLE_NAME}_VIEW successfully created.")] + ), + ), + + def setup_open_existing(self) -> None: + self.add_session_mock_sql( + query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", + result=mock_data_frame.MockDataFrame(self.get_show_databases_success(name=_DATABASE_NAME)), + ) + self.add_session_mock_sql( + query=f"SHOW SCHEMAS LIKE '{_SCHEMA_NAME}' IN DATABASE {_DATABASE_NAME}", + result=mock_data_frame.MockDataFrame(self.get_show_schemas_success(name=_SCHEMA_NAME)), + ) + self.add_session_mock_sql( + query=f"SHOW TABLES LIKE '{_REGISTRY_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", + result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_REGISTRY_TABLE_NAME)), + ) + self.add_session_mock_sql( + query=f"DESC TABLE {_FULLY_QUALIFIED_REGISTRY_TABLE_NAME}", + result=mock_data_frame.MockDataFrame(self.get_desc_registry_table_success()).add_collect_result( + self.get_desc_registry_table_success() + ), + ) + self.add_session_mock_sql( + query=f"SHOW TABLES LIKE '{_METADATA_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", + result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_METADATA_TABLE_NAME)), + ) + self.add_session_mock_sql( + query=f"SHOW TABLES LIKE '{_DEPLOYMENTS_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", + result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_DEPLOYMENTS_TABLE_NAME)), + ) def template_test_get_attribute( self, collection_res: List[snowpark.Row], use_id: bool = False @@ -336,7 +520,14 @@ def template_test_set_attribute( if not use_id: expected_df.add_operation("filter") expected_df.add_collect_result( - [snowpark.Row(ID=self.model_id, NAME="name", VERSION="abc", URI="sfc://model_stage")] + [ + snowpark.Row( + ID=self.model_id, + NAME="name", + VERSION="abc", + URI=f"sfc://{_DATABASE_NAME}.{_SCHEMA_NAME}.model_stage", + ) + ] ) self._session.add_operation("get_current_role", result="current_role") @@ -355,21 +546,23 @@ def template_test_set_attribute( def test_create_new(self) -> None: """Verify that we can create a new ModelRegistry database with the default names.""" # "Create" calls. - combinations = list(itertools.product([True, False], repeat=5)) + combinations = list(itertools.product([True, False], repeat=6)) for ( database_exists, schema_exists, registry_table_exists, metadata_table_exists, deployments_table_exists, + artifacts_table_exists, ) in combinations: with self.subTest( msg=( f"database_exists={database_exists}, " f"schema_exists={schema_exists}, " f"registry_table_exists={registry_table_exists}, " - f"metadata_table_exists={metadata_table_exists}" - f"deployments_table_exists={deployments_table_exists}" + f"metadata_table_exists={metadata_table_exists}, " + f"deployments_table_exists={deployments_table_exists}, " + f"artifacts_table_exists={artifacts_table_exists}" ) ): statement_params = telemetry.get_function_usage_statement_params( @@ -378,150 +571,76 @@ def test_create_new(self) -> None: function_name="snowflake.ml.registry.model_registry.create_model_registry", ) if database_exists: - self.add_session_mock_sql( - query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", - result=mock_data_frame.MockDataFrame(self.get_show_databases_success(name=_DATABASE_NAME)), - ) + self._mock_show_database_exists() else: - self.add_session_mock_sql( - query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", - result=mock_data_frame.MockDataFrame([]).add_collect_result( - [], statement_params=statement_params - ), - ) - self.add_session_mock_sql( - query=f"CREATE DATABASE {_DATABASE_NAME}", - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status="Database MODEL_REGISTRY successfully created.")], - collect_statement_params=statement_params, - ), - ) + self._mock_show_database_not_exists(statement_params) + self._mock_create_database_not_exists(statement_params) + if schema_exists: - self.add_session_mock_sql( - query=f"SHOW SCHEMAS LIKE '{_SCHEMA_NAME}' IN DATABASE {_DATABASE_NAME}", - result=mock_data_frame.MockDataFrame( - self.get_show_schemas_success(name=_SCHEMA_NAME) - ).add_collect_result( - self.get_show_schemas_success(name=_SCHEMA_NAME), - statement_params=statement_params, - ), - ) + self._mock_show_schema_exists(statement_params) else: - self.add_session_mock_sql( - query=f"SHOW SCHEMAS LIKE '{_SCHEMA_NAME}' IN DATABASE {_DATABASE_NAME}", - result=mock_data_frame.MockDataFrame([]).add_collect_result( - [], statement_params=statement_params - ), - ) - self.add_session_mock_sql( - query=f"CREATE SCHEMA {_DATABASE_NAME}.{_SCHEMA_NAME}", - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"SCHEMA {_SCHEMA_NAME} successfully created.")], - collect_statement_params=statement_params, - ), - ) + self._mock_show_schema_not_exists(statement_params) + self._mock_create_schema_not_exists(statement_params) + if registry_table_exists: - self.add_session_mock_sql( - query=f""" - CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_REGISTRY_TABLE_NAME} - ({_REGISTRY_SCHEMA_STRING}) - """, - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"{_REGISTRY_TABLE_NAME} already exists, statement succeeded.")], - collect_statement_params=statement_params, - ), - ) + self._mock_create_registry_table_exists(statement_params) else: - self.add_session_mock_sql( - query=f""" - CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_REGISTRY_TABLE_NAME} - ({_REGISTRY_SCHEMA_STRING}) - """, - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"Table {_REGISTRY_TABLE_NAME} successfully created.")], - collect_statement_params=statement_params, - ), - ) + self._mock_create_registry_table_not_exists(statement_params) + if metadata_table_exists: - self.add_session_mock_sql( - query=f""" - CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_METADATA_TABLE_NAME} - ({_METADATA_SCHEMA_STRING}) - """, - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"{_METADATA_TABLE_NAME} already exists, statement succeeded.")], - collect_statement_params=statement_params, - ), - ) + self._mock_create_metadata_table_exists(statement_params) else: - self.add_session_mock_sql( - query=f""" - CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_METADATA_TABLE_NAME} - ({_METADATA_SCHEMA_STRING}) - """, - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"Table {_METADATA_TABLE_NAME} successfully created.")], - collect_statement_params=statement_params, - ), - ) + self._mock_create_metadata_table_not_exists(statement_params) + if deployments_table_exists: - self.add_session_mock_sql( - query=f""" - CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_DEPLOYMENTS_TABLE_NAME} - ({_DEPLOYMENTS_SCHEMA_STRING}) - """, - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"{_DEPLOYMENTS_TABLE_NAME} already exists, statement succeeded.")], - collect_statement_params=statement_params, - ), - ) + self._mock_create_deployment_table_exists(statement_params) + else: + self._mock_create_deployment_table_not_exists(statement_params) + + if artifacts_table_exists: + self._mock_create_artifacts_table_exists(statement_params) else: - self.add_session_mock_sql( - query=f""" - CREATE TABLE IF NOT EXISTS {_DATABASE_NAME}.{_SCHEMA_NAME}.{_DEPLOYMENTS_TABLE_NAME} - ({_DEPLOYMENTS_SCHEMA_STRING}) - """, - result=mock_data_frame.MockDataFrame( - [snowpark.Row(status=f"Table {_DEPLOYMENTS_TABLE_NAME} successfully created.")], - collect_statement_params=statement_params, - ), - ) + self._mock_create_artifacts_table_not_exists(statement_params) self.setup_create_views_call() + model_registry.create_model_registry( session=cast(snowpark.Session, self._session), database_name=_DATABASE_NAME, schema_name=_SCHEMA_NAME, ) - def test_open_existing(self) -> None: - """Verify that we can open an existing ModelRegistry database with the default names.""" - self.add_session_mock_sql( - query=f"SHOW DATABASES LIKE '{_DATABASE_NAME}'", - result=mock_data_frame.MockDataFrame(self.get_show_databases_success(name=_DATABASE_NAME)), + def test_create_if_not_exists(self) -> None: + statement_params = telemetry.get_function_usage_statement_params( + project="MLOps", + subproject="ModelRegistry", + function_name="snowflake.ml.registry.model_registry.create_model_registry", ) - self.add_session_mock_sql( - query=f"SHOW SCHEMAS LIKE '{_SCHEMA_NAME}' IN DATABASE {_DATABASE_NAME}", - result=mock_data_frame.MockDataFrame(self.get_show_schemas_success(name=_SCHEMA_NAME)), - ) - self.add_session_mock_sql( - query=f"SHOW TABLES LIKE '{_REGISTRY_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", - result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_REGISTRY_TABLE_NAME)), - ) - self.add_session_mock_sql( - query=f"DESC TABLE {_FULLY_QUALIFIED_REGISTRY_TABLE_NAME}", - result=mock_data_frame.MockDataFrame(self.get_desc_registry_table_success()).add_collect_result( - self.get_desc_registry_table_success() - ), - ) - self.add_session_mock_sql( - query=f"SHOW TABLES LIKE '{_METADATA_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", - result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_METADATA_TABLE_NAME)), - ) - self.add_session_mock_sql( - query=f"SHOW TABLES LIKE '{_DEPLOYMENTS_TABLE_NAME}' IN {_DATABASE_NAME}.{_SCHEMA_NAME}", - result=mock_data_frame.MockDataFrame(self.get_show_tables_success(name=_DEPLOYMENTS_TABLE_NAME)), + # SQL queries issued by create_model_registry: + self._mock_show_database_not_exists(statement_params) + self._mock_create_database_not_exists(statement_params) + self._mock_show_schema_not_exists(statement_params) + self._mock_create_schema_not_exists(statement_params) + self._mock_create_registry_table_not_exists(statement_params) + self._mock_create_metadata_table_not_exists(statement_params) + self._mock_create_deployment_table_not_exists(statement_params) + self._mock_create_artifacts_table_not_exists(statement_params) + self.setup_create_views_call() + + # 2. SQL queries issued by ModelRegistry constructor. + self.setup_open_existing() + + registry = model_registry.ModelRegistry( + session=cast(snowpark.Session, self._session), + database_name=_DATABASE_NAME, + schema_name=_SCHEMA_NAME, + create_if_not_exists=True, ) + self.assertIsNotNone(registry) + + def test_open_existing(self) -> None: + """Verify that we can open an existing ModelRegistry database with the default names.""" + self.setup_open_existing() model_registry.ModelRegistry(session=cast(snowpark.Session, self._session)) def test_list_models(self) -> None: @@ -783,8 +902,8 @@ def test_get_tags(self) -> None: self.assertEqual(tags["top_level"], "string") self.assertEqual(tags["nested"]["float"], 0.9) - def test_log_model_path_file(self) -> None: - """Test _log_model_path() when the model is a file. + def test_log_model_path(self) -> None: + """Test _log_model_path(). Validate _log_model_path() can perform stage file put operation with the expected stage path and call register_model() with the expected arguments. @@ -803,48 +922,143 @@ def test_log_model_path_file(self) -> None: ), ) - # Mock the snowpark.session.file operation - mock_sp_file_operation = absltest.mock.Mock() - self._session.__setattr__("file", mock_sp_file_operation) + expected_stage_path = ( + f"{identifier.get_inferred_name(_DATABASE_NAME)}" + + "." + + f"{identifier.get_inferred_name(_SCHEMA_NAME)}" + + "." + + f"SNOWML_MODEL_{expected_stage_postfix}" + ) + + with absltest.mock.patch.object( + model_registry, + "_get_new_unique_identifier", + return_value=self.model_id, + ): + model_id, stage_path = model_registry._log_model_path( + model_name=model_name, + model_version=model_version, + ) + self.assertEqual(model_id, self.model_id) + self.assertEqual(stage_path, expected_stage_path) + + def test_log_model(self) -> None: + """Test log_model()""" + model_registry = self.get_model_registry() + + model_name = "name" + model_version = "abc" + expected_stage_postfix = f"{self.model_id}".upper() expected_stage_path = ( f"{identifier.get_inferred_name(_DATABASE_NAME)}" + "." + f"{identifier.get_inferred_name(_SCHEMA_NAME)}" + "." - + f"SNOWML_MODEL_{expected_stage_postfix}/data" + + f"SNOWML_MODEL_{expected_stage_postfix}" ) + model_path = posixpath.join(f"@{expected_stage_path}", f"{self.model_id}.zip") - with absltest.mock.patch("model_registry.os.path.isfile", return_value=True) as mock_isfile: + with absltest.mock.patch.object( + model_registry, + "_list_selected_models", + return_value=absltest.mock.MagicMock(count=absltest.mock.MagicMock(return_value=0)), + ): with absltest.mock.patch.object( model_registry, - "_get_new_unique_identifier", - return_value=self.model_id, - ): + "_log_model_path", + return_value=(self.model_id, expected_stage_path), + ) as mock_path: + mock_model = absltest.mock.MagicMock() + mock_type = absltest.mock.MagicMock() + mock_metadata = absltest.mock.MagicMock(model_type=mock_type) with absltest.mock.patch.object( - model_registry, - "_register_model_with_id", - return_value=self.model_id, - ): - model_registry._log_model_path( - path="path", - type="type", - model_name=model_name, - model_version=model_version, - description="description", - ) - mock_isfile.assert_called_once_with("path") - mock_sp_file_operation.put.assert_called_with("path", expected_stage_path) - assert isinstance(model_registry._register_model_with_id, absltest.mock.Mock) - model_registry._register_model_with_id.assert_called_with( - model_name=model_name, - model_version=model_version, - model_id=self.model_id, - type="type", - uri=f"sfc:{_DATABASE_NAME}.{_SCHEMA_NAME}.SNOWML_MODEL_{expected_stage_postfix}", - description="description", - tags=None, - ) + target=_model, attribute="save_model", return_value=mock_metadata + ) as mock_save: + with absltest.mock.patch.object( + target=model_registry, attribute="_register_model_with_id", return_value=None + ) as mock_register: + with absltest.mock.patch.object(model_registry, "_get_model_id", return_value=self.model_id): + m_signatures = {"predict": None} + model_registry.log_model( + model_name=model_name, + model_version=model_version, + model=mock_model, + signatures=m_signatures, + description="description", + tags=None, + ) + mock_path.assert_called_once_with(model_name=model_name, model_version=model_version) + mock_save.assert_called_once_with( + name=model_name, + session=self._session, + model_stage_file_path=model_path, + model=mock_model, + signatures=m_signatures, + metadata=None, + conda_dependencies=None, + pip_requirements=None, + sample_input=None, + code_paths=None, + options=None, + ) + mock_register.assert_called_once_with( + model_name=model_name, + model_version=model_version, + model_id=self.model_id, + type=mock_type, + uri=uri.get_uri_from_snowflake_stage_path(model_path), + description="description", + tags=None, + training_dataset=None, + ) + + with absltest.mock.patch.object( + model_registry, + "_list_selected_models", + return_value=absltest.mock.MagicMock(count=absltest.mock.MagicMock(return_value=1)), + ): + with self.assertRaises(connector.DataError): + model_registry.log_model( + model_name=model_name, + model_version=model_version, + model=mock_model, + signatures=m_signatures, + description="description", + tags=None, + ) + + self.add_session_mock_sql( + query=f"DROP STAGE {_DATABASE_NAME}.{_SCHEMA_NAME}.SNOWML_MODEL_{expected_stage_postfix}", + result=mock_data_frame.MockDataFrame( + [snowpark.Row(**{"status": f"Stage area SNOWML_MODEL_{expected_stage_postfix} successfully dropped."})] + ), + ) + + with absltest.mock.patch.object( + model_registry, + "_list_selected_models", + return_value=absltest.mock.MagicMock(count=absltest.mock.MagicMock(return_value=0)), + ): + with absltest.mock.patch.object( + model_registry, + "_log_model_path", + return_value=(self.model_id, expected_stage_path), + ) as mock_path: + mock_model = absltest.mock.MagicMock() + mock_type = absltest.mock.MagicMock() + mock_metadata = absltest.mock.MagicMock(model_type=mock_type) + with absltest.mock.patch.object(target=_model, attribute="save_model") as mock_save: + mock_save.side_effect = ValueError("Mock Error") + with self.assertRaises(ValueError): + model_registry.log_model( + model_name=model_name, + model_version=model_version, + model=mock_model, + signatures=m_signatures, + description="description", + tags=None, + ) def test_delete_model_with_artifact(self) -> None: """Test deleting a model and artifact from the registry.""" @@ -852,7 +1066,14 @@ def test_delete_model_with_artifact(self) -> None: self.setup_list_model_call().add_operation(operation="filter").add_operation( operation="filter" ).add_collect_result( - [snowpark.Row(ID=self.model_id, NAME=self.model_name, VERSION=self.model_version, URI="sfc://model_stage")], + [ + snowpark.Row( + ID=self.model_id, + NAME=self.model_name, + VERSION=self.model_version, + URI=f"sfc://{_DATABASE_NAME}.{_SCHEMA_NAME}.model_stage", + ) + ], ) self.add_session_mock_sql( query=f""" @@ -862,12 +1083,14 @@ def test_delete_model_with_artifact(self) -> None: ) self.add_session_mock_sql( query=f"DROP STAGE {_DATABASE_NAME}.{_SCHEMA_NAME}.model_stage", - result=mock_data_frame.MockDataFrame([snowpark.Row(**{"status": "'model_stage' successfully dropped."})]), + result=mock_data_frame.MockDataFrame( + [snowpark.Row(**{"status": f"'{_DATABASE_NAME}.{_SCHEMA_NAME}.model_stage' successfully dropped."})] + ), ) self.template_test_set_attribute( "DELETION", { - "URI": "sfc://model_stage", + "URI": f"sfc://{_DATABASE_NAME}.{_SCHEMA_NAME}.model_stage", "delete_artifact": True, }, use_id=True, diff --git a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb index 36417955..847c38fa 100644 --- a/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb +++ b/snowflake/ml/registry/notebooks/Deployment to Snowpark Container Service Demo.ipynb @@ -141,12 +141,11 @@ "model_name = \"snowpark_ml_logistic\"\n", "model_version = \"v1\"\n", "\n", - "registry.log_model(\n", + "model_ref = registry.log_model(\n", " model_name=model_name,\n", " model_version=model_version,\n", " model=logistic_model,\n", " sample_input_data=test_features,\n", - " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", ")" ] }, @@ -177,10 +176,6 @@ "from snowflake.ml.model import deploy_platforms\n", "from snowflake import snowpark\n", "\n", - "model_ref = model_registry.ModelReference(\n", - " registry=registry, model_name=model_name, model_version=model_version\n", - ")\n", - "\n", "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created compute pool\n", "deployment_name = \"LOGISTIC_FUNC\" # Name of the resulting UDF\n", "\n", @@ -399,21 +394,10 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "9dd84f88", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'bafae568275d11ee95175ac3f3b698e1'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from snowflake.ml.registry import model_registry\n", "\n", @@ -421,13 +405,11 @@ "model_name = \"cross_encoder_model\"\n", "model_version = \"v1\"\n", "\n", - "registry.log_model(\n", + "model_ref = registry.log_model(\n", " model_name=model_name,\n", " model_version=model_version,\n", " model=model,\n", - " conda_dependencies=[\"pytorch::pytorch==2.0.1\", \"conda-forge::transformers==4.18.0\"],\n", " sample_input_data=test_features,\n", - " options={\"embed_local_ml_library\": True}, # This option is enabled to pull latest dev code changes.\n", ")" ] }, @@ -441,66 +423,13 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "701152f7", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Building the Docker image and deploying to Snowpark Container Service. This process may take a few minutes.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Image successfully built! To prevent the need for rebuilding the Docker image in future deployments, simply specify 'prebuilt_snowflake_image': 'temptest002038-servicesnow.registry-dev.snowflakecomputing.com/inference_container_db/inference_container_schema/snowml_repo/bafae568275d11ee95175ac3f3b698e1:latest' in the options field of the deploy() function\n" - ] - } - ], + "outputs": [], "source": [ "from snowflake.ml.model import deploy_platforms\n", "\n", - "model_ref = model_registry.ModelReference(\n", - " registry=registry, model_name=model_name, model_version=model_version\n", - ")\n", - "\n", "compute_pool = \"MY_COMPUTE_POOL\" # Pre-created\n", "deployment_name = \"CROSS_ENCODER\" # Name of the resulting UDF\n", "\n", @@ -510,7 +439,7 @@ " target_method=\"predict\",\n", " options={\n", " \"compute_pool\": compute_pool,\n", - " \"use_gpu\": True\n", + " \"num_gpus\": 1\n", " }\n", ")" ] diff --git a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb index c7bb52e8..4b25a6c9 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb @@ -33,16 +33,7 @@ "id": "1117c596", "metadata": {}, "source": [ - "Please refer to our [readme file](https://docs.google.com/document/d/10DmBHYFGKINQwyvJupfuhARDk-cyG5_Fn3Uy2OQcQPk) to install `snowflake-ml-python`." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "b1b950fe", - "metadata": {}, - "source": [ - "If you are about to go over the **Use with customize model** part in this notebook, you will need `tensorflow` and `transformers`, which could be installed by following command." + "Please refer to our [landing page](https://docs.snowflake.com/en/developer-guide/snowpark-ml/index) to install `snowflake-ml-python`." ] }, { @@ -56,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "afd16ff5", "metadata": {}, "outputs": [], @@ -67,10 +58,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "d609ff44", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Scale cell width with the browser window to accommodate .show() commands for wider tables.\n", "from IPython.display import display, HTML\n", @@ -103,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "b2efc0a8", "metadata": {}, "outputs": [], @@ -134,13 +138,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "a95e3431", "metadata": {}, "outputs": [], "source": [ - "REGISTRY_DATABASE_NAME = \"TEMP\"\n", - "REGISTRY_SCHEMA_NAME = \"WZHAO\"" + "REGISTRY_DATABASE_NAME = \"MODEL_REGISTRY\"\n", + "REGISTRY_SCHEMA_NAME = \"PUBLIC\"" ] }, { @@ -151,8 +155,13 @@ "outputs": [], "source": [ "from snowflake.ml.registry import model_registry\n", - "model_registry.create_model_registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)\n", - "registry = model_registry.ModelRegistry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)" + "\n", + "model_registry.create_model_registry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")\n", + "registry = model_registry.ModelRegistry(\n", + " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", + ")" ] }, { @@ -184,13 +193,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "8cf44218", "metadata": {}, "outputs": [], "source": [ "from sklearn import svm\n", "from sklearn.datasets import load_digits\n", + "import numpy as np\n", "\n", "digits = load_digits()\n", "target_digit = 6\n", @@ -270,15 +280,6 @@ "Aso, you have to provide a sample input data so that we could infer the model signature for you, or you can specify the model signature manually." ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c52611ac", - "metadata": {}, - "source": [ - "Also, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to set `embed_local_ml_library` as `True` to embed local library into the model, it will not required when we our package into Snowflake Anaconda Channel." - ] - }, { "cell_type": "code", "execution_count": null, @@ -286,8 +287,8 @@ "metadata": {}, "outputs": [], "source": [ - "SVC_MODEL_NAME=\"SIMPLE_SVC_MODEL\"\n", - "SVC_MODEL_VERSION=\"2\"" + "SVC_MODEL_NAME = \"SIMPLE_SVC_MODEL\"\n", + "SVC_MODEL_VERSION = \"v1\"" ] }, { @@ -298,18 +299,13 @@ "outputs": [], "source": [ "# A name and model tags can be added to the model at registration time.\n", - "model_id = registry.log_model(\n", + "svc_model = registry.log_model(\n", " model_name=SVC_MODEL_NAME,\n", " model_version=SVC_MODEL_VERSION,\n", " model=clf,\n", " tags={\"stage\": \"testing\", \"classifier_type\": \"svm.SVC\", \"svc_gamma\": svc_gamma, \"svc_C\": svc_C},\n", " sample_input_data=test_features[:10],\n", - " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, model_name=SVC_MODEL_NAME, model_version=SVC_MODEL_VERSION)\n", - "print(\"Registered new model:\", model_id)" + ")" ] }, { @@ -339,11 +335,7 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=SVC_MODEL_NAME, model_version=SVC_MODEL_VERSION)\n", - "model.deploy(\n", + "svc_model.deploy(\n", " deployment_name=\"svc_model_predict\",\n", " target_method=\"predict\",\n", ")" @@ -356,7 +348,7 @@ "metadata": {}, "outputs": [], "source": [ - "remote_prediction = model.predict(deployment_name=\"svc_model_predict\", data=test_features)\n", + "remote_prediction = svc_model.predict(deployment_name=\"svc_model_predict\", data=test_features)\n", "\n", "print(\"Remote prediction:\", remote_prediction[:10])\n", "\n", @@ -379,7 +371,7 @@ "metadata": {}, "outputs": [], "source": [ - "model.deploy(\n", + "svc_model.deploy(\n", " deployment_name=\"svc_model_predict_proba\",\n", " target_method=\"predict_proba\",\n", ")" @@ -392,7 +384,7 @@ "metadata": {}, "outputs": [], "source": [ - "remote_prediction_proba = model.predict(deployment_name=\"svc_model_predict_proba\", data=test_features)\n", + "remote_prediction_proba = svc_model.predict(deployment_name=\"svc_model_predict_proba\", data=test_features)\n", "\n", "print(\"Remote prediction:\", remote_prediction_proba[:10])\n", "\n", @@ -414,7 +406,8 @@ "id": "f2224cc7", "metadata": {}, "source": [ - "Also with customized model, it could do much more than what shows above." + "Requirements:\n", + "- `transformers` and `tensorflow` installed locally." ] }, { @@ -523,12 +516,17 @@ "metadata": {}, "outputs": [], "source": [ - "gpt_model = GPT2Model(custom_model.ModelContext(models={}, artifacts={\n", - " \"model\":os.path.join(ARTIFACTS_DIR, \"model\"),\n", - " \"tokenizer\":os.path.join(ARTIFACTS_DIR, \"tokenizer\")\n", - "}))\n", + "gpt_model = GPT2Model(\n", + " custom_model.ModelContext(\n", + " models={},\n", + " artifacts={\n", + " \"model\": os.path.join(ARTIFACTS_DIR, \"model\"),\n", + " \"tokenizer\": os.path.join(ARTIFACTS_DIR, \"tokenizer\"),\n", + " },\n", + " )\n", + ")\n", "\n", - "gpt_model.predict(pd.DataFrame({\"input\":[\"Hello, are you GPT?\"]}))" + "gpt_model.predict(pd.DataFrame({\"input\": [\"Hello, are you GPT?\"]}))" ] }, { @@ -557,7 +555,7 @@ "outputs": [], "source": [ "GPT2_MODEL_NAME = \"GPT2_MODEL\"\n", - "GPT2_MODEL_VERSION = \"2\"" + "GPT2_MODEL_VERSION = \"v1\"" ] }, { @@ -569,7 +567,7 @@ "source": [ "from snowflake.ml.model import model_signature\n", "\n", - "model_id_gpt = registry.log_model(\n", + "gpt_model_ref = registry.log_model(\n", " model_name=GPT2_MODEL_NAME,\n", " model_version=GPT2_MODEL_VERSION,\n", " model=gpt_model,\n", @@ -580,11 +578,7 @@ " outputs=[model_signature.FeatureSpec(name=\"output\", dtype=model_signature.DataType.STRING)],\n", " )\n", " },\n", - " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "gpt_model = model_registry.ModelReference(registry=registry, model_name=GPT2_MODEL_NAME, model_version=GPT2_MODEL_VERSION)\n", - "print(\"Registered new model:\", model_id_gpt)" + ")" ] }, { @@ -613,18 +607,8 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "gpt_model = model_registry.ModelReference(\n", - " registry=registry,\n", - " model_name=GPT2_MODEL_NAME,\n", - " model_version=GPT2_MODEL_VERSION,\n", - ")\n", - "gpt_model.deploy(\n", + "gpt_model_ref.deploy(\n", " deployment_name=\"gpt_model_predict\",\n", - " target_method=\"predict\",\n", - " options={\"relax_version\": True},\n", ")" ] }, @@ -635,17 +619,7 @@ "metadata": {}, "outputs": [], "source": [ - "res = gpt_model.predict(deployment_name=\"gpt_model_predict\", data=pd.DataFrame({\"input\":[\"Hello, are you GPT?\"]}))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e479c77d", - "metadata": {}, - "outputs": [], - "source": [ - "print(res)" + "gpt_model_ref.predict(deployment_name=\"gpt_model_predict\", data=pd.DataFrame({\"input\": [\"Hello, are you GPT?\"]}))" ] }, { @@ -764,7 +738,7 @@ ")\n", "ft_standard_scaler = ft_standard_scaler.fit(kddcup99_sp_df_train)\n", "kddcup99_sp_df_train = ft_standard_scaler.transform(kddcup99_sp_df_train)\n", - "kddcup99_sp_df_test = ft_standard_scaler.transform(kddcup99_sp_df_test)\n" + "kddcup99_sp_df_test = ft_standard_scaler.transform(kddcup99_sp_df_test)" ] }, { @@ -784,7 +758,7 @@ "outputs": [], "source": [ "XGB_MODEL_NAME = \"XGB_MODEL_KDDCUP99\"\n", - "XGB_MODEL_VERSION = \"2\"" + "XGB_MODEL_VERSION = \"v1\"" ] }, { @@ -820,23 +794,12 @@ "metadata": {}, "outputs": [], "source": [ - "from snowflake.ml.model import model_signature\n", - "\n", - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "# A name and model tags can be added to the model at registration time.\n", - "model_id_xgb = registry.log_model(\n", + "xgb_model = registry.log_model(\n", " model_name=XGB_MODEL_NAME,\n", " model_version=XGB_MODEL_VERSION,\n", " model=regressor,\n", " sample_input_data=kddcup99_sp_df_train.drop('\"labels\"'),\n", - " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "# The object API can be used to reference a model after creation.\n", - "xgb_model = model_registry.ModelReference(registry=registry, model_name=XGB_MODEL_NAME, model_version=XGB_MODEL_VERSION)\n", - "print(\"Registered new model:\", model_id_xgb)" + ")" ] }, { @@ -855,21 +818,8 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "xgb_model = model_registry.ModelReference(\n", - " registry=registry,\n", - " model_name=XGB_MODEL_NAME,\n", - " model_version=XGB_MODEL_VERSION,\n", - ")\n", "xgb_model.deploy(\n", - " deployment_name=\"xgb_model_predict\",\n", - " target_method=\"predict\",\n", - " permanent=True,\n", - " options={\n", - " \"relax_version\": True,\n", - " },\n", + " deployment_name=\"xgb_model_predict\", target_method=\"predict\", permanent=True, options={\"relax_version\": True}\n", ")" ] }, @@ -934,12 +884,12 @@ "another_registry = model_registry.ModelRegistry(\n", " session=another_session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", ")\n", - "xgb_model = model_registry.ModelReference(\n", + "xgb_model_ref = model_registry.ModelReference(\n", " registry=another_registry,\n", " model_name=XGB_MODEL_NAME,\n", " model_version=XGB_MODEL_VERSION,\n", ")\n", - "xgb_model.list_deployments()" + "xgb_model_ref.list_deployments().show()" ] }, { @@ -949,7 +899,7 @@ "metadata": {}, "outputs": [], "source": [ - "sp_res = xgb_model.predict(\n", + "sp_res = xgb_model_ref.predict(\n", " deployment_name=\"xgb_model_predict\", data=another_session.create_dataframe(kddcup99_sp_df_test.to_pandas())\n", ")\n", "sp_res.show()" @@ -980,7 +930,471 @@ "metadata": {}, "outputs": [], "source": [ - "xgb_model.delete_deployment(deployment_name=\"xgb_model_predict\")" + "xgb_model_ref.delete_deployment(deployment_name=\"xgb_model_predict\")" + ] + }, + { + "cell_type": "markdown", + "id": "5df0ed62", + "metadata": {}, + "source": [ + "### Deploy to SPCS and using GPU for inference" + ] + }, + { + "cell_type": "markdown", + "id": "08bce3c3", + "metadata": {}, + "source": [ + "Requirements:\n", + "- `xgboost==1.7.6` installed locally.\n", + "- a SPCS compute pool with at least 1 GPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f67748d", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model import deploy_platforms\n", + "\n", + "xgb_model.deploy(\n", + " deployment_name=\"xgb_model_predict_spcs\",\n", + " target_method=\"predict\",\n", + " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", + " permanent=True,\n", + " options={\"compute_pool\": \"...\", \"num_gpus\": 1, \"num_workers\": 24},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bff75a87", + "metadata": {}, + "outputs": [], + "source": [ + "sp_res = xgb_model.predict(deployment_name=\"xgb_model_predict_spcs\", data=kddcup99_sp_df_test)\n", + "sp_res.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3372b222", + "metadata": {}, + "outputs": [], + "source": [ + "xgb_model.delete_deployment(deployment_name=\"xgb_model_predict_spcs\")" + ] + }, + { + "cell_type": "markdown", + "id": "2114bb8c", + "metadata": {}, + "source": [ + "## Using LLM with HuggingFace Pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "cd99cd28", + "metadata": {}, + "source": [ + "Requirements:\n", + "- `transformers>=4.31.0` and `tokenizers>=0.13.3` installed locally.\n", + "- a HuggingFace token with read access.\n", + "- a SPCS compute pool with at least 1 GPU.\n", + "- News Category Dataset from https://www.kaggle.com/datasets/rmisra/news-category-dataset" + ] + }, + { + "cell_type": "markdown", + "id": "07bb4d94", + "metadata": {}, + "source": [ + "### Preparing Data into Snowflake" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "280c8644", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "news_dataset = pd.read_json(\"News_Category_Dataset_v3.json\", lines=True).convert_dtypes()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "c8500b9a", + "metadata": {}, + "outputs": [], + "source": [ + "NEWS_DATA_TABLE_NAME = \"news_dataset\"\n", + "news_dataset_sp_df = session.create_dataframe(news_dataset)\n", + "news_dataset_sp_df.write.mode(\"overwrite\").save_as_table(NEWS_DATA_TABLE_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "533ef50c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"headline\" |\"category\" |\"short_description\" |\n", + "-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|Where Do We Come From? |WEIRD NEWS |My dear readers, Denial is not a river in Egypt. What will it take to wake up? Or have we allowed ourselves to be so disempowered that we have thrown in the towel? If so, is self destruction imminent? I would hope not. |\n", + "|Sen. Mike Enzi Wins Primary, Will Face Democrat In November |POLITICS | |\n", + "|Start-Art |ARTS |For too long the contemporary art world has been the exclusive redoubt of insiders, tastemakers, and a privileged elite. Gertrude has exploded this paradigm, and fashioned a conversational forum that democratizes and demystifies contemporary art. |\n", + "|Tony Wagner's The Global Achievement Gap Is More Relevant Than Ever |EDUCATION |We have always had plenty of soul-killing, drill and kill instruction. In the past, however, it was seen as education malpractice. Now, it is imposed in the name of \"reform.\" |\n", + "|Why Are Shoes So Damn Expensive? |COMEDY | |\n", + "|Create an Online Gift Registry for Your Baby and Get the Items You Need and Want |PARENTS |Online gift registries are making it easy and convenient for expectant parents to get everything on their wish lists. |\n", + "|I'm an American Citizen. If You Want to Remain a Cop, Don't Violate My Human Rights |POLITICS |This idea that cops get to say when and where constitutional rights apply is so very, deeply misguided that I am shocked anyone could type it out without coming to their senses mid-sentence. |\n", + "|Jose Antonio Vargas Among Undocumented Immigrants Making Urgent Plea To Obama |POLITICS | |\n", + "|The 1 Minute Blog. Emotions Consuming You? |GOOD NEWS | |\n", + "|Beyond the Ice-Bucket: A Deeper Challenge |IMPACT |Life is full of challenges. Some are profoundly life-changing, and some are cold and wet. The ice-bucket challenge was not only great summer fun, it was also one of the most positive and productive viral campaigns in history. |\n", + "-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "news_dataset_sp = session.table(NEWS_DATA_TABLE_NAME).select('\"headline\"','\"category\"','\"short_description\"')\n", + "\n", + "news_dataset_sp.show(max_width=600)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7d9c076b", + "metadata": {}, + "outputs": [], + "source": [ + "LLM_MODEL_NAME = \"llama-2-7b-chat\"\n", + "LLM_MODEL_VERSION = \"v1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d9d072bc", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model.models import huggingface_pipeline\n", + "\n", + "llama_model = huggingface_pipeline.HuggingFacePipelineModel(\n", + " task=\"text-generation\",\n", + " model=\"meta-llama/Llama-2-7b-chat-hf\",\n", + " token=\"...\", # Put your HuggingFace token here.\n", + " return_full_text=False,\n", + " max_new_tokens=100,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "29b00bc2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.log_model() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.list_models() is in private preview since 0.2.0. Do not use it in production. \n" + ] + } + ], + "source": [ + "llama_model_ref = registry.log_model(\n", + " model_name=LLM_MODEL_NAME,\n", + " model_version=LLM_MODEL_VERSION,\n", + " model=llama_model,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4d6c93ea", + "metadata": {}, + "outputs": [], + "source": [ + "DEPLOYMENT_NAME=\"llama_predict\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c464d3aa", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.ml.model import deploy_platforms\n", + "\n", + "llama_model_ref.deploy(\n", + " deployment_name=DEPLOYMENT_NAME,\n", + " platform=deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,\n", + " permanent=True,\n", + " options={\n", + " \"compute_pool\": \"...\",\n", + " \"num_gpus\": 1,\n", + " \"enable_remote_image_build\": True,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "e68f70b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"headline\" |\"category\" |\"short_description\" |\"inputs\" |\n", + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|Where Do We Come From? |WEIRD NEWS |My dear readers, Denial is not a river in Egypt. What will it take to wake up? Or have we allowed ourselves to be so disempowered that we have thrown in the towel? If so, is self destruction imminent? I would hope not. |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Where Do We Come From? My dear readers, Denial is not a river in Egypt. What will it take to wake up? Or have we allowed ourselves to be so disempowered that we have thrown in the towel? If so, is self destruction imminent? I would hope not. [/INST] |\n", + "|Sen. Mike Enzi Wins Primary, Will Face Democrat In November |POLITICS | |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Sen. Mike Enzi Wins Primary, Will Face Democrat In November [/INST] |\n", + "|Start-Art |ARTS |For too long the contemporary art world has been the exclusive redoubt of insiders, tastemakers, and a privileged elite. Gertrude has exploded this paradigm, and fashioned a conversational forum that democratizes and demystifies contemporary art. |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Start-Art For too long the contemporary art world has been the exclusive redoubt of insiders, tastemakers, and a privileged elite. Gertrude has exploded this paradigm, and fashioned a conversational forum that democratizes and demystifies contemporary art. [/INST] |\n", + "|Tony Wagner's The Global Achievement Gap Is More Relevant Than Ever |EDUCATION |We have always had plenty of soul-killing, drill and kill instruction. In the past, however, it was seen as education malpractice. Now, it is imposed in the name of \"reform.\" |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Tony Wagner's The Global Achievement Gap Is More Relevant Than Ever We have always had plenty of soul-killing, drill and kill instruction. In the past, however, it was seen as education malpractice. Now, it is imposed in the name of \"reform.\" [/INST] |\n", + "|Why Are Shoes So Damn Expensive? |COMEDY | |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Why Are Shoes So Damn Expensive? [/INST] |\n", + "|Create an Online Gift Registry for Your Baby and Get the Items You Need and Want |PARENTS |Online gift registries are making it easy and convenient for expectant parents to get everything on their wish lists. |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Create an Online Gift Registry for Your Baby and Get the Items You Need and Want Online gift registries are making it easy and convenient for expectant parents to get everything on their wish lists. [/INST] |\n", + "|I'm an American Citizen. If You Want to Remain a Cop, Don't Violate My Human Rights |POLITICS |This idea that cops get to say when and where constitutional rights apply is so very, deeply misguided that I am shocked anyone could type it out without coming to their senses mid-sentence. |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | I'm an American Citizen. If You Want to Remain a Cop, Don't Violate My Human Rights This idea that cops get to say when and where constitutional rights apply is so very, deeply misguided that I am shocked anyone could type it out without coming to their senses mid-sentence. [/INST] |\n", + "|Jose Antonio Vargas Among Undocumented Immigrants Making Urgent Plea To Obama |POLITICS | |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Jose Antonio Vargas Among Undocumented Immigrants Making Urgent Plea To Obama [/INST] |\n", + "|The 1 Minute Blog. Emotions Consuming You? |GOOD NEWS | |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | The 1 Minute Blog. Emotions Consuming You? [/INST] |\n", + "|Beyond the Ice-Bucket: A Deeper Challenge |IMPACT |Life is full of challenges. Some are profoundly life-changing, and some are cold and wet. The ice-bucket challenge was not only great summer fun, it was also one of the most positive and productive viral campaigns in history. |[INST] <> |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} |\n", + "| | | |<> |\n", + "| | | | Beyond the Ice-Bucket: A Deeper Challenge Life is full of challenges. Some are profoundly life-changing, and some are cold and wet. The ice-bucket challenge was not only great summer fun, it was also one of the most positive and productive viral campaigns in history. [/INST] |\n", + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "import snowflake.snowpark.functions as F\n", + "\n", + "prompt_prefix = \"\"\"[INST] <>\n", + "Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} \n", + "As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8}\n", + "<>\n", + "\"\"\"\n", + "prompt_suffix = \"[/INST]\"\n", + "\n", + "input_df = news_dataset_sp.with_column(\n", + " '\"inputs\"',\n", + " F.concat_ws(\n", + " F.lit(\" \"), F.lit(prompt_prefix), F.col('\"headline\"'), F.col('\"short_description\"'), F.lit(prompt_suffix)\n", + " ),\n", + ")\n", + "\n", + "input_df.show(max_width=600)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05aa68e8", + "metadata": {}, + "outputs": [], + "source": [ + "res = llama_model_ref.predict(\n", + " deployment_name=DEPLOYMENT_NAME,\n", + " data=input_df\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "783be746", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"headline\" |\"category\" |\"short_description\" |\"inputs\" |\"outputs\" |\"PRED_CATEGORY\" |\"PRED_KEYWORDS\" |\"PRED_IMPORTANCE\" |\n", + "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|Where Do We Come From? |WEIRD NEWS |My dear readers, Denial is not a river in Egypt. What will it take to wake up? Or have we allowed ourselves to be so disempowered that we have thrown in the towel? If so, is self destruction imminent? I would hope not. |[INST] <> |[{\"generated_text\": \" Here is the JSON output for the given text:\\n{\\n\\\"category\\\": \\\"Society\\\",\\n\\\"keywords\\\": [\\n\\\"denial\\\",\\n\\\"Egypt\\\",\\n\\\"self-destruction\\\"\\n],\\n\\\"importance\\\": 7\\n\\n}\\n\\nNote: The JSON output is valid according to the provided schema, and includes the required properties and values.\"}] |\"Society\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"denial\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"Egypt\", | |\n", + "| | | |<> | | | \"self-destruction\" | |\n", + "| | | | Where Do We Come From? My dear readers, Denial is not a river in Egypt. What will it take to wake up? Or have we allowed ourselves to be so disempowered that we have thrown in the towel? If so, is self destruction imminent? I would hope not. [/INST] | | |] | |\n", + "|Sen. Mike Enzi Wins Primary, Will Face Democrat In November |POLITICS | |[INST] <> |[{\"generated_text\": \" Sure, here's the JSON output for the input \\\"Sen. Mike Enzi Wins Primary, Will Face Democrat In November\\\":\\n{\\n\\\"category\\\": \\\"Politics\\\",\\n\\\"keywords\\\": [\\\"Mike Enzi\\\", \\\"primary\\\", \\\"election\\\", \\\"Democrat\\\"],\\n\\\"importance\\\": 7\\n\\n}\\n\\nNote that I've included the required fields \\\"category\\\", \\\"keywords\\\", and \\\"importance\\\" with the appropriate\"}] |\"Politics\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"Mike Enzi\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"primary\", | |\n", + "| | | |<> | | | \"election\", | |\n", + "| | | | Sen. Mike Enzi Wins Primary, Will Face Democrat In November [/INST] | | | \"Democrat\" | |\n", + "| | | | | | |] | |\n", + "|Start-Art |ARTS |For too long the contemporary art world has been the exclusive redoubt of insiders, tastemakers, and a privileged elite. Gertrude has exploded this paradigm, and fashioned a conversational forum that democratizes and demystifies contemporary art. |[INST] <> |[{\"generated_text\": \" Here is the JSON output for the given input:\\n{\\n\\\"category\\\": \\\"Art\\\",\\n\\\"keywords\\\": [\\n\\\"Gertrude\\\",\\n\\\"contemporary art\\\",\\n\\\"democratization\\\",\\n\\\"demystification\\\"\\n],\\n\\\"importance\\\": 9\\n\\n}\\n\\nNote that I have included the required fields \\\"category\\\", \\\"keywords\\\", and \\\"importance\\\" in the JSON output, and have also included the specified types for each\"}] |\"Art\" |[ |9 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"Gertrude\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"contemporary art\", | |\n", + "| | | |<> | | | \"democratization\", | |\n", + "| | | | Start-Art For too long the contemporary art world has been the exclusive redoubt of insiders, tastemakers, and a privileged elite. Gertrude has exploded this paradigm, and fashioned a conversational forum that democratizes and demystifies contemporary art. [/INST] | | | \"demystification\" | |\n", + "| | | | | | |] | |\n", + "|Tony Wagner's The Global Achievement Gap Is More Relevant Than Ever |EDUCATION |We have always had plenty of soul-killing, drill and kill instruction. In the past, however, it was seen as education malpractice. Now, it is imposed in the name of \"reform.\" |[INST] <> |[{\"generated_text\": \" Sure, here is the JSON output for the given input:\\n{\\n\\\"category\\\": \\\"Education\\\",\\n\\\"keywords\\\": [\\\"soul-killing\\\", \\\"drill and kill\\\", \\\"education malpractice\\\", \\\"reform\\\"],\\n\\\"importance\\\": 7\\n}\\nNote that I have included the required properties and their respective values as per the provided JSON schema.\"}] |\"Education\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"soul-killing\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"drill and kill\", | |\n", + "| | | |<> | | | \"education malpractice\", | |\n", + "| | | | Tony Wagner's The Global Achievement Gap Is More Relevant Than Ever We have always had plenty of soul-killing, drill and kill instruction. In the past, however, it was seen as education malpractice. Now, it is imposed in the name of \"reform.\" [/INST] | | | \"reform\" | |\n", + "| | | | | | |] | |\n", + "|Why Are Shoes So Damn Expensive? |COMEDY | |[INST] <> |[{\"generated_text\": \" Here is the JSON output for the given text:\\n{\\n\\\"category\\\": \\\"Society\\\",\\n\\\"keywords\\\": [\\\"shoes\\\", \\\"expensive\\\", \\\"prices\\\", \\\"consumerism\\\"],\\n\\\"importance\\\": 7\\n}\\n\\nNote: The output is in compliance with the provided JSON schema, and includes the required properties and values.\"}] |\"Society\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"shoes\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"expensive\", | |\n", + "| | | |<> | | | \"prices\", | |\n", + "| | | | Why Are Shoes So Damn Expensive? [/INST] | | | \"consumerism\" | |\n", + "| | | | | | |] | |\n", + "|Create an Online Gift Registry for Your Baby and Get the Items You Need and Want |PARENTS |Online gift registries are making it easy and convenient for expectant parents to get everything on their wish lists. |[INST] <> |[{\"generated_text\": \" Sure! Here is the JSON output for the given input:\\n{\\n\\\"category\\\": \\\"Parenting\\\",\\n\\\"keywords\\\": [\\n\\\"gift registry\\\",\\n\\\"baby gifts\\\",\\n\\\"online registry\\\",\\n\\\"registry ideas\\\",\\n\\\"baby shower gifts\\\"\\n],\\n\\\"importance\\\": 7\\n\\n}\\n\\nNote that I have included the \\\"category\\\" and \\\"keywords\\\" properties as requested, and have also included the \\\"\"}] |\"Parenting\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"gift registry\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"baby gifts\", | |\n", + "| | | |<> | | | \"online registry\", | |\n", + "| | | | Create an Online Gift Registry for Your Baby and Get the Items You Need and Want Online gift registries are making it easy and convenient for expectant parents to get everything on their wish lists. [/INST] | | | \"registry ideas\", | |\n", + "| | | | | | | \"baby shower gifts\" | |\n", + "| | | | | | |] | |\n", + "|I'm an American Citizen. If You Want to Remain a Cop, Don't Violate My Human Rights |POLITICS |This idea that cops get to say when and where constitutional rights apply is so very, deeply misguided that I am shocked anyone could type it out without coming to their senses mid-sentence. |[INST] <> |[{\"generated_text\": \" Sure, here is the JSON output for the given text:\\n{\\n\\\"category\\\": \\\"Society\\\",\\n\\\"keywords\\\": [\\n\\\"police\\\",\\n\\\"human rights\\\",\\n\\\"constitution\\\",\\n\\\"violation\\\"\\n],\\n\\\"importance\\\": 9\\n\\n}\\n\\nNote that I have included the required properties and values according to the JSON schema you provided.\"}] |\"Society\" |[ |9 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"police\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"human rights\", | |\n", + "| | | |<> | | | \"constitution\", | |\n", + "| | | | I'm an American Citizen. If You Want to Remain a Cop, Don't Violate My Human Rights This idea that cops get to say when and where constitutional rights apply is so very, deeply misguided that I am shocked anyone could type it out without coming to their senses mid-sentence. [/INST] | | | \"violation\" | |\n", + "| | | | | | |] | |\n", + "|Jose Antonio Vargas Among Undocumented Immigrants Making Urgent Plea To Obama |POLITICS | |[INST] <> |[{\"generated_text\": \" Sure, here is the JSON output for the given input:\\n{\\n\\\"category\\\": \\\"Politics\\\",\\n\\\"keywords\\\": [\\\"undocumented immigrants\\\", \\\"Barack Obama\\\", \\\"plea\\\"],\\n\\\"importance\\\": 7\\n\\n}\\n\\nPlease note that the output is in the format of a JSON object, with the properties, keywords, and importance values corresponding to the fields in the JSON schema you provided.\"}] |\"Politics\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"undocumented immigrants\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"Barack Obama\", | |\n", + "| | | |<> | | | \"plea\" | |\n", + "| | | | Jose Antonio Vargas Among Undocumented Immigrants Making Urgent Plea To Obama [/INST] | | |] | |\n", + "|The 1 Minute Blog. Emotions Consuming You? |GOOD NEWS | |[INST] <> |[{\"generated_text\": \" Sure, I can provide a valid JSON response that conforms to the schema you provided. Here's an example:\\n{\\n\\\"category\\\": \\\"Local News\\\",\\n\\\"keywords\\\": [\\n\\\"crime\\\",\\n\\\"arrest\\\",\\n\\\"suspect\\\",\\n\\\"police\\\",\\n\\\"investigation\\\"\\n],\\n\\\"importance\\\": 7\\n\\n}\\n\\nPlease let me know if you have any other questions or if you'd like me to\"}] |\"Local News\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"crime\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"arrest\", | |\n", + "| | | |<> | | | \"suspect\", | |\n", + "| | | | The 1 Minute Blog. Emotions Consuming You? [/INST] | | | \"police\", | |\n", + "| | | | | | | \"investigation\" | |\n", + "| | | | | | |] | |\n", + "|Beyond the Ice-Bucket: A Deeper Challenge |IMPACT |Life is full of challenges. Some are profoundly life-changing, and some are cold and wet. The ice-bucket challenge was not only great summer fun, it was also one of the most positive and productive viral campaigns in history. |[INST] <> |[{\"generated_text\": \" Here is the JSON output for the provided text:\\n{\\n\\\"category\\\": \\\"Society and Culture\\\",\\n\\\"keywords\\\": [\\n\\\"challenge\\\",\\n\\\"life\\\",\\n\\\"viral campaign\\\",\\n\\\"positive\\\",\\n\\\"productive\\\"\\n],\\n\\\"importance\\\": 7\\n\\n}\\n\\nNote: The output is formatted according to the provided JSON schema, with the required properties and values. The \\\"keywords\\\" array includes the five keywords mentioned\"}] |\"Society and Culture\" |[ |7 |\n", + "| | | |Your output will be parsed by a computer program as a JSON object. Please respond ONLY with valid json that conforms to this JSON schema: {\"properties\": {\"category\": {\"type\": \"string\",\"description\": \"The category that the news should belong to.\"},\"keywords\": {\"type\": \"array\":\"description\": \"The keywords that are mentioned in the news.\",\"items\": [{\"type\": \"string\"}]},\"importance\": {\"type\": \"number\",\"description\": \"A integer from 1 to 10 to show if the new is important. The higher the more important the news is.\"}},\"required\": [\"properties\",\"keywords\",\"importance\"]} | | | \"challenge\", | |\n", + "| | | |As an example, input \"Residents ordered to evacuate amid threat of growing wildfire in Washington state, medical facilities sheltering in place\" results in the json: {\"category\": \"Natural Disasters\",\"keywords\": [\"evacuate\", \"wildfire\", \"Washington state\", \"medical facilities\"],\"importance\": 8} | | | \"life\", | |\n", + "| | | |<> | | | \"viral campaign\", | |\n", + "| | | | Beyond the Ice-Bucket: A Deeper Challenge Life is full of challenges. Some are profoundly life-changing, and some are cold and wet. The ice-bucket challenge was not only great summer fun, it was also one of the most positive and productive viral campaigns in history. [/INST] | | | \"positive\", | |\n", + "| | | | | | | \"productive\" | |\n", + "| | | | | | |] | |\n", + "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "json_capture_regexp = r'[{\\[]{1}([,:{}\\[\\]0-9.\\-+Eaeflnr-u \\n\\r\\t]|\".*?\")+[}\\]]{1}'\n", + "\n", + "output_json_col = F.parse_json(\n", + " F.regexp_extract(\n", + " F.replace(F.get(F.get(F.parse_json(F.col('\"outputs\"')), 0), F.lit(\"generated_text\")), r\"\\\"\", '\"'),\n", + " json_capture_regexp,\n", + " 0,\n", + " )\n", + ")\n", + "\n", + "output_df = res.with_columns(\n", + " [\"pred_category\", \"pred_keywords\", \"pred_importance\"],\n", + " [\n", + " F.get(output_json_col, F.lit(\"category\")),\n", + " F.get(output_json_col, F.lit(\"keywords\")),\n", + " F.get(output_json_col, F.lit(\"importance\")),\n", + " ],\n", + ")\n", + "\n", + "output_df.show(max_width=600)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "9baccb60", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.delete_deployment() is in private preview since 1.0.1. Do not use it in production. \n" + ] + } + ], + "source": [ + "llama_model_ref.delete_deployment(deployment_name=DEPLOYMENT_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "968f8571", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.delete_model() is in private preview since 0.2.0. Do not use it in production. \n" + ] + } + ], + "source": [ + "llama_model_ref.delete_model()" ] } ], @@ -1000,7 +1414,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.9.16" }, "vscode": { "interpreter": { diff --git a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb index a79d6aec..49cc052c 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb @@ -139,8 +139,8 @@ "metadata": {}, "outputs": [], "source": [ - "REGISTRY_DATABASE_NAME = \"TEMP\"\n", - "REGISTRY_SCHEMA_NAME = \"WZHAO\"" + "REGISTRY_DATABASE_NAME = \"MODEL_REGISTRY\"\n", + "REGISTRY_SCHEMA_NAME = \"PUBLIC\"" ] }, { @@ -262,17 +262,12 @@ "outputs": [], "source": [ "# A name and model tags can be added to the model at registration time.\n", - "model_id = registry.log_model(\n", + "model = registry.log_model(\n", " model_name=model_name,\n", " model_version=model_version,\n", " model=clf_xgb,\n", " tags={\"stage\": \"testing\", \"classifier_type\": \"XGBClassifier\"},\n", - " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "print(\"Registered new model:\", model_id)" + ")" ] }, { @@ -298,14 +293,9 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", - " options={\"relax_version\": True},\n", ")" ] }, @@ -338,14 +328,9 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_proba\",\n", - " options={\"relax_version\": True},\n", ")" ] }, @@ -430,17 +415,12 @@ "outputs": [], "source": [ "# A name and model tags can be added to the model at registration time.\n", - "model_id = registry.log_model(\n", + "model = registry.log_model(\n", " model_name=model_name,\n", " model_version=model_version,\n", " model=clf_rf,\n", " tags={\"stage\": \"testing\", \"classifier_type\": classifier_type},\n", - " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "print(\"Registered new model:\", model_id)" + ")" ] }, { @@ -458,14 +438,9 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", - " options={\"relax_version\": True},\n", ")" ] }, @@ -490,14 +465,9 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_proba\",\n", - " options={\"relax_version\": True},\n", ")" ] }, @@ -522,14 +492,9 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_log_proba\",\n", - " options={\"relax_version\": True},\n", ")" ] }, @@ -616,17 +581,13 @@ "outputs": [], "source": [ "# A name and model tags can be added to the model at registration time.\n", - "model_id = registry.log_model(\n", + "model = registry.log_model(\n", " model_name=model_name,\n", " model_version=model_version,\n", " model=clf_lr,\n", " tags={\"stage\": \"testing\", \"classifier_type\": classifier_type},\n", " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "print(\"Registered new model:\", model_id)" + ")" ] }, { @@ -644,10 +605,6 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", @@ -676,10 +633,6 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_proba\",\n", @@ -708,10 +661,6 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_log_proba\",\n", @@ -740,10 +689,6 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"decision_function\",\n", @@ -903,17 +848,13 @@ "outputs": [], "source": [ "# A name and model tags can be added to the model at registration time.\n", - "model_id = registry.log_model(\n", + "model = registry.log_model(\n", " model_name=model_name,\n", " model_version=model_version,\n", " model=pipeline,\n", " tags={\"stage\": \"testing\", \"classifier_type\": classifier_type},\n", " options={\"embed_local_ml_library\": True}\n", - ")\n", - "\n", - "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", - "print(\"Registered new model:\", model_id)" + ")" ] }, { @@ -931,10 +872,6 @@ "metadata": {}, "outputs": [], "source": [ - "registry = model_registry.ModelRegistry(\n", - " session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME\n", - ")\n", - "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", @@ -957,14 +894,6 @@ "\n", "print(\"Result comparison:\", np.allclose(prediction, remote_prediction.values))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0dd77df3", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb b/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb index aedf48b4..c7904355 100644 --- a/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb +++ b/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb @@ -248,7 +248,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_id = registry.log_model(model_name=model_name, model_version=model_version, model=clf, tags={\n", + "model = registry.log_model(model_name=model_name, model_version=model_version, model=clf, tags={\n", " \"stage\": \"testing\", \"classifier_type\": \"svm.SVC\", \"svc_gamma\": svc_gamma, \"svc_C\": svc_C}, sample_input_data=train_features, options={\"embed_local_ml_library\": True})" ] }, @@ -708,7 +708,7 @@ "metadata": {}, "outputs": [], "source": [ - "model.get_model_history().select(\"EVENT_TIMESTAMP\", \"ROLE\", \"ATTRIBUTE_NAME\",\"OPERATION\", \"VALUE[ATTRIBUTE_NAME]\").sort(\"EVENT_TIMESTAMP\", ascending=False).show()" + "registry.get_model_history(model_name=model_name, model_version=model_version).select(\"EVENT_TIMESTAMP\", \"ROLE\", \"ATTRIBUTE_NAME\",\"OPERATION\", \"VALUE[ATTRIBUTE_NAME]\").sort(\"EVENT_TIMESTAMP\", ascending=False).show()" ] }, { diff --git a/snowflake/ml/registry/registry_utils.py b/snowflake/ml/registry/registry_utils.py new file mode 100644 index 00000000..43e4f7f5 --- /dev/null +++ b/snowflake/ml/registry/registry_utils.py @@ -0,0 +1,92 @@ +from typing import Any, Dict, List, Tuple + +from snowflake import snowpark +from snowflake.ml._internal.utils import formatting, query_result_checker + + +def _get_fully_qualified_schema_name(database_name: str, schema_name: str) -> str: + return f"{database_name}.{schema_name}" + + +def _get_fully_qualified_table_name(database_name: str, schema_name: str, table_name: str) -> str: + return f"{_get_fully_qualified_schema_name(database_name, schema_name)}.{table_name}" + + +def _create_single_registry_table( + session: snowpark.Session, + database_name: str, + schema_name: str, + table_name: str, + table_schema: List[Tuple[str, str]], + statement_params: Dict[str, Any], +) -> str: + """Creates a single table for registry and returns the fully qualified name of the table. + + Args: + session: Session object to communicate with Snowflake. + database_name: Desired name of the model registry database. + schema_name: Desired name of the schema used by this model registry inside the database. + table_name: Name of the target table. + table_schema: A list of pair of strings, each pair denotes `(, )`. + statement_params: Function usage statement parameters used in sql query executions. + + Returns: + A string which is the name of the created table. + + Raises: + RuntimeError: If table creation failed. + """ + fully_qualified_table_name = _get_fully_qualified_table_name(database_name, schema_name, table_name) + table_schema_string = ", ".join([f"{k} {v}" for k, v in table_schema]) + try: + session.sql(f"CREATE TABLE IF NOT EXISTS {fully_qualified_table_name} ({table_schema_string})").collect( + statement_params=statement_params + ) + except Exception as e: + raise RuntimeError(f"Registry table {fully_qualified_table_name} creation failed due to {e}") + + return fully_qualified_table_name + + +def _insert_table_entry(session: snowpark.Session, table: str, columns: Dict[str, Any]) -> List[snowpark.Row]: + """Insert an entry into an internal Model Registry table. + + Args: + session: Snowpark session object to communicate with Snowflake. + table: Fully qualified name of the table to insert into. + columns: Key-value pairs of columns and values to be inserted into the table. + + Returns: + Result of the operation as returned by the Snowpark session (snowpark.DataFrame). + + Raises: + RuntimeError: If entry insertion failed. + """ + sorted_columns = sorted(columns.items()) + try: + sql = "INSERT INTO {table} ( {columns} ) SELECT {values}".format( + table=table, + columns=",".join([x[0] for x in sorted_columns]), + values=",".join([formatting.format_value_for_select(x[1]) for x in sorted_columns]), + ) + return query_result_checker.SqlResultValidator(session, sql).insertion_success(expected_num_rows=1).validate() + except Exception as e: + raise RuntimeError(f"Table {table} entry {columns} insertion failed due to {e}") + + +def _validate_table_exist(session: snowpark.Session, table: str, qualified_schema_name: str) -> bool: + """Check if the given table exists in the target schema. + + Note: + In case the table doesn't exist, a DataError will be raised by SqlResultValidator. + + Args: + session: Snowpark session object to communicate with Snowflake. + table: Name of the target table as an identifier. + qualified_schema_name: Fully qualidied schema name where the target table is expected to exist. + + Returns: + A boolean stands for whether the target table already exists. + """ + tables = session.sql(f"SHOW TABLES LIKE '{table}' IN {qualified_schema_name}").collect() + return len(tables) == 1 diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl index 6069e6a9..86f96cc7 100755 --- a/snowflake/ml/requirements.bzl +++ b/snowflake/ml/requirements.bzl @@ -1,6 +1,6 @@ # DO NOT EDIT! # Generate by running 'bazel run --config=pre_build //bazel/requirements:sync_requirements' -EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<3'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<3', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1']} +EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<2.4'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'transformers': ['transformers>=4.29.2,<5'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<2.4', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1', 'transformers>=4.29.2,<5']} -REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.3', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] +REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] diff --git a/snowflake/ml/training_dataset/training_dataset.py b/snowflake/ml/training_dataset/training_dataset.py index 09c584b4..ab7cce81 100644 --- a/snowflake/ml/training_dataset/training_dataset.py +++ b/snowflake/ml/training_dataset/training_dataset.py @@ -1,7 +1,18 @@ +import json from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional -from snowflake.snowpark import DataFrame +from snowflake.snowpark import DataFrame, Session + + +def _get_val_or_null(val: Any) -> Any: + return val if val is not None else "null" + + +def _wrap_embedded_str(s: str) -> str: + s = s.replace("\\", "\\\\") + s = s.replace('"', '\\"') + return s @dataclass(frozen=True) @@ -20,6 +31,33 @@ class FeatureStoreMetadata: connection_params: Dict[str, str] features: List[str] + def to_json(self) -> str: + return json.dumps(self.to_dict()) + + @classmethod + def from_json(cls, json_str: str) -> "FeatureStoreMetadata": + json_dict = json.loads(json_str) + return FeatureStoreMetadata.from_dict(json_dict) + + def to_dict(self) -> Dict[str, str]: + return { + # TODO(zhe): Additonal wrap is needed because ml_.artifact.ad_artifact takes a dict + # but we retrieve it as an object. Snowpark serialization is inconsistent with + # our deserialization. A fix is let artifact table stores string and callers + # handles both serialization and deserialization. + "spine_query": _wrap_embedded_str(self.spine_query), + "connection_params": _wrap_embedded_str(json.dumps(self.connection_params)), + "features": _wrap_embedded_str(json.dumps(self.features)), + } + + @classmethod + def from_dict(cls, json_dict: Dict[str, str]) -> "FeatureStoreMetadata": + return cls( + spine_query=json_dict["spine_query"], + connection_params=json.loads(json_dict["connection_params"]), + features=json.loads(json_dict["features"]), + ) + @dataclass(frozen=True) class TrainingDataset: @@ -29,6 +67,7 @@ class TrainingDataset: Properties: df: A dataframe object representing the training dataset generation. materialized_table: The destination table name which training data will writes into. + snapshot_table: A snapshot table name on the materialized table. timestamp_col: Name of timestamp column in spine_df that will be used to join time-series features. If spine_timestamp_col is not none, the input features also must have timestamp_col. label_cols: Name of colum(s) in materialized_table that contains training labels. @@ -38,7 +77,62 @@ class TrainingDataset: df: DataFrame materialized_table: Optional[str] + snapshot_table: Optional[str] timestamp_col: Optional[str] label_cols: Optional[List[str]] feature_store_metadata: Optional[FeatureStoreMetadata] desc: str + + def to_dict(self) -> Dict[str, str]: + if len(self.df.queries["queries"]) != 1: + raise ValueError( + f"""df dataframe must contain only 1 query. +Got {len(self.df.queries['queries'])}: {self.df.queries['queries']} +""" + ) + + return { + "df_query": self.df.queries["queries"][0], + "materialized_table": _get_val_or_null(self.materialized_table), + "snapshot_table": _get_val_or_null(self.snapshot_table), + "timestamp_col": _get_val_or_null(self.timestamp_col), + "label_cols": _get_val_or_null(self.label_cols), + "feature_store_metadata": self.feature_store_metadata.to_json() + if self.feature_store_metadata is not None + else "null", + "desc": self.desc, + } + + @classmethod + def from_dict(cls, json_dict: Dict[str, Any], session: Session) -> "TrainingDataset": + json_dict["df"] = session.sql(json_dict["df_query"]) + json_dict.pop("df_query") + + fs_meta_json = json_dict["feature_store_metadata"] + json_dict["feature_store_metadata"] = FeatureStoreMetadata.from_json(fs_meta_json) + return cls(**json_dict) + + def to_json(self) -> str: + d = self.to_dict() + return json.dumps(d) + + @classmethod + def from_json(cls, json_str: str, session: Session) -> "TrainingDataset": + json_dict = json.loads(json_str) + return cls.from_dict(json_dict, session) + + def __eq__(self, other: object) -> bool: + return isinstance(other, TrainingDataset) and self.to_json() == other.to_json() + + def id(self) -> str: + """Return a unique identifier of this training dataset. + + Raises: + ValueError: when snapshot_table is None. + + Returns: + A unique identifier string. + """ + if self.snapshot_table is None: + raise ValueError("snapshot_table is required to generate id.") + return self.snapshot_table diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index bcb01834..a3b92e20 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "1.0.5" +VERSION = "1.0.6" diff --git a/tests/integ/snowflake/ml/_internal/BUILD.bazel b/tests/integ/snowflake/ml/_internal/BUILD.bazel index 16bac168..9bb5ebad 100644 --- a/tests/integ/snowflake/ml/_internal/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/BUILD.bazel @@ -10,3 +10,25 @@ py_test( "//snowflake/ml/utils:connection_params" ], ) + +py_test( + name = "grid_search_integ_test", + srcs = ["grid_search_integ_test.py"], + timeout = "long", + deps = [ + "//snowflake/ml/modeling/model_selection/_internal:_grid_search_cv", + "//snowflake/ml/modeling/svm:svr", + "//snowflake/ml/utils:connection_params" + ], +) + +py_test( + name = "randomized_search_integ_test", + srcs = ["randomized_search_integ_test.py"], + timeout = "long", + deps = [ + "//snowflake/ml/modeling/model_selection/_internal:_randomized_search_cv", + "//snowflake/ml/modeling/ensemble:random_forest_classifier", + "//snowflake/ml/utils:connection_params" + ], +) diff --git a/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py b/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py new file mode 100644 index 00000000..be6fd12f --- /dev/null +++ b/tests/integ/snowflake/ml/_internal/grid_search_integ_test.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import inflection +import numpy as np +import pytest +from absl.testing.absltest import TestCase, main +from sklearn.datasets import load_diabetes +from sklearn.model_selection import GridSearchCV as SkGridSearchCV +from sklearn.svm import SVR as SkSVR + +from snowflake.ml.modeling.model_selection._internal import GridSearchCV +from snowflake.ml.modeling.svm import SVR +from snowflake.ml.utils.connection_params import SnowflakeLoginOptions +from snowflake.snowpark import Session + + +@pytest.mark.pip_incompatible +class GridSearchCVTest(TestCase): + def setUp(self): + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + + def tearDown(self): + self._session.close() + + def test_fit_and_compare_results(self) -> None: + input_df_pandas = load_diabetes(as_frame=True).frame + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + + sklearn_reg = SkGridSearchCV(estimator=SkSVR(), param_grid={"C": [1, 10], "kernel": ("linear", "rbf")}) + reg = GridSearchCV(estimator=SVR(), param_grid={"C": [1, 10], "kernel": ("linear", "rbf")}) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + + reg.fit(input_df) + sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) + + actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].astype("float64").to_numpy() + sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) + + assert reg._sklearn_object.best_params_ == sklearn_reg.best_params_ + + np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py b/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py new file mode 100644 index 00000000..ed67f3e8 --- /dev/null +++ b/tests/integ/snowflake/ml/_internal/randomized_search_integ_test.py @@ -0,0 +1,71 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import inflection +import pytest +from absl.testing.absltest import TestCase, main +from scipy.stats import randint +from sklearn.datasets import load_iris +from sklearn.ensemble import RandomForestClassifier as SkRandomForestClassifier +from sklearn.model_selection import RandomizedSearchCV as SkRandomizedSearchCV + +from snowflake.ml.modeling.ensemble import RandomForestClassifier +from snowflake.ml.modeling.model_selection._internal import RandomizedSearchCV +from snowflake.ml.utils.connection_params import SnowflakeLoginOptions +from snowflake.snowpark import Session + + +@pytest.mark.pip_incompatible +class RandomizedSearchCVTest(TestCase): + def setUp(self): + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + + def tearDown(self): + self._session.close() + + def test_fit_and_compare_results(self) -> None: + input_df_pandas = load_iris(as_frame=True).frame + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + pararm_distribution = { + "n_estimators": randint(50, 200), + "max_depth": randint(3, 8), + } + + sklearn_reg = SkRandomizedSearchCV( + estimator=SkRandomForestClassifier(random_state=0), + param_distributions=pararm_distribution, + random_state=0, + ) + + reg = RandomizedSearchCV( + estimator=RandomForestClassifier(random_state=0), + param_distributions=pararm_distribution, + random_state=0, + ) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + + reg.fit(input_df) + sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) + + # TODO: randomized search cv results are not always the same. + # check with implementation + # actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX") + # [output_cols].astype("float64").to_numpy() + # sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) + # assert reg._sklearn_object.best_score_ == sklearn_reg.best_score_ + # assert reg._sklearn_object.best_params_ == sklearn_reg.best_params_ + + # np.testing.assert_allclose(actual_arr.flatten(), sklearn_numpy_arr.flatten(), rtol=1.0e-1, atol=1.0e-2) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel index 79ead891..cd798e85 100644 --- a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel +++ b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel @@ -91,3 +91,12 @@ py_test( "//snowflake/ml/utils:connection_params", ] ) + +py_test( + name="test_non_numeric_target", + srcs = ["test_non_numeric_target.py"], + deps = [ + "//snowflake/ml/modeling/ensemble:random_forest_classifier", + "//snowflake/ml/utils:connection_params", + ] +) diff --git a/tests/integ/snowflake/ml/extra_tests/test_non_numeric_target.py b/tests/integ/snowflake/ml/extra_tests/test_non_numeric_target.py new file mode 100644 index 00000000..cc1732d0 --- /dev/null +++ b/tests/integ/snowflake/ml/extra_tests/test_non_numeric_target.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import inflection +import numpy as np +import pytest +from absl.testing.absltest import TestCase, main +from sklearn.datasets import load_iris +from sklearn.ensemble import RandomForestClassifier as SkRandomForestClassifier + +from snowflake.ml.modeling.ensemble import RandomForestClassifier +from snowflake.ml.utils.connection_params import SnowflakeLoginOptions +from snowflake.snowpark import Session + + +@pytest.mark.pip_incompatible +class NonNumericTargetTest(TestCase): + def setUp(self): + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + + def tearDown(self): + self._session.close() + + def test_fit_and_compare_results(self) -> None: + data = load_iris(as_frame=True) + input_df_pandas = data.frame + + input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + + # Coerce target column to string. + input_df_pandas["TARGET"] = input_df_pandas["TARGET"].apply(lambda x: data.target_names[x]) + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + label_col = [c for c in input_df_pandas.columns if c.startswith("TARGET")] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index + input_df = self._session.create_dataframe(input_df_pandas) + + sklearn_reg = SkRandomForestClassifier(random_state=0) + + reg = RandomForestClassifier(random_state=0) + reg.set_input_cols(input_cols) + output_cols = ["OUTPUT_" + c for c in label_col] + reg.set_output_cols(output_cols) + reg.set_label_cols(label_col) + + reg.fit(input_df) + sklearn_reg.fit(X=input_df_pandas[input_cols], y=input_df_pandas[label_col].squeeze()) + + actual_arr = reg.predict(input_df).to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() + sklearn_numpy_arr = sklearn_reg.predict(input_df_pandas[input_cols]) + + np.testing.assert_equal(actual_arr.flatten(), sklearn_numpy_arr.flatten()) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index 4d69f9b6..f0368db3 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -11,6 +11,7 @@ py_library( "//snowflake/ml/model:type_hints", "//tests/integ/snowflake/ml/test_utils:db_manager", "//tests/integ/snowflake/ml/test_utils:test_env_utils", + "//snowflake/ml/model/_signatures:snowpark_handler", ], ) @@ -113,6 +114,7 @@ py_test( shard_count = 4, deps = [ ":warehouse_model_integ_test_utils", + "//snowflake/ml/_internal:env", "//snowflake/ml/model:type_hints", "//snowflake/ml/model/_signatures:numpy_handler", "//snowflake/ml/utils:connection_params", @@ -135,3 +137,16 @@ py_test( "//tests/integ/snowflake/ml/test_utils:test_env_utils", ], ) + +py_test( + name = "warehouse_huggingface_pipeline_model_integ_test", + timeout = "long", + srcs = ["warehouse_huggingface_pipeline_model_integ_test.py"], + shard_count = 8, + deps = [ + ":warehouse_model_integ_test_utils", + "//snowflake/ml/model:type_hints", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/test_utils:db_manager", + ], +) diff --git a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py index effe2bdb..52138e6a 100644 --- a/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py +++ b/tests/integ/snowflake/ml/model/deployment_to_snowservice_integ_test.py @@ -3,12 +3,11 @@ # # TODO[shchen], SNOW-889081, re-enable once server-side image build is supported. -# # -# # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. -# # + # import uuid # from unittest import SkipTest -# +# from typing import Tuple + # import pandas as pd # import pytest # import sklearn.base @@ -16,9 +15,10 @@ from absl.testing import absltest # from sklearn import neighbors -# + # from snowflake.ml.model import ( # _model as model_api, +# _model_meta, # custom_model, # type_hints as model_types, # ) @@ -27,18 +27,18 @@ # from snowflake.ml.utils import connection_params # from snowflake.snowpark import Session # from tests.integ.snowflake.ml.test_utils import db_manager -# + # _IRIS = datasets.load_iris(as_frame=True) # _IRIS_X = _IRIS.data # _IRIS_Y = _IRIS.target -# -# + + # def _get_sklearn_model() -> "sklearn.base.BaseEstimator": # knn_model = neighbors.KNeighborsClassifier() # knn_model.fit(_IRIS_X, _IRIS_Y) # return knn_model -# -# + + # @pytest.mark.pip_incompatible # class DeploymentToSnowServiceIntegTest(absltest.TestCase): # _RUN_ID = uuid.uuid4().hex[:2] @@ -50,14 +50,14 @@ # TEST_ROLE = "SYSADMIN" # TEST_COMPUTE_POOL = "MODEL_DEPLOYMENT_INTEG_TEST_POOL_STANDARD_2" # PRE-CREATED # CONNECTION_NAME = "snowservice" # PRE-CREATED AND STORED IN KEY VAULT -# + # @classmethod # def setUpClass(cls) -> None: # try: # login_options = connection_params.SnowflakeLoginOptions(connection_name=cls.CONNECTION_NAME) # except KeyError: # raise SkipTest("SnowService connection parameters not present: skipping SnowServicesIntegTest.") -# + # cls._session = Session.builder.configs( # { # **login_options, @@ -69,22 +69,24 @@ # cls._db_manager.create_stage(cls.TEST_STAGE, cls.TEST_SCHEMA, cls.TEST_DB, sse_encrypted=True) # cls._db_manager.create_image_repo(cls.TEST_IMAGE_REPO) # cls._db_manager.cleanup_databases(expire_hours=6) -# + # @classmethod # def tearDownClass(cls) -> None: # cls._db_manager.drop_image_repo(cls.TEST_IMAGE_REPO) # # Dropping the db/schema will implicitly terminate the service function and snowservice as well. # cls._db_manager.drop_database(cls.TEST_DB) # cls._session.close() -# + # def setUp(self) -> None: # # Set up a unique id for each artifact, in addition to the class-level prefix. This is particularly useful -# when differentiating artifacts generated between different test cases, such as service function names. +# # when differentiating artifacts generated between different test cases, such as service function names. # self.uid = uuid.uuid4().hex[:4] -# -# def _save_model_to_stage(self, model: custom_model.CustomModel, sample_input: pd.DataFrame) -> str: + +# def _save_model_to_stage( +# self, model: custom_model.CustomModel, sample_input: pd.DataFrame +# ) -> Tuple[str, _model_meta.ModelMetadata]: # stage_path = f"@{self.TEST_STAGE}/{self.uid}/model.zip" -# model_api.save_model( # type: ignore[call-overload] +# meta = model_api.save_model( # type: ignore[call-overload] # name="model", # session=self._session, # model_stage_file_path=stage_path, @@ -92,10 +94,10 @@ # sample_input=sample_input, # options={"embed_local_ml_library": True}, # ) -# return stage_path -# +# return stage_path, meta + # def test_deployment_workflow(self) -> None: -# model_stage_file_path = self._save_model_to_stage(model=_get_sklearn_model(), sample_input=_IRIS_X) +# model_stage_file_path, meta = self._save_model_to_stage(model=_get_sklearn_model(), sample_input=_IRIS_X) # service_func_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( # self._RUN_ID, f"func_{self.uid}" # ) @@ -109,13 +111,14 @@ # snowservice_api._deploy( # self._session, # model_id=uuid.uuid4().hex, +# model_meta=meta, # service_func_name=service_func_name, # model_zip_stage_path=model_stage_file_path, # deployment_stage_path=model_stage_file_path, # use the same stage for testing # target_method="predict", # **deployment_options, # ) -# -# + + if __name__ == "__main__": absltest.main() diff --git a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py index a2fa1a5c..bcac645c 100644 --- a/tests/integ/snowflake/ml/model/model_badcase_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_badcase_integ_test.py @@ -2,8 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -import os -import tempfile +import posixpath import uuid import numpy as np @@ -65,121 +64,122 @@ def tearDownClass(self) -> None: self._session.close() def test_bad_model_deploy(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - lm = DemoModel(custom_model.ModelContext()) - arr = np.array([[1, 2, 3], [4, 2, 5]]) - pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - model_api.save_model( - name="custom_bad_model", - model_dir_path=os.path.join(tmpdir, "custom_bad_model"), - model=lm, - sample_input=pd_df, - metadata={"author": "halu", "version": "1"}, - conda_dependencies=["invalidnumpy==1.22.4"], - options=model_types.CustomModelSaveOption({"embed_local_ml_library": True}), - ) - function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( - self.run_id, "custom_bad_model" + lm = DemoModel(custom_model.ModelContext()) + arr = np.array([[1, 2, 3], [4, 2, 5]]) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + tmp_stage = self._session.get_session_stage() + model_api.save_model( + name="custom_bad_model", + session=self._session, + model_stage_file_path=posixpath.join(tmp_stage, "custom_bad_model.zip"), + model=lm, + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + conda_dependencies=["invalidnumpy==1.22.4"], + options=model_types.CustomModelSaveOption({"embed_local_ml_library": True}), + ) + function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "custom_bad_model") + with self.assertRaises(RuntimeError): + _ = _deployer.deploy( + session=self._session, + name=function_name, + model_stage_file_path=posixpath.join(tmp_stage, "custom_bad_model.zip"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions({"relax_version": False}), ) - with self.assertRaises(RuntimeError): - _ = _deployer.deploy( - session=self._session, - name=function_name, - model_dir_path=os.path.join(tmpdir, "custom_bad_model"), - platform=deploy_platforms.TargetPlatform.WAREHOUSE, - target_method="predict", - options=model_types.WarehouseDeployOptions({"relax_version": False}), - ) def test_custom_demo_model(self) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - lm = DemoModel(custom_model.ModelContext()) - arr = np.random.randint(100, size=(10000, 3)) - pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - model_api.save_model( - name="custom_demo_model", - model_dir_path=os.path.join(tmpdir, "custom_demo_model"), - model=lm, - conda_dependencies=[ - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") - ], - sample_input=pd_df, - metadata={"author": "halu", "version": "1"}, - options=model_types.CustomModelSaveOption({"embed_local_ml_library": True}), - ) - function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( - self.run_id, "custom_demo_model" - ) - with self.assertRaises(RuntimeError): - deploy_info = _deployer.deploy( - session=self._session, - name=function_name, - model_dir_path=os.path.join(tmpdir, "custom_demo_model"), - platform=deploy_platforms.TargetPlatform.WAREHOUSE, - target_method="predict", - options=model_types.WarehouseDeployOptions( - { - "relax_version": test_env_utils.is_in_pip_env(), - "permanent_udf_stage_location": f"{self.full_qual_stage}/", - # Test stage location validation - } - ), - ) + tmp_stage = self._session.get_session_stage() + lm = DemoModel(custom_model.ModelContext()) + arr = np.random.randint(100, size=(10000, 3)) + pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) + + model_metadata = model_api.save_model( + name="custom_demo_model", + session=self._session, + model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + model=lm, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], + sample_input=pd_df, + metadata={"author": "halu", "version": "1"}, + ) + + self.assertTrue(hasattr(model_metadata, "local_ml_library_version")) + function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name(self.run_id, "custom_demo_model") + with self.assertRaises(RuntimeError): deploy_info = _deployer.deploy( session=self._session, name=function_name, - model_dir_path=os.path.join(tmpdir, "custom_demo_model", ""), # Test sanitizing user path input. + model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( { "relax_version": test_env_utils.is_in_pip_env(), - "permanent_udf_stage_location": f"@{self.full_qual_stage}/", + "permanent_udf_stage_location": f"{self.full_qual_stage}/", + # Test stage location validation } ), ) - assert deploy_info is not None - res = _deployer.predict(session=self._session, deployment=deploy_info, X=pd_df) - pd.testing.assert_frame_equal( - res, - pd.DataFrame(arr[:, 0], columns=["output"]), - ) + deploy_info = _deployer.deploy( + session=self._session, + name=function_name, + model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + { + "relax_version": test_env_utils.is_in_pip_env(), + "permanent_udf_stage_location": f"@{self.full_qual_stage}/", + } + ), + ) + assert deploy_info is not None + res = _deployer.predict(session=self._session, deployment=deploy_info, X=pd_df) - with self.assertRaises(RuntimeError): - deploy_info = _deployer.deploy( - session=self._session, - name=function_name, - model_dir_path=os.path.join(tmpdir, "custom_demo_model"), - platform=deploy_platforms.TargetPlatform.WAREHOUSE, - target_method="predict", - options=model_types.WarehouseDeployOptions( - { - "relax_version": test_env_utils.is_in_pip_env(), - "permanent_udf_stage_location": f"@{self.full_qual_stage}/", - } - ), - ) - - self._db_manager.drop_function(function_name=function_name, args=["OBJECT"]) + pd.testing.assert_frame_equal( + res, + pd.DataFrame(arr[:, 0], columns=["output"]), + ) + with self.assertRaises(RuntimeError): deploy_info = _deployer.deploy( session=self._session, name=function_name, - model_dir_path=os.path.join(tmpdir, "custom_demo_model"), + model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), platform=deploy_platforms.TargetPlatform.WAREHOUSE, target_method="predict", options=model_types.WarehouseDeployOptions( { "relax_version": test_env_utils.is_in_pip_env(), "permanent_udf_stage_location": f"@{self.full_qual_stage}/", - "replace_udf": True, } ), ) - self._db_manager.drop_function(function_name=function_name, args=["OBJECT"]) + self._db_manager.drop_function(function_name=function_name, args=["OBJECT"]) + + deploy_info = _deployer.deploy( + session=self._session, + name=function_name, + model_stage_file_path=posixpath.join(tmp_stage, "custom_demo_model.zip"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions( + { + "relax_version": test_env_utils.is_in_pip_env(), + "permanent_udf_stage_location": f"@{self.full_qual_stage}/", + "replace_udf": True, + } + ), + ) + + self._db_manager.drop_function(function_name=function_name, args=["OBJECT"]) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py index 0cccfd65..eb097ede 100644 --- a/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_custom_model_integ_test.py @@ -3,7 +3,6 @@ # import asyncio -import json import os import tempfile import uuid @@ -109,7 +108,6 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -122,20 +120,13 @@ def base_test_case( sample_input=sample_input, test_input=test_input, deploy_params=deploy_params, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_async_model_composition( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -156,7 +147,7 @@ async def _test(self: "TestWarehouseCustomModelInteg") -> None: sample_input=pd_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -164,56 +155,40 @@ async def _test(self: "TestWarehouseCustomModelInteg") -> None: ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) asyncio.get_event_loop().run_until_complete(_test(self)) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) arr = [[1, 2, 3], [4, 2, 5]] sp_df = self._session.create_dataframe(arr, schema=['"c1"', '"c2"', '"c3"']) + y_df_expected = pd.DataFrame([[1, 2, 3, 1], [4, 2, 5, 4]], columns=["c1", "c2", "c3", "output"]) self.base_test_case( name="custom_demo_model_sp0", model=lm, sample_input=sp_df, test_input=sp_df, deploy_params={ - "predict": ( + "": ( {}, - lambda res: pd.testing.assert_frame_equal( - res.to_pandas(), - pd.DataFrame([1, 4], columns=["output"], dtype=np.int8), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected, check_dtype=False), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_sp_quote( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -227,7 +202,7 @@ def test_custom_demo_model_sp_quote( sample_input=sp_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -235,20 +210,13 @@ def test_custom_demo_model_sp_quote( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_sp_mix_1( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -256,34 +224,25 @@ def test_custom_demo_model_sp_mix_1( arr = [[1, 2, 3], [4, 2, 5]] pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) sp_df = self._session.create_dataframe(arr, schema=['"c1"', '"c2"', '"c3"']) + y_df_expected = pd.concat([pd_df, pd_df[["c1"]].rename(columns={"c1": "output"})], axis=1) self.base_test_case( name="custom_demo_model_sp1", model=lm, sample_input=pd_df, test_input=sp_df, deploy_params={ - "predict": ( + "": ( {}, - lambda res: pd.testing.assert_frame_equal( - res.to_pandas(), - pd.DataFrame([1, 4], columns=["output"], dtype=np.int8), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected, check_dtype=False), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_sp_mix_2( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -297,7 +256,7 @@ def test_custom_demo_model_sp_mix_2( sample_input=sp_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -305,20 +264,13 @@ def test_custom_demo_model_sp_mix_2( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_array( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -331,7 +283,7 @@ def test_custom_demo_model_array( sample_input=pd_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -339,20 +291,13 @@ def test_custom_demo_model_array( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_str( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -364,7 +309,7 @@ def test_custom_demo_model_str( sample_input=pd_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -372,20 +317,13 @@ def test_custom_demo_model_str( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_array_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -393,68 +331,50 @@ def test_custom_demo_model_array_sp( arr = np.array([[1, 2, 3], [4, 2, 5]]) pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) sp_df = self._session.create_dataframe(pd_df) + y_df_expected = pd.concat([pd_df, pd.DataFrame(data={"output": [[1, 2, 3], [4, 2, 5]]})], axis=1) self.base_test_case( name="custom_demo_model_array_sp", model=lm, sample_input=sp_df, test_input=sp_df, deploy_params={ - "predict": ( + "": ( {}, - lambda res: pd.testing.assert_frame_equal( - res.to_pandas().applymap(json.loads), - pd.DataFrame(data={"output": [[1, 2, 3], [4, 2, 5]]}), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected, check_dtype=False), ) }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_str_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: lm = DemoModel(custom_model.ModelContext()) pd_df = pd.DataFrame([["Yogiri", "Civia", "Echo"], ["Artia", "Doris", "Rosalyn"]], columns=["c1", "c2", "c3"]) sp_df = self._session.create_dataframe(pd_df) + y_df_expected = pd.concat([pd_df, pd.DataFrame(data={"output": ["Yogiri", "Artia"]})], axis=1) self.base_test_case( name="custom_demo_model_str_sp", model=lm, sample_input=sp_df, test_input=sp_df, deploy_params={ - "predict": ( + "": ( {}, - lambda res: pd.testing.assert_frame_equal( - res.to_pandas(), - pd.DataFrame(data={"output": ["Yogiri", "Artia"]}), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected), ) }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_demo_model_array_str( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -466,7 +386,7 @@ def test_custom_demo_model_array_str( sample_input=pd_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -474,91 +394,13 @@ def test_custom_demo_model_array_str( ), ) }, - model_in_stage=model_in_stage, - permanent_deploy=permanent_deploy, - test_released_version=test_released_version, - ) - - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) - def test_custom_demo_model_with_input_no_keep_order( - self, - model_in_stage: Optional[bool] = False, - permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, - ) -> None: - lm = DemoModel(custom_model.ModelContext()) - arr = np.random.randint(100, size=(10000, 3)) - pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - self.base_test_case( - name="custom_demo_model_with_input_no_keep_order", - model=lm, - sample_input=pd_df, - test_input=pd_df, - deploy_params={ - "predict": ( - {"output_with_input_features": True, "keep_order": False}, - lambda res: pd.testing.assert_series_equal( - res["output"], res["c1"], check_dtype=False, check_names=False - ), - ) - }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) - def test_custom_demo_model_with_input( - self, - model_in_stage: Optional[bool] = False, - permanent_deploy: Optional[bool] = False, - test_released_version: Optional[str] = None, - ) -> None: - lm = DemoModel(custom_model.ModelContext()) - arr = np.random.randint(100, size=(10000, 3)) - pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) - - def check_res(res: pd.DataFrame) -> Any: - pd.testing.assert_series_equal(res["output"], res["c1"], check_dtype=False, check_names=False) - pd.testing.assert_frame_equal( - res, - pd.DataFrame( - np.concatenate([arr, np.expand_dims(arr[:, 0], axis=1)], axis=1), - columns=["c1", "c2", "c3", "output"], - ), - check_dtype=False, - ) - - self.base_test_case( - name="custom_demo_model_with_input", - model=lm, - sample_input=pd_df, - test_input=pd_df, - deploy_params={"predict": ({"output_with_input_features": True}, check_res)}, - model_in_stage=model_in_stage, - permanent_deploy=permanent_deploy, - test_released_version=test_released_version, - ) - - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_model_with_artifacts( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -576,7 +418,7 @@ def test_custom_model_with_artifacts( sample_input=pd_df, test_input=pd_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: pd.testing.assert_frame_equal( res, @@ -584,20 +426,13 @@ def test_custom_model_with_artifacts( ), ) }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.3"]) # type: ignore[misc] def test_custom_model_bool_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -610,21 +445,20 @@ def test_custom_model_bool_sp( arr = np.array([[1, 2, 3], [4, 2, 5]]) pd_df = pd.DataFrame(arr, columns=["c1", "c2", "c3"]) sp_df = self._session.create_dataframe(pd_df) + y_df_expected = pd.concat([pd_df, pd.DataFrame([False, True], columns=["output"])], axis=1) self.base_test_case( name="custom_model_bool_sp", model=lm, sample_input=sp_df, test_input=sp_df, deploy_params={ - "predict": ( + "": ( {}, - lambda res: pd.testing.assert_frame_equal( - res.to_pandas(), - pd.DataFrame([False, True], columns=["output"]), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res( + res, y_df_expected, check_dtype=False ), ) }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py new file mode 100644 index 00000000..98c73d4f --- /dev/null +++ b/tests/integ/snowflake/ml/model/warehouse_huggingface_pipeline_model_integ_test.py @@ -0,0 +1,642 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# + +import json +import os +import tempfile +import uuid +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import numpy as np +import pandas as pd +from absl.testing import absltest, parameterized + +from snowflake.ml.model import type_hints as model_types +from snowflake.ml.utils import connection_params +from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session +from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils +from tests.integ.snowflake.ml.test_utils import db_manager + + +class TestWarehouseHuggingFacehModelInteg(parameterized.TestCase): + @classmethod + def setUpClass(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + self._db_manager = db_manager.DBManager(self._session) + self._db_manager.cleanup_schemas() + self._db_manager.cleanup_stages() + self._db_manager.cleanup_user_functions() + + self.cache_dir = tempfile.TemporaryDirectory() + self._original_cache_dir = os.getenv("TRANSFORMERS_CACHE", None) + os.environ["TRANSFORMERS_CACHE"] = self.cache_dir.name + + # To create different UDF names among different runs + self.run_id = uuid.uuid4().hex + self._test_schema_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self.run_id, "model_deployment_huggingface_model_test_schema" + ) + self._db_manager.create_schema(self._test_schema_name) + self._db_manager.use_schema(self._test_schema_name) + + self.deploy_stage_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + self.run_id, "deployment_stage" + ) + self.full_qual_stage = self._db_manager.create_stage( + self.deploy_stage_name, schema_name=self._test_schema_name, sse_encrypted=False + ) + + @classmethod + def tearDownClass(self) -> None: + self._db_manager.drop_stage(self.deploy_stage_name, schema_name=self._test_schema_name) + self._db_manager.drop_schema(self._test_schema_name) + self._session.close() + if self._original_cache_dir: + os.environ["TRANSFORMERS_CACHE"] = self._original_cache_dir + self.cache_dir.cleanup() + + def base_test_case( + self, + name: str, + model: model_types.SupportedModelType, + test_input: model_types.SupportedDataType, + deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + warehouse_model_integ_test_utils.base_test_case( + self._db_manager, + run_id=self.run_id, + full_qual_stage=self.full_qual_stage, + name=name, + model=model, + sample_input=None, + test_input=test_input, + deploy_params=deploy_params, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_conversational_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + # We have to import here due to cache location issue. + # Only by doing so can we make the cache dir setting effective. + import transformers + + model = transformers.pipeline(task="conversational", model="ToddGoldfarb/Cadet-Tiny") + + x_df = pd.DataFrame( + [ + { + "user_inputs": [ + "Do you speak French?", + "Do you know how to say Snowflake in French?", + ], + "generated_responses": ["Yes I do."], + }, + ] + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["generated_responses"])) + + for row in res["generated_responses"]: + self.assertIsInstance(row, list) + for resp in row: + self.assertIsInstance(resp, str) + + self.base_test_case( + name="huggingface_conversational_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_fill_mask_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="fill-mask", model="distilroberta-base", top_k=1) + + x_df = pd.DataFrame( + [ + ["LynYuu is the of the Grand Duchy of Yu."], + ] + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["outputs"])) + + for row in res["outputs"]: + self.assertIsInstance(row, str) + resp = json.loads(row) + self.assertIsInstance(resp, list) + self.assertIn("score", resp[0]) + self.assertIn("token", resp[0]) + self.assertIn("token_str", resp[0]) + self.assertIn("sequence", resp[0]) + + self.base_test_case( + name="huggingface_fill_mask_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_ner_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="ner", model="dslim/bert-base-NER") + + x_df = pd.DataFrame( + [ + ["My name is Izumi and I live in Tokyo, Japan."], + ] + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["outputs"])) + + for row in res["outputs"]: + self.assertIsInstance(row, str) + resp = json.loads(row) + self.assertIsInstance(resp, list) + self.assertIn("entity", resp[0]) + self.assertIn("score", resp[0]) + self.assertIn("index", resp[0]) + self.assertIn("word", resp[0]) + self.assertIn("start", resp[0]) + self.assertIn("end", resp[0]) + + self.base_test_case( + name="huggingface_ner_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_question_answering_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="question-answering", model="deepset/tinyroberta-squad2", top_k=1) + + x_df = pd.DataFrame( + [ + { + "question": "What did Doris want to do?", + "context": ( + "Doris is a cheerful mermaid from the ocean depths. She transformed into a bipedal creature " + 'and came to see everyone because she wanted to "learn more about the world of athletics."' + " She dislikes cuisines with seafood." + ), + } + ], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["score", "start", "end", "answer"])) + + self.assertEqual(res["score"].dtype.type, np.float64) + self.assertEqual(res["start"].dtype.type, np.int64) + self.assertEqual(res["end"].dtype.type, np.int64) + self.assertEqual(res["answer"].dtype.type, np.object_) + + self.base_test_case( + name="huggingface_question_answering_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_question_answering_pipeline_multiple_output( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="question-answering", model="deepset/tinyroberta-squad2", top_k=3) + + x_df = pd.DataFrame( + [ + { + "question": "What did Doris want to do?", + "context": ( + "Doris is a cheerful mermaid from the ocean depths. She transformed into a bipedal creature " + 'and came to see everyone because she wanted to "learn more about the world of athletics."' + " She dislikes cuisines with seafood." + ), + } + ], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["outputs"])) + + for row in res["outputs"]: + self.assertIsInstance(row, str) + resp = json.loads(row) + self.assertIsInstance(resp, list) + self.assertIn("score", resp[0]) + self.assertIn("start", resp[0]) + self.assertIn("end", resp[0]) + self.assertIn("answer", resp[0]) + + self.base_test_case( + name="huggingface_question_answering_pipeline_multiple_output", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_summarization_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="summarization", model="JulesBelveze/t5-small-headline-generator") + + x_df = pd.DataFrame( + [ + [ + ( + "Neuro-sama is a chatbot styled after a female VTuber that hosts live streams on the Twitch " + 'channel "vedal987". Her speech and personality are generated by an artificial intelligence' + " (AI) system which utilizes a large language model, allowing her to communicate with " + "viewers in a live chat. She was created by a computer programmer and AI-developer named " + "Jack Vedal, who decided to build upon the concept of an AI VTuber by combining interactions " + "between AI game play and a computer-generated avatar. She debuted on Twitch on December 19, " + "2022 after four years of development." + ) + ] + ], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["summary_text"])) + + self.assertEqual(res["summary_text"].dtype.type, np.object_) + + self.base_test_case( + name="huggingface_summarization_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_table_question_answering_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="table-question-answering", model="google/tapas-small-finetuned-wtq") + + x_df = pd.DataFrame( + [ + { + "query": "Which channel has the most subscribers?", + "table": json.dumps( + { + "Channel": ["A.I.Channel", "Kaguya Luna", "Mirai Akari", "Siro"], + "Subscribers": ["3,020,000", "872,000", "694,000", "660,000"], + "Videos": ["1,200", "113", "639", "1,300"], + "Created At": ["Jun 30 2016", "Dec 4 2017", "Feb 28 2014", "Jun 23 2017"], + } + ), + } + ], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["answer", "coordinates", "cells", "aggregator"])) + + self.assertEqual(res["answer"].dtype.type, np.object_) + self.assertEqual(res["coordinates"].dtype.type, np.object_) + self.assertIsInstance(res["coordinates"][0], list) + self.assertEqual(res["cells"].dtype.type, np.object_) + self.assertIsInstance(res["cells"][0], list) + self.assertEqual(res["aggregator"].dtype.type, np.object_) + + self.base_test_case( + name="huggingface_table_question_answering_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_text_classification_pair_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2") + + x_df = pd.DataFrame( + [{"text": "I like you.", "text_pair": "I love you, too."}], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["label", "score"])) + + self.assertEqual(res["label"].dtype.type, np.object_) + self.assertEqual(res["score"].dtype.type, np.float64) + self.assertGreaterEqual(res["score"][0], 0.9) + + self.base_test_case( + name="huggingface_text_classification_pair_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_text_classification_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=1) + + x_df = pd.DataFrame( + [{"text": "I am wondering if I should have udon or rice for lunch", "text_pair": ""}], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["outputs"])) + + for row in res["outputs"]: + self.assertIsInstance(row, str) + resp = json.loads(row) + self.assertIsInstance(resp, list) + self.assertIn("label", resp[0]) + self.assertIn("score", resp[0]) + + self.base_test_case( + name="huggingface_text_classification_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_text_generation_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="text-generation", model="distilgpt2") + + x_df = pd.DataFrame( + [['A descendant of the Lost City of Atlantis, who swam to Earth while saying, "']], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["outputs"])) + + for row in res["outputs"]: + self.assertIsInstance(row, str) + resp = json.loads(row) + self.assertIsInstance(resp, list) + self.assertIn("generated_text", resp[0]) + + self.base_test_case( + name="huggingface_text_generation_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_text2text_generation_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="text2text-generation", model="google/flan-t5-small") + + x_df = pd.DataFrame( + [['A descendant of the Lost City of Atlantis, who swam to Earth while saying, "']], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["generated_text"])) + self.assertEqual(res["generated_text"].dtype.type, np.object_) + + self.base_test_case( + name="huggingface_text2text_generation_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_translation_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="translation_en_to_ja", model="t5-small") + + x_df = pd.DataFrame( + [ + [ + ( + "Snowflake's Data Cloud is powered by an advanced data platform provided as a self-managed " + "service. Snowflake enables data storage, processing, and analytic solutions that are faster, " + "easier to use, and far more flexible than traditional offerings. The Snowflake data platform " + "is not built on any existing database technology or “big data” software platforms such as " + "Hadoop. Instead, Snowflake combines a completely new SQL query engine with an innovative " + "architecture natively designed for the cloud. To the user, Snowflake provides all of the " + "functionality of an enterprise analytic database, along with many additional special features " + "and unique capabilities." + ) + ] + ], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["translation_text"])) + self.assertEqual(res["translation_text"].dtype.type, np.object_) + + self.base_test_case( + name="huggingface_translation_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + @parameterized.product(permanent_deploy=[True], test_released_version=[None]) # type: ignore[misc] + def test_zero_shot_classification_pipeline( + self, + permanent_deploy: Optional[bool] = False, + test_released_version: Optional[str] = None, + ) -> None: + import transformers + + model = transformers.pipeline(task="zero-shot-classification", model="cross-encoder/nli-distilroberta-base") + + x_df = pd.DataFrame( + [ + { + "sequences": "I have a problem with Snowflake that needs to be resolved asap!!", + "candidate_labels": ["urgent", "not urgent"], + }, + { + "sequences": "I have a problem with Snowflake that needs to be resolved asap!!", + "candidate_labels": ["English", "Japanese"], + }, + ], + ) + + def check_res(res: pd.DataFrame) -> None: + pd.testing.assert_index_equal(res.columns, pd.Index(["sequence", "labels", "scores"])) + self.assertEqual(res["sequence"].dtype.type, np.object_) + self.assertEqual(res["sequence"][0], "I have a problem with Snowflake that needs to be resolved asap!!") + self.assertEqual(res["sequence"][1], "I have a problem with Snowflake that needs to be resolved asap!!") + self.assertEqual(res["labels"].dtype.type, np.object_) + self.assertListEqual(res["labels"][0], ["urgent", "not urgent"]) + self.assertListEqual(res["labels"][1], ["English", "Japanese"]) + self.assertEqual(res["scores"].dtype.type, np.object_) + self.assertIsInstance(res["labels"][0], list) + self.assertIsInstance(res["labels"][1], list) + + self.base_test_case( + name="huggingface_zero_shot_classification_pipeline", + model=model, + test_input=x_df, + deploy_params={ + "": ( + {}, + check_res, + ), + }, + permanent_deploy=permanent_deploy, + test_released_version=test_released_version, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py index f81e2d47..0d3dc974 100644 --- a/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_mlflow_model_integ_test.py @@ -3,14 +3,17 @@ # import uuid +from importlib import metadata as importlib_metadata from typing import Any, Callable, Dict, Optional, Tuple, Union import mlflow import numpy as np import pandas as pd +import pytest from absl.testing import absltest, parameterized from sklearn import datasets, ensemble, model_selection +from snowflake.ml._internal import env from snowflake.ml.model import type_hints as model_types from snowflake.ml.model._signatures import numpy_handler from snowflake.ml.utils import connection_params @@ -19,6 +22,7 @@ from tests.integ.snowflake.ml.test_utils import db_manager +@pytest.mark.pip_incompatible class TestWarehouseMLFlowModelInteg(parameterized.TestCase): @classmethod def setUpClass(self) -> None: @@ -58,7 +62,6 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -71,20 +74,13 @@ def base_test_case( sample_input=sample_input, test_input=test_input, deploy_params=deploy_params, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_mlflow_model_deploy_sklearn_df( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -103,16 +99,20 @@ def test_mlflow_model_deploy_sklearn_df( signature=signature, metadata={"author": "halu", "version": "1"}, conda_env={ - "dependencies": [ - "python=3.8.13", - "mlflow==2.3.1", - "cloudpickle==2.0.0", - "numpy==1.23.4", - "psutil==5.9.0", - "scikit-learn==1.2.2", - "scipy==1.9.3", - "typing-extensions==4.5.0", - ], + "dependencies": [f"python=={env.PYTHON_VERSION}"] + + list( + map( + lambda pkg: f"{pkg}=={importlib_metadata.distribution(pkg).version}", + [ + "mlflow", + "cloudpickle", + "numpy", + "scikit-learn", + "scipy", + "typing-extensions", + ], + ) + ), "name": "mlflow-env", }, ) @@ -125,25 +125,18 @@ def test_mlflow_model_deploy_sklearn_df( sample_input=None, test_input=X_test, deploy_params={ - "predict": ( + "": ( {}, lambda res: np.testing.assert_allclose(np.expand_dims(predictions, axis=1), res.to_numpy()), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_mlflow_model_deploy_sklearn( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -162,16 +155,20 @@ def test_mlflow_model_deploy_sklearn( signature=signature, metadata={"author": "halu", "version": "1"}, conda_env={ - "dependencies": [ - "python=3.8.13", - "mlflow==2.3.1", - "cloudpickle==2.0.0", - "numpy==1.23.4", - "psutil==5.9.0", - "scikit-learn==1.2.2", - "scipy==1.9.3", - "typing-extensions==4.5.0", - ], + "dependencies": [f"python=={env.PYTHON_VERSION}"] + + list( + map( + lambda pkg: f"{pkg}=={importlib_metadata.distribution(pkg).version}", + [ + "mlflow", + "cloudpickle", + "numpy", + "scikit-learn", + "scipy", + "typing-extensions", + ], + ) + ), "name": "mlflow-env", }, ) @@ -186,12 +183,11 @@ def test_mlflow_model_deploy_sklearn( sample_input=None, test_input=X_test_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: np.testing.assert_allclose(np.expand_dims(predictions, axis=1), res.to_numpy()), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py index 42794415..e96bc0ae 100644 --- a/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py +++ b/tests/integ/snowflake/ml/model/warehouse_model_integ_test_utils.py @@ -2,12 +2,12 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -import os import posixpath -import tempfile import unittest -from typing import Any, Callable, Dict, Optional, Tuple, Union +from typing import Any, Callable, Dict, Literal, Optional, Tuple, Union +import numpy as np +import numpy.typing as npt import pandas as pd from packaging import version @@ -17,6 +17,7 @@ deploy_platforms, type_hints as model_types, ) +from snowflake.ml.model._signatures import snowpark_handler from snowflake.snowpark import DataFrame as SnowparkDataFrame from tests.integ.snowflake.ml.test_utils import db_manager, test_env_utils @@ -30,76 +31,109 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: - with tempfile.TemporaryDirectory() as tmpdir: - version_args: Dict[str, Any] = {} - tmp_stage = db._session.get_session_stage() - conda_dependencies = [ - test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-snowpark-python") - ] - # We only test when the test is added before the current version available in the server. - snowml_req_str = test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-ml-python") - - if test_released_version: - if version.parse(test_released_version) <= version.parse(snowml_req_str.split("==")[-1]): - actual_name = f"{name}_v_released" - conda_dependencies.append(snowml_req_str) - else: - raise unittest.SkipTest( - f"Skip test on released version {test_released_version} which has not been available yet." - ) + version_args: Dict[str, Any] = {} + tmp_stage = db._session.get_session_stage() + conda_dependencies = [ + test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-snowpark-python") + ] + # We only test when the test is added before the current version available in the server. + snowml_req_str = test_env_utils.get_latest_package_versions_in_server(db._session, "snowflake-ml-python") + + if permanent_deploy: + permanent_deploy_args = {"permanent_udf_stage_location": f"@{full_qual_stage}/"} + perm_model_name = "perm" + else: + permanent_deploy_args = {} + perm_model_name = "temp" + + if test_released_version: + if version.parse(test_released_version) <= version.parse(snowml_req_str.split("==")[-1]): + actual_name = f"{name}_{perm_model_name}_released" + conda_dependencies.append(snowml_req_str) else: - actual_name = f"{name}_v_current" - version_args["options"] = {"embed_local_ml_library": True} - if model_in_stage: - actual_name = f"{actual_name}_remote" - location_args = { - "session": db._session, - "model_stage_file_path": posixpath.join(tmp_stage, f"{actual_name}_{run_id}.zip"), - } + raise unittest.SkipTest( + f"Skip test on released version {test_released_version} which has not been available yet." + ) + else: + actual_name = f"{name}_{perm_model_name}_current" + version_args["options"] = {"embed_local_ml_library": True} + + model_api.save_model( + name=actual_name, + model=model, + sample_input=sample_input, + conda_dependencies=conda_dependencies, + metadata={"author": "halu", "version": "1"}, + session=db._session, + model_stage_file_path=posixpath.join(tmp_stage, f"{actual_name}_{run_id}.zip"), + **version_args, + ) + + for target_method, (additional_deploy_options, check_func) in deploy_params.items(): + function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + run_id, f"{actual_name}_{target_method}" + ) + # This is to test the case for omitting target_method when deploying. + if target_method == "": + target_method_arg = None else: - actual_name = f"{actual_name}_local" - location_args = {"model_dir_path": os.path.join(tmpdir, actual_name)} - - model_api.save_model( # type:ignore[call-overload] - name=actual_name, - model=model, - sample_input=sample_input, - conda_dependencies=conda_dependencies, - metadata={"author": "halu", "version": "1"}, - **location_args, - **version_args, + target_method_arg = target_method + deploy_info = _deployer.deploy( + name=function_name, + session=db._session, + model_stage_file_path=posixpath.join(tmp_stage, f"{actual_name}_{run_id}.zip"), + platform=deploy_platforms.TargetPlatform.WAREHOUSE, + target_method=target_method_arg, + options={ + "relax_version": test_env_utils.is_in_pip_env(), + **permanent_deploy_args, # type: ignore[arg-type] + **additional_deploy_options, + }, # type: ignore[call-overload] ) - for target_method, (additional_deploy_options, check_func) in deploy_params.items(): - if permanent_deploy: - permanent_deploy_args = {"permanent_udf_stage_location": f"@{full_qual_stage}/"} - else: - permanent_deploy_args = {} - if "session" not in location_args: - location_args.update(session=db._session) - function_name = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( - run_id, f"{actual_name}_{target_method}" - ) - deploy_info = _deployer.deploy( - name=function_name, - **location_args, - platform=deploy_platforms.TargetPlatform.WAREHOUSE, - target_method=target_method, - options={ - "relax_version": test_env_utils.is_in_pip_env(), - **permanent_deploy_args, # type: ignore[arg-type] - **additional_deploy_options, - }, # type: ignore[call-overload] - ) + assert deploy_info is not None + res = _deployer.predict(session=db._session, deployment=deploy_info, X=test_input) + + check_func(res) + + if permanent_deploy: + db.drop_function(function_name=function_name, args=["OBJECT"]) + + +def check_sp_df_res( + res_sp_df: SnowparkDataFrame, + expected_pd_df: pd.DataFrame, + *, + check_dtype: bool = True, + check_index_type: Union[bool, Literal["equiv"]] = "equiv", + check_column_type: Union[bool, Literal["equiv"]] = "equiv", + check_frame_type: bool = True, + check_names: bool = True, +) -> None: + res_pd_df = snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res_sp_df) - assert deploy_info is not None - res = _deployer.predict(session=db._session, deployment=deploy_info, X=test_input) + def totuple(a: Union[npt.ArrayLike, Tuple[object], object]) -> Union[Tuple[object], object]: + try: + return tuple(totuple(i) for i in a) # type: ignore[union-attr] + except TypeError: + return a - check_func(res) + for df in [res_pd_df, expected_pd_df]: + for col in df.columns: + if isinstance(df[col][0], list): + df[col] = df[col].apply(tuple) + elif isinstance(df[col][0], np.ndarray): + df[col] = df[col].apply(totuple) - if permanent_deploy: - db.drop_function(function_name=function_name, args=["OBJECT"]) + pd.testing.assert_frame_equal( + res_pd_df.sort_values(by=res_pd_df.columns.tolist()).reset_index(drop=True), + expected_pd_df.sort_values(by=expected_pd_df.columns.tolist()).reset_index(drop=True), + check_dtype=check_dtype, + check_index_type=check_index_type, + check_column_type=check_column_type, + check_frame_type=check_frame_type, + check_names=check_names, + ) diff --git a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py index 8af2a7b3..60032cf4 100644 --- a/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_pytorch_model_integ_test.py @@ -56,7 +56,6 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -69,60 +68,46 @@ def base_test_case( sample_input=sample_input, test_input=test_input, deploy_params=deploy_params, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_pytorch_tensor_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model() - x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) - y_pred = model.forward(data_x)[0].detach() + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) + y_pred = model.forward(data_x).detach() self.base_test_case( name="pytorch_model_tensor_as_sample", model=model, - sample_input=data_x, + sample_input=[data_x], test_input=x_df, deploy_params={ - "forward": ( + "": ( {}, lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_pytorch_df_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model(torch.float64) - x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) - y_pred = model.forward(data_x)[0].detach() + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) + y_pred = model.forward(data_x).detach() self.base_test_case( name="pytorch_model_df_as_sample", @@ -130,35 +115,31 @@ def test_pytorch_df_as_sample( sample_input=x_df, test_input=x_df, deploy_params={ - "forward": ( + "": ( {}, lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_pytorch_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_torch_model(torch.float64) - x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) x_df.columns = ["col_0"] - y_pred = model.forward(data_x)[0].detach() - x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, x_df, keep_order=True) + y_pred = model.forward(data_x) + x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, x_df) + y_pred_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([y_pred]) + y_pred_df.columns = ["output_feature_0"] + y_df_expected = pd.concat([x_df, y_pred_df], axis=1) self.base_test_case( name="pytorch_model_sp", @@ -166,72 +147,53 @@ def test_pytorch_sp( sample_input=x_df, test_input=x_df_sp, deploy_params={ - "forward": ( + "": ( {}, - lambda res: torch.testing.assert_close( - pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( - snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) - )[0], - y_pred, - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_torchscript_tensor_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model() - x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) model_script = torch.jit.script(model) # type:ignore[attr-defined] - y_pred = model_script.forward(data_x)[0].detach() + y_pred = model_script.forward(data_x).detach() self.base_test_case( name="torch_script_model_tensor_as_sample", model=model_script, - sample_input=data_x, + sample_input=[data_x], test_input=x_df, deploy_params={ - "forward": ( + "": ( {}, lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred, check_dtype=False ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_torchscript_df_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model(torch.float64) - x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) model_script = torch.jit.script(model) # type:ignore[attr-defined] - y_pred = model_script.forward(data_x)[0].detach() + y_pred = model_script.forward(data_x).detach() self.base_test_case( name="torch_script_model_df_as_sample", @@ -239,36 +201,32 @@ def test_torchscript_df_as_sample( sample_input=x_df, test_input=x_df, deploy_params={ - "forward": ( + "": ( {}, lambda res: torch.testing.assert_close( pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df(res)[0], y_pred ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_torchscript_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_jittable_torch_model(torch.float64) - x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([data_x], ensure_serializable=False) x_df.columns = ["col_0"] model_script = torch.jit.script(model) # type:ignore[attr-defined] - y_pred = model_script.forward(data_x)[0].detach() - x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, x_df, keep_order=True) + y_pred = model_script.forward(data_x) + x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df(self._session, x_df) + y_pred_df = pytorch_handler.SeqOfPyTorchTensorHandler.convert_to_df([y_pred]) + y_pred_df.columns = ["output_feature_0"] + y_df_expected = pd.concat([x_df, y_pred_df], axis=1) self.base_test_case( name="torch_script_model_sp", @@ -276,17 +234,11 @@ def test_torchscript_sp( sample_input=x_df, test_input=x_df_sp, deploy_params={ - "forward": ( + "": ( {}, - lambda res: torch.testing.assert_close( - pytorch_handler.SeqOfPyTorchTensorHandler.convert_from_df( - snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) - )[0], - y_pred, - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py index a98f59eb..c5f04d4c 100644 --- a/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_sklearn_xgboost_model_integ_test.py @@ -57,7 +57,6 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -70,20 +69,13 @@ def base_test_case( sample_input=sample_input, test_input=test_input, deploy_params=deploy_params, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_skl_model_deploy( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -102,20 +94,13 @@ def test_skl_model_deploy( lambda res: np.testing.assert_allclose(res["output_feature_0"].values, regr.predict(iris_X)), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_skl_model_proba_deploy( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -137,20 +122,13 @@ def test_skl_model_proba_deploy( lambda res: np.testing.assert_allclose(res.values, model.predict_proba(iris_X[:10])), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_skl_multiple_output_model_proba_deploy( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -177,20 +155,13 @@ def test_skl_multiple_output_model_proba_deploy( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_xgb( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -213,20 +184,13 @@ def test_xgb( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.3"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.3"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_xgb_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -237,6 +201,14 @@ def test_xgb_sp( cal_data_pd_df_train = cal_data_sp_df_train.to_pandas() regressor.fit(cal_data_pd_df_train.drop(columns=["target"]), cal_data_pd_df_train["target"]) cal_data_sp_df_test_X = cal_data_sp_df_test.drop('"target"') + + y_df_expected = pd.concat( + [ + cal_data_sp_df_test_X.to_pandas(), + pd.DataFrame(regressor.predict(cal_data_sp_df_test_X.to_pandas()), columns=["output_feature_0"]), + ], + axis=1, + ) self.base_test_case( name="xgb_model_sp", model=regressor, @@ -245,26 +217,16 @@ def test_xgb_sp( deploy_params={ "predict": ( {}, - lambda res: np.testing.assert_allclose( - res.to_pandas().values, - np.expand_dims(regressor.predict(cal_data_sp_df_test_X.to_pandas()), axis=1), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected, check_dtype=False), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_xgb_booster( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -283,23 +245,16 @@ def test_xgb_booster( deploy_params={ "predict": ( {}, - lambda res: np.testing.assert_allclose(res.values, np.expand_dims(y_pred, axis=1)), + lambda res: np.testing.assert_allclose(res.values, np.expand_dims(y_pred, axis=1), rtol=1e-6), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_xgb_booster_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -313,7 +268,16 @@ def test_xgb_booster_sp( xgboost.DMatrix(data=cal_data_pd_df_train.drop(columns=["target"]), label=cal_data_pd_df_train["target"]), ) cal_data_sp_df_test_X = cal_data_sp_df_test.drop('"target"') - y_pred = regressor.predict(xgboost.DMatrix(data=cal_data_sp_df_test_X.to_pandas())) + y_df_expected = pd.concat( + [ + cal_data_sp_df_test_X.to_pandas(), + pd.DataFrame( + regressor.predict(xgboost.DMatrix(data=cal_data_sp_df_test_X.to_pandas())), + columns=["output_feature_0"], + ), + ], + axis=1, + ) self.base_test_case( name="xgb_booster_sp", model=regressor, @@ -322,13 +286,9 @@ def test_xgb_booster_sp( deploy_params={ "predict": ( {}, - lambda res: np.testing.assert_allclose( - res.to_pandas().values, - np.expand_dims(y_pred, axis=1), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected, check_dtype=False), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py index c0cc7379..e59e2d5e 100644 --- a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py @@ -21,6 +21,7 @@ from tests.integ.snowflake.ml.test_utils import db_manager +@pytest.mark.pip_incompatible class TestWarehouseSnowMLModelInteg(parameterized.TestCase): @classmethod def setUpClass(self) -> None: @@ -60,7 +61,6 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -73,21 +73,13 @@ def base_test_case( sample_input=sample_input, test_input=test_input, deploy_params=deploy_params, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @pytest.mark.pip_incompatible - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"]) # type: ignore[misc] def test_snowml_model_deploy_snowml_sklearn( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -114,21 +106,13 @@ def test_snowml_model_deploy_snowml_sklearn( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @pytest.mark.pip_incompatible - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"]) # type: ignore[misc] def test_snowml_model_deploy_xgboost( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -155,21 +139,13 @@ def test_snowml_model_deploy_xgboost( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @pytest.mark.pip_incompatible - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.5"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.5"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"]) # type: ignore[misc] def test_snowml_model_deploy_lightgbm( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -196,7 +172,6 @@ def test_snowml_model_deploy_lightgbm( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py index 3e16cb28..78233ffd 100644 --- a/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py +++ b/tests/integ/snowflake/ml/model/warehouse_tensorflow_model_integ_test.py @@ -3,7 +3,7 @@ # import uuid -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Optional, Tuple, Union import numpy as np import pandas as pd @@ -11,7 +11,11 @@ from absl.testing import absltest, parameterized from snowflake.ml.model import type_hints as model_types -from snowflake.ml.model._signatures import snowpark_handler, tensorflow_handler +from snowflake.ml.model._signatures import ( + numpy_handler, + snowpark_handler, + tensorflow_handler, +) from snowflake.ml.utils import connection_params from snowflake.snowpark import DataFrame as SnowparkDataFrame, Session from tests.integ.snowflake.ml.model import warehouse_model_integ_test_utils @@ -24,22 +28,9 @@ def __init__(self, name: str = None) -> None: self.a_variable = tf.Variable(5.0, name="train_me") self.non_trainable_variable = tf.Variable(5.0, trainable=False, name="do_not_train_me") - @tf.function(input_signature=[[tf.TensorSpec(shape=(None, 1), dtype=tf.float32)]]) # type: ignore[misc] - def __call__(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: - return [self.a_variable * tensors[0] + self.non_trainable_variable] - - -class KerasModel(tf.keras.Model): - def __init__(self, n_hidden: int, n_out: int) -> None: - super().__init__() - self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") - self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") - - def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: - input = tensors[0] - x = self.fc_1(input) - x = self.fc_2(x) - return [x] + @tf.function(input_signature=[tf.TensorSpec(shape=(None, 1), dtype=tf.float32)]) # type: ignore[misc] + def __call__(self, tensor: tf.Tensor) -> tf.Tensor: + return self.a_variable * tensor + self.non_trainable_variable class TestWarehouseTensorflowModelInteg(parameterized.TestCase): @@ -81,7 +72,6 @@ def base_test_case( sample_input: model_types.SupportedDataType, test_input: model_types.SupportedDataType, deploy_params: Dict[str, Tuple[Dict[str, Any], Callable[[Union[pd.DataFrame, SnowparkDataFrame]], Any]]], - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: @@ -94,62 +84,48 @@ def base_test_case( sample_input=sample_input, test_input=test_input, deploy_params=deploy_params, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_tf_tensor_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") - data_x = [tf.constant([[5.0], [10.0]])] - x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + data_x = tf.constant([[5.0], [10.0]]) + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) y_pred = model(data_x) self.base_test_case( name="tf_model_tensor_as_sample", model=model, - sample_input=data_x, + sample_input=[data_x], test_input=x_df, deploy_params={ - "__call__": ( + "": ( {}, lambda res: np.testing.assert_allclose( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), - y_pred[0].numpy(), + y_pred.numpy(), ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_tf_df_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") - data_x = [tf.constant([[5.0], [10.0]])] - x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + data_x = tf.constant([[5.0], [10.0]]) + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) y_pred = model(data_x) self.base_test_case( @@ -158,41 +134,36 @@ def test_tf_df_as_sample( sample_input=x_df, test_input=x_df, deploy_params={ - "__call__": ( + "": ( {}, lambda res: np.testing.assert_allclose( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), - y_pred[0].numpy(), + y_pred.numpy(), ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_tf_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model = SimpleModule(name="simple") - data_x = [tf.constant([[5.0], [10.0]])] - x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + data_x = tf.constant([[5.0], [10.0]]) + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) x_df.columns = ["col_0"] y_pred = model(data_x) x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df( self._session, x_df, - keep_order=True, ) + y_pred_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([y_pred]) + y_pred_df.columns = ["output_feature_0"] + y_df_expected = pd.concat([x_df, y_pred_df], axis=1) self.base_test_case( name="tf_model_sp", @@ -200,44 +171,32 @@ def test_tf_sp( sample_input=x_df, test_input=x_df_sp, deploy_params={ - "__call__": ( + "": ( {}, - lambda res: np.testing.assert_allclose( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( - snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) - )[0].numpy(), - y_pred[0].numpy(), - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_keras_tensor_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() - x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) - y_pred = model.predict(data_x)[0] + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) + y_pred = model.predict(data_x) self.base_test_case( name="keras_model_tensor_as_sample", model=model, - sample_input=data_x, + sample_input=[data_x], test_input=x_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: np.testing.assert_allclose( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), @@ -246,26 +205,19 @@ def test_keras_tensor_as_sample( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_keras_df_as_sample( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() - x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) - y_pred = model.predict(data_x)[0] + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) + y_pred = model.predict(data_x) self.base_test_case( name="keras_model_df_as_sample", @@ -273,7 +225,7 @@ def test_keras_df_as_sample( sample_input=x_df, test_input=x_df, deploy_params={ - "predict": ( + "": ( {}, lambda res: np.testing.assert_allclose( tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df(res)[0].numpy(), @@ -282,32 +234,27 @@ def test_keras_df_as_sample( ), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) - @parameterized.parameters( # type: ignore[misc] - {"model_in_stage": True, "permanent_deploy": True, "test_released_version": None}, - {"model_in_stage": False, "permanent_deploy": False, "test_released_version": None}, - {"model_in_stage": True, "permanent_deploy": False, "test_released_version": "1.0.4"}, - {"model_in_stage": False, "permanent_deploy": True, "test_released_version": "1.0.4"}, - ) + @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.6"]) # type: ignore[misc] def test_keras_sp( self, - model_in_stage: Optional[bool] = False, permanent_deploy: Optional[bool] = False, test_released_version: Optional[str] = None, ) -> None: model, data_x, data_y = model_factory.ModelFactory.prepare_keras_model() - x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df(data_x, ensure_serializable=False) + x_df = tensorflow_handler.SeqOfTensorflowTensorHandler.convert_to_df([data_x], ensure_serializable=False) x_df.columns = ["col_0"] - y_pred = model.predict(data_x)[0] + y_pred = model.predict(data_x) x_df_sp = snowpark_handler.SnowparkDataFrameHandler.convert_from_df( self._session, x_df, - keep_order=True, ) + y_pred_df = numpy_handler.SeqOfNumpyArrayHandler.convert_to_df([y_pred]) + y_pred_df.columns = ["output_feature_0"] + y_df_expected = pd.concat([x_df, y_pred_df], axis=1) self.base_test_case( name="keras_model_sp", @@ -315,18 +262,11 @@ def test_keras_sp( sample_input=x_df, test_input=x_df_sp, deploy_params={ - "predict": ( + "": ( {}, - lambda res: np.testing.assert_allclose( - tensorflow_handler.SeqOfTensorflowTensorHandler.convert_from_df( - snowpark_handler.SnowparkDataFrameHandler.convert_to_df(res) - )[0].numpy(), - y_pred, - atol=1e-6, - ), + lambda res: warehouse_model_integ_test_utils.check_sp_df_res(res, y_df_expected), ), }, - model_in_stage=model_in_stage, permanent_deploy=permanent_deploy, test_released_version=test_released_version, ) diff --git a/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py b/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py index e56efaf3..ed87c117 100644 --- a/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py +++ b/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py @@ -486,45 +486,45 @@ def test_serde(self) -> None: simple_imputer = SimpleImputer().set_input_cols(input_cols).set_output_cols(output_cols) simple_imputer.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_simple_imputer.pkl") - self._to_be_deleted_files.append(filepath) - simple_imputer_dump_cloudpickle = cloudpickle.dumps(simple_imputer) - simple_imputer_dump_pickle = pickle.dumps(simple_imputer) - joblib.dump(simple_imputer, filepath) - - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.impute.simple_imputer"]) - - # cloudpickle - simple_imputer_load_cloudpickle = cloudpickle.loads(simple_imputer_dump_cloudpickle) - transformed_df_cloudpickle = simple_imputer_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # pickle - simple_imputer_load_pickle = pickle.loads(simple_imputer_dump_pickle) - transformed_df_pickle = simple_imputer_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # joblib - simple_imputer_load_joblib = joblib.load(filepath) - transformed_df_joblib = simple_imputer_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - - # sklearn - simple_imputer_sklearn = SklearnSimpleImputer() - simple_imputer_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = simple_imputer_sklearn.transform(df_pandas[input_cols]) - - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + simple_imputer_dump_cloudpickle = cloudpickle.dumps(simple_imputer) + simple_imputer_dump_pickle = pickle.dumps(simple_imputer) + joblib.dump(simple_imputer, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.impute.simple_imputer"]) + + # cloudpickle + simple_imputer_load_cloudpickle = cloudpickle.loads(simple_imputer_dump_cloudpickle) + transformed_df_cloudpickle = simple_imputer_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # pickle + simple_imputer_load_pickle = pickle.loads(simple_imputer_dump_pickle) + transformed_df_pickle = simple_imputer_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # joblib + simple_imputer_load_joblib = joblib.load(file.name) + transformed_df_joblib = simple_imputer_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + + # sklearn + simple_imputer_sklearn = SklearnSimpleImputer() + simple_imputer_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = simple_imputer_sklearn.transform(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_d2_absolute_error_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_d2_absolute_error_score.py index 91ab8c1e..dd220d0f 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_d2_absolute_error_score.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_d2_absolute_error_score.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -124,6 +125,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_loss, actual_loss) + @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_loss = snowml_metrics.d2_absolute_error_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + ) + sklearn_loss = sklearn_metrics.d2_absolute_error_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + ) + self.assertAlmostEqual(sklearn_loss, actual_loss) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_d2_pinball_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_d2_pinball_score.py index 84407f76..0ab730b8 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_d2_pinball_score.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_d2_pinball_score.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -157,6 +158,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_loss, actual_loss) + @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_loss = snowml_metrics.d2_pinball_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + ) + sklearn_loss = sklearn_metrics.d2_pinball_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + ) + self.assertAlmostEqual(sklearn_loss, actual_loss) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_explained_variance_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_explained_variance_score.py index 92217df3..932848b2 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_explained_variance_score.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_explained_variance_score.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -157,6 +158,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_loss, actual_loss) + @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_loss = snowml_metrics.explained_variance_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + ) + sklearn_loss = sklearn_metrics.explained_variance_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + ) + self.assertAlmostEqual(sklearn_loss, actual_loss) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_error.py b/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_error.py index 303f5baf..6fb72769 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_error.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_error.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -124,6 +125,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_loss, actual_loss) + @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_loss = snowml_metrics.mean_absolute_error( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + ) + sklearn_loss = sklearn_metrics.mean_absolute_error( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + ) + self.assertAlmostEqual(sklearn_loss, actual_loss) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_percentage_error.py b/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_percentage_error.py index affff300..a61cda86 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_percentage_error.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_mean_absolute_percentage_error.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -124,6 +125,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_loss, actual_loss) + @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_loss = snowml_metrics.mean_absolute_percentage_error( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + ) + sklearn_loss = sklearn_metrics.mean_absolute_percentage_error( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + ) + self.assertAlmostEqual(sklearn_loss, actual_loss) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_mean_squared_error.py b/tests/integ/snowflake/ml/modeling/metrics/test_mean_squared_error.py index 7f0d59dd..6eb626b9 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_mean_squared_error.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_mean_squared_error.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -157,6 +158,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_loss, actual_loss) + @mock.patch("snowflake.ml.modeling.metrics.regression.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_loss = snowml_metrics.mean_squared_error( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + ) + sklearn_loss = sklearn_metrics.mean_squared_error( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + ) + self.assertAlmostEqual(sklearn_loss, actual_loss) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_curve.py b/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_curve.py index 4af439f4..37ed333a 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_curve.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_curve.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -84,6 +85,24 @@ def test_sample_weight(self, params: Dict[str, Any]) -> None: np.testing.assert_allclose(actual_recall, sklearn_recall) np.testing.assert_allclose(actual_thresholds, sklearn_thresholds) + @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_precision, actual_recall, actual_thresholds = snowml_metrics.precision_recall_curve( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + probas_pred_col_name=_PROBAS_PRED_COL, + ) + sklearn_precision, sklearn_recall, sklearn_thresholds = sklearn_metrics.precision_recall_curve( + pandas_df[_Y_TRUE_COL], + pandas_df[_PROBAS_PRED_COL], + ) + np.testing.assert_allclose(actual_precision, sklearn_precision) + np.testing.assert_allclose(actual_recall, sklearn_recall) + np.testing.assert_allclose(actual_thresholds, sklearn_thresholds) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_roc_auc_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_roc_auc_score.py index b7e73530..fad19d9f 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_roc_auc_score.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_roc_auc_score.py @@ -2,6 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -222,6 +223,22 @@ def test_multilabel(self) -> None: ) self.assertAlmostEqual(sklearn_auc, actual_auc) + @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + pandas_df = pd.DataFrame(_MULTILABEL_DATA, columns=_MULTILABEL_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_auc = snowml_metrics.roc_auc_score( + df=input_df, + y_true_col_names=_MULTILABEL_Y_TRUE_COLS, + y_score_col_names=_MULTILABEL_Y_SCORE_COLS, + ) + sklearn_auc = sklearn_metrics.roc_auc_score( + pandas_df[_MULTILABEL_Y_TRUE_COLS], + pandas_df[_MULTILABEL_Y_SCORE_COLS], + ) + self.assertAlmostEqual(sklearn_auc, actual_auc) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py b/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py index ab399ef0..8fcca737 100644 --- a/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_roc_curve.py @@ -4,6 +4,7 @@ import os import tempfile from typing import Any, Dict +from unittest import mock import numpy as np import pandas as pd @@ -155,6 +156,26 @@ def test_multi_query_df(self) -> None: np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)), ) + @mock.patch("snowflake.ml.modeling.metrics.ranking.result._RESULT_SIZE_THRESHOLD", 0) + def test_metric_size_threshold(self) -> None: + # TODO: somehow confirm that the stage upload code path was taken. + pandas_df = pd.DataFrame(_BINARY_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + actual_fpr, actual_tpr, actual_thresholds = snowml_metrics.roc_curve( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_score_col_name=_Y_SCORE_COL, + ) + sklearn_fpr, sklearn_tpr, sklearn_thresholds = sklearn_metrics.roc_curve( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_SCORE_COL], + ) + np.testing.assert_allclose( + np.array((actual_fpr, actual_tpr, actual_thresholds)), + np.array((sklearn_fpr, sklearn_tpr, sklearn_thresholds)), + ) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py index d078d3ce..ef1fd59b 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py @@ -125,45 +125,45 @@ def test_serde(self) -> None: binarizer = Binarizer(threshold=threshold).set_input_cols(input_cols).set_output_cols(output_cols) binarizer.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_serialization.pkl") - self._to_be_deleted_files.append(filepath) - binarizer_dump_cloudpickle = cloudpickle.dumps(binarizer) - binarizer_dump_pickle = pickle.dumps(binarizer) - joblib.dump(binarizer, filepath) - - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.binarizer"]) - - # cloudpickle - binarizer_load_cloudpickle = cloudpickle.loads(binarizer_dump_cloudpickle) - transformed_df_cloudpickle = binarizer_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # pickle - binarizer_load_pickle = pickle.loads(binarizer_dump_pickle) - transformed_df_pickle = binarizer_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # joblib - binarizer_load_joblib = joblib.load(filepath) - transformed_df_joblib = binarizer_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - - # sklearn - binarizer_sklearn = SklearnBinarizer(threshold=threshold) - binarizer_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = binarizer_sklearn.transform(df_pandas[input_cols]) - - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + binarizer_dump_cloudpickle = cloudpickle.dumps(binarizer) + binarizer_dump_pickle = pickle.dumps(binarizer) + joblib.dump(binarizer, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.binarizer"]) + + # cloudpickle + binarizer_load_cloudpickle = cloudpickle.loads(binarizer_dump_cloudpickle) + transformed_df_cloudpickle = binarizer_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # pickle + binarizer_load_pickle = pickle.loads(binarizer_dump_pickle) + transformed_df_pickle = binarizer_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # joblib + binarizer_load_joblib = joblib.load(file.name) + transformed_df_joblib = binarizer_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + + # sklearn + binarizer_sklearn = SklearnBinarizer(threshold=threshold) + binarizer_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = binarizer_sklearn.transform(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py index cb439855..530936a8 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py @@ -197,45 +197,45 @@ def test_serde(self) -> None: label_encoder = LabelEncoder().set_input_cols(input_cols).set_output_cols(output_cols) label_encoder.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_label_encoder.pkl") - self._to_be_deleted_files.append(filepath) - label_encoder_dump_cloudpickle = cloudpickle.dumps(label_encoder) - label_encoder_dump_pickle = pickle.dumps(label_encoder) - joblib.dump(label_encoder, filepath) - - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.label_encoder"]) - - # cloudpickle - label_encoder_load_cloudpickle = cloudpickle.loads(label_encoder_dump_cloudpickle) - transformed_df_cloudpickle = label_encoder_load_cloudpickle.transform(df2) - actual_arr_cloudpickle = transformed_df_cloudpickle[output_cols].to_pandas().to_numpy().flatten() - - # pickle - label_encoder_load_pickle = pickle.loads(label_encoder_dump_pickle) - transformed_df_pickle = label_encoder_load_pickle.transform(df2) - actual_arr_pickle = transformed_df_pickle[output_cols].to_pandas().to_numpy().flatten() - - # joblib - label_encoder_load_joblib = joblib.load(filepath) - transformed_df_joblib = label_encoder_load_joblib.transform(df2) - actual_arr_joblib = transformed_df_joblib[output_cols].to_pandas().to_numpy().flatten() - - # sklearn - label_encoder_sklearn = SklearnLabelEncoder() - label_encoder_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = label_encoder_sklearn.transform(df_pandas[input_cols]) - - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + label_encoder_dump_cloudpickle = cloudpickle.dumps(label_encoder) + label_encoder_dump_pickle = pickle.dumps(label_encoder) + joblib.dump(label_encoder, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.label_encoder"]) + + # cloudpickle + label_encoder_load_cloudpickle = cloudpickle.loads(label_encoder_dump_cloudpickle) + transformed_df_cloudpickle = label_encoder_load_cloudpickle.transform(df2) + actual_arr_cloudpickle = transformed_df_cloudpickle[output_cols].to_pandas().to_numpy().flatten() + + # pickle + label_encoder_load_pickle = pickle.loads(label_encoder_dump_pickle) + transformed_df_pickle = label_encoder_load_pickle.transform(df2) + actual_arr_pickle = transformed_df_pickle[output_cols].to_pandas().to_numpy().flatten() + + # joblib + label_encoder_load_joblib = joblib.load(file.name) + transformed_df_joblib = label_encoder_load_joblib.transform(df2) + actual_arr_joblib = transformed_df_joblib[output_cols].to_pandas().to_numpy().flatten() + + # sklearn + label_encoder_sklearn = SklearnLabelEncoder() + label_encoder_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = label_encoder_sklearn.transform(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py index 98ea4c40..d0454758 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py @@ -140,45 +140,45 @@ def test_serde(self) -> None: scaler = MaxAbsScaler().set_input_cols(input_cols).set_output_cols(output_cols) scaler.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_max_abs_scaler.pkl") - self._to_be_deleted_files.append(filepath) - scaler_dump_cloudpickle = cloudpickle.dumps(scaler) - scaler_dump_pickle = pickle.dumps(scaler) - joblib.dump(scaler, filepath) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + scaler_dump_cloudpickle = cloudpickle.dumps(scaler) + scaler_dump_pickle = pickle.dumps(scaler) + joblib.dump(scaler, file.name) - self._session.close() + self._session.close() - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.max_abs_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.max_abs_scaler"]) - # cloudpickle - scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) - transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # cloudpickle + scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) + transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # pickle - scaler_load_pickle = pickle.loads(scaler_dump_pickle) - transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # pickle + scaler_load_pickle = pickle.loads(scaler_dump_pickle) + transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # joblib - scaler_load_joblib = joblib.load(filepath) - transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + # joblib + scaler_load_joblib = joblib.load(file.name) + transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - # sklearn - scaler_sklearn = SklearnMaxAbsScaler() - scaler_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) + # sklearn + scaler_sklearn = SklearnMaxAbsScaler() + scaler_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py index cb753fa1..58778a8d 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py @@ -340,46 +340,45 @@ def test_serde(self) -> None: scaler = MinMaxScaler().set_input_cols(input_cols).set_output_cols(output_cols) scaler.fit(df1) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + scaler_dump_cloudpickle = cloudpickle.dumps(scaler) + scaler_dump_pickle = pickle.dumps(scaler) + joblib.dump(scaler, file.name) - filepath = os.path.join(tempfile.gettempdir(), "test_min_max_scaler.pkl") - self._to_be_deleted_files.append(filepath) - scaler_dump_cloudpickle = cloudpickle.dumps(scaler) - scaler_dump_pickle = pickle.dumps(scaler) - joblib.dump(scaler, filepath) + self._session.close() - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.min_max_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.min_max_scaler"]) - # cloudpickle - scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) - transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # cloudpickle + scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) + transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # pickle - scaler_load_pickle = pickle.loads(scaler_dump_pickle) - transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # pickle + scaler_load_pickle = pickle.loads(scaler_dump_pickle) + transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # joblib - scaler_load_joblib = joblib.load(filepath) - transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + # joblib + scaler_load_joblib = joblib.load(file.name) + transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - # sklearn - scaler_sklearn = SklearnMinMaxScaler() - scaler_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) + # sklearn + scaler_sklearn = SklearnMinMaxScaler() + scaler_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py index 0101f374..26671344 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py @@ -183,45 +183,45 @@ def test_serde(self) -> None: normalizer = Normalizer().set_input_cols(input_cols).set_output_cols(output_cols) normalizer.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_standard_normalizer.pkl") - self._to_be_deleted_files.append(filepath) - normalizer_dump_cloudpickle = cloudpickle.dumps(normalizer) - normalizer_dump_pickle = pickle.dumps(normalizer) - joblib.dump(normalizer, filepath) - - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.normalizer"]) - - # cloudpickle - normalizer_load_cloudpickle = cloudpickle.loads(normalizer_dump_cloudpickle) - transformed_df_cloudpickle = normalizer_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # pickle - normalizer_load_pickle = pickle.loads(normalizer_dump_pickle) - transformed_df_pickle = normalizer_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # joblib - normalizer_load_joblib = joblib.load(filepath) - transformed_df_joblib = normalizer_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - - # sklearn - normalizer_sklearn = SklearnNormalizer() - normalizer_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = normalizer_sklearn.transform(df_pandas[input_cols]) - - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + normalizer_dump_cloudpickle = cloudpickle.dumps(normalizer) + normalizer_dump_pickle = pickle.dumps(normalizer) + joblib.dump(normalizer, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.normalizer"]) + + # cloudpickle + normalizer_load_cloudpickle = cloudpickle.loads(normalizer_dump_cloudpickle) + transformed_df_cloudpickle = normalizer_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # pickle + normalizer_load_pickle = pickle.loads(normalizer_dump_pickle) + transformed_df_pickle = normalizer_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # joblib + normalizer_load_joblib = joblib.load(file.name) + transformed_df_joblib = normalizer_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + + # sklearn + normalizer_sklearn = SklearnNormalizer() + normalizer_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = normalizer_sklearn.transform(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py index ebe86086..d3bab77f 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py @@ -1553,51 +1553,53 @@ def test_serde(self) -> None: sparse = False encoder = OneHotEncoder(sparse=sparse).set_input_cols(input_cols).set_output_cols(output_cols) encoder.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_one_hot_encoder.pkl") - self._to_be_deleted_files.append(filepath) - encoder_dump_cloudpickle = cloudpickle.dumps(encoder) - encoder_dump_pickle = pickle.dumps(encoder) - joblib.dump(encoder, filepath) - - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.one_hot_encoder"]) - - # cloudpickle - encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) - transformed_df_cloudpickle = encoder_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = ( - transformed_df_cloudpickle.sort(id_col)[encoder_load_cloudpickle.get_output_cols()].to_pandas().to_numpy() - ) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + encoder_dump_cloudpickle = cloudpickle.dumps(encoder) + encoder_dump_pickle = pickle.dumps(encoder) + joblib.dump(encoder, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.one_hot_encoder"]) + + # cloudpickle + encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) + transformed_df_cloudpickle = encoder_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = ( + transformed_df_cloudpickle.sort(id_col)[encoder_load_cloudpickle.get_output_cols()] + .to_pandas() + .to_numpy() + ) - # pickle - encoder_load_pickle = pickle.loads(encoder_dump_pickle) - transformed_df_pickle = encoder_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = ( - transformed_df_pickle.sort(id_col)[encoder_load_pickle.get_output_cols()].to_pandas().to_numpy() - ) + # pickle + encoder_load_pickle = pickle.loads(encoder_dump_pickle) + transformed_df_pickle = encoder_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = ( + transformed_df_pickle.sort(id_col)[encoder_load_pickle.get_output_cols()].to_pandas().to_numpy() + ) - # joblib - encoder_load_joblib = joblib.load(filepath) - transformed_df_joblib = encoder_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = ( - transformed_df_joblib.sort(id_col)[encoder_load_joblib.get_output_cols()].to_pandas().to_numpy() - ) + # joblib + encoder_load_joblib = joblib.load(file.name) + transformed_df_joblib = encoder_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = ( + transformed_df_joblib.sort(id_col)[encoder_load_joblib.get_output_cols()].to_pandas().to_numpy() + ) - # sklearn - encoder_sklearn = SklearnOneHotEncoder(sparse=sparse) - encoder_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = encoder_sklearn.transform(df_pandas[input_cols]) + # sklearn + encoder_sklearn = SklearnOneHotEncoder(sparse=sparse) + encoder_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = encoder_sklearn.transform(df_pandas[input_cols]) - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) def test_drop_input_cols(self) -> None: df_pandas, df = framework_utils.get_df(self._session, DATA, SCHEMA, np.nan) diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py index 28d40a19..89b03b5f 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py @@ -797,45 +797,45 @@ def test_serde(self) -> None: encoder = OrdinalEncoder().set_input_cols(input_cols).set_output_cols(output_cols) encoder.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_ordinal_encoder.pkl") - self._to_be_deleted_files.append(filepath) - encoder_dump_cloudpickle = cloudpickle.dumps(encoder) - encoder_dump_pickle = pickle.dumps(encoder) - joblib.dump(encoder, filepath) - - self._session.close() - - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) - - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.ordinal_encoder"]) - - # cloudpickle - encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) - transformed_df_cloudpickle = encoder_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # pickle - encoder_load_pickle = pickle.loads(encoder_dump_pickle) - transformed_df_pickle = encoder_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - - # joblib - encoder_load_joblib = joblib.load(filepath) - transformed_df_joblib = encoder_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - - # sklearn - encoder_sklearn = SklearnOrdinalEncoder() - encoder_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = encoder_sklearn.transform(df_pandas[input_cols]) - - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr, equal_nan=True) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr, equal_nan=True) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr, equal_nan=True) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + encoder_dump_cloudpickle = cloudpickle.dumps(encoder) + encoder_dump_pickle = pickle.dumps(encoder) + joblib.dump(encoder, file.name) + + self._session.close() + + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) + + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.ordinal_encoder"]) + + # cloudpickle + encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) + transformed_df_cloudpickle = encoder_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # pickle + encoder_load_pickle = pickle.loads(encoder_dump_pickle) + transformed_df_pickle = encoder_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + + # joblib + encoder_load_joblib = joblib.load(file.name) + transformed_df_joblib = encoder_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + + # sklearn + encoder_sklearn = SklearnOrdinalEncoder() + encoder_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = encoder_sklearn.transform(df_pandas[input_cols]) + + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr, equal_nan=True) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr, equal_nan=True) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr, equal_nan=True) def test_same_input_output_cols(self) -> None: """ diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py index 893a961e..15c023e8 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py @@ -259,45 +259,45 @@ def test_serde(self) -> None: scaler = RobustScaler().set_input_cols(input_cols).set_output_cols(output_cols) scaler.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_robust_scaler.pkl") - self._to_be_deleted_files.append(filepath) - scaler_dump_cloudpickle = cloudpickle.dumps(scaler) - scaler_dump_pickle = pickle.dumps(scaler) - joblib.dump(scaler, filepath) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + scaler_dump_cloudpickle = cloudpickle.dumps(scaler) + scaler_dump_pickle = pickle.dumps(scaler) + joblib.dump(scaler, file.name) - self._session.close() + self._session.close() - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.robust_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.robust_scaler"]) - # cloudpickle - scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) - transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # cloudpickle + scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) + transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # pickle - scaler_load_pickle = pickle.loads(scaler_dump_pickle) - transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # pickle + scaler_load_pickle = pickle.loads(scaler_dump_pickle) + transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # joblib - scaler_load_joblib = joblib.load(filepath) - transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + # joblib + scaler_load_joblib = joblib.load(file.name) + transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - # sklearn - scaler_sklearn = SklearnRobustScaler() - scaler_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) + # sklearn + scaler_sklearn = SklearnRobustScaler() + scaler_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py index 0f13ad60..8b3440dd 100644 --- a/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py @@ -373,45 +373,45 @@ def test_serde(self) -> None: scaler = StandardScaler().set_input_cols(input_cols).set_output_cols(output_cols) scaler.fit(df1) - filepath = os.path.join(tempfile.gettempdir(), "test_standard_scaler.pkl") - self._to_be_deleted_files.append(filepath) - scaler_dump_cloudpickle = cloudpickle.dumps(scaler) - scaler_dump_pickle = pickle.dumps(scaler) - joblib.dump(scaler, filepath) + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file: + self._to_be_deleted_files.append(file.name) + scaler_dump_cloudpickle = cloudpickle.dumps(scaler) + scaler_dump_pickle = pickle.dumps(scaler) + joblib.dump(scaler, file.name) - self._session.close() + self._session.close() - # transform in session 2 - self._session = Session.builder.configs(SnowflakeLoginOptions()).create() - _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) - input_cols_extended = input_cols.copy() - input_cols_extended.append(id_col) + # transform in session 2 + self._session = Session.builder.configs(SnowflakeLoginOptions()).create() + _, df2 = framework_utils.get_df(self._session, data, schema, np.nan) + input_cols_extended = input_cols.copy() + input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.standard_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.standard_scaler"]) - # cloudpickle - scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) - transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) - actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # cloudpickle + scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) + transformed_df_cloudpickle = scaler_load_cloudpickle.transform(df2[input_cols_extended]) + actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # pickle - scaler_load_pickle = pickle.loads(scaler_dump_pickle) - transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) - actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() + # pickle + scaler_load_pickle = pickle.loads(scaler_dump_pickle) + transformed_df_pickle = scaler_load_pickle.transform(df2[input_cols_extended]) + actual_arr_pickle = transformed_df_pickle.sort(id_col)[output_cols].to_pandas().to_numpy() - # joblib - scaler_load_joblib = joblib.load(filepath) - transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) - actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() + # joblib + scaler_load_joblib = joblib.load(file.name) + transformed_df_joblib = scaler_load_joblib.transform(df2[input_cols_extended]) + actual_arr_joblib = transformed_df_joblib.sort(id_col)[output_cols].to_pandas().to_numpy() - # sklearn - scaler_sklearn = SklearnStandardScaler() - scaler_sklearn.fit(df_pandas[input_cols]) - sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) + # sklearn + scaler_sklearn = SklearnStandardScaler() + scaler_sklearn.fit(df_pandas[input_cols]) + sklearn_arr = scaler_sklearn.transform(df_pandas[input_cols]) - np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) - np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) + np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_pickle, sklearn_arr) + np.testing.assert_allclose(actual_arr_joblib, sklearn_arr) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/registry/BUILD.bazel b/tests/integ/snowflake/ml/registry/BUILD.bazel index 843269f9..9082421f 100644 --- a/tests/integ/snowflake/ml/registry/BUILD.bazel +++ b/tests/integ/snowflake/ml/registry/BUILD.bazel @@ -6,6 +6,7 @@ py_test( deps = [ "//tests/integ/snowflake/ml/test_utils:db_manager", "//snowflake/ml/registry:model_registry", + "//snowflake/ml/registry:_ml_artifact", "//snowflake/ml/utils:connection_params", ], ) diff --git a/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py index a4c122c9..6fa23b26 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_basic_integ_test.py @@ -1,13 +1,13 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # - +import json import uuid from typing import Optional from absl.testing import absltest, parameterized -from snowflake.ml.registry import _schema, model_registry +from snowflake.ml.registry import _ml_artifact, _schema, model_registry from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.test_utils import db_manager @@ -178,11 +178,102 @@ def test_add_new_registry_table_column_without_allowlist(self) -> None: self._db_manager.drop_database(broken_registry) raise Exception(f"Test failed with exception:{e}") - _schema._REGISTRY_TABLE_SCHEMA["new_column"] = "VARCHAR" - with self.assertRaisesRegex(TypeError, "Registry table:.* doesn't have required column:.*"): - model_registry.ModelRegistry(session=self._session, database_name=broken_registry) + try: + _schema._REGISTRY_TABLE_SCHEMA.append(("new_column", "VARCHAR")) + with self.assertRaisesRegex(TypeError, "Registry table:.* doesn't have required column:.*"): + model_registry.ModelRegistry(session=self._session, database_name=broken_registry) + finally: + _schema._REGISTRY_TABLE_SCHEMA.pop() + self._db_manager.drop_database(broken_registry) + + def test_add_and_delete_ml_artifacts(self) -> None: + """Test add_artifact() and delete_artifact() in `_ml_artifact.py` works as expected.""" + + artifact_registry = db_manager.TestObjectNameGenerator.get_snowml_test_object_name( + _RUN_ID, "artifact_registry" + ).upper() + artifact_registry_schema = "PUBLIC" + + try: + model_registry.create_model_registry( + session=self._session, database_name=artifact_registry, schema_name=artifact_registry_schema + ) + except Exception as e: + self._db_manager.drop_database(artifact_registry) + raise Exception(f"Test failed with exception:{e}") + + artifact_id = "123" + artifact_type = _ml_artifact.ArtifactType.TESTTYPE + artifact_name = "test_artifact" + artifact_version = "test_artifact_version" + artifact_spec = {"test_property": "test_value"} - _schema._REGISTRY_TABLE_SCHEMA.pop("new_column") + try: + self.assertTrue( + _ml_artifact.if_artifact_table_exists(self._session, artifact_registry, artifact_registry_schema) + ) + + # Validate `add_artifact()` can insert entry into the artifact table + self.assertFalse( + _ml_artifact.if_artifact_exists( + self._session, + artifact_registry, + artifact_registry_schema, + artifact_id=artifact_id, + artifact_type=artifact_type, + ) + ) + _ml_artifact.add_artifact( + self._session, + artifact_registry, + artifact_registry_schema, + artifact_id=artifact_id, + artifact_type=artifact_type, + artifact_name=artifact_name, + artifact_version=artifact_version, + artifact_spec=artifact_spec, + ) + self.assertTrue( + _ml_artifact.if_artifact_exists( + self._session, + artifact_registry, + artifact_registry_schema, + artifact_id=artifact_id, + artifact_type=artifact_type, + ) + ) + + # Validate the artifact_spec can be parsed as expected + artifact_df = _ml_artifact._get_artifact( + self._session, + artifact_registry, + artifact_registry_schema, + artifact_id=artifact_id, + artifact_type=artifact_type, + ) + actual_artifact_spec_str = artifact_df.collect()[0]["ARTIFACT_SPEC"] + actual_artifact_spec_dict = json.loads(actual_artifact_spec_str) + self.assertDictEqual(artifact_spec, actual_artifact_spec_dict) + + # Validate that `delete_artifact` can remove entries from the artifact table. + _ml_artifact.delete_artifact( + self._session, + artifact_registry, + artifact_registry_schema, + artifact_id=artifact_id, + artifact_type=artifact_type, + ) + self.assertFalse( + _ml_artifact.if_artifact_exists( + self._session, + artifact_registry, + artifact_registry_schema, + artifact_id=artifact_id, + artifact_type=artifact_type, + ) + ) + finally: + self._db_manager.drop_database(artifact_registry, if_exists=True) if __name__ == "__main__": diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py index 64325431..a0402f5d 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test.py @@ -12,7 +12,9 @@ from sklearn import metrics from snowflake import connector -from snowflake.ml.registry import model_registry +from snowflake.ml._internal.utils import identifier +from snowflake.ml.registry import _ml_artifact, model_registry +from snowflake.ml.training_dataset import training_dataset from snowflake.ml.utils import connection_params from snowflake.snowpark import Session from tests.integ.snowflake.ml.test_utils import ( @@ -61,7 +63,7 @@ def test_basic_workflow(self) -> None: registry=registry, model_name=model_name, model_version=model_version ) - model_id = registry.log_model( + model_ref = registry.log_model( model_name=model_name, model_version=model_version, model=model, @@ -154,7 +156,7 @@ def test_basic_workflow(self) -> None: # Test list models model_list = registry.list_models().to_pandas() - filtered_model_list = model_list.loc[model_list["ID"] == model_id].reset_index(drop=True) + filtered_model_list = model_list.loc[model_list["ID"] == model_ref._id].reset_index(drop=True) self.assertEqual(filtered_model_list.shape[0], 1) self.assertEqual(filtered_model_list["NAME"][0], second=model_name) @@ -263,7 +265,7 @@ def test_basic_workflow(self) -> None: registry.delete_model(model_name=model_name, model_version=model_version, delete_artifact=True) model_list = registry.list_models().to_pandas() - filtered_model_list = model_list.loc[model_list["ID"] == model_id].reset_index(drop=True) + filtered_model_list = model_list.loc[model_list["ID"] == model_ref._id].reset_index(drop=True) self.assertEqual(filtered_model_list.shape[0], 0) @pytest.mark.pip_incompatible @@ -272,7 +274,7 @@ def test_snowml_model(self) -> None: model_name = "snowml_xgb_classifier" model_version = self.run_id - model, test_features = model_factory.ModelFactory.prepare_snowml_model_xgb() + model, test_features, _ = model_factory.ModelFactory.prepare_snowml_model_xgb() local_prediction = model.predict(test_features) local_prediction_proba = model.predict_proba(test_features) @@ -348,12 +350,90 @@ def test_snowml_pipeline(self) -> None: target_method="predict", permanent=False, ) - remote_prediction_temp = model_ref.predict(temp_predict_deployment_name, test_features) + remote_prediction_temp = model_ref.predict(temp_predict_deployment_name, test_features.to_pandas()) # TODO: Remove .astype(dtype={"OUTPUT_TARGET": np.float64} after SNOW-853638 gets fixed. pd.testing.assert_frame_equal( - remote_prediction_temp.to_pandas(), local_prediction.to_pandas().astype(dtype={"OUTPUT_TARGET": np.float64}) + remote_prediction_temp, + local_prediction.to_pandas().astype(dtype={"OUTPUT_TARGET": np.float64}), + ) + + @pytest.mark.pip_incompatible + def test_log_model_with_traing_dataset(self) -> None: + registry = model_registry.ModelRegistry(session=self._session, database_name=self.registry_name) + + model_name = "snowml_test_training_dataset" + model_version = self.run_id + model, test_features, training_data_df = model_factory.ModelFactory.prepare_snowml_model_xgb() + + database_name = identifier.get_unescaped_names(self._session.get_current_database()) + schema_name = identifier.get_unescaped_names(self._session.get_current_schema()) + dummy_materialized_table_full_path = f"{database_name}.{schema_name}.dummy_materialized_table" + dummy_snapshot_table_full_path = f"{dummy_materialized_table_full_path}_SNAPSHOT" + self._session.create_dataframe(training_data_df).write.mode("overwrite").save_as_table( + f"{dummy_snapshot_table_full_path}" + ) + + spine_query = f"SELECT * FROM {dummy_materialized_table_full_path}" + + fs_metadata = training_dataset.FeatureStoreMetadata( + spine_query=spine_query, + connection_params={ + "database": "test_db", + "schema": "test_schema", + "default_warehouse": "test_warehouse", + }, + features=[], + ) + dummy_training_dataset = training_dataset.TrainingDataset( + df=self._session.sql(spine_query), + materialized_table=dummy_materialized_table_full_path, + snapshot_table=dummy_snapshot_table_full_path, + timestamp_col="ts", + label_cols=["TARGET"], + feature_store_metadata=fs_metadata, + desc="a dummy training dataset metadata", + ) + + with self.assertRaisesRegex( + ValueError, + "Only one of sample_input_data and training_dataset should be provided.", + ): + registry.log_model( + model_name=model_name, + model_version=model_version, + model=model, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], + sample_input_data=test_features, + training_dataset=dummy_training_dataset, + options={"embed_local_ml_library": True}, + ) + + registry.log_model( + model_name=model_name, + model_version=model_version, + model=model, + conda_dependencies=[ + test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") + ], + options={"embed_local_ml_library": True}, + training_dataset=dummy_training_dataset, ) + # test deserialized training dataset from get_training_dataset + des_ds_0 = registry.get_training_dataset(model_name, model_version) + self.assertIsNotNone(des_ds_0) + self.assertEqual(des_ds_0, dummy_training_dataset) + + # test deserialized training dataset from list_artifacts + rows_list = registry.list_artifacts(model_name, model_version).collect() + self.assertEqual(len(rows_list), 1) + self.assertEqual(rows_list[0]["ID"], dummy_training_dataset.id()) + self.assertEqual(_ml_artifact.ArtifactType[rows_list[0]["TYPE"]], _ml_artifact.ArtifactType.TRAINING_DATASET) + des_ds_1 = training_dataset.TrainingDataset.from_json(rows_list[0]["ARTIFACT_SPEC"], self._session) + self.assertEqual(des_ds_1, dummy_training_dataset) + if __name__ == "__main__": absltest.main() diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py index e0c52155..4bff76bd 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_base.py @@ -69,16 +69,20 @@ def _test_snowservice_deployment( self, model_name: str, model_version: str, - prepare_model_and_feature_fn: Callable[[], Tuple[Any, Any]], + prepare_model_and_feature_fn: Callable[[], Tuple[Any, Any, Any]], deployment_options: Dict[str, Any], prediction_assert_fn: Callable[[Any, Union[pd.DataFrame, SnowparkDataFrame]], Any], pip_requirements: Optional[List[str]] = None, conda_dependencies: Optional[List[str]] = None, embed_local_ml_library: Optional[bool] = True, - ): + omit_target_method_when_deploy: bool = False, + ) -> None: model, test_features, *_ = prepare_model_and_feature_fn() - target_method = deployment_options["target_method"] + if omit_target_method_when_deploy: + target_method = deployment_options.pop("target_method") + else: + target_method = deployment_options["target_method"] if hasattr(model, "predict_with_device"): local_prediction = model.predict_with_device(test_features, model_factory.DEVICE.CPU) @@ -89,9 +93,7 @@ def _test_snowservice_deployment( # Instead we rely on snowpark version on information.schema table. Note that this will not affect end user # as by the time they use it, the latest snowpark should be available in conda already. conda_dependencies = conda_dependencies or [] - conda_dependencies.append( - test_env_utils.get_latest_package_versions_in_server(self._session, "snowflake-snowpark-python") - ) + conda_dependencies.append(test_env_utils.get_latest_package_versions_in_conda("snowflake-snowpark-python")) self.registry.log_model( model_name=model_name, @@ -109,7 +111,7 @@ def _test_snowservice_deployment( deployment_name = f"{model_name}_{model_version}_deployment" deployment_options["deployment_name"] = deployment_name - model_ref.deploy(**deployment_options) + model_ref.deploy(**deployment_options) # type: ignore[attr-defined] remote_prediction = model_ref.predict(deployment_name, test_features) prediction_assert_fn(local_prediction, remote_prediction) diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py index 3f29f0c9..c2c46e36 100644 --- a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py +++ b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py @@ -31,8 +31,10 @@ def test_snowml_model_deployment_xgboost(self) -> None: "options": { "compute_pool": self._TEST_CPU_COMPUTE_POOL, "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO), + "enable_remote_image_build": True, }, }, + omit_target_method_when_deploy=True, ) diff --git a/tests/integ/snowflake/ml/test_utils/model_factory.py b/tests/integ/snowflake/ml/test_utils/model_factory.py index 159fa982..90293a85 100644 --- a/tests/integ/snowflake/ml/test_utils/model_factory.py +++ b/tests/integ/snowflake/ml/test_utils/model_factory.py @@ -2,7 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from enum import Enum -from typing import List, Tuple +from typing import List, Tuple, cast import numpy as np import numpy.typing as npt @@ -54,7 +54,14 @@ def one_vs_all(dataset: npt.NDArray[np.float64], digit: int) -> List[bool]: return clf, test_features, test_labels @staticmethod - def prepare_snowml_model_xgb() -> Tuple[XGBClassifier, pd.DataFrame]: + def prepare_snowml_model_xgb() -> Tuple[XGBClassifier, pd.DataFrame, pd.DataFrame]: + """Prepare SnowML XGBClassifier model. + + Returns: + a XGB classifier. + a dataframe of test features. + a dataframe of training dataset. + """ iris = datasets.load_iris() df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] @@ -69,7 +76,7 @@ def prepare_snowml_model_xgb() -> Tuple[XGBClassifier, pd.DataFrame]: clf_xgb.fit(df) - return clf_xgb, df.drop(columns=label_cols).head(10) + return (clf_xgb, df.drop(columns=label_cols).head(10), df) @staticmethod def prepare_snowml_pipeline(session: Session) -> Tuple[Pipeline, DataFrame]: @@ -174,7 +181,7 @@ def predict(self, input_df: pd.DataFrame) -> pd.DataFrame: @staticmethod def prepare_torch_model( dtype: torch.dtype = torch.float32, force_remote_gpu_inference: bool = False - ) -> Tuple[torch.nn.Module, List[torch.Tensor], List[torch.Tensor]]: + ) -> Tuple[torch.nn.Module, torch.Tensor, torch.Tensor]: class TorchModel(torch.nn.Module): def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: super().__init__() @@ -185,39 +192,40 @@ def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.nn.Sigmoid(), ) - def forward_training(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: - return [self.model(tensors[0])] + def forward_training(self, tensor: torch.Tensor) -> torch.Tensor: + return cast(torch.Tensor, self.model(tensor)) - def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: + def forward(self, tensor: torch.Tensor) -> torch.Tensor: device = DEVICE.CUDA if force_remote_gpu_inference else DEVICE.CPU - return self.predict_with_device(tensors, device) + return self.predict_with_device(tensor, device) - def predict_with_device(self, tensors: List[torch.Tensor], device: DEVICE) -> List[torch.Tensor]: + def predict_with_device(self, tensor: torch.Tensor, device: DEVICE) -> torch.Tensor: self.model.eval() self.model.to(device.value) with torch.no_grad(): - tensors = [tensor.to(device.value) for tensor in tensors] - return [self.model(tensors[0])] + tensor = tensor.to(device.value) + return cast(torch.Tensor, self.model(tensor)) n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 x = np.random.rand(batch_size, n_input) - data_x = [torch.from_numpy(x).to(dtype=dtype)] - data_y = [(torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype)] + data_x = torch.from_numpy(x).to(dtype=dtype) + data_y = (torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype) model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) loss_function = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for _epoch in range(100): pred_y = model.forward_training(data_x) - loss = loss_function(pred_y[0], data_y[0]) + loss = loss_function(pred_y, data_y) optimizer.zero_grad() loss.backward() optimizer.step() return model, data_x, data_y + @staticmethod def prepare_jittable_torch_model( dtype: torch.dtype = torch.float32, force_remote_gpu_inference: bool = False - ) -> Tuple[torch.nn.Module, List[torch.Tensor], List[torch.Tensor]]: + ) -> Tuple[torch.nn.Module, torch.Tensor, torch.Tensor]: class TorchModel(torch.nn.Module): def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.float32) -> None: super().__init__() @@ -228,20 +236,20 @@ def __init__(self, n_input: int, n_hidden: int, n_out: int, dtype: torch.dtype = torch.nn.Sigmoid(), ) - def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: - return [self.model(tensors[0])] + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + return self.model(tensor) # type: ignore[no-any-return] n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 x = np.random.rand(batch_size, n_input) - data_x = [torch.from_numpy(x).to(dtype=dtype)] - data_y = [(torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype)] + data_x = torch.from_numpy(x).to(dtype=dtype) + data_y = (torch.rand(size=(batch_size, 1)) < 0.5).to(dtype=dtype) model = TorchModel(n_input, n_hidden, n_out, dtype=dtype) loss_function = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) for _epoch in range(100): pred_y = model(data_x) - loss = loss_function(pred_y[0], data_y[0]) + loss = loss_function(pred_y, data_y) optimizer.zero_grad() loss.backward() optimizer.step() @@ -250,30 +258,29 @@ def forward(self, tensors: List[torch.Tensor]) -> List[torch.Tensor]: @staticmethod def prepare_keras_model( dtype: tf.dtypes.DType = tf.float32, - ) -> Tuple[tf.keras.Model, List[tf.Tensor], List[tf.Tensor]]: + ) -> Tuple[tf.keras.Model, tf.Tensor, tf.Tensor]: class KerasModel(tf.keras.Model): def __init__(self, n_hidden: int, n_out: int) -> None: super().__init__() self.fc_1 = tf.keras.layers.Dense(n_hidden, activation="relu") self.fc_2 = tf.keras.layers.Dense(n_out, activation="sigmoid") - def call(self, tensors: List[tf.Tensor]) -> List[tf.Tensor]: - input = tensors[0] + def call(self, tensor: tf.Tensor) -> tf.Tensor: + input = tensor x = self.fc_1(input) x = self.fc_2(x) - return [x] + return x n_input, n_hidden, n_out, batch_size, learning_rate = 10, 15, 1, 100, 0.01 x = np.random.rand(batch_size, n_input) - data_x = [tf.convert_to_tensor(x, dtype=dtype)] + data_x = tf.convert_to_tensor(x, dtype=dtype) raw_data_y = tf.random.uniform((batch_size, 1)) raw_data_y = tf.where(raw_data_y > 0.5, tf.ones_like(raw_data_y), tf.zeros_like(raw_data_y)) - data_y = [tf.cast(raw_data_y, dtype=dtype)] - - def loss_fn(y_true: List[tf.Tensor], y_pred: List[tf.Tensor]) -> tf.Tensor: - return tf.keras.losses.mse(y_true[0], y_pred[0]) + data_y = tf.cast(raw_data_y, dtype=dtype) model = KerasModel(n_hidden, n_out) - model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=loss_fn) + model.compile( + optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate), loss=tf.keras.losses.MeanSquaredError() + ) model.fit(data_x, data_y, batch_size=batch_size, epochs=100) return model, data_x, data_y diff --git a/tests/integ/snowflake/ml/test_utils/test_env_utils.py b/tests/integ/snowflake/ml/test_utils/test_env_utils.py index 4b846027..fafe1a44 100644 --- a/tests/integ/snowflake/ml/test_utils/test_env_utils.py +++ b/tests/integ/snowflake/ml/test_utils/test_env_utils.py @@ -6,6 +6,7 @@ import importlib import textwrap +import requests from packaging import version import snowflake.connector @@ -57,3 +58,36 @@ def get_latest_package_versions_in_server( if len(version_list) == 0: return package_name return f"{package_name}=={max(version_list)}" + + +@functools.lru_cache +def get_latest_package_versions_in_conda(package_name: str, python_version: str = env.PYTHON_VERSION) -> str: + repodata_url = "https://repo.anaconda.com/pkgs/snowflake/linux-64/repodata.json" + + parsed_python_version = version.Version(python_version) + python_version_build_str = f"py{parsed_python_version.major}{parsed_python_version.minor}" + + max_retry = 3 + + exc_list = [] + + while max_retry > 0: + try: + version_list = [] + repodata = requests.get(repodata_url).json() + assert isinstance(repodata, dict) + packages_info = repodata["packages"] + assert isinstance(packages_info, dict) + for package_info in packages_info.values(): + if package_info["name"] == package_name and python_version_build_str in package_info["build"]: + version_list.append(version.parse(package_info["version"])) + return f"{package_name}=={str(max(version_list))}" + except Exception as e: + max_retry -= 1 + exc_list.append(e) + + raise RuntimeError( + f"Failed to get latest version of package {package_name} in Snowflake Anaconda Channel. " + + "Exceptions are " + + ", ".join(map(str, exc_list)) + )