snowflakedb · sfc-gh-kdama · Jun 2, 2023 · Jun 2, 2023
diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -25,7 +25,7 @@ requirements:
     - cloudpickle
     - fsspec>=2022.11,<=2023.1
     - numpy>=1.23,<2
-    - packaging>=23.0,<24
+    - packaging>=20.9,<24
     - pandas>=1.0.0,<2  # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
     - pyyaml>=6.0,<7
     - scikit-learn>=1.2.1,<2

diff --git a/ci/type_ignored_targets b/ci/type_ignored_targets
@@ -1,4 +1,32 @@
 //snowflake/ml/experimental/...
-//snowflake/ml/modeling/...
+//tests/integ/snowflake/ml/_internal/...
 //tests/integ/snowflake/ml/extra_tests/...
-//tests/integ/snowflake/ml/preprocessing/...
+//tests/integ/snowflake/ml/sklearn/preprocessing/...
+
+//snowflake/ml/sklearn/linear_model/...
+//snowflake/ml/sklearn/ensemble/...
+//snowflake/ml/sklearn/svm/...
+//snowflake/ml/sklearn/neural_network/...
+//snowflake/ml/sklearn/tree/...
+//snowflake/ml/sklearn/calibration/...
+//snowflake/ml/sklearn/cluster/...
+//snowflake/ml/sklearn/compose/...
+//snowflake/ml/sklearn/covariance/...
+//snowflake/ml/sklearn/decomposition/...
+//snowflake/ml/sklearn/discriminant_analysis/...
+//snowflake/ml/sklearn/feature_selection/...
+//snowflake/ml/sklearn/gaussian_process/...
+//snowflake/ml/sklearn/impute/...
+//snowflake/ml/sklearn/isotonic/...
+//snowflake/ml/sklearn/kernel_approximation/...
+//snowflake/ml/sklearn/kernel_ridge/...
+//snowflake/ml/sklearn/manifold/...
+//snowflake/ml/sklearn/mixture/...
+//snowflake/ml/sklearn/model_selection/...
+//snowflake/ml/sklearn/multiclass/...
+//snowflake/ml/sklearn/multioutput/...
+//snowflake/ml/sklearn/naive_bayes/...
+//snowflake/ml/sklearn/neighbors/...
+//snowflake/ml/sklearn/semi_supervised/...
+//snowflake/ml/xgboost/...
+//snowflake/ml/lightgbm/...
diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl
@@ -82,12 +82,13 @@ def autogen_estimators(module, estimator_info_list):
             srcs = [":generate_{}".format(e.normalized_class_name)],
             deps = [
                 ":init",
-                "//snowflake/ml/framework:framework",
+                "//snowflake/ml/sklearn/framework:framework",
                 "//snowflake/ml/_internal:telemetry",
                 "//snowflake/ml/_internal/utils:temp_file_utils",
                 "//snowflake/ml/_internal/utils:query_result_checker",
                 "//snowflake/ml/_internal/utils:pkg_version_utils",
                 "//snowflake/ml/_internal/utils:identifier",
+                "//snowflake/ml/model:model_signature",
             ],
             tags = ["skip_mypy_check"],
         )

diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py
@@ -311,9 +311,9 @@ def get_snow_ml_module_name(module_name: str) -> str:
         """
         tokens = module_name.split(".")
         if tokens[0] == "sklearn":
-            return "snowflake.ml.modeling." + ".".join(module_name.split(".")[1:])
+            return "snowflake.ml.sklearn." + ".".join(module_name.split(".")[1:])
         else:
-            return "snowflake.ml.modeling." + module_name
+            return "snowflake.ml." + module_name
 
     @staticmethod
     def can_generate_wrapper(class_object: Tuple[str, type]) -> bool:

diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template
@@ -12,7 +12,7 @@ import numpy as np
 {transform.estimator_imports}
 from sklearn.utils.metaestimators import available_if
 
-from snowflake.ml.framework.base import BaseTransformer
+from snowflake.ml.sklearn.framework.base import BaseTransformer
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml._internal.utils import pkg_version_utils, identifier
@@ -21,6 +21,14 @@ from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark.functions import pandas_udf, sproc
 from snowflake.snowpark.types import PandasSeries
 
+from snowflake.ml.model.model_signature import (
+    DataType,
+    FeatureSpec,
+    ModelSignature,
+    _infer_signature,
+    _rename_features,
+)
+
 _PROJECT = "ModelDevelopment"
 # Derive subproject from module name by removing "sklearn"
 # and converting module name from underscore to CamelCase
@@ -116,6 +124,7 @@ class {transform.original_class_name}(BaseTransformer):
         self._sklearn_object = {transform.root_module_name}.{transform.original_class_name}(
             {transform.sklearn_init_arguments}
         )
+        self._model_signature_dict = None
         {transform.estimator_init_member_args}
 
     def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
@@ -161,6 +170,7 @@ class {transform.original_class_name}(BaseTransformer):
                 "Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
             )
         self._is_fitted = True
+        self._get_model_signatures(dataset)
         return self
 
     def _fit_snowpark(self, dataset: DataFrame) -> None:
@@ -310,9 +320,9 @@ class {transform.original_class_name}(BaseTransformer):
             query,
             stage_transform_file_name,
             stage_result_file_name,
-            identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
-            identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
-            identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
+            identifier.get_unescaped_names(self.input_cols),
+            identifier.get_unescaped_names(self.label_cols),
+            identifier.get_unescaped_names(self.sample_weight_col),
             statement_params=statement_params,
         )
 
@@ -378,7 +388,7 @@ class {transform.original_class_name}(BaseTransformer):
         # Input columns for UDF are sorted by column names.
         # We need actual order of input cols to reorder dataframe before calling inference methods.
         input_cols = self.input_cols
-        unquoted_input_cols = identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols)
+        unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
 
         statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
@@ -511,9 +521,37 @@ class {transform.original_class_name}(BaseTransformer):
         expected_output_cols_list: List[str]
     ) -> pd.DataFrame:
         output_cols = expected_output_cols_list.copy()
-        transformed_numpy_array = getattr(self._sklearn_object, inference_method)(
-            dataset[self.input_cols]
+
+        # Model expects exact same columns names in the input df for predict call.
+        # Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call
+        # input cols need to match unquoted / quoted
+        input_cols = self.input_cols
+        unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
+
+        estimator = self._sklearn_object
+
+        input_df = dataset[input_cols] # Select input columns with quoted column names.
+        if hasattr(estimator, "feature_names_in_"):
+            missing_features = []
+            for i, f in enumerate(getattr(estimator, "feature_names_in_")):
+                if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
+                    missing_features.append(f)
+
+            if len(missing_features) > 0:
+                raise ValueError(
+                    "The feature names should match with those that were passed during fit.\n"
+                    f"Features seen during fit call but not present in the input: {{missing_features}}\n"
+                    f"Features in the input dataframe : {{input_cols}}\n"
+                )
+            input_df.columns = getattr(estimator, "feature_names_in_")
+        else:
+            # Just rename the column names to unquoted identifiers.
+            input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
+
+        transformed_numpy_array = getattr(estimator, inference_method)(
+            input_df
         )
+
         if (
             isinstance(transformed_numpy_array, list)
             and len(transformed_numpy_array) > 0
@@ -974,12 +1012,45 @@ class {transform.original_class_name}(BaseTransformer):
             score_sproc_name,
             query,
             stage_score_file_name,
-            identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
-            identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
-            identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
+            identifier.get_unescaped_names(self.input_cols),
+            identifier.get_unescaped_names(self.label_cols),
+            identifier.get_unescaped_names(self.sample_weight_col),
             statement_params=statement_params,
         )
 
         cleanup_temp_files([local_score_file_name])
 
         return score
+
+    def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
+        self._model_signature_dict: Dict[str, ModelSignature] = dict()
+
+        PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
+
+        inputs = _infer_signature(dataset[self.input_cols], "input")
+        if hasattr(self, "predict"):
+            # For classifier, the type of predict is the same as the type of label
+            if self._sklearn_object._estimator_type == 'classifier':
+                outputs = _infer_signature(dataset[self.label_cols], "output")  # label columns is the desired type for output
+                outputs = _rename_features(outputs, self.output_cols)  # rename the output columns
+                self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
+            # For regressor, the type of predict is float64
+            elif self._sklearn_object._estimator_type == 'regressor':
+                outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
+                self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
+
+        for prob_func in PROB_FUNCTIONS:
+            if hasattr(self, prob_func):
+                output_cols_prefix: str = f"{{prob_func}}_"
+                output_column_names = self._get_output_column_names(output_cols_prefix)
+                outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
+                self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
+
+        ##TODO: Add support for transform method
+
+
+    @property
+    def model_signatures(self) -> Dict[str, ModelSignature]:
+        if self._model_signature_dict is None:
+            raise RuntimeError("Estimator not fitted before accessing property model_signatures! ")
+        return self._model_signature_dict
diff --git a/conda-env-extended.yml b/conda-env-extended.yml
@@ -8,7 +8,6 @@
 channels:
   - conda-forge
 dependencies:
-  - moto==4.0.11 # SNOW-690705
   - torchdata==0.4.1 # SNOW-702102
   # SNOW-747683: Tensorflow is available on snowflake conda channel,
   # however, macos-arm64 is only available on conda-forge.

diff --git a/conda-env-snowflake.yml b/conda-env-snowflake.yml
@@ -17,12 +17,14 @@ dependencies:
   - boto3==1.24.28
   - conda-libmamba-solver==23.1.0
   - coverage==6.3.2  # not a package dependency.
+  - docker-py==4.4.1
   - flask-cors==3.0.10
   - flask==2.1.3
   - fsspec==2022.10.0
   - inflection==0.5.1
   - joblib==1.1.1
   - lightgbm==3.3.5
+  - moto==4.0.11
   - networkx==2.8.4
   - numpy==1.23.4
   - packaging==23.0
@@ -38,4 +40,4 @@ dependencies:
   - sqlparse==0.4.3
   - typing-extensions==4.5.0
   - xgboost==1.7.3
-  - mypy==0.981  # not a package dependency.
+  - mypy==0.981 # not a package dependency.
diff --git a/conda-env.yml b/conda-env.yml
@@ -12,6 +12,7 @@ dependencies:
   - boto3==1.24.28
   - conda-libmamba-solver==23.1.0
   - coverage==6.3.2
+  - docker-py==4.4.1
   - flask-cors==3.0.10
   - flask==2.1.3
   - fsspec==2022.10.0

diff --git a/snowflake/ml/BUILD.bazel b/snowflake/ml/BUILD.bazel
@@ -41,7 +41,7 @@ snowml_wheel(
         "cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it.
         "fsspec[http]>=2022.11,<=2023.1",
         "numpy>=1.23,<2",
-        "packaging>=23.0,<24",
+        "packaging>=20.9,<24",
         "pandas>=1.0.0,<2",  # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
         "pyyaml>=6.0,<7",
         "scikit-learn>=1.2.1,<2",
@@ -55,37 +55,37 @@ snowml_wheel(
     version = VERSION,
     deps = [
         "//snowflake/ml/metrics:metrics_pkg",
-        "//snowflake/ml/preprocessing:preprocessing_pkg",
+        "//snowflake/ml/sklearn/preprocessing:preprocessing_pkg",
         "//snowflake/ml/utils:utils_pkg",
         "//snowflake/ml/fileset:fileset_pkg",
         "//snowflake/ml/registry:model_registry_pkg",
         # Auotgen packages
-        "//snowflake/ml/modeling/linear_model:sklearn_linear_model_pkg",
-        "//snowflake/ml/modeling/ensemble:sklearn_ensemble_pkg",
-        "//snowflake/ml/modeling/svm:sklearn_svm_pkg",
-        "//snowflake/ml/modeling/neural_network:sklearn_neural_network_pkg",
-        "//snowflake/ml/modeling/tree:sklearn_tree_pkg",
-        "//snowflake/ml/modeling/xgboost:xgboost_pkg",
-        "//snowflake/ml/modeling/calibration:sklearn_calibration_pkg",
-        "//snowflake/ml/modeling/cluster:sklearn_cluster_pkg",
-        "//snowflake/ml/modeling/compose:sklearn_compose_pkg",
-        "//snowflake/ml/modeling/covariance:sklearn_covariance_pkg",
-        "//snowflake/ml/modeling/decomposition:sklearn_decomposition_pkg",
-        "//snowflake/ml/modeling/discriminant_analysis:sklearn_discriminant_analysis_pkg",
-        "//snowflake/ml/modeling/feature_selection:sklearn_feature_selection_pkg",
-        "//snowflake/ml/modeling/gaussian_process:sklearn_gaussian_process_pkg",
-        "//snowflake/ml/modeling/impute:sklearn_impute_pkg",
-        "//snowflake/ml/modeling/isotonic:sklearn_isotonic_pkg",
-        "//snowflake/ml/modeling/kernel_approximation:sklearn_kernel_approximation_pkg",
-        "//snowflake/ml/modeling/kernel_ridge:sklearn_kernel_ridge_pkg",
-        "//snowflake/ml/modeling/manifold:sklearn_manifold_pkg",
-        "//snowflake/ml/modeling/mixture:sklearn_mixture_pkg",
-        "//snowflake/ml/modeling/model_selection:sklearn_model_selection_pkg",
-        "//snowflake/ml/modeling/multiclass:sklearn_multiclass_pkg",
-        "//snowflake/ml/modeling/multioutput:sklearn_multioutput_pkg",
-        "//snowflake/ml/modeling/naive_bayes:sklearn_naive_bayes_pkg",
-        "//snowflake/ml/modeling/neighbors:sklearn_neighbors_pkg",
-        "//snowflake/ml/modeling/semi_supervised:sklearn_semi_supervised_pkg",
-        "//snowflake/ml/modeling/lightgbm:lightgbm_pkg",
+        "//snowflake/ml/sklearn/linear_model:sklearn_linear_model_pkg",
+        "//snowflake/ml/sklearn/ensemble:sklearn_ensemble_pkg",
+        "//snowflake/ml/sklearn/svm:sklearn_svm_pkg",
+        "//snowflake/ml/sklearn/neural_network:sklearn_neural_network_pkg",
+        "//snowflake/ml/sklearn/tree:sklearn_tree_pkg",
+        "//snowflake/ml/sklearn/calibration:sklearn_calibration_pkg",
+        "//snowflake/ml/sklearn/cluster:sklearn_cluster_pkg",
+        "//snowflake/ml/sklearn/compose:sklearn_compose_pkg",
+        "//snowflake/ml/sklearn/covariance:sklearn_covariance_pkg",
+        "//snowflake/ml/sklearn/decomposition:sklearn_decomposition_pkg",
+        "//snowflake/ml/sklearn/discriminant_analysis:sklearn_discriminant_analysis_pkg",
+        "//snowflake/ml/sklearn/feature_selection:sklearn_feature_selection_pkg",
+        "//snowflake/ml/sklearn/gaussian_process:sklearn_gaussian_process_pkg",
+        "//snowflake/ml/sklearn/impute:sklearn_impute_pkg",
+        "//snowflake/ml/sklearn/isotonic:sklearn_isotonic_pkg",
+        "//snowflake/ml/sklearn/kernel_approximation:sklearn_kernel_approximation_pkg",
+        "//snowflake/ml/sklearn/kernel_ridge:sklearn_kernel_ridge_pkg",
+        "//snowflake/ml/sklearn/manifold:sklearn_manifold_pkg",
+        "//snowflake/ml/sklearn/mixture:sklearn_mixture_pkg",
+        "//snowflake/ml/sklearn/model_selection:sklearn_model_selection_pkg",
+        "//snowflake/ml/sklearn/multiclass:sklearn_multiclass_pkg",
+        "//snowflake/ml/sklearn/multioutput:sklearn_multioutput_pkg",
+        "//snowflake/ml/sklearn/naive_bayes:sklearn_naive_bayes_pkg",
+        "//snowflake/ml/sklearn/neighbors:sklearn_neighbors_pkg",
+        "//snowflake/ml/sklearn/semi_supervised:sklearn_semi_supervised_pkg",
+        "//snowflake/ml/xgboost:xgboost_pkg",
+        "//snowflake/ml/lightgbm:lightgbm_pkg",
     ],
 )
diff --git a/snowflake/ml/_internal/BUILD.bazel b/snowflake/ml/_internal/BUILD.bazel
@@ -27,7 +27,6 @@ py_library(
 py_test(
     name = "file_utils_test",
     srcs = ["file_utils_test.py"],
-    timeout = "short",
     deps = [
         ":file_utils",
     ],

diff --git a/snowflake/ml/_internal/file_utils.py b/snowflake/ml/_internal/file_utils.py
@@ -2,8 +2,9 @@
 import io
 import os
 import shutil
+import tempfile
 import zipfile
-from typing import Generator, Optional
+from typing import IO, Generator, Optional
 
 GENERATED_PY_FILE_EXT = (".pyc", ".pyo", ".pyd", ".pyi")
 
@@ -69,7 +70,6 @@ def zip_file_or_directory_to_stream(
 
     with io.BytesIO() as input_stream:
         with zipfile.ZipFile(input_stream, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
-
             if os.path.realpath(path) != os.path.realpath(start_path):
                 cur_path = os.path.dirname(path)
                 while os.path.realpath(cur_path) != os.path.realpath(start_path):
@@ -92,3 +92,20 @@ def zip_file_or_directory_to_stream(
                 zf.write(path, os.path.relpath(path, start_path))
 
         yield input_stream
+
+
+@contextlib.contextmanager
+def unzip_stream_in_temp_dir(stream: IO[bytes], temp_root: Optional[str] = None) -> Generator[str, None, None]:
+    """Unzip an IO stream into a temporary directory.
+
+    Args:
+        stream: The input stream.
+        temp_root: The root directory where the temporary directory should created in. Defaults to None.
+
+    Yields:
+        The path to the created temporary directory.
+    """
+    with tempfile.TemporaryDirectory(dir=temp_root) as tempdir:
+        with zipfile.ZipFile(stream, mode="r", compression=zipfile.ZIP_DEFLATED) as zf:
+            zf.extractall(path=tempdir)
+        yield tempdir