Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/conda_recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ requirements:
- cloudpickle
- fsspec>=2022.11,<=2023.1
- numpy>=1.23,<2
- packaging>=23.0,<24
- packaging>=20.9,<24
- pandas>=1.0.0,<2 # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
- pyyaml>=6.0,<7
- scikit-learn>=1.2.1,<2
Expand Down
32 changes: 30 additions & 2 deletions ci/type_ignored_targets
Original file line number Diff line number Diff line change
@@ -1,4 +1,32 @@
//snowflake/ml/experimental/...
//snowflake/ml/modeling/...
//tests/integ/snowflake/ml/_internal/...
//tests/integ/snowflake/ml/extra_tests/...
//tests/integ/snowflake/ml/preprocessing/...
//tests/integ/snowflake/ml/sklearn/preprocessing/...

//snowflake/ml/sklearn/linear_model/...
//snowflake/ml/sklearn/ensemble/...
//snowflake/ml/sklearn/svm/...
//snowflake/ml/sklearn/neural_network/...
//snowflake/ml/sklearn/tree/...
//snowflake/ml/sklearn/calibration/...
//snowflake/ml/sklearn/cluster/...
//snowflake/ml/sklearn/compose/...
//snowflake/ml/sklearn/covariance/...
//snowflake/ml/sklearn/decomposition/...
//snowflake/ml/sklearn/discriminant_analysis/...
//snowflake/ml/sklearn/feature_selection/...
//snowflake/ml/sklearn/gaussian_process/...
//snowflake/ml/sklearn/impute/...
//snowflake/ml/sklearn/isotonic/...
//snowflake/ml/sklearn/kernel_approximation/...
//snowflake/ml/sklearn/kernel_ridge/...
//snowflake/ml/sklearn/manifold/...
//snowflake/ml/sklearn/mixture/...
//snowflake/ml/sklearn/model_selection/...
//snowflake/ml/sklearn/multiclass/...
//snowflake/ml/sklearn/multioutput/...
//snowflake/ml/sklearn/naive_bayes/...
//snowflake/ml/sklearn/neighbors/...
//snowflake/ml/sklearn/semi_supervised/...
//snowflake/ml/xgboost/...
//snowflake/ml/lightgbm/...
3 changes: 2 additions & 1 deletion codegen/codegen_rules.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,13 @@ def autogen_estimators(module, estimator_info_list):
srcs = [":generate_{}".format(e.normalized_class_name)],
deps = [
":init",
"//snowflake/ml/framework:framework",
"//snowflake/ml/sklearn/framework:framework",
"//snowflake/ml/_internal:telemetry",
"//snowflake/ml/_internal/utils:temp_file_utils",
"//snowflake/ml/_internal/utils:query_result_checker",
"//snowflake/ml/_internal/utils:pkg_version_utils",
"//snowflake/ml/_internal/utils:identifier",
"//snowflake/ml/model:model_signature",
],
tags = ["skip_mypy_check"],
)
Expand Down
4 changes: 2 additions & 2 deletions codegen/sklearn_wrapper_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,9 @@ def get_snow_ml_module_name(module_name: str) -> str:
"""
tokens = module_name.split(".")
if tokens[0] == "sklearn":
return "snowflake.ml.modeling." + ".".join(module_name.split(".")[1:])
return "snowflake.ml.sklearn." + ".".join(module_name.split(".")[1:])
else:
return "snowflake.ml.modeling." + module_name
return "snowflake.ml." + module_name

@staticmethod
def can_generate_wrapper(class_object: Tuple[str, type]) -> bool:
Expand Down
91 changes: 81 additions & 10 deletions codegen/sklearn_wrapper_template.py_template
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import numpy as np
{transform.estimator_imports}
from sklearn.utils.metaestimators import available_if

from snowflake.ml.framework.base import BaseTransformer
from snowflake.ml.sklearn.framework.base import BaseTransformer
from snowflake.ml._internal import telemetry
from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
from snowflake.ml._internal.utils import pkg_version_utils, identifier
Expand All @@ -21,6 +21,14 @@ from snowflake.snowpark import DataFrame, Session
from snowflake.snowpark.functions import pandas_udf, sproc
from snowflake.snowpark.types import PandasSeries

from snowflake.ml.model.model_signature import (
DataType,
FeatureSpec,
ModelSignature,
_infer_signature,
_rename_features,
)

_PROJECT = "ModelDevelopment"
# Derive subproject from module name by removing "sklearn"
# and converting module name from underscore to CamelCase
Expand Down Expand Up @@ -116,6 +124,7 @@ class {transform.original_class_name}(BaseTransformer):
self._sklearn_object = {transform.root_module_name}.{transform.original_class_name}(
{transform.sklearn_init_arguments}
)
self._model_signature_dict = None
{transform.estimator_init_member_args}

def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
Expand Down Expand Up @@ -161,6 +170,7 @@ class {transform.original_class_name}(BaseTransformer):
"Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
)
self._is_fitted = True
self._get_model_signatures(dataset)
return self

def _fit_snowpark(self, dataset: DataFrame) -> None:
Expand Down Expand Up @@ -310,9 +320,9 @@ class {transform.original_class_name}(BaseTransformer):
query,
stage_transform_file_name,
stage_result_file_name,
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
identifier.get_unescaped_names(self.input_cols),
identifier.get_unescaped_names(self.label_cols),
identifier.get_unescaped_names(self.sample_weight_col),
statement_params=statement_params,
)

Expand Down Expand Up @@ -378,7 +388,7 @@ class {transform.original_class_name}(BaseTransformer):
# Input columns for UDF are sorted by column names.
# We need actual order of input cols to reorder dataframe before calling inference methods.
input_cols = self.input_cols
unquoted_input_cols = identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols)
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)

statement_params = telemetry.get_function_usage_statement_params(
project=_PROJECT,
Expand Down Expand Up @@ -511,9 +521,37 @@ class {transform.original_class_name}(BaseTransformer):
expected_output_cols_list: List[str]
) -> pd.DataFrame:
output_cols = expected_output_cols_list.copy()
transformed_numpy_array = getattr(self._sklearn_object, inference_method)(
dataset[self.input_cols]

# Model expects exact same columns names in the input df for predict call.
# Given the scenario that user use snowpark DataFrame in fit call, but pandas DataFrame in predict call
# input cols need to match unquoted / quoted
input_cols = self.input_cols
unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)

estimator = self._sklearn_object

input_df = dataset[input_cols] # Select input columns with quoted column names.
if hasattr(estimator, "feature_names_in_"):
missing_features = []
for i, f in enumerate(getattr(estimator, "feature_names_in_")):
if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
missing_features.append(f)

if len(missing_features) > 0:
raise ValueError(
"The feature names should match with those that were passed during fit.\n"
f"Features seen during fit call but not present in the input: {{missing_features}}\n"
f"Features in the input dataframe : {{input_cols}}\n"
)
input_df.columns = getattr(estimator, "feature_names_in_")
else:
# Just rename the column names to unquoted identifiers.
input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.

transformed_numpy_array = getattr(estimator, inference_method)(
input_df
)

if (
isinstance(transformed_numpy_array, list)
and len(transformed_numpy_array) > 0
Expand Down Expand Up @@ -974,12 +1012,45 @@ class {transform.original_class_name}(BaseTransformer):
score_sproc_name,
query,
stage_score_file_name,
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.input_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.label_cols),
identifier.get_equivalent_identifier_in_the_response_pandas_dataframe(self.sample_weight_col),
identifier.get_unescaped_names(self.input_cols),
identifier.get_unescaped_names(self.label_cols),
identifier.get_unescaped_names(self.sample_weight_col),
statement_params=statement_params,
)

cleanup_temp_files([local_score_file_name])

return score

def _get_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
self._model_signature_dict: Dict[str, ModelSignature] = dict()

PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]

inputs = _infer_signature(dataset[self.input_cols], "input")
if hasattr(self, "predict"):
# For classifier, the type of predict is the same as the type of label
if self._sklearn_object._estimator_type == 'classifier':
outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
outputs = _rename_features(outputs, self.output_cols) # rename the output columns
self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
# For regressor, the type of predict is float64
elif self._sklearn_object._estimator_type == 'regressor':
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)

for prob_func in PROB_FUNCTIONS:
if hasattr(self, prob_func):
output_cols_prefix: str = f"{{prob_func}}_"
output_column_names = self._get_output_column_names(output_cols_prefix)
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)

##TODO: Add support for transform method


@property
def model_signatures(self) -> Dict[str, ModelSignature]:
if self._model_signature_dict is None:
raise RuntimeError("Estimator not fitted before accessing property model_signatures! ")
return self._model_signature_dict
1 change: 0 additions & 1 deletion conda-env-extended.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
channels:
- conda-forge
dependencies:
- moto==4.0.11 # SNOW-690705
- torchdata==0.4.1 # SNOW-702102
# SNOW-747683: Tensorflow is available on snowflake conda channel,
# however, macos-arm64 is only available on conda-forge.
Expand Down
4 changes: 3 additions & 1 deletion conda-env-snowflake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ dependencies:
- boto3==1.24.28
- conda-libmamba-solver==23.1.0
- coverage==6.3.2 # not a package dependency.
- docker-py==4.4.1
- flask-cors==3.0.10
- flask==2.1.3
- fsspec==2022.10.0
- inflection==0.5.1
- joblib==1.1.1
- lightgbm==3.3.5
- moto==4.0.11
- networkx==2.8.4
- numpy==1.23.4
- packaging==23.0
Expand All @@ -38,4 +40,4 @@ dependencies:
- sqlparse==0.4.3
- typing-extensions==4.5.0
- xgboost==1.7.3
- mypy==0.981 # not a package dependency.
- mypy==0.981 # not a package dependency.
1 change: 1 addition & 0 deletions conda-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- boto3==1.24.28
- conda-libmamba-solver==23.1.0
- coverage==6.3.2
- docker-py==4.4.1
- flask-cors==3.0.10
- flask==2.1.3
- fsspec==2022.10.0
Expand Down
58 changes: 29 additions & 29 deletions snowflake/ml/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ snowml_wheel(
"cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it.
"fsspec[http]>=2022.11,<=2023.1",
"numpy>=1.23,<2",
"packaging>=23.0,<24",
"packaging>=20.9,<24",
"pandas>=1.0.0,<2", # Limit since 2.x is not available in Snowflake Anaconda Channel yet.
"pyyaml>=6.0,<7",
"scikit-learn>=1.2.1,<2",
Expand All @@ -55,37 +55,37 @@ snowml_wheel(
version = VERSION,
deps = [
"//snowflake/ml/metrics:metrics_pkg",
"//snowflake/ml/preprocessing:preprocessing_pkg",
"//snowflake/ml/sklearn/preprocessing:preprocessing_pkg",
"//snowflake/ml/utils:utils_pkg",
"//snowflake/ml/fileset:fileset_pkg",
"//snowflake/ml/registry:model_registry_pkg",
# Auotgen packages
"//snowflake/ml/modeling/linear_model:sklearn_linear_model_pkg",
"//snowflake/ml/modeling/ensemble:sklearn_ensemble_pkg",
"//snowflake/ml/modeling/svm:sklearn_svm_pkg",
"//snowflake/ml/modeling/neural_network:sklearn_neural_network_pkg",
"//snowflake/ml/modeling/tree:sklearn_tree_pkg",
"//snowflake/ml/modeling/xgboost:xgboost_pkg",
"//snowflake/ml/modeling/calibration:sklearn_calibration_pkg",
"//snowflake/ml/modeling/cluster:sklearn_cluster_pkg",
"//snowflake/ml/modeling/compose:sklearn_compose_pkg",
"//snowflake/ml/modeling/covariance:sklearn_covariance_pkg",
"//snowflake/ml/modeling/decomposition:sklearn_decomposition_pkg",
"//snowflake/ml/modeling/discriminant_analysis:sklearn_discriminant_analysis_pkg",
"//snowflake/ml/modeling/feature_selection:sklearn_feature_selection_pkg",
"//snowflake/ml/modeling/gaussian_process:sklearn_gaussian_process_pkg",
"//snowflake/ml/modeling/impute:sklearn_impute_pkg",
"//snowflake/ml/modeling/isotonic:sklearn_isotonic_pkg",
"//snowflake/ml/modeling/kernel_approximation:sklearn_kernel_approximation_pkg",
"//snowflake/ml/modeling/kernel_ridge:sklearn_kernel_ridge_pkg",
"//snowflake/ml/modeling/manifold:sklearn_manifold_pkg",
"//snowflake/ml/modeling/mixture:sklearn_mixture_pkg",
"//snowflake/ml/modeling/model_selection:sklearn_model_selection_pkg",
"//snowflake/ml/modeling/multiclass:sklearn_multiclass_pkg",
"//snowflake/ml/modeling/multioutput:sklearn_multioutput_pkg",
"//snowflake/ml/modeling/naive_bayes:sklearn_naive_bayes_pkg",
"//snowflake/ml/modeling/neighbors:sklearn_neighbors_pkg",
"//snowflake/ml/modeling/semi_supervised:sklearn_semi_supervised_pkg",
"//snowflake/ml/modeling/lightgbm:lightgbm_pkg",
"//snowflake/ml/sklearn/linear_model:sklearn_linear_model_pkg",
"//snowflake/ml/sklearn/ensemble:sklearn_ensemble_pkg",
"//snowflake/ml/sklearn/svm:sklearn_svm_pkg",
"//snowflake/ml/sklearn/neural_network:sklearn_neural_network_pkg",
"//snowflake/ml/sklearn/tree:sklearn_tree_pkg",
"//snowflake/ml/sklearn/calibration:sklearn_calibration_pkg",
"//snowflake/ml/sklearn/cluster:sklearn_cluster_pkg",
"//snowflake/ml/sklearn/compose:sklearn_compose_pkg",
"//snowflake/ml/sklearn/covariance:sklearn_covariance_pkg",
"//snowflake/ml/sklearn/decomposition:sklearn_decomposition_pkg",
"//snowflake/ml/sklearn/discriminant_analysis:sklearn_discriminant_analysis_pkg",
"//snowflake/ml/sklearn/feature_selection:sklearn_feature_selection_pkg",
"//snowflake/ml/sklearn/gaussian_process:sklearn_gaussian_process_pkg",
"//snowflake/ml/sklearn/impute:sklearn_impute_pkg",
"//snowflake/ml/sklearn/isotonic:sklearn_isotonic_pkg",
"//snowflake/ml/sklearn/kernel_approximation:sklearn_kernel_approximation_pkg",
"//snowflake/ml/sklearn/kernel_ridge:sklearn_kernel_ridge_pkg",
"//snowflake/ml/sklearn/manifold:sklearn_manifold_pkg",
"//snowflake/ml/sklearn/mixture:sklearn_mixture_pkg",
"//snowflake/ml/sklearn/model_selection:sklearn_model_selection_pkg",
"//snowflake/ml/sklearn/multiclass:sklearn_multiclass_pkg",
"//snowflake/ml/sklearn/multioutput:sklearn_multioutput_pkg",
"//snowflake/ml/sklearn/naive_bayes:sklearn_naive_bayes_pkg",
"//snowflake/ml/sklearn/neighbors:sklearn_neighbors_pkg",
"//snowflake/ml/sklearn/semi_supervised:sklearn_semi_supervised_pkg",
"//snowflake/ml/xgboost:xgboost_pkg",
"//snowflake/ml/lightgbm:lightgbm_pkg",
],
)
1 change: 0 additions & 1 deletion snowflake/ml/_internal/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ py_library(
py_test(
name = "file_utils_test",
srcs = ["file_utils_test.py"],
timeout = "short",
deps = [
":file_utils",
],
Expand Down
21 changes: 19 additions & 2 deletions snowflake/ml/_internal/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import io
import os
import shutil
import tempfile
import zipfile
from typing import Generator, Optional
from typing import IO, Generator, Optional

GENERATED_PY_FILE_EXT = (".pyc", ".pyo", ".pyd", ".pyi")

Expand Down Expand Up @@ -69,7 +70,6 @@ def zip_file_or_directory_to_stream(

with io.BytesIO() as input_stream:
with zipfile.ZipFile(input_stream, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:

if os.path.realpath(path) != os.path.realpath(start_path):
cur_path = os.path.dirname(path)
while os.path.realpath(cur_path) != os.path.realpath(start_path):
Expand All @@ -92,3 +92,20 @@ def zip_file_or_directory_to_stream(
zf.write(path, os.path.relpath(path, start_path))

yield input_stream


@contextlib.contextmanager
def unzip_stream_in_temp_dir(stream: IO[bytes], temp_root: Optional[str] = None) -> Generator[str, None, None]:
"""Unzip an IO stream into a temporary directory.

Args:
stream: The input stream.
temp_root: The root directory where the temporary directory should created in. Defaults to None.

Yields:
The path to the created temporary directory.
"""
with tempfile.TemporaryDirectory(dir=temp_root) as tempdir:
with zipfile.ZipFile(stream, mode="r", compression=zipfile.ZIP_DEFLATED) as zf:
zf.extractall(path=tempdir)
yield tempdir
Loading