diff --git a/BUILD.bazel b/BUILD.bazel index e4ce4056..104eb38a 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -4,4 +4,5 @@ exports_files([ "conda-env.yml", "mypy.ini", "requirements.txt", + "requirements.yml", ]) diff --git a/README.md b/README.md index 9a7e5b47..0067ac97 100644 --- a/README.md +++ b/README.md @@ -142,13 +142,12 @@ Another useful command is, `bazel run`. This builds and then run the built targe ### Python dependencies To introduce a third-party Python dependency, first check if it is available as a package in the -[Snowflake conda channel](https://repo.anaconda.com/pkgs/snowflake/). If so, add the package -to [conda-env-snowflake.yml](https://github.com/snowflakedb/snowml/blob/main/conda-env-snowflake.yml), -and run the following to re-generate +[Snowflake conda channel](https://repo.anaconda.com/pkgs/snowflake/). Then modify +[requirements.yml](https://github.com/snowflakedb/snowml/blob/main/requirements.yml) following the instruction there, and run the following to re-generate all requirements files, including [conda-env.yml](https://github.com/snowflakedb/snowml/blob/main/conda-env.yml): ``` -bazel build //bazel:conda-env.yml && cp bazel-bin/bazel/conda-env.yml . +bazel run //bazel/requirements:sync_requirements ``` Then, your code can use the package as if it were "installed" in the Python environment. diff --git a/WORKSPACE b/WORKSPACE index 779eec55..776a64f9 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -24,6 +24,18 @@ http_archive( load("//third_party/rules_conda:defs.bzl", "conda_create", "load_conda", "register_toolchain") +http_archive( + name = "aspect_bazel_lib", + sha256 = "e3151d87910f69cf1fc88755392d7c878034a69d6499b287bcfc00b1cf9bb415", + strip_prefix = "bazel-lib-1.32.1", + url = "https://github.com/aspect-build/bazel-lib/releases/download/v1.32.1/bazel-lib-v1.32.1.tar.gz", +) + +load("@aspect_bazel_lib//lib:repositories.bzl", "aspect_bazel_lib_dependencies", "register_yq_toolchains") + +aspect_bazel_lib_dependencies() +register_yq_toolchains() + # Below two conda environments (toolchains) are created and they require different # constraint values. Two platforms defined in bazel/platforms/BUILD provide those # constraint values. A toolchain matches a platform as long as the platform provides @@ -44,6 +56,7 @@ conda_create( timeout = 3600, clean = False, environment = "@//:conda-env-snowflake.yml", + coverage_tool = "@//bazel/coverage_tool:coverage_tool.py", quiet = True, ) @@ -62,6 +75,7 @@ conda_create( timeout = 3600, clean = False, environment = "@//:conda-env.yml", + coverage_tool = "@//bazel/coverage_tool:coverage_tool.py", quiet = True, ) diff --git a/bazel/BUILD.bazel b/bazel/BUILD.bazel index b79b6510..a6d81166 100644 --- a/bazel/BUILD.bazel +++ b/bazel/BUILD.bazel @@ -1,5 +1,4 @@ load("@rules_python//python:defs.bzl", native_py_test = "py_test") -load(":py_rules.bzl", "py_binary", "py_library") native_py_test( name = "repo_paths_test", @@ -8,67 +7,3 @@ native_py_test( python_version = "PY3", srcs_version = "PY3", ) - -py_library( - name = "conda_env_utils", - srcs = ["conda_env_utils.py"], -) - -py_binary( - name = "generate_conda_env", - srcs = ["generate_conda_env.py"], - deps = [":conda_env_utils"], -) - -py_binary( - name = "generate_requirements", - srcs = ["generate_requirements.py"], - deps = [":conda_env_utils"], -) - -genrule( - name = "conda_env_gen", - srcs = [ - "//:conda-env-extended.yml", - "//:conda-env-snowflake.yml", - ], - outs = ["conda-env.yml"], - cmd = "$(location :generate_conda_env) $(location //:conda-env-snowflake.yml) $(location //:conda-env-extended.yml)> $@", - tools = [":generate_conda_env"], -) - -sh_test( - name = "conda_env_test", - srcs = ["conda_env_test.sh"], - args = [ - "$(location //:conda-env.yml)", - "$(location :conda-env.yml)", - ], - data = [ - ":conda-env.yml", - "//:conda-env.yml", - ], -) - -genrule( - name = "requirements_gen", - srcs = [ - "//:conda-env.yml", - ], - outs = ["requirements.txt"], - cmd = "$(location :generate_requirements) $(location //:conda-env.yml) > $@", - tools = [":generate_requirements"], -) - -sh_test( - name = "requirements_conda_env_sync_test", - srcs = ["requirements_conda_env_test.sh"], - args = [ - "$(location //:requirements.txt)", - "$(location :requirements.txt)", - ], - data = [ - ":requirements.txt", - "//:requirements.txt", - ], -) diff --git a/bazel/conda_env_test.sh b/bazel/conda_env_test.sh deleted file mode 100755 index bc826022..00000000 --- a/bazel/conda_env_test.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -# Test to make sure //conda-env.yml is the same as bazel/conda-env.yml. - -if [ $# -ne 2 ]; then - echo "must provide the two .yml files as arguments." - exit 1 -fi - -# -y: multi-column output -# -W: max column width (contents will be clipped). -diff -y -W 160 $1 $2 || \ -(echo -e "\nconda-env.yml should be updated." \ - "Please see the instructions on top of the file to re-generate it." && \ - exit 1) diff --git a/bazel/conda_env_utils.py b/bazel/conda_env_utils.py deleted file mode 100644 index 55b6529f..00000000 --- a/bazel/conda_env_utils.py +++ /dev/null @@ -1,51 +0,0 @@ -from typing import Dict, Iterable, List, Tuple - - -def merge_conda_envs(env_dicts: Iterable[Dict[str, List[str]]]) -> Dict[str, List[str]]: - channels_set = set() - channels = [] - dependencies = _Dependencies() - for env_dict in env_dicts: - this_channels, this_deps = validate_and_get_conda_env(env_dict) - # Dedup channels but keep the order as they appear. - for channel in this_channels: - if channel not in channels_set: - channels_set.add(channel) - channels.append(channel) - for dep in this_deps: - dependencies.add(dep) - - return {"channels": channels, "dependencies": dependencies.to_list()} - - -class _Dependencies: - def __init__(self) -> None: - # maps name to version - self._deps = {} # type: Dict[str, str] - - def add(self, dep_string: str) -> None: - name_and_version = dep_string.split("==") - if len(name_and_version) != 2: - raise ValueError(f"Invalid dependency spec. Expected == but got {dep_string}") - name, version = name_and_version - existing_version = self._deps.get(name, None) - if existing_version is not None: - raise ValueError( - f"Found duplicate package: {name}. Prefer specifying the package only" - "in conda-env-snowflake.yml, if it is available in the Snowflake " - "conda channel." - ) - self._deps[name] = version - - def to_list(self) -> List[str]: - deps = [f"{name}=={version}" for name, version in self._deps.items()] - deps.sort() - return deps - - -def validate_and_get_conda_env(env_dict: Dict[str, List[str]]) -> Tuple[List[str], List[str]]: - assert len(env_dict) == 2, "A conda env YAML must contain only two entries, 'channels' and 'dependencies'" - assert "channels" in env_dict - assert "dependencies" in env_dict - - return env_dict["channels"], env_dict["dependencies"] diff --git a/bazel/coverage_tool/BUILD.bazel b/bazel/coverage_tool/BUILD.bazel new file mode 100644 index 00000000..2c70e68e --- /dev/null +++ b/bazel/coverage_tool/BUILD.bazel @@ -0,0 +1,3 @@ +package(default_visibility = ["//visibility:public"]) + +exports_files(["coverage_tool.py"]) diff --git a/bazel/coverage_tool/coverage_tool.py b/bazel/coverage_tool/coverage_tool.py new file mode 100644 index 00000000..35410f9f --- /dev/null +++ b/bazel/coverage_tool/coverage_tool.py @@ -0,0 +1,25 @@ +"""This is a wrapper to the coverage tool. +It injects a --ignore-errors argument when called to generate a coverage report, to avoid bazel fails when running +coverage tool to collect coverage report on a source code file that does not exist, for example, zip-imported source. +""" +import re +import sys + +try: + from coverage.cmdline import main +except ImportError as e: + raise ImportError( + f"Unable to import coverage. Make sure coverage is added to the bazel conda environment. Actual error: {e}" + ) + +if __name__ == "__main__": + if len(sys.argv) < 2: + raise ValueError("Too few arguments.") + # This line is from the original coverage entrypoint. + sys.argv[0] = re.sub(r"(-script\.pyw?|\.exe)?$", "", sys.argv[0]) + + action, options = sys.argv[1], sys.argv[2:] + if action in ["report", "html", "xml", "json", "lcov", "annotate"]: + options.insert(0, "--ignore-errors") + args = [action] + options + sys.exit(main(args)) diff --git a/bazel/generate_conda_env.py b/bazel/generate_conda_env.py deleted file mode 100644 index 470622f5..00000000 --- a/bazel/generate_conda_env.py +++ /dev/null @@ -1,30 +0,0 @@ -import sys -from typing import Dict, Iterable, List - -from ruamel.yaml import YAML - -from bazel import conda_env_utils - -if __name__ == "__main__": - - if len(sys.argv) <= 1: - raise ValueError("Must provide at least one conda env file.") - - yaml = YAML() - yaml.indent(mapping=2, offset=2) - - def env_dict_generator(files: Iterable[str]) -> Iterable[Dict[str, List[str]]]: - for f in files: - yield yaml.load(open(f)) - - merged_env_dict = conda_env_utils.merge_conda_envs(env_dict_generator(sys.argv[1:])) - sys.stdout.writelines( - [ - "# DO NOT EDIT!\n", - "# Generated by //bazel:merge_conda_env\n", - "# To update, run:\n", - "# bazel build //bazel:conda-env.yml && cp bazel-bin/bazel/conda-env.yml .\n", - "\n", - ] - ) - yaml.dump(merged_env_dict, sys.stdout) diff --git a/bazel/generate_requirements.py b/bazel/generate_requirements.py deleted file mode 100644 index 58cfd72d..00000000 --- a/bazel/generate_requirements.py +++ /dev/null @@ -1,25 +0,0 @@ -import sys - -from ruamel.yaml import YAML - -from bazel import conda_env_utils - -if __name__ == "__main__": - - if len(sys.argv) <= 1: - raise ValueError("Must provide at least one conda env file.") - - yaml = YAML() - conda_env = yaml.load(open(sys.argv[1])) - _, deps = conda_env_utils.validate_and_get_conda_env(conda_env) - sys.stdout.writelines( - [ - "# DO NOT EDIT!\n", - "# Generated by //bazel:generate_requirements\n", - "# To update, run:\n", - "# bazel build //bazel:requirements.txt && cp bazel-bin/bazel/requirements.txt .\n", - "\n", - ] - ) - sys.stdout.writelines("\n".join(deps)) - sys.stdout.writelines("\n") diff --git a/bazel/requirements/BUILD.bazel b/bazel/requirements/BUILD.bazel new file mode 100644 index 00000000..58d8003c --- /dev/null +++ b/bazel/requirements/BUILD.bazel @@ -0,0 +1,154 @@ +load("//bazel:py_rules.bzl", "py_binary") +load("@bazel_skylib//rules:diff_test.bzl", "diff_test") +load("@bazel_skylib//rules:write_file.bzl", "write_file") +load("@aspect_bazel_lib//lib:yq.bzl", "yq") +load("//snowflake/ml:version.bzl", "VERSION") + +package(default_visibility = ["//visibility:public"]) + +exports_files(["requirements.schema.json"]) + +py_binary( + name = "parse_and_generate_requirements", + srcs = ["parse_and_generate_requirements.py"], +) + +_SRC_REQUIREMENT_FILE = "//:requirements.yml" + +_SCHEMA_FILE = ":requirements.schema.json" + +_GENERATE_TOOL = ":parse_and_generate_requirements" + +_GENERATE_COMMAND = "$(location " + _GENERATE_TOOL + ") $(location " + _SRC_REQUIREMENT_FILE + ") --schema $(location " + _SCHEMA_FILE + ") {options} > $@" + +_TEMPLATE_FOLDER_PATH = "//bazel/requirements/templates" + +_AUTOGEN_HEADERS = """# DO NOT EDIT! +# Generate by running 'bazel run //bazel/requirements:sync_requirements' +""" + +_GENERATED_REQUIREMENTS_FILES = { + "requirements_txt": { + "cmd": "--mode dev_version --format text", + "generated": "requirements.txt", + "target": "//:requirements.txt", + }, + "conda_env_yml": { + "cmd": "--mode dev_version --format conda_env", + "generated": "conda-env.yml", + "target": "//:conda-env.yml", + }, + "conda_env_snowflake_yml": { + "cmd": "--mode dev_version --format conda_env --snowflake_channel_only", + "generated": "conda-env-snowflake.yml", + "target": "//:conda-env-snowflake.yml", + }, + "conda_meta": { + "cmd": "--mode version_requirements --format conda_meta --version " + VERSION, + "generated": "meta.yaml", + "target": "//ci/conda_recipe:meta.yaml", + }, + "requirements_bzl": { + "cmd": "--mode version_requirements --format bzl", + "generated": "requirements.bzl", + "target": "//snowflake/ml:requirements.bzl", + }, +} + +[ + genrule( + name = "gen_{name}_body".format(name = name), + srcs = [ + _SRC_REQUIREMENT_FILE, + _SCHEMA_FILE, + ], + outs = ["{generated}.body".format(generated = value["generated"])], + cmd = _GENERATE_COMMAND.format(options = value["cmd"]), + tools = [_GENERATE_TOOL], + ) + for name, value in _GENERATED_REQUIREMENTS_FILES.items() + if name != "conda_meta" +] + +[ + genrule( + name = "gen_{name}".format(name = name), + srcs = [ + "{generated}.body".format(generated = value["generated"]), + ], + outs = [value["generated"]], + cmd = "(echo -e \""+ _AUTOGEN_HEADERS +"\" ; cat $(location :{generated}.body) ) > $@".format( + generated = value["generated"], + ), + tools = [_GENERATE_TOOL], + ) + for name, value in _GENERATED_REQUIREMENTS_FILES.items() + if name != "conda_meta" +] + +# Generate ci/conda-recipe/meta.yaml +genrule( + name = "gen_conda_meta_body", + srcs = [ + _SRC_REQUIREMENT_FILE, + _SCHEMA_FILE, + ], + outs = ["meta.body.yaml"], + cmd = _GENERATE_COMMAND.format(options = "--mode version_requirements --format conda_meta --version " + VERSION), + tools = [_GENERATE_TOOL], +) + +yq( + name = "gen_conda_meta", + srcs = [ + ":meta.body.yaml", + "{template_folder}:meta.tpl.yaml".format(template_folder = _TEMPLATE_FOLDER_PATH), + ], + outs = ["meta.yaml"], + expression = ". as $item ireduce ({}; . * $item ) | sort_keys(..)", +) + +# Create a test target for each file that Bazel should +# write to the source tree. +[ + diff_test( + name = "check_{name}".format(name = name), + # Make it trivial for devs to understand that if + # this test fails, they just need to run the updater + # Note, you need bazel-skylib version 1.1.1 or greater + # to get the failure_message attribute + failure_message = "Please run: bazel run //bazel/requirements:sync_requirements", + file1 = ":{generated}".format(generated = value["generated"]), + file2 = value["target"], + ) + for name, value in _GENERATED_REQUIREMENTS_FILES.items() +] + +# Generate the updater script so there's only one target for devs to run, +# even if many generated files are in the source folder. +write_file( + name = "gen_sync_requirements", + out = "sync_requirements.sh", + content = [ + # This depends on bash, would need tweaks for Windows + "#!/usr/bin/env sh", + # Bazel gives us a way to access the source folder! + "cd $BUILD_WORKSPACE_DIRECTORY", + ] + [ + # Paths are now relative to the workspace. + # We can copy files from bazel-bin to the sources + "cp -fv bazel-bin/bazel/requirements/{generated} {target}".format( + generated = value["generated"], + # Convert label to path + target = value["target"].lstrip("//").lstrip(":").replace(":", "/"), + ) + for name, value in _GENERATED_REQUIREMENTS_FILES.items() + ], +) + +# This is what you can `bazel run` and it can write to the source folder +sh_binary( + name = "sync_requirements", + srcs = ["sync_requirements.sh"], + data = [":{generated}".format(generated = value["generated"]) for value in _GENERATED_REQUIREMENTS_FILES.values()], +) diff --git a/bazel/requirements/parse_and_generate_requirements.py b/bazel/requirements/parse_and_generate_requirements.py new file mode 100644 index 00000000..6a21a8f7 --- /dev/null +++ b/bazel/requirements/parse_and_generate_requirements.py @@ -0,0 +1,418 @@ +import argparse +import functools +import itertools +import json +import sys +from typing import Literal, MutableMapping, Optional, Sequence, Set, TypedDict, cast + +import jsonschema +import yaml +from packaging import requirements as packaging_requirements + +SNOWFLAKE_CONDA_CHANNEL = "https://repo.anaconda.com/pkgs/snowflake" + + +class RequirementInfo(TypedDict, total=False): + """This reflect the requirements.schema.json file.""" + + name: str + name_pypi: str + name_conda: str + dev_version: str + dev_version_pypi: str + dev_version_conda: str + from_channel: str + version_requirements: str + version_requirements_pypi: str + version_requirements_conda: str + requirements_extra_tags: Sequence[str] + tags: Sequence[str] + + +def filter_by_tag( + req_info: RequirementInfo, + field: Literal["tags", "requirements_extra_tags"], + tag_filter: Optional[str] = None, +) -> bool: + """Filter the requirement by whether given tag filter appears in the given field in the requirement information. + The field is an array. + + Args: + req_info: requirement information. + field: field to filter the tag from. + tag_filter: tag to filter the requirement. Defaults to None. + + Returns: + True if tag_filter is None, or in the array of given field if presented. + """ + return tag_filter is None or tag_filter in req_info.get(field, []) + + +def filter_by_extras(req_info: RequirementInfo, extras: bool, no_extras: bool) -> bool: + """Filter the requirements by whether it contains extras. + + Args: + req_info: requirement information. + extras: if set to True, only filter those requirements are extras. + no_extras: if set to True, only filter those requirements are not extras. + + Returns: + True, for all requirements if extras and no_extras are both False; + or for all extras requirements if extras is True; + or for all non-extras requirements if no_extras is True. + """ + return ( + (not extras and not no_extras) + or (extras and len(req_info.get("requirements_extra_tags", [])) > 0) + or (no_extras and len(req_info.get("requirements_extra_tags", [])) == 0) + ) + + +def get_req_name(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> Optional[str]: + """Get the name of the requirement in the given env. + For each env, env specific name will be chosen, if not presented, common name will be chosen. + + Args: + req_info: requirement information. + env: environment indicator, choose from conda and pip. + + Raises: + ValueError: Illegal env argument. + + Returns: + The name of the requirement, if not presented, return None. + """ + if env == "conda": + return req_info.get("name_conda", req_info.get("name", None)) + elif env == "pip": + return req_info.get("name_pypi", req_info.get("name", None)) + else: + raise ValueError("Unreachable") + + +def generate_dev_pinned_string(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> Optional[str]: + """Get the pinned version for dev environment of the requirement in the given env. + For each env, env specific pinned version will be chosen, if not presented, common pinned version will be chosen. + + Args: + req_info: requirement information. + env: environment indicator, choose from conda and pip. + + Raises: + ValueError: Illegal env argument. + ValueError: No pinned dev version exists, which is not allowed. + + Returns: + If the name is None, return None. + Otherwise, return name==x.y.z format string, showing the pinned version in the dev environment. + """ + name = get_req_name(req_info, env) + if name is None: + return None + if env == "conda": + version = req_info.get("dev_version_conda", req_info.get("dev_version", None)) + elif env == "pip": + version = req_info.get("dev_version_pypi", req_info.get("dev_version", None)) + else: + raise ValueError("Unreachable") + if version is None: + raise ValueError("No pinned version exists.") + return f"{name}=={version}" + + +def generate_user_requirements_string(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> Optional[str]: + """Get the user requirements version specifier string of the requirement in the given env. + For each env, env specific user requirements version will be chosen, if not presented, common one will be chosen. + + Args: + req_info: requirement information. + env: environment indicator, choose from conda and pip. + + Raises: + ValueError: Illegal env argument. + + Returns: + If the name is None, return None. + If no user requirements version, return the package name. + Otherwise, return PEP-508 compatible format string, showing requirements when users install SnowML. + """ + name = get_req_name(req_info, env) + if name is None: + return None + if env == "conda": + specifiers = req_info.get("version_requirements_conda", req_info.get("version_requirements", None)) + elif env == "pip": + specifiers = req_info.get("version_requirements_pypi", req_info.get("version_requirements", None)) + else: + raise ValueError("Unreachable") + if specifiers is None: + return None + return f"{name}{specifiers}" + + +def validate_dev_version_and_user_requirements(req_info: RequirementInfo, env: Literal["conda", "pip"]) -> None: + """Validate dev version and the user requirements version of the requirement in the given env. + Check if dev version is within the user requirements version. + + Args: + req_info: requirement information. + env: environment indicator, choose from conda and pip. + + Raises: + ValueError: Illegal env argument. + ValueError: No pinned dev version exists, which is not allowed. + ValueError: Pinned dev version does not exist in user requirements. + """ + user_requirements_string = generate_user_requirements_string(req_info, env) + if user_requirements_string is None: + return + if env == "conda": + version = req_info.get("dev_version_conda", req_info.get("dev_version", None)) + elif env == "pip": + version = req_info.get("dev_version_pypi", req_info.get("dev_version", None)) + else: + raise ValueError("Unreachable") + req = packaging_requirements.Requirement(user_requirements_string) + if version is None: + raise ValueError("No pinned version exists.") + if not req.specifier.contains(version): + raise ValueError( + f"Pinned dev version {version} does not exist in user requirements {user_requirements_string}." + ) + return + + +def fold_extras_tags(extras_tags: Set[str], req_info: RequirementInfo) -> Set[str]: + """Left-fold style function to get all extras tags in all requirements. + + Args: + extras_tags: A set containing all existing extras tags. + req_info: requirement information. + + Returns: + Updated set with tags in the requirement information added. + """ + for extras_tag in req_info.get("requirements_extra_tags", []): + extras_tags.add(extras_tag) + return extras_tags + + +def fold_channel(channels: Set[str], req_info: RequirementInfo) -> Set[str]: + """Left-fold style function to get all channels in all requirements. + + Args: + channels: A set containing all existing extras channels. + req_info: requirement information. + + Returns: + Updated set with channels in the requirement information added. + """ + channel = req_info.get("from_channel", None) + if channel: + channels.add(channel) + return channels + + +def generate_requirements( + req_file_path: str, + schema_file_path: str, + mode: str, + format: str, + snowflake_channel_only: bool, + tag_filter: Optional[str] = None, + version: Optional[str] = None, +) -> None: + with open(schema_file_path) as f: + schema = json.load(f) + with open(req_file_path) as f: + requirements = yaml.safe_load(f) + + jsonschema.validate(requirements, schema=schema) + + requirements = cast(Sequence[RequirementInfo], requirements) + requirements = list(filter(lambda req_info: filter_by_tag(req_info, "tags", tag_filter), requirements)) + + for req_info in requirements: + validate_dev_version_and_user_requirements(req_info, "pip") + validate_dev_version_and_user_requirements(req_info, "conda") + + reqs_pypi = list(filter(None, map(lambda req_info: get_req_name(req_info, "pip"), requirements))) + reqs_conda = list(filter(None, map(lambda req_info: get_req_name(req_info, "conda"), requirements))) + if len(reqs_pypi) != len(set(reqs_pypi)) or len(reqs_conda) != len(set(reqs_conda)): + raise ValueError("Duplicate Requirements found!") + + if (mode, format) == ("dev_version", "text"): + results = list( + sorted( + map( + lambda s: s + "\n", + filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "pip"), requirements)), + ) + ) + ) + sys.stdout.writelines(results) + elif (mode, format) == ("version_requirements", "bzl"): + extras_requirements = list(filter(lambda req_info: filter_by_extras(req_info, True, False), requirements)) + extras_results: MutableMapping[str, Sequence[str]] = {} + all_extras_tags: Set[str] = set() + all_extras_tags = functools.reduce(fold_extras_tags, requirements, all_extras_tags) + for extras_tag in sorted(list(all_extras_tags)): + requirements_with_tag = list( + filter( + lambda req_info: filter_by_tag(req_info, "requirements_extra_tags", extras_tag), + extras_requirements, + ) + ) + extras_results[extras_tag] = list( + sorted( + filter( + None, + map( + lambda req_info: generate_user_requirements_string(req_info, "pip"), + requirements_with_tag, + ), + ) + ) + ) + extras_results["all"] = sorted(list(set(itertools.chain(*extras_results.values())))) + results = list( + sorted( + filter( + None, + map( + lambda req_info: generate_user_requirements_string(req_info, "pip"), + filter(lambda req_info: filter_by_extras(req_info, False, True), requirements), + ), + ) + ) + ) + sys.stdout.write( + "EXTRA_REQUIREMENTS={extra_requirements}\n\nREQUIREMENTS={requirements}\n".format( + extra_requirements=repr(extras_results), requirements=repr(results) + ) + ) + elif (mode, format) == ("version_requirements", "python"): + results = list( + sorted( + filter(None, map(lambda req_info: generate_user_requirements_string(req_info, "conda"), requirements)), + ) + ) + sys.stdout.writelines(f"REQUIREMENTS={repr(results)}\n") + elif (mode, format) == ("dev_version", "conda_env"): + if snowflake_channel_only: + results = list( + sorted( + filter( + None, + map( + lambda req_info: generate_dev_pinned_string(req_info, "conda"), + filter( + lambda req_info: req_info.get("from_channel", SNOWFLAKE_CONDA_CHANNEL) + == SNOWFLAKE_CONDA_CHANNEL, + requirements, + ), + ), + ) + ) + ) + env_result = {"channels": [SNOWFLAKE_CONDA_CHANNEL, "nodefaults"], "dependencies": results} + yaml.safe_dump(env_result, sys.stdout, default_flow_style=False) + else: + all_channels: Set[str] = {SNOWFLAKE_CONDA_CHANNEL} + all_channels = functools.reduce(fold_channel, requirements, all_channels) + results = list( + sorted(filter(None, map(lambda req_info: generate_dev_pinned_string(req_info, "conda"), requirements))) + ) + all_channels.remove(SNOWFLAKE_CONDA_CHANNEL) + env_result = { + "channels": [SNOWFLAKE_CONDA_CHANNEL] + list(sorted(all_channels)) + ["nodefaults"], + "dependencies": results, + } + yaml.safe_dump(env_result, sys.stdout, default_flow_style=False) + elif (mode, format) == ("version_requirements", "conda_meta"): + if version is None: + raise ValueError("Version must be specified when generate conda meta.") + run_results = list( + sorted( + filter( + None, + map( + lambda req_info: generate_user_requirements_string(req_info, "conda"), + filter(lambda req_info: filter_by_extras(req_info, False, True), requirements), + ), + ) + ) + ) + run_constrained_results = list( + sorted( + filter( + None, + map( + lambda req_info: generate_user_requirements_string(req_info, "conda"), + filter(lambda req_info: filter_by_extras(req_info, True, False), requirements), + ), + ) + ) + ) + meta_result = { + "package": {"version": version}, + "requirements": {"run": run_results, "run_constrained": run_constrained_results}, + } + yaml.safe_dump(meta_result, sys.stdout, default_flow_style=False) + else: + raise ValueError("Unreachable") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("requirement_file", help="Path to the requirement.yaml file", type=str) + parser.add_argument("--schema", type=str, help="Path to the json schema file.", required=True) + parser.add_argument( + "--mode", + type=str, + choices=["dev_version", "version_requirements", "version_requirements_extras"], + help="Define the mode when specifying the requirements.", + required=True, + ) + parser.add_argument( + "--format", + type=str, + choices=["text", "bzl", "python", "conda_env", "conda_meta"], + help="Define the output format.", + required=True, + ) + parser.add_argument("--filter_by_tag", type=str, default=None, help="Filter the result by tags.") + parser.add_argument("--version", type=str, default=None, help="Filter the result by tags.") + parser.add_argument( + "--snowflake_channel_only", + action="store_true", + default=False, + help="Flag to set if only output dependencies in Snowflake Anaconda Channel.", + ) + args = parser.parse_args() + + VALID_SETTINGS = [ + ("dev_version", "text", False), # requirements.txt + ("version_requirements", "bzl", False), # wheel rule requirements + ("version_requirements", "python", False), # model deployment core dependencies list + ("dev_version", "conda_env", False), # dev conda-env.yml file + ("dev_version", "conda_env", True), # dev conda-env-snowflake.yml file + ("version_requirements", "conda_meta", False), # conda build recipe metadata file + ] + + if (args.mode, args.format, args.snowflake_channel_only) not in VALID_SETTINGS: + raise ValueError("Invalid config combination found.") + + generate_requirements( + args.requirement_file, + args.schema, + args.mode, + args.format, + args.snowflake_channel_only, + args.filter_by_tag, + args.version, + ) + + +if __name__ == "__main__": + main() diff --git a/bazel/requirements/requirements.schema.json b/bazel/requirements/requirements.schema.json new file mode 100644 index 00000000..02fb8682 --- /dev/null +++ b/bazel/requirements/requirements.schema.json @@ -0,0 +1,115 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "array", + "items": [ + { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The name of the required packages." + }, + "name_pypi": { + "type": "string", + "description": "The name of the required packages in PyPI, set if differs." + }, + "name_conda": { + "type": "string", + "description": "The name of the required packages in conda, set if differs." + }, + "dev_version": { + "type": "string", + "description": "The version to use in the development environment.", + "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" + }, + "dev_version_pypi": { + "type": "string", + "description": "The version to use in the development environment in PyPI, set if differs.", + "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" + }, + "dev_version_conda": { + "type": "string", + "description": "The version to use in the development environment in conda, set if differs.", + "pattern": "^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?$" + }, + "from_channel": { + "type": "string", + "description": "The channel where the package come from, set if not from Snowflake Anaconda Channel.", + "default": "https://repo.anaconda.com/pkgs/snowflake" + }, + "version_requirements": { + "type": "string", + "description": "The version requirements of this package as a dependency when released.", + "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" + }, + "version_requirements_pypi": { + "type": "string", + "description": "The version requirements of this package as a dependency when released in PyPI, set if differs.", + "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" + }, + "version_requirements_conda": { + "type": "string", + "description": "The version requirements of this package as a dependency when released.", + "pattern": "^((<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)(,(<|<=|\\!=|==|>=|>)([1-9][0-9]*!)?(0|[1-9][0-9]*)(\\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\\.post(0|[1-9][0-9]*))?(\\.dev(0|[1-9][0-9]*))?)*$" + }, + "requirements_extra_tags": { + "type": "array", + "description": "The extras tags that this package belongs to as a dependency.", + "items": [ + { + "type": "string" + } + ] + }, + "tags": { + "type": "array", + "items": [ + { + "type": "string" + } + ] + } + }, + "allOf": [ + { + "anyOf": [ + { + "required": [ + "name" + ] + }, + { + "required": [ + "name_pypi" + ] + }, + { + "required": [ + "name_conda" + ] + } + ] + }, + { + "anyOf": [ + { + "required": [ + "dev_version" + ] + }, + { + "required": [ + "dev_version_pypi" + ] + }, + { + "required": [ + "dev_version_conda" + ] + } + ] + } + ] + } + ] +} diff --git a/bazel/requirements/templates/BUILD.bazel b/bazel/requirements/templates/BUILD.bazel new file mode 100644 index 00000000..a52b28fc --- /dev/null +++ b/bazel/requirements/templates/BUILD.bazel @@ -0,0 +1,3 @@ +exports_files([ + "meta.tpl.yaml" +]) diff --git a/bazel/requirements/templates/meta.tpl.yaml b/bazel/requirements/templates/meta.tpl.yaml new file mode 100644 index 00000000..260f3955 --- /dev/null +++ b/bazel/requirements/templates/meta.tpl.yaml @@ -0,0 +1,31 @@ +# DO NOT EDIT! +# Generated by //bazel/requirements:gen_conda_meta +# To update, run: +# bazel run //bazel/requirements:sync_requirements +# + +package: + name: snowflake-ml-python + +source: + path: ../../ + +build: + noarch: python + +requirements: + build: + - python + - bazel >=6.0.0 + +about: + home: https://github.com/snowflakedb/snowflake-ml-python + license: Apache-2.0 + license_family: Apache + license_file: ../../LICENSE.txt + summary: Snowflake ML Library + description: | + Snowflake ML client Library is used for interacting with Snowflake to build machine learning solutions. + Functionalities include feature engineering, modeling, model management, deployment, etc + dev_url: https://github.com/snowflakedb/snowflake-ml-python + doc_url: https://github.com/snowflakedb/snowflake-ml-python/blob/main/README.md diff --git a/bazel/requirements_conda_env_test.sh b/bazel/requirements_conda_env_test.sh deleted file mode 100755 index c6a9d65d..00000000 --- a/bazel/requirements_conda_env_test.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -# Test to make sure //requirements.txt is the same as bazel/requirements.txt. - -if [ $# -ne 2 ]; then - echo "must provide the two requirements.txt files as arguments." - exit 1 -fi - -# -y: multi-column output -# -W: max column width (contents will be clipped). -diff -y -W 160 $1 $2 || \ -(echo -e "\nrequirements.txt should be updated." \ - "Please see the instructions on top of the file to re-generate it." && \ - exit 1) diff --git a/ci/conda_recipe/BUILD.bazel b/ci/conda_recipe/BUILD.bazel new file mode 100644 index 00000000..882f61f3 --- /dev/null +++ b/ci/conda_recipe/BUILD.bazel @@ -0,0 +1,3 @@ +exports_files([ + "meta.yaml" +]) diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml index 238929b7..607c0485 100644 --- a/ci/conda_recipe/meta.yaml +++ b/ci/conda_recipe/meta.yaml @@ -1,59 +1,49 @@ -# TODO(SNOW-728020): Make this file a template so that it can read ground truths -# (dependencies, version number) from a common place. We also need to define that -# common place, as currently it's a BUILD rule. -# See https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html#templating-with-jinja -{% set version_match = load_file_regex(load_file='snowflake/ml/version.bzl', regex_pattern='VERSION = "(\d\.\d\.\d*)"\s.*') %} - -package: - name: snowflake-ml-python - version: {{ version_match.group(1) }} - -source: - path: ../../ - +# DO NOT EDIT! +# Generated by //bazel/requirements:gen_conda_meta +# To update, run: +# bazel run //bazel/requirements:sync_requirements +# +about: + description: | + Snowflake ML client Library is used for interacting with Snowflake to build machine learning solutions. + Functionalities include feature engineering, modeling, model management, deployment, etc + dev_url: https://github.com/snowflakedb/snowflake-ml-python + doc_url: https://github.com/snowflakedb/snowflake-ml-python/blob/main/README.md + home: https://github.com/snowflakedb/snowflake-ml-python + license: Apache-2.0 + license_family: Apache + license_file: ../../LICENSE.txt + summary: Snowflake ML Library build: noarch: python - +package: + name: snowflake-ml-python + version: 1.0.0 requirements: build: - python - bazel >=6.0.0 run: - - python - absl-py>=0.15,<2 - anyio>=3.5.0,<4 - cloudpickle + - conda-libmamba-solver>=23.1.0,<24 - fsspec>=2022.11,<=2023.1 - numpy>=1.23,<2 - packaging>=20.9,<24 - - pandas>=1.0.0,<2 # Limit since 2.x is not available in Snowflake Anaconda Channel yet. + - pandas>=1.0.0,<2 + - python - pyyaml>=6.0,<7 - scikit-learn>=1.2.1,<2 - scipy>=1.9,<2 - snowflake-connector-python - - snowflake-snowpark-python>=1.4.0,<=2 + - snowflake-snowpark-python>=1.4.0,<2 - sqlparse>=0.4,<1 - typing-extensions>=4.1.0,<5 - xgboost>=1.7.3,<2 - - # conda-libmamba-solver is conda-specific requirement, and should not appear in wheel's dependency. - - conda-libmamba-solver>=23.1.0,<24 run_constrained: - # Any dependencies required by extra should be specified here so that conda could consider the constraints when - # installing them simultaneously. This part should sync with the extra_requirements in snowml_wheel in - # snowflake/ml/BUILD.bazel file. + - lightgbm==3.3.5 - tensorflow>=2.9,<3 - torchdata>=0.4,<1 - - lightgbm==3.3.5 - -about: - home: https://github.com/snowflakedb/snowflake-ml-python - license: Apache-2.0 - license_family: Apache - license_file: ../../LICENSE.txt - summary: Snowflake ML Library - description: | - Snowflake ML client Library is used for interacting with Snowflake to build machine learning solutions. - Functionalities include feature engineering, modeling, model management, deployment, etc - dev_url: https://github.com/snowflakedb/snowflake-ml-python - doc_url: https://github.com/snowflakedb/snowflake-ml-python/blob/main/README.md +source: + path: ../../ diff --git a/ci/get_excluded_tests.sh b/ci/get_excluded_tests.sh new file mode 100755 index 00000000..b59057c7 --- /dev/null +++ b/ci/get_excluded_tests.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Usage +# exclude_tests.sh [-b ] [-f ] +# +# Flags +# -b: specify path to bazel +# -f: specify output file path +# +# Action +# - exclude integration tests whose dependency is not part of the wheel package. +# The missing dependency cuold happen when a new operator is being developed, but not yet released. + +set -o pipefail +set -eu + +echo "Running "$0 + +bazel="bazel" +output_path="/tmp/files_to_exclude" + +while getopts "b:f:" opt; do + case "${opt}" in + b) + bazel=${OPTARG} + ;; + f) + output_path=${OPTARG} + ;; + :) + echo "Option -[bf] requires an argument." + exit 1 + ;; + ?) + echo "Invalid option." + echo "Usage: $0 [-b ] [-f ]" + exit 1 + ;; + esac +done + +# Compute missing dependencies by subtracting deps included in wheel from deps required by tests. +# We only care about dependencies in //snowflake/ml since that's our dev directory. +${bazel} query "kind('py_library rule', deps(tests/...) except deps(snowflake/ml:wheel))" \ + | grep -w "//snowflake/ml" > /tmp/missing_deps + +# Reverse search on testing files depending on missing deps and exclude those. +files_to_exclude=$(${bazel} query \ + "kind('source file', deps(kind('py_test rule', rdeps(tests/..., set($(| ${output_path} +for f in ${files_to_exclude} +do + echo "Excluding file: "${f} + echo ${f} >> ${output_path} +done + +echo "Done running "$0 diff --git a/ci/type_ignored_targets b/ci/type_ignored_targets index d576d2a2..3123cc2d 100644 --- a/ci/type_ignored_targets +++ b/ci/type_ignored_targets @@ -1,32 +1,37 @@ //snowflake/ml/experimental/... //tests/integ/snowflake/ml/_internal/... //tests/integ/snowflake/ml/extra_tests/... -//tests/integ/snowflake/ml/sklearn/preprocessing/... +//tests/integ/snowflake/ml/modeling/impute/... +//tests/integ/snowflake/ml/modeling/metrics/... +//tests/integ/snowflake/ml/modeling/pipeline/... +//tests/integ/snowflake/ml/modeling/preprocessing/... -//snowflake/ml/sklearn/linear_model/... -//snowflake/ml/sklearn/ensemble/... -//snowflake/ml/sklearn/svm/... -//snowflake/ml/sklearn/neural_network/... -//snowflake/ml/sklearn/tree/... -//snowflake/ml/sklearn/calibration/... -//snowflake/ml/sklearn/cluster/... -//snowflake/ml/sklearn/compose/... -//snowflake/ml/sklearn/covariance/... -//snowflake/ml/sklearn/decomposition/... -//snowflake/ml/sklearn/discriminant_analysis/... -//snowflake/ml/sklearn/feature_selection/... -//snowflake/ml/sklearn/gaussian_process/... -//snowflake/ml/sklearn/impute/... -//snowflake/ml/sklearn/isotonic/... -//snowflake/ml/sklearn/kernel_approximation/... -//snowflake/ml/sklearn/kernel_ridge/... -//snowflake/ml/sklearn/manifold/... -//snowflake/ml/sklearn/mixture/... -//snowflake/ml/sklearn/model_selection/... -//snowflake/ml/sklearn/multiclass/... -//snowflake/ml/sklearn/multioutput/... -//snowflake/ml/sklearn/naive_bayes/... -//snowflake/ml/sklearn/neighbors/... -//snowflake/ml/sklearn/semi_supervised/... -//snowflake/ml/xgboost/... -//snowflake/ml/lightgbm/... +//snowflake/ml/modeling/linear_model/... +//snowflake/ml/modeling/ensemble/... +//snowflake/ml/modeling/svm/... +//snowflake/ml/modeling/neural_network/... +//snowflake/ml/modeling/tree/... +//snowflake/ml/modeling/calibration/... +//snowflake/ml/modeling/cluster/... +//snowflake/ml/modeling/compose/... +//snowflake/ml/modeling/covariance/... +//snowflake/ml/modeling/decomposition/... +//snowflake/ml/modeling/discriminant_analysis/... +//snowflake/ml/modeling/feature_selection/... +//snowflake/ml/modeling/gaussian_process/... +//snowflake/ml/modeling/impute:iterative_imputer +//snowflake/ml/modeling/impute:knn_imputer +//snowflake/ml/modeling/impute:missing_indicator +//snowflake/ml/modeling/isotonic/... +//snowflake/ml/modeling/kernel_approximation/... +//snowflake/ml/modeling/kernel_ridge/... +//snowflake/ml/modeling/lightgbm/... +//snowflake/ml/modeling/manifold/... +//snowflake/ml/modeling/mixture/... +//snowflake/ml/modeling/model_selection/... +//snowflake/ml/modeling/multiclass/... +//snowflake/ml/modeling/multioutput/... +//snowflake/ml/modeling/naive_bayes/... +//snowflake/ml/modeling/neighbors/... +//snowflake/ml/modeling/semi_supervised/... +//snowflake/ml/modeling/xgboost/... diff --git a/codegen/build_file_autogen.py b/codegen/build_file_autogen.py index 5d4621a6..9e484465 100644 --- a/codegen/build_file_autogen.py +++ b/codegen/build_file_autogen.py @@ -6,7 +6,7 @@ python3 snowflake/ml/experimental/amauser/transformer/build_file_autogen.py """ import os -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List import inflection @@ -17,7 +17,8 @@ @dataclass(frozen=True) class ModuleInfo: module_name: str - exclude_list: List[str] + exclude_list: List[str] = field(default_factory=list) + include_list: List[str] = field(default_factory=list) MODULES = [ @@ -30,18 +31,18 @@ class ModuleInfo: ], ), ModuleInfo("sklearn.svm", ["OneClassSVM"]), - ModuleInfo("sklearn.neural_network", []), + ModuleInfo("sklearn.neural_network"), ModuleInfo("sklearn.tree", ["BaseDecisionTree"]), # Excluded BaseDecisionTree which is a private class. # TODO(snandamuri): Implement support for XGBRanker ModuleInfo("xgboost", ["Booster", "XGBModel", "XGBRanker"]), # Excluded private classes and Ranker. ModuleInfo("sklearn.calibration", ["_SigmoidCalibration"]), # Abstract base classes. - ModuleInfo("sklearn.cluster", []), - ModuleInfo("sklearn.compose", []), - ModuleInfo("sklearn.covariance", []), - # ModuleInfo("sklearn.cross_decomposition", []), + ModuleInfo("sklearn.cluster"), + ModuleInfo("sklearn.compose"), + ModuleInfo("sklearn.covariance"), + # ModuleInfo("sklearn.cross_decomposition"), ModuleInfo("sklearn.decomposition", ["MiniBatchNMF", "NMF", "SparseCoder", "LatentDirichletAllocation"]), - ModuleInfo("sklearn.discriminant_analysis", []), - # ModuleInfo("sklearn.feature_extraction", []), + ModuleInfo("sklearn.discriminant_analysis"), + # ModuleInfo("sklearn.feature_extraction"), ModuleInfo( "sklearn.feature_selection", [ @@ -53,14 +54,14 @@ class ModuleInfo: "SelectFromModel", ], ), - ModuleInfo("sklearn.gaussian_process", []), + ModuleInfo("sklearn.gaussian_process"), ModuleInfo("sklearn.impute", ["SimpleImputer"]), ModuleInfo("sklearn.isotonic", ["IsotonicRegression"]), - ModuleInfo("sklearn.kernel_approximation", []), - ModuleInfo("sklearn.kernel_ridge", []), + ModuleInfo("sklearn.kernel_approximation"), + ModuleInfo("sklearn.kernel_ridge"), ModuleInfo("sklearn.manifold", ["LocallyLinearEmbedding"]), - ModuleInfo("sklearn.mixture", []), - ModuleInfo("sklearn.model_selection", []), + ModuleInfo("sklearn.mixture"), + ModuleInfo("sklearn.model_selection"), ModuleInfo("sklearn.multiclass", ["_ConstantPredictor"]), ModuleInfo( "sklearn.multioutput", @@ -91,6 +92,7 @@ class ModuleInfo: "DaskLGBMRegressor", ], ), + ModuleInfo(module_name="sklearn.preprocessing", include_list=["PolynomialFeatures"]), ] SRC_OUTPUT_PATH = "" @@ -110,11 +112,71 @@ def indent(baseString: str, spaces: int = 0) -> str: return " " * spaces + baseString +def get_src_build_file_content(module: ModuleInfo, module_root_dir: str) -> str: + """Generates the conent of BUILD.bazel file for source directory of the given module. + + Args: + module: Module information. + module_root_dir: Relative directory path of the module source code. + + Returns: + Returns conent of the BUILD.bazel file for module source directory. + """ + # Source dir has bazel rules for native implementation of estimator or transformers? + src_build_native_file_path = os.path.join(SRC_OUTPUT_PATH, module_root_dir, "BUILD_NATIVE.bzl") + src_build_native_file_exists = os.path.isfile(src_build_native_file_path) + + # Check if init file is alread preset in the source dir + src_init_file_path = os.path.join(SRC_OUTPUT_PATH, module_root_dir, "__init__.py") + src_init_file_exists = os.path.isfile(src_init_file_path) + + return ( + 'load("//codegen:codegen_rules.bzl", "autogen_estimators", "autogen_init_file_for_module")\n' + 'load(":estimators_info.bzl", "estimator_info_list")\n' + + ('load(":BUILD_NATIVE.bzl", "get_build_rules_for_native_impl")\n' if src_build_native_file_exists else "") + + 'package(default_visibility = ["//visibility:public"])\n' + + (f'\nautogen_init_file_for_module(module="{module.module_name}")' if not src_init_file_exists else "") + + f'\nautogen_estimators(module="{module.module_name}", estimator_info_list=estimator_info_list)\n' + + ("get_build_rules_for_native_impl()\n" if src_build_native_file_exists else "") + ) + + +def get_test_build_file_content(module: ModuleInfo, module_root_dir: str) -> str: + """Generates the conent of BUILD.bazel file for test directory of the given module. + + Args: + module: Module information. + module_root_dir: Relative directory path of the module source code. + + Returns: + Returns conent of the BUILD.bazel file for module test directory. + """ + + # Test dir has bazel rules for native implementation of estimator or transformers? + test_build_native_file_path = os.path.join(TEST_OUTPUT_PATH, module_root_dir, "BUILD_NATIVE.bzl") + test_build_native_file_exists = os.path.isfile(test_build_native_file_path) + + return ( + 'load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators")\n' + f'load("//{module_root_dir}:estimators_info.bzl", "estimator_info_list")\n' + + ('load(":BUILD_NATIVE.bzl", "get_build_rules_for_native_impl")\n' if test_build_native_file_exists else "") + + 'package(default_visibility = ["//visibility:public"])\n' + "\nautogen_tests_for_estimators(\n" + f' module = "{module.module_name}",\n' + f' module_root_dir = "{module_root_dir}",\n' + " estimator_info_list=estimator_info_list\n" + ")\n" + ("get_build_rules_for_native_impl()\n" if test_build_native_file_exists else "") + ) + + def main(argv: List[str]) -> None: del argv # Unused. # For each module for module in MODULES: + if len(module.exclude_list) > 0 and len(module.include_list) > 0: + raise ValueError(f"Both inlcude_list and exclude_list can't be specified for module {module.module_name}!") + module_root_dir = AutogenTool.module_root_dir(module.module_name) estimators_info_file_path = os.path.join(module_root_dir, "estimators_info.bzl") src_build_file_path = os.path.join(SRC_OUTPUT_PATH, module_root_dir, "BUILD.bazel") @@ -127,28 +189,13 @@ def main(argv: List[str]) -> None: # Src build file: # Contains genrules and py_library rules for all the estimator wrappers. - src_build_file_content = ( - 'load("//codegen:codegen_rules.bzl", "autogen_estimators", "autogen_init_file_for_module")\n' - f'load(":estimators_info.bzl", "estimator_info_list")\n' - 'package(default_visibility = ["//visibility:public"])\n' - f'\nautogen_init_file_for_module(module="{module.module_name}")' - f'\nautogen_estimators(module="{module.module_name}", estimator_info_list=estimator_info_list)\n' - ) + src_build_file_content = get_src_build_file_content(module, module_root_dir) os.makedirs("/".join(src_build_file_path.split("/")[:-1]), exist_ok=True) open(src_build_file_path, "w").write(src_build_file_content) # Test build file: # Contains genrules and py_test rules for all the estimator wrappers. - test_build_file_content = ( - 'load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators")\n' - f'load("//{module_root_dir}:estimators_info.bzl", "estimator_info_list")\n' - 'package(default_visibility = ["//visibility:public"])\n' - "\nautogen_tests_for_estimators(\n" - f' module = "{module.module_name}",\n' - f' module_root_dir = "{module_root_dir}",\n' - " estimator_info_list=estimator_info_list\n" - ")\n" - ) + test_build_file_content = get_test_build_file_content(module, module_root_dir) os.makedirs("/".join(test_build_file_path.split("/")[:-1]), exist_ok=True) open(test_build_file_path, "w").write(test_build_file_content) @@ -170,7 +217,13 @@ def get_estimators_info_file_content(module: ModuleInfo) -> str: [ indent(f'struct(class_name="{c}", normalized_class_name="{inflection.underscore(c)}")', 4) for c in class_list - if c not in module.exclude_list + if ( + c not in module.exclude_list + if len(module.exclude_list) > 0 + else c in module.include_list + if len(module.include_list) > 0 + else True + ) ] ) ) diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl index 5afd9d26..1dcd7895 100644 --- a/codegen/codegen_rules.bzl +++ b/codegen/codegen_rules.bzl @@ -35,6 +35,7 @@ def autogen_init_file_for_module(module): tools = [AUTO_GEN_TOOL_BAZEL_PATH], srcs = [INIT_TEMPLATE_BAZEL_PATH], cmd = "cat $(location {}) > $@".format(INIT_TEMPLATE_BAZEL_PATH), + tags = ["autogen_build"], ) py_library( @@ -75,6 +76,7 @@ def autogen_estimators(module, estimator_info_list): tools = [AUTO_GEN_TOOL_BAZEL_PATH], srcs = [ESTIMATOR_TEMPLATE_BAZEL_PATH], cmd = cmd.format(e.class_name), + tags = ["autogen_build"], ) py_library( @@ -82,7 +84,7 @@ def autogen_estimators(module, estimator_info_list): srcs = [":generate_{}".format(e.normalized_class_name)], deps = [ ":init", - "//snowflake/ml/sklearn/framework:framework", + "//snowflake/ml/modeling/framework:framework", "//snowflake/ml/_internal:telemetry", "//snowflake/ml/_internal/utils:temp_file_utils", "//snowflake/ml/_internal/utils:query_result_checker", @@ -125,6 +127,7 @@ def autogen_tests_for_estimators(module, module_root_dir, estimator_info_list): tools = [AUTO_GEN_TOOL_BAZEL_PATH], srcs = [ESTIMATOR_TEST_TEMPLATE_BAZEL_PATH], cmd = cmd.format(e.class_name), + tags = ["autogen_build"], ) py_test( diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py index 38816d10..e0fb5da6 100644 --- a/codegen/sklearn_wrapper_generator.py +++ b/codegen/sklearn_wrapper_generator.py @@ -311,9 +311,9 @@ def get_snow_ml_module_name(module_name: str) -> str: """ tokens = module_name.split(".") if tokens[0] == "sklearn": - return "snowflake.ml.sklearn." + ".".join(module_name.split(".")[1:]) + return "snowflake.ml.modeling." + ".".join(module_name.split(".")[1:]) else: - return "snowflake.ml." + module_name + return "snowflake.ml.modeling." + module_name @staticmethod def can_generate_wrapper(class_object: Tuple[str, type]) -> bool: @@ -473,6 +473,9 @@ def __init__(self, module_name: str, class_object: Tuple[str, type]) -> None: self.predict_udf_deps = "" self.fit_sproc_deps = "" + # Native function transform + self.supported_export_method = "" + def _format_default_value(self, default_value: Any) -> str: if isinstance(default_value, str): return f'"{default_value}"' @@ -607,17 +610,20 @@ def _populate_function_names_and_signatures(self) -> None: args_to_transform = ["steps", "transformers", "estimator", "estimators", "base_estimator", "final_estimator"] arg_transform_calls = [] + deps_gathering_calls = [] for arg_to_transform in args_to_transform: if arg_to_transform in self.original_init_signature.parameters.keys(): arg_transform_calls.append( f"{arg_to_transform} = _transform_snowml_obj_to_sklearn_obj({arg_to_transform})" ) + deps_gathering_calls.append(f"deps = deps | _gather_dependencies({arg_to_transform})") self.estimator_init_signature = ",\n ".join(signature_lines) + "," self.sklearn_init_arguments = ",\n ".join(sklearn_init_lines) + "," self.sklearn_init_args_dict = "{" + ",\n ".join(sklearn_init_args_dict_list) + ",}" self.estimator_init_member_args = "\n ".join(init_member_args) self.estimator_args_transform_calls = "\n ".join(arg_transform_calls) + self.estimator_args_gathering_calls = "\n ".join(deps_gathering_calls) # TODO(snandamuri): Implement type inference for classifiers. self.udf_datatype = "float" if self._from_data_py or self._is_regressor else "" @@ -803,10 +809,11 @@ def generate(self) -> "SklearnWrapperGenerator": self.test_estimator_input_args_list.extend(["min_samples_leaf=1", "max_leaf_nodes=100"]) # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. - self.fit_sproc_deps = self.predict_udf_deps = ( - "f'numpy=={np.__version__}', f'pandas=={pd.__version__}', f'scikit-learn=={sklearn.__version__}', " - "f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'" + self.deps = ( + "f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'" ) + self.supported_export_method = "to_sklearn" + self.unsupported_export_methods = ["to_xgboost", "to_lightgbm"] self._construct_string_from_lists() return self @@ -821,10 +828,9 @@ def generate(self) -> "XGBoostWrapperGenerator": self.test_estimator_input_args_list.extend(["random_state=0", "subsample=1.0", "colsample_bynode=1.0"]) self.fit_sproc_imports = "import xgboost" # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. - self.fit_sproc_deps = self.predict_udf_deps = ( - "f'numpy=={np.__version__}', f'pandas=={pd.__version__}', f'xgboost=={xgboost.__version__}', " - "f'cloudpickle=={cp.__version__}'" - ) + self.supported_export_method = "to_xgboost" + self.unsupported_export_methods = ["to_sklearn", "to_lightgbm"] + self.deps = "f'numpy=={np.__version__}', f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'" self._construct_string_from_lists() return self @@ -839,9 +845,8 @@ def generate(self) -> "LightGBMWrapperGenerator": self.test_estimator_input_args_list.extend(["random_state=0"]) self.fit_sproc_imports = "import lightgbm" # TODO(snandamuri): Replace cloudpickle with joblib after latest version of joblib is added to snowflake conda. - self.fit_sproc_deps = self.predict_udf_deps = ( - "f'numpy=={np.__version__}', f'pandas=={pd.__version__}', f'lightgbm=={lightgbm.__version__}', " - "f'cloudpickle=={cp.__version__}'" - ) + self.deps = "f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'" + self.supported_export_method = "to_lightgbm" + self.unsupported_export_methods = ["to_sklearn", "to_xgboost"] self._construct_string_from_lists() return self diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template index 28d664cd..7f5d08ff 100644 --- a/codegen/sklearn_wrapper_template.py_template +++ b/codegen/sklearn_wrapper_template.py_template @@ -3,7 +3,7 @@ # import inspect import os -from typing import Iterable, Optional, Union, List, Any, Dict, Callable +from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set from uuid import uuid4 import cloudpickle as cp @@ -12,7 +12,8 @@ import numpy as np {transform.estimator_imports} from sklearn.utils.metaestimators import available_if -from snowflake.ml.sklearn.framework.base import BaseTransformer +from snowflake.ml.modeling.framework.base import BaseTransformer +from snowflake.ml.modeling.framework._utils import to_native_format from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator from snowflake.ml._internal.utils import pkg_version_utils, identifier @@ -40,7 +41,7 @@ def _original_estimator_has_callable(attr : str) -> Callable[[Any], bool]: """ Checks that the original estimator has callable `attr`. Args: - attr: Arrtibute to check for. + attr: Attribute to check for. Returns: A function which checks for the existance of callable `attr` on the given object. @@ -55,6 +56,25 @@ def _original_estimator_has_callable(attr : str) -> Callable[[Any], bool]: return check +def _gather_dependencies(obj: Any) -> Set[str]: + """ Gethers dependencies from the SnowML Estimator and Transformer objects. + + Args: + obj: Source object to collect dependencies from. Source object could of any type, example, lists, tuples, etc. + + Returns: + A set of dependencies required to work with the object. + """ + + if isinstance(obj, list) or isinstance(obj, tuple): + deps: Set[str] = set() + for elem in obj: + deps = deps | set(_gather_dependencies(elem)) + return deps + elif isinstance(obj, BaseTransformer): + return set(obj._get_dependencies()) + else: + return set() def _transform_snowml_obj_to_sklearn_obj(obj: Any) -> Any: """Converts SnowML Estimator and Transformer objects to equivalent SKLearn objects. @@ -74,7 +94,7 @@ def _transform_snowml_obj_to_sklearn_obj(obj: Any) -> Any: return tuple(map(_transform_snowml_obj_to_sklearn_obj, obj)) elif isinstance(obj, BaseTransformer): # Convert SnowML object to equivalent SKLearn object - return obj.get_sklearn_object() + return to_native_format(obj) else: # Return all other objects as it is. return obj @@ -115,6 +135,9 @@ class {transform.original_class_name}(BaseTransformer): ) -> None: super().__init__() self.id = str(uuid4()).replace("-", "_").upper() + deps: Set[str] = set([{transform.deps}]) + {transform.estimator_args_gathering_calls} + self._deps = list(deps) {transform.estimator_args_transform_calls} init_args = {transform.sklearn_init_args_dict} cleaned_up_init_args = _validate_sklearn_args( @@ -176,8 +199,9 @@ class {transform.original_class_name}(BaseTransformer): def _fit_snowpark(self, dataset: DataFrame) -> None: session = dataset._session # Validate that key package version in user workspace are supported in snowflake conda channel - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( - pkg_versions=[{transform.fit_sproc_deps}], session=session, subproject=_SUBPROJECT) + # If customer doesn't have package in conda channel, replace the ones have the closest versions + self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( + pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT) # Specify input columns so column pruing will be enforced selected_cols = ( @@ -236,7 +260,7 @@ class {transform.original_class_name}(BaseTransformer): @sproc( is_permanent=False, name=fit_sproc_name, - packages=["snowflake-snowpark-python", {transform.fit_sproc_deps}], + packages=["snowflake-snowpark-python"] + self._get_dependencies(), replace=True, session=session, statement_params=statement_params, @@ -374,8 +398,8 @@ class {transform.original_class_name}(BaseTransformer): session = dataset._session # Validate that key package version in user workspace are supported in snowflake conda channel - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( - pkg_versions=[{transform.predict_udf_deps}], session=session, subproject=_SUBPROJECT) + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( + pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT) # Register vectorized UDF for batch inference batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{{safe_id}}_{{method}}".format( @@ -403,7 +427,7 @@ class {transform.original_class_name}(BaseTransformer): @pandas_udf( is_permanent=False, name=batch_inference_udf_name, - packages=[{transform.predict_udf_deps}], + packages= self._get_dependencies(), replace=True, session=session, statement_params=statement_params, @@ -947,7 +971,7 @@ class {transform.original_class_name}(BaseTransformer): @sproc( is_permanent=False, name=score_sproc_name, - packages=["snowflake-snowpark-python", {transform.fit_sproc_deps}], + packages=["snowflake-snowpark-python"] + self._get_dependencies(), replace=True, session=session, statement_params=statement_params, @@ -1054,3 +1078,17 @@ class {transform.original_class_name}(BaseTransformer): if self._model_signature_dict is None: raise RuntimeError("Estimator not fitted before accessing property model_signatures! ") return self._model_signature_dict + + def {transform.supported_export_method}(self) -> Any: + if self._sklearn_object is None: + self._sklearn_object = self._create_sklearn_object() + return self._sklearn_object + + def {transform.unsupported_export_methods[0]}(self) -> Any: + raise AttributeError("Estimator doesn't support {transform.unsupported_export_methods[0]}(). Please use {transform.supported_export_method}()") + + def {transform.unsupported_export_methods[1]}(self) -> Any: + raise AttributeError("Estimator doesn't support {transform.unsupported_export_methods[1]}(). Please use {transform.supported_export_method}()") + + def _get_dependencies(self) -> List[str]: + return self._deps diff --git a/conda-env-extended.yml b/conda-env-extended.yml deleted file mode 100644 index 23af7cb9..00000000 --- a/conda-env-extended.yml +++ /dev/null @@ -1,18 +0,0 @@ -# Dependencies not available in the Snowflake conda channel. -# Each dependency must be accompanied with a JIRA ticket requesting it -# to be added to the Snowflake channel. -# -# After updating this file, conda-env.yml should be updated. See instructions -# there. - -channels: - - conda-forge -dependencies: - - torchdata==0.4.1 # SNOW-702102 - # SNOW-747683: Tensorflow is available on snowflake conda channel, - # however, macos-arm64 is only available on conda-forge. - - tensorflow==2.9.1 - # For mypy on YAML. Not required to be released. - - types-PyYAML==6.0.12 - # For huggingface model creation. Not required to be released. - - transformers==4.27.1 diff --git a/conda-env-snowflake.yml b/conda-env-snowflake.yml index e18cb088..4faa40fe 100644 --- a/conda-env-snowflake.yml +++ b/conda-env-snowflake.yml @@ -1,43 +1,40 @@ -# Dependencies available in the Snowflake conda channel -# are listed here. -# DO NOT add more channels. To add a dependency that's currently -# not available in the Snowflake conda channel, change conda-env-extended.yml. -# -# After updating this file, conda-env.yml should be updated. See instructions -# there. -# -# Keep all the dependencies in alphabetical order. -# Specify version explicitly for all the dependencies. +# DO NOT EDIT! +# Generate by running 'bazel run //bazel/requirements:sync_requirements' channels: - - https://repo.anaconda.com/pkgs/snowflake +- https://repo.anaconda.com/pkgs/snowflake +- nodefaults dependencies: - - absl-py==0.15.0 - - anyio==3.5.0 - - boto3==1.24.28 - - conda-libmamba-solver==23.1.0 - - coverage==6.3.2 # not a package dependency. - - docker-py==4.4.1 - - flask-cors==3.0.10 - - flask==2.1.3 - - fsspec==2022.10.0 - - inflection==0.5.1 - - joblib==1.1.1 - - lightgbm==3.3.5 - - moto==4.0.11 - - networkx==2.8.4 - - numpy==1.23.4 - - packaging==23.0 - - pandas==1.4.4 - - pytest==7.1.2 - - python==3.8.13 - - pytorch==1.12.1 - - ruamel.yaml==0.17.21 - - s3fs==2022.10.0 - - scipy==1.9.3 - - scikit-learn==1.2.2 - - snowflake-snowpark-python==1.4.0 - - sqlparse==0.4.3 - - typing-extensions==4.5.0 - - xgboost==1.7.3 - - mypy==0.981 # not a package dependency. +- absl-py==0.15.0 +- anyio==3.5.0 +- boto3==1.24.28 +- cloudpickle==2.0.0 +- conda-libmamba-solver==23.1.0 +- coverage==6.3.2 +- docker-py==4.4.1 +- flask-cors==3.0.10 +- flask==2.1.3 +- fsspec==2022.11.0 +- httpx==0.23.0 +- inflection==0.5.1 +- joblib==1.1.1 +- jsonschema==3.2.0 +- lightgbm==3.3.5 +- mypy==0.981 +- networkx==2.8.4 +- numpy==1.23.4 +- packaging==23.0 +- pandas==1.4.4 +- pytest==7.1.2 +- python==3.8.13 +- pytorch==1.12.1 +- pyyaml==6.0 +- ruamel.yaml==0.17.21 +- s3fs==2022.11.0 +- scikit-learn==1.2.2 +- scipy==1.9.3 +- snowflake-connector-python==3.0.3 +- snowflake-snowpark-python==1.4.0 +- sqlparse==0.4.3 +- typing-extensions==4.5.0 +- xgboost==1.7.3 diff --git a/conda-env.yml b/conda-env.yml index e4f76a04..3155cbd5 100644 --- a/conda-env.yml +++ b/conda-env.yml @@ -1,42 +1,47 @@ # DO NOT EDIT! -# Generated by //bazel:merge_conda_env -# To update, run: -# bazel build //bazel:conda-env.yml && cp bazel-bin/bazel/conda-env.yml . +# Generate by running 'bazel run //bazel/requirements:sync_requirements' channels: - - https://repo.anaconda.com/pkgs/snowflake - - conda-forge +- https://repo.anaconda.com/pkgs/snowflake +- conda-forge +- nodefaults dependencies: - - absl-py==0.15.0 - - anyio==3.5.0 - - boto3==1.24.28 - - conda-libmamba-solver==23.1.0 - - coverage==6.3.2 - - docker-py==4.4.1 - - flask-cors==3.0.10 - - flask==2.1.3 - - fsspec==2022.10.0 - - inflection==0.5.1 - - joblib==1.1.1 - - lightgbm==3.3.5 - - moto==4.0.11 - - mypy==0.981 - - networkx==2.8.4 - - numpy==1.23.4 - - packaging==23.0 - - pandas==1.4.4 - - pytest==7.1.2 - - python==3.8.13 - - pytorch==1.12.1 - - ruamel.yaml==0.17.21 - - s3fs==2022.10.0 - - scikit-learn==1.2.2 - - scipy==1.9.3 - - snowflake-snowpark-python==1.4.0 - - sqlparse==0.4.3 - - tensorflow==2.9.1 - - torchdata==0.4.1 - - transformers==4.27.1 - - types-PyYAML==6.0.12 - - typing-extensions==4.5.0 - - xgboost==1.7.3 +- absl-py==0.15.0 +- anyio==3.5.0 +- boto3==1.24.28 +- cloudpickle==2.0.0 +- conda-libmamba-solver==23.1.0 +- coverage==6.3.2 +- docker-py==4.4.1 +- flask-cors==3.0.10 +- flask==2.1.3 +- fsspec==2022.11.0 +- httpx==0.23.0 +- inflection==0.5.1 +- joblib==1.1.1 +- jsonschema==3.2.0 +- lightgbm==3.3.5 +- moto==4.0.11 +- mypy==0.981 +- networkx==2.8.4 +- numpy==1.23.4 +- packaging==23.0 +- pandas==1.4.4 +- pytest==7.1.2 +- python==3.8.13 +- pytorch==1.12.1 +- pyyaml==6.0 +- ruamel.yaml==0.17.21 +- s3fs==2022.11.0 +- scikit-learn==1.2.2 +- scipy==1.9.3 +- snowflake-connector-python==3.0.3 +- snowflake-snowpark-python==1.4.0 +- sqlparse==0.4.3 +- starlette==0.27.0 +- tensorflow==2.9.1 +- torchdata==0.4.1 +- transformers==4.27.1 +- types-PyYAML==6.0.12 +- typing-extensions==4.5.0 +- xgboost==1.7.3 diff --git a/requirements.yml b/requirements.yml new file mode 100644 index 00000000..29b50472 --- /dev/null +++ b/requirements.yml @@ -0,0 +1,172 @@ +# Add requirements information here and use `bazel run //bazel/requirements:sync_requirements` +# to generate all other requirements files. +# Fields: +# name: The name of the package. Set if it is available with the same name and required both in PyPI and conda. +# name_pypi: The name of the package in PyPI. Set this only to indicate it is a requirements available in PyPI only, +# or set this with name_conda to indicates that it has different name in PyPI and conda. +# name_conda: The name of the package in conda. Set this only to indicate it is a requirements available in conda only, +# or set this with name_pypi to indicates that it has different name in PyPI and conda. +# At least 1 of these 3 fields should be set. +# +# dev_version: The version of the package to be pinned in the dev environment. +# Set if it is available with the same version and required both in PyPI and conda. +# dev_version_pypi: The version from PyPI to be pinned in the dev environment. Set this only to indicate +# it is a requirements available in PyPI only, or set this with dev_version_conda to indicates that +# it has different version in PyPI and conda. +# dev_version_conda: The version from conda to be pinned in the dev environment. Set this only to indicate +# it is a requirements available in conda only, or set this with dev_version_pypi to indicates that +# it has different version in PyPI and conda. +# from_channel: Set this if the package is not available in Snowflake Anaconda Channel +# (https://repo.anaconda.com/pkgs/snowflake). Each dependency must be accompanied with a JIRA ticket requesting it +# to be added to the Snowflake channel. +# At least 1 of these 3 fields should be set. +# +# version_requirements: The version requirements specifiers when this requirement is a dependency of SnowML release. +# Set if it is available with the same name and required both in PyPI and conda. +# version_requirements_pypi: The version requirements specifiers when this requirement is a dependency of +# SnowML release via PyPI. Set this only to indicate it is a requirements required by PyPI release only, +# or set this with version_requirements_conda to indicates that it has different version in PyPI and conda. +# version_requirements_conda: The version requirements specifiers when this requirement is a dependency of +# SnowML release via conda. Set this only to indicate it is a requirements required by conda release only, +# or set this with version_requirements_pypi to indicates that it has different version in PyPI and conda. +# At least 1 of these 3 fields but be set to indicate that this package is a dependency of release. +# If you don't want to constrain version, set the field to empty string. +# +# requirements_extra_tags: PyPI release only. Set this to indicate the package is a extras dependency of the SnowML. +# This requirements will be then added to all extras tags set here, and an all extras tag will be auto +# generated to include all extras requirements. All extras requirements will be labeled as run_constrained in conda +# meta.yaml. +# tags: Set tag to to filter some of the requirements in some cases. + +- name: absl-py + dev_version: "0.15.0" + version_requirements: ">=0.15,<2" +- name: anyio + dev_version: "3.5.0" + version_requirements: ">=3.5.0,<4" + tags: + - deployment_core +- name: boto3 + dev_version: "1.24.28" +- name_conda: conda-libmamba-solver + dev_version_conda: "23.1.0" + version_requirements_conda: ">=23.1.0,<24" +- name: cloudpickle + dev_version: "2.0.0" + version_requirements: "" + tags: + - deployment_core +- name: coverage + dev_version: "6.3.2" +- name_conda: docker-py + name_pypi: docker + dev_version: "4.4.1" +- name: flask-cors + dev_version: "3.0.10" +- name: flask + dev_version: "2.1.3" +- name_pypi: fsspec[http] + name_conda: fsspec + dev_version: "2022.11.0" + version_requirements: ">=2022.11,<=2023.1" +- name: httpx + dev_version: "0.23.0" +- name: inflection + dev_version: "0.5.1" +- name: jsonschema + dev_version: "3.2.0" +- name: joblib + dev_version: "1.1.1" +- name: lightgbm + dev_version: "3.3.5" + version_requirements: "==3.3.5" + requirements_extra_tags: + - lightgbm +- name: moto + dev_version: "4.0.11" + from_channel: conda-forge +- name: mypy + dev_version: "0.981" +- name: networkx + dev_version: "2.8.4" +- name: numpy + dev_version: "1.23.4" + version_requirements: ">=1.23,<2" + tags: + - deployment_core +- name: packaging + dev_version: "23.0" + version_requirements: ">=20.9,<24" + tags: + - deployment_core +- name: pandas + dev_version: "1.4.4" + version_requirements: ">=1.0.0,<2" + tags: + - deployment_core +- name: pytest + dev_version: "7.1.2" +- name_conda: python + dev_version_conda: "3.8.13" + version_requirements_conda: "" +- name_pypi: torch + name_conda: pytorch + dev_version: "1.12.1" +- name: pyyaml + dev_version: "6.0" + version_requirements: ">=6.0,<7" + tags: + - deployment_core +- name: ruamel.yaml + dev_version: "0.17.21" +- name: s3fs + dev_version: "2022.11.0" +- name: scikit-learn + dev_version: "1.2.2" + version_requirements: ">=1.2.1,<2" + tags: + - deployment_core +- name: scipy + dev_version: "1.9.3" + version_requirements: ">=1.9,<2" +- name_conda: snowflake-connector-python + name_pypi: snowflake-connector-python[pandas] + dev_version: "3.0.3" + version_requirements: "" +- name: snowflake-snowpark-python + dev_version: "1.4.0" + version_requirements: ">=1.4.0,<2" + tags: + - deployment_core +- name: starlette + dev_version: "0.27.0" + from_channel: conda-forge +- name: sqlparse + dev_version: "0.4.3" + version_requirements: ">=0.4,<1" +- name: tensorflow + dev_version: "2.9.1" + from_channel: conda-forge + version_requirements: ">=2.9,<3" + requirements_extra_tags: + - tensorflow +- name: torchdata + dev_version: "0.4.1" + from_channel: conda-forge + version_requirements: ">=0.4,<1" + requirements_extra_tags: + - torch +- name: transformers + dev_version: "4.27.1" + from_channel: conda-forge +- name: types-PyYAML + dev_version: "6.0.12" + from_channel: conda-forge +- name: typing-extensions + dev_version: "4.5.0" + version_requirements: ">=4.1.0,<5" + tags: + - deployment_core +- name: xgboost + dev_version: "1.7.3" + version_requirements: ">=1.7.3,<2" diff --git a/snowflake/ml/BUILD.bazel b/snowflake/ml/BUILD.bazel index f0e35b67..6438523e 100644 --- a/snowflake/ml/BUILD.bazel +++ b/snowflake/ml/BUILD.bazel @@ -1,15 +1,10 @@ load("//bazel:py_rules.bzl", "py_library", "snowml_wheel") load(":version.bzl", "VERSION") +load(":requirements.bzl", "EXTRA_REQUIREMENTS", "REQUIREMENTS") -package(default_visibility = ["//visibility:public"]) - -_TENSORFLOW_REQUIRES = ["tensorflow>=2.9,<3"] - -_PYTORCH_REQUIRES = ["torchdata>=0.4,<1"] +exports_files(["requirements.bzl"]) -_LIGHTGBM_REQUIRES = ["lightgbm==3.3.5"] - -_ALL_REQUIRES = _TENSORFLOW_REQUIRES + _PYTORCH_REQUIRES + _LIGHTGBM_REQUIRES +package(default_visibility = ["//visibility:public"]) genrule( name = "generate_version", @@ -27,65 +22,45 @@ snowml_wheel( name = "wheel", compatible_with_snowpark = False, development_status = "PrPr", - extra_requires = { - "tensorflow": _TENSORFLOW_REQUIRES, - "pytorch": _PYTORCH_REQUIRES, - "lightgbm": _LIGHTGBM_REQUIRES, - "all": _ALL_REQUIRES, - }, - # TODO(zhuo): consider adding a check to make sure what's listed - # here is a subset that is compatible with what is specified in conda-env.yml. - requires = [ - "absl-py>=0.15,<2", - "anyio>=3.5.0,<4", - "cloudpickle", # Version range is specified by snowpark. We are implicitly depending on it. - "fsspec[http]>=2022.11,<=2023.1", - "numpy>=1.23,<2", - "packaging>=20.9,<24", - "pandas>=1.0.0,<2", # Limit since 2.x is not available in Snowflake Anaconda Channel yet. - "pyyaml>=6.0,<7", - "scikit-learn>=1.2.1,<2", - "scipy>=1.9,<2", - "snowflake-connector-python[pandas]", - "snowflake-snowpark-python>=1.4.0,<2", - "sqlparse>=0.4,<1", - "typing-extensions>=4.1.0,<5", - "xgboost>=1.7.3,<2", - ], + extra_requires = EXTRA_REQUIREMENTS, + requires = REQUIREMENTS, version = VERSION, deps = [ - "//snowflake/ml/metrics:metrics_pkg", - "//snowflake/ml/sklearn/preprocessing:preprocessing_pkg", + "//snowflake/ml/modeling/impute:impute_pkg", + "//snowflake/ml/modeling/metrics:metrics_pkg", + "//snowflake/ml/modeling/pipeline:pipeline_pkg", + "//snowflake/ml/modeling/preprocessing:preprocessing_pkg", "//snowflake/ml/utils:utils_pkg", "//snowflake/ml/fileset:fileset_pkg", "//snowflake/ml/registry:model_registry_pkg", # Auotgen packages - "//snowflake/ml/sklearn/linear_model:sklearn_linear_model_pkg", - "//snowflake/ml/sklearn/ensemble:sklearn_ensemble_pkg", - "//snowflake/ml/sklearn/svm:sklearn_svm_pkg", - "//snowflake/ml/sklearn/neural_network:sklearn_neural_network_pkg", - "//snowflake/ml/sklearn/tree:sklearn_tree_pkg", - "//snowflake/ml/sklearn/calibration:sklearn_calibration_pkg", - "//snowflake/ml/sklearn/cluster:sklearn_cluster_pkg", - "//snowflake/ml/sklearn/compose:sklearn_compose_pkg", - "//snowflake/ml/sklearn/covariance:sklearn_covariance_pkg", - "//snowflake/ml/sklearn/decomposition:sklearn_decomposition_pkg", - "//snowflake/ml/sklearn/discriminant_analysis:sklearn_discriminant_analysis_pkg", - "//snowflake/ml/sklearn/feature_selection:sklearn_feature_selection_pkg", - "//snowflake/ml/sklearn/gaussian_process:sklearn_gaussian_process_pkg", - "//snowflake/ml/sklearn/impute:sklearn_impute_pkg", - "//snowflake/ml/sklearn/isotonic:sklearn_isotonic_pkg", - "//snowflake/ml/sklearn/kernel_approximation:sklearn_kernel_approximation_pkg", - "//snowflake/ml/sklearn/kernel_ridge:sklearn_kernel_ridge_pkg", - "//snowflake/ml/sklearn/manifold:sklearn_manifold_pkg", - "//snowflake/ml/sklearn/mixture:sklearn_mixture_pkg", - "//snowflake/ml/sklearn/model_selection:sklearn_model_selection_pkg", - "//snowflake/ml/sklearn/multiclass:sklearn_multiclass_pkg", - "//snowflake/ml/sklearn/multioutput:sklearn_multioutput_pkg", - "//snowflake/ml/sklearn/naive_bayes:sklearn_naive_bayes_pkg", - "//snowflake/ml/sklearn/neighbors:sklearn_neighbors_pkg", - "//snowflake/ml/sklearn/semi_supervised:sklearn_semi_supervised_pkg", - "//snowflake/ml/xgboost:xgboost_pkg", - "//snowflake/ml/lightgbm:lightgbm_pkg", + "//snowflake/ml/modeling/linear_model:sklearn_linear_model_pkg", + "//snowflake/ml/modeling/ensemble:sklearn_ensemble_pkg", + "//snowflake/ml/modeling/svm:sklearn_svm_pkg", + "//snowflake/ml/modeling/neural_network:sklearn_neural_network_pkg", + "//snowflake/ml/modeling/tree:sklearn_tree_pkg", + "//snowflake/ml/modeling/calibration:sklearn_calibration_pkg", + "//snowflake/ml/modeling/cluster:sklearn_cluster_pkg", + "//snowflake/ml/modeling/compose:sklearn_compose_pkg", + "//snowflake/ml/modeling/covariance:sklearn_covariance_pkg", + "//snowflake/ml/modeling/decomposition:sklearn_decomposition_pkg", + "//snowflake/ml/modeling/discriminant_analysis:sklearn_discriminant_analysis_pkg", + "//snowflake/ml/modeling/feature_selection:sklearn_feature_selection_pkg", + "//snowflake/ml/modeling/gaussian_process:sklearn_gaussian_process_pkg", + "//snowflake/ml/modeling/impute:sklearn_impute_pkg", + "//snowflake/ml/modeling/isotonic:sklearn_isotonic_pkg", + "//snowflake/ml/modeling/kernel_approximation:sklearn_kernel_approximation_pkg", + "//snowflake/ml/modeling/kernel_ridge:sklearn_kernel_ridge_pkg", + "//snowflake/ml/modeling/lightgbm:lightgbm_pkg", + "//snowflake/ml/modeling/manifold:sklearn_manifold_pkg", + "//snowflake/ml/modeling/mixture:sklearn_mixture_pkg", + "//snowflake/ml/modeling/model_selection:sklearn_model_selection_pkg", + "//snowflake/ml/modeling/multiclass:sklearn_multiclass_pkg", + "//snowflake/ml/modeling/multioutput:sklearn_multioutput_pkg", + "//snowflake/ml/modeling/naive_bayes:sklearn_naive_bayes_pkg", + "//snowflake/ml/modeling/neighbors:sklearn_neighbors_pkg", + "//snowflake/ml/modeling/preprocessing:sklearn_preprocessing_pkg", + "//snowflake/ml/modeling/semi_supervised:sklearn_semi_supervised_pkg", + "//snowflake/ml/modeling/xgboost:xgboost_pkg", ], ) diff --git a/snowflake/ml/_internal/BUILD.bazel b/snowflake/ml/_internal/BUILD.bazel index 6a98bd1f..2e84a63f 100644 --- a/snowflake/ml/_internal/BUILD.bazel +++ b/snowflake/ml/_internal/BUILD.bazel @@ -29,6 +29,7 @@ py_test( srcs = ["file_utils_test.py"], deps = [ ":file_utils", + "//snowflake/ml/test_utils:mock_session", ], ) @@ -37,6 +38,7 @@ py_library( srcs = ["env_utils.py"], deps = [ "//snowflake/ml/_internal/utils:query_result_checker", + ":env", ], ) diff --git a/snowflake/ml/_internal/env_utils.py b/snowflake/ml/_internal/env_utils.py index b25214e2..d844564a 100644 --- a/snowflake/ml/_internal/env_utils.py +++ b/snowflake/ml/_internal/env_utils.py @@ -8,9 +8,11 @@ from packaging import requirements, specifiers, utils as packaging_utils, version import snowflake.connector +from snowflake.ml._internal import env as snowml_env from snowflake.ml._internal.utils import query_result_checker from snowflake.snowpark import session +_SNOWML_PKG_NAME = "snowflake-ml-python" _INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION: Optional[bool] = None _SNOWFLAKE_CONDA_PACKAGE_CACHE: Dict[str, List[version.Version]] = {} @@ -192,17 +194,20 @@ def get_local_installed_version_of_pip_package(pip_req: requirements.Requirement try: local_dist = importlib_metadata.distribution(pip_req.name) local_dist_version = local_dist.version - if pip_req.specifier.contains(local_dist_version): - new_pip_req = copy.deepcopy(pip_req) - new_pip_req.specifier = specifiers.SpecifierSet(specifiers=f"=={local_dist_version}") - return new_pip_req + except importlib_metadata.PackageNotFoundError: + if pip_req.name == _SNOWML_PKG_NAME: + local_dist_version = snowml_env.VERSION else: - warnings.warn( - f"Package requirement {str(pip_req)} specified, while version {local_dist_version} is installed.", - category=UserWarning, - ) return pip_req - except importlib_metadata.PackageNotFoundError: + if pip_req.specifier.contains(local_dist_version): + new_pip_req = copy.deepcopy(pip_req) + new_pip_req.specifier = specifiers.SpecifierSet(specifiers=f"=={local_dist_version}") + return new_pip_req + else: + warnings.warn( + f"Package requirement {str(pip_req)} specified, while version {local_dist_version} is installed.", + category=UserWarning, + ) return pip_req diff --git a/snowflake/ml/_internal/env_utils_test.py b/snowflake/ml/_internal/env_utils_test.py index 2aacd2cd..aeb4bd06 100644 --- a/snowflake/ml/_internal/env_utils_test.py +++ b/snowflake/ml/_internal/env_utils_test.py @@ -243,6 +243,12 @@ def test_get_local_installed_version_of_pip_package(self) -> None: env_utils.get_local_installed_version_of_pip_package(r), ) + r = requirements.Requirement(env_utils._SNOWML_PKG_NAME) + self.assertEqual( + requirements.Requirement(f"{env_utils._SNOWML_PKG_NAME}=={snowml_env.VERSION}"), + env_utils.get_local_installed_version_of_pip_package(r), + ) + r = requirements.Requirement("python-package") self.assertIs( r, diff --git a/snowflake/ml/_internal/file_utils.py b/snowflake/ml/_internal/file_utils.py index 11960346..58bd0720 100644 --- a/snowflake/ml/_internal/file_utils.py +++ b/snowflake/ml/_internal/file_utils.py @@ -1,10 +1,15 @@ import contextlib +import hashlib +import importlib import io import os +import pathlib import shutil import tempfile import zipfile -from typing import IO, Generator, Optional +from typing import IO, Generator, Optional, Tuple, Union + +from snowflake.snowpark import session as snowpark_session GENERATED_PY_FILE_EXT = (".pyc", ".pyo", ".pyd", ".pyi") @@ -77,6 +82,7 @@ def zip_file_or_directory_to_stream( cur_path = os.path.dirname(cur_path) if os.path.isdir(path): + zf.writestr(f"{os.path.relpath(path, start_path)}/", "") for dirname, _, files in os.walk(path): # ignore __pycache__ if ignore_generated_py_file and "__pycache__" in dirname: @@ -109,3 +115,61 @@ def unzip_stream_in_temp_dir(stream: IO[bytes], temp_root: Optional[str] = None) with zipfile.ZipFile(stream, mode="r", compression=zipfile.ZIP_DEFLATED) as zf: zf.extractall(path=tempdir) yield tempdir + + +@contextlib.contextmanager +def zip_snowml() -> Generator[Tuple[io.BytesIO, str], None, None]: + """Zip the snowflake-ml source code as a zip-file for import. + + Yields: + A bytes IO stream containing the zip file. + """ + snowml_path = list(importlib.import_module("snowflake.ml").__path__)[0] + root_path = os.path.normpath(os.path.join(snowml_path, os.pardir, os.pardir)) + with zip_file_or_directory_to_stream(snowml_path, root_path) as stream: + yield stream, hash_directory(snowml_path) + + +def hash_directory(directory: Union[str, pathlib.Path]) -> str: + """Hash the **content** of a folder recursively using SHA-1. + + Args: + directory: The path to the directory to be hashed. + + Returns: + The hexdigest form of the hash result. + """ + + def _update_hash_from_dir(directory: Union[str, pathlib.Path], hash: "hashlib._Hash") -> "hashlib._Hash": + assert pathlib.Path(directory).is_dir(), "Provided path is not a directory." + for path in sorted(pathlib.Path(directory).iterdir(), key=lambda p: str(p).lower()): + hash.update(path.name.encode()) + if path.is_file(): + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(64 * 1024), b""): + hash.update(chunk) + elif path.is_dir(): + hash = _update_hash_from_dir(path, hash) + return hash + + return _update_hash_from_dir(directory, hashlib.sha1()).hexdigest() + + +def upload_snowml(session: snowpark_session.Session, stage_location: Optional[str] = None) -> str: + """Upload the SnowML local code into a stage if provided, or a session stage. + It will label the file name using the SHA-1 of the snowflake.ml folder, so that if the source code does not change, + it won't reupload. Any changes will, however, result a new zip file. + + Args: + session: Snowpark connection session. + stage_location: The path to the stage location where the uploaded SnowML should be. Defaults to None. + + Returns: + The path to the uploaded SnowML zip file. + """ + with zip_snowml() as (stream, hash_str): + if stage_location is None: + stage_location = session.get_session_stage() + file_location = os.path.join(stage_location, f"snowml_{hash_str}.zip") + session.file.put_stream(stream, stage_location=file_location, auto_compress=False, overwrite=False) + return file_location diff --git a/snowflake/ml/_internal/file_utils_test.py b/snowflake/ml/_internal/file_utils_test.py index d46aa956..568c1a7d 100644 --- a/snowflake/ml/_internal/file_utils_test.py +++ b/snowflake/ml/_internal/file_utils_test.py @@ -1,12 +1,17 @@ -# import importlib +import importlib import os - -# import sys +import shutil +import sys import tempfile +import zipimport +from typing import cast +from unittest import mock from absl.testing import absltest from snowflake.ml._internal import file_utils +from snowflake.ml.test_utils import mock_session +from snowflake.snowpark import session PY_SRC = """\ def get_name(): @@ -23,31 +28,30 @@ def test_zip_file_or_directory_to_stream(self) -> None: fake_mod_dirpath = os.path.join(leading_path, "snowflake", "fake", "fake_module") os.makedirs(fake_mod_dirpath) - # TODO(SNOW-831507): Test disabled because it breaks the coverage - # py_file_path = os.path.join(fake_mod_dirpath, "p.py") - # with open(py_file_path, "w") as f: - # f.write(PY_SRC) + py_file_path = os.path.join(fake_mod_dirpath, "p.py") + with open(py_file_path, "w") as f: + f.write(PY_SRC) zip_module_filename = os.path.join(tmpdir, "fake_module.zip") - # with file_utils.zip_file_or_directory_to_stream(py_file_path, leading_path) as input_stream: - # with open(zip_module_filename, "wb") as f: - # f.write(input_stream.getbuffer()) + with file_utils.zip_file_or_directory_to_stream(py_file_path, leading_path) as input_stream: + with open(zip_module_filename, "wb") as f: + f.write(input_stream.getbuffer()) - # sys.path.insert(0, os.path.abspath(zip_module_filename)) + sys.path.insert(0, os.path.abspath(zip_module_filename)) - # importlib.import_module("snowflake.fake.fake_module.p") + importlib.import_module("snowflake.fake.fake_module.p") - # sys.path.remove(os.path.abspath(zip_module_filename)) + sys.path.remove(os.path.abspath(zip_module_filename)) - # with file_utils.zip_file_or_directory_to_stream(fake_mod_dirpath, leading_path) as input_stream: - # with open(zip_module_filename, "wb") as f: - # f.write(input_stream.getbuffer()) + with file_utils.zip_file_or_directory_to_stream(fake_mod_dirpath, leading_path) as input_stream: + with open(zip_module_filename, "wb") as f: + f.write(input_stream.getbuffer()) - # sys.path.insert(0, os.path.abspath(zip_module_filename)) + sys.path.insert(0, os.path.abspath(zip_module_filename)) - # importlib.import_module("snowflake.fake.fake_module.p") + importlib.import_module("snowflake.fake.fake_module.p") - # sys.path.remove(os.path.abspath(zip_module_filename)) + sys.path.remove(os.path.abspath(zip_module_filename)) with file_utils.zip_file_or_directory_to_stream(fake_mod_dirpath, fake_mod_dirpath) as input_stream: with open(zip_module_filename, "wb") as f: @@ -68,6 +72,119 @@ def test_unzip_stream_in_temp_dir(self) -> None: with open(os.path.join(sub_tempdir, "snowflake", "fake", "fake_module", "p.py")) as f: self.assertEqual(f.read(), PY_SRC) + def test_zip_snowml(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + zip_module_filename = os.path.join(tmpdir, "snowml.zip") + with file_utils.zip_snowml() as (input_stream, _): + with open(zip_module_filename, "wb") as f: + f.write(input_stream.getbuffer()) + sys.path.insert(0, os.path.abspath(zip_module_filename)) + + mod = importlib.import_module("snowflake.ml._internal.file_utils_test") + self.assertIsInstance(mod.__loader__, zipimport.zipimporter) + + sys.path.remove(os.path.abspath(zip_module_filename)) + + def test_hash_directory(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + os.mkdir(os.path.join(tmpdir, "test")) + with open(os.path.join(tmpdir, "test", "snowflake"), "w") as f: + f.write("Hello Snowflake!") + f.flush() + hash_0 = file_utils.hash_directory(os.path.join(tmpdir, "test")) + hash_1 = file_utils.hash_directory(os.path.join(tmpdir, "test")) + shutil.rmtree(os.path.join(tmpdir, "test")) + + os.mkdir(os.path.join(tmpdir, "test")) + with open(os.path.join(tmpdir, "test", "snowflake"), "w") as f: + f.write("Hello Snowflake!") + f.flush() + hash_2 = file_utils.hash_directory(os.path.join(tmpdir, "test")) + shutil.rmtree(os.path.join(tmpdir, "test")) + + os.mkdir(os.path.join(tmpdir, "test")) + with open(os.path.join(tmpdir, "test", "snowflake"), "w") as f: + f.write("Hello Taffy!") + f.flush() + hash_3 = file_utils.hash_directory(os.path.join(tmpdir, "test")) + shutil.rmtree(os.path.join(tmpdir, "test")) + + os.mkdir(os.path.join(tmpdir, "test")) + with open(os.path.join(tmpdir, "test", "snow"), "w") as f: + f.write("Hello Snowflake!") + f.flush() + hash_4 = file_utils.hash_directory(os.path.join(tmpdir, "test")) + shutil.rmtree(os.path.join(tmpdir, "test")) + + os.mkdir(os.path.join(tmpdir, "not-a-test")) + with open(os.path.join(tmpdir, "not-a-test", "snowflake"), "w") as f: + f.write("Hello Snowflake!") + f.flush() + hash_5 = file_utils.hash_directory(os.path.join(tmpdir, "not-a-test")) + shutil.rmtree(os.path.join(tmpdir, "not-a-test")) + + os.makedirs(os.path.join(tmpdir, "test", "test")) + with open(os.path.join(tmpdir, "test", "test", "snowflake"), "w") as f: + f.write("Hello Snowflake!") + f.flush() + hash_6 = file_utils.hash_directory(os.path.join(tmpdir, "test")) + shutil.rmtree(os.path.join(tmpdir, "test")) + + self.assertEqual(hash_0, hash_1) + self.assertEqual(hash_0, hash_2) + self.assertNotEqual(hash_0, hash_3) + self.assertNotEqual(hash_0, hash_4) + self.assertEqual(hash_0, hash_5) + self.assertNotEqual(hash_0, hash_6) + + _MOCK_SHA1_RESULT = "10757e7c6da427f7eca0646fcd2a3883" + + def test_upload_snowml_session_stage(self) -> None: + m_session = mock_session.MockSession(conn=None, test_case=self) + c_session = cast(session.Session, m_session) + with mock.patch.object( + c_session, "get_session_stage", create=True, return_value="@mock_session_stage" + ) as mock_session_stage: + with mock.patch.object(c_session, "file", create=True) as mock_file_object: + with mock.patch.object(mock_file_object, "put_stream") as mock_put_stream: + with mock.patch.object( + file_utils, "hash_directory", return_value=FileUtilsTest._MOCK_SHA1_RESULT + ) as mock_hash_directory: + file_location = file_utils.upload_snowml(c_session) + mock_session_stage.assert_called_once_with() + mock_hash_directory.assert_called_once() + mock_put_stream.assert_called_once_with( + mock.ANY, + stage_location=f"@mock_session_stage/snowml_{FileUtilsTest._MOCK_SHA1_RESULT}.zip", + auto_compress=False, + overwrite=False, + ) + self.assertEqual( + file_location, f"@mock_session_stage/snowml_{FileUtilsTest._MOCK_SHA1_RESULT}.zip" + ) + + def test_upload_snowml_provided_stage(self) -> None: + m_session = mock_session.MockSession(conn=None, test_case=self) + c_session = cast(session.Session, m_session) + with mock.patch.object( + c_session, "get_session_stage", create=True, return_value="@mock_session_stage" + ) as mock_session_stage: + with mock.patch.object(c_session, "file", create=True) as mock_file_object: + with mock.patch.object(mock_file_object, "put_stream") as mock_put_stream: + with mock.patch.object( + file_utils, "hash_directory", return_value=FileUtilsTest._MOCK_SHA1_RESULT + ) as mock_hash_directory: + file_location = file_utils.upload_snowml(c_session, stage_location="@mystage") + mock_session_stage.assert_not_called() + mock_hash_directory.assert_called_once() + mock_put_stream.assert_called_once_with( + mock.ANY, + stage_location=f"@mystage/snowml_{FileUtilsTest._MOCK_SHA1_RESULT}.zip", + auto_compress=False, + overwrite=False, + ) + self.assertEqual(file_location, f"@mystage/snowml_{FileUtilsTest._MOCK_SHA1_RESULT}.zip") + if __name__ == "__main__": absltest.main() diff --git a/snowflake/ml/_internal/init_utils.py b/snowflake/ml/_internal/init_utils.py index ec6d836c..ce870de7 100644 --- a/snowflake/ml/_internal/init_utils.py +++ b/snowflake/ml/_internal/init_utils.py @@ -9,7 +9,7 @@ def fetch_classes_from_modules_in_pkg_dir(pkg_dir: str, pkg_name: str) -> Dict[s Args: pkg_dir: Path of the package directory. - pkg_name: Package name. Example, "snowflake.ml.sklearn.preprocessing". + pkg_name: Package name. Example, "snowflake.ml.modeling.preprocessing". Returns: A dict with class_name as key and class object as value. diff --git a/snowflake/ml/_internal/utils/disable_package_version_enforcement.py b/snowflake/ml/_internal/utils/disable_package_version_enforcement.py new file mode 100644 index 00000000..8ed2826a --- /dev/null +++ b/snowflake/ml/_internal/utils/disable_package_version_enforcement.py @@ -0,0 +1,17 @@ +""" +Enables relax version: + +The API and results of this might lead to more issues caused by +the different versioning from packages, such as sklearn, pandas, ... + +Importing this file dynamically sets _relax_version = True + + >>> # explicitly import this package + >>> from snowflake.ml._internal.utils import disable_package_version_enforcement # noqa + >>> # now you can import other package normally without any version errors + >>> from snowflake.ml.modeling.linear_model import LogisticRegression +""" + +from snowflake.ml._internal.utils import pkg_version_utils + +pkg_version_utils._relax_version = True diff --git a/snowflake/ml/_internal/utils/pkg_version_utils.py b/snowflake/ml/_internal/utils/pkg_version_utils.py index 7c9190bc..242bdd22 100644 --- a/snowflake/ml/_internal/utils/pkg_version_utils.py +++ b/snowflake/ml/_internal/utils/pkg_version_utils.py @@ -1,34 +1,59 @@ -import inspect import sys +import warnings from typing import Dict, List, Optional +from packaging.version import Version + from snowflake.ml._internal import telemetry -from snowflake.snowpark import DataFrame, Session +from snowflake.snowpark import Session -cache: Dict[str, bool] = {} +cache: Dict[str, Optional[str]] = {} _PROJECT = "ModelDevelopment" _SUBPROJECT = "utils" _RUNTIME_VERSION = f"{sys.version_info.major}.{sys.version_info.minor}" +_relax_version: bool = False + + +def is_relaxed() -> bool: + return _relax_version -def validate_pkg_versions_supported_in_snowflake_conda_channel( + +def get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions: List[str], session: Session, subproject: Optional[str] = None -) -> None: +) -> List[Optional[str]]: + pkg_version_conda_list: List[Optional[str]] = [] + pkg_version_warning_list: List[List[str]] = [] for pkg_version in pkg_versions: - if not _validate_pkg_version_supported_in_snowflake_conda_channel( + conda_pkg_version = _validate_pkg_version_supported_in_snowflake_conda_channel( pkg_version=pkg_version, session=session, subproject=subproject - ): - raise RuntimeError( - f"Package {pkg_version} is not supported in snowflake conda channel for " - f"python runtime {_RUNTIME_VERSION}." - ) + ) + if not conda_pkg_version: + if _relax_version: + pkg_version_warning_list.append([pkg_version, _RUNTIME_VERSION]) + else: + raise RuntimeError( + f"Package {pkg_version} is not supported in snowflake conda channel for " + f"python runtime {_RUNTIME_VERSION}." + ) + else: + tokens = pkg_version.split("==") + pkg_name = tokens[0] + pkg_version_conda_list.append(f"{pkg_name}=={conda_pkg_version}") + if pkg_version_warning_list: + warnings.warn( + f"Package {', '.join([pkg[0] for pkg in pkg_version_warning_list])} is not supported " + f"in snowflake conda channel for python runtime " + f"{', '.join([pkg[1] for pkg in pkg_version_warning_list])}." + ) + return pkg_version_conda_list def _validate_pkg_version_supported_in_snowflake_conda_channel( pkg_version: str, session: Session, subproject: Optional[str] = None -) -> bool: +) -> Optional[str]: if pkg_version not in cache: tokens = pkg_version.split("==") if len(tokens) != 2: @@ -37,25 +62,34 @@ def _validate_pkg_version_supported_in_snowflake_conda_channel( f"'==', but found {pkg_version}" ) pkg_name = tokens[0] - version = tokens[1] + version = Version(tokens[1]) + major_version, minor_version, micro_version = version.major, version.minor, version.micro - sql = f"""SELECT * - FROM information_schema.packages - WHERE package_name = '{pkg_name}' - AND version = '{version}'""" + # relax version control - only major_version.minor_version.* will be enforced. + # the result would be ordered by, the version that closest to user's version, and the latest. + sql = f""" + SELECT PACKAGE_NAME, VERSION, LANGUAGE + FROM ( + SELECT *, + SUBSTRING(VERSION, LEN(VERSION) - CHARINDEX('.', REVERSE(VERSION)) + 2, LEN(VERSION)) as micro_version + FROM information_schema.packages + WHERE package_name = '{pkg_name}' + AND version LIKE '{major_version}.{minor_version}.%' + ORDER BY abs({micro_version}-micro_version), -micro_version + ) + """ result_df = session.sql(sql) # TODO(snandamuri): Move this filter into main SQL query after BCR 7.19 is completed. if "RUNTIME_VERSION" in result_df.columns: result_df = result_df.filter(f"RUNTIME_VERSION = {_RUNTIME_VERSION}") - num_rows = result_df.count( - statement_params=telemetry.get_function_usage_statement_params( - project=_PROJECT, - subproject=subproject or _SUBPROJECT, - function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe()), - api_calls=[DataFrame.count], - ) + pkg_version_list = result_df.collect( + statement_params=telemetry.get_statement_params(_PROJECT, subproject or _SUBPROJECT) ) - cache[pkg_version] = num_rows >= 1 + + if len(pkg_version_list) >= 1: + cache[pkg_version] = pkg_version_list[0]["VERSION"] + else: + cache[pkg_version] = None return cache[pkg_version] diff --git a/snowflake/ml/_internal/utils/pkg_version_utils_test.py b/snowflake/ml/_internal/utils/pkg_version_utils_test.py index 6231f395..729e6f75 100644 --- a/snowflake/ml/_internal/utils/pkg_version_utils_test.py +++ b/snowflake/ml/_internal/utils/pkg_version_utils_test.py @@ -5,40 +5,61 @@ from snowflake.ml._internal.utils import pkg_version_utils from snowflake.ml.test_utils import mock_data_frame, mock_session -from snowflake.snowpark import session +from snowflake.snowpark import Row, session _RUNTIME_VERSION = f"{sys.version_info.major}.{sys.version_info.minor}" class PackageVersionUtilsTest(absltest.TestCase): def test_happy_case(self) -> None: - query = """SELECT * + pkg_name = "xgboost" + major_version, minor_version, micro_version = 1, 7, 3 + query = f""" + SELECT PACKAGE_NAME, VERSION, LANGUAGE + FROM ( + SELECT *, + SUBSTRING(VERSION, LEN(VERSION) - CHARINDEX('.', REVERSE(VERSION)) + 2, LEN(VERSION)) as micro_version FROM information_schema.packages - WHERE package_name = 'xgboost' - AND version = '1.7.3'""" + WHERE package_name = '{pkg_name}' + AND version LIKE '{major_version}.{minor_version}.%' + ORDER BY abs({micro_version}-micro_version), -micro_version + ) + """ m_session = mock_session.MockSession(conn=None, test_case=self) m_session.add_mock_sql( query=query, - result=mock_data_frame.MockDataFrame(count_result=1, columns=["PACKAGE_NAME", "VERSION", "LANGUAGE"]), + result=mock_data_frame.MockDataFrame( + collect_result=[Row(PACKAGE_NAME="xgboost", VERSION="1.7.3", LANGUAGE="python")], + columns=["PACKAGE_NAME", "VERSION", "LANGUAGE"], + ), ) c_session = cast(session.Session, m_session) # Test - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.7.3"], session=c_session ) # Test subsequent calls are served through cache. - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.7.3"], session=c_session ) def test_happy_case_with_runtime_version_column(self) -> None: - query = """SELECT * + pkg_name = "xgboost" + major_version, minor_version, micro_version = 1, 7, 3 + query = f""" + SELECT PACKAGE_NAME, VERSION, LANGUAGE + FROM ( + SELECT *, + SUBSTRING(VERSION, LEN(VERSION) - CHARINDEX('.', REVERSE(VERSION)) + 2, LEN(VERSION)) as micro_version FROM information_schema.packages - WHERE package_name = 'xgboost' - AND version = '1.7.3'""" + WHERE package_name = '{pkg_name}' + AND version LIKE '{major_version}.{minor_version}.%' + ORDER BY abs({micro_version}-micro_version), -micro_version + ) + """ m_session = mock_session.MockSession(conn=None, test_case=self) mock_df = mock_data_frame.MockDataFrame(columns=["PACKAGE_NAME", "VERSION", "LANGUAGE", "RUNTIME_VERSION"]) @@ -49,45 +70,59 @@ def test_happy_case_with_runtime_version_column(self) -> None: c_session = cast(session.Session, m_session) # Test - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.7.3"], session=c_session ) # Test subsequent calls are served through cache. - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.7.3"], session=c_session ) def test_unsupported_version(self) -> None: - query = """SELECT * + pkg_name = "xgboost" + major_version, minor_version, micro_version = 1, 0, 0 + query = f""" + SELECT PACKAGE_NAME, VERSION, LANGUAGE + FROM ( + SELECT *, + SUBSTRING(VERSION, LEN(VERSION) - CHARINDEX('.', REVERSE(VERSION)) + 2, LEN(VERSION)) as micro_version FROM information_schema.packages - WHERE package_name = 'xgboost' - AND version = '1.0.0'""" + WHERE package_name = '{pkg_name}' + AND version LIKE '{major_version}.{minor_version}.%' + ORDER BY abs({micro_version}-micro_version), -micro_version + ) + """ m_session = mock_session.MockSession(conn=None, test_case=self) m_session.add_mock_sql( query=query, - result=mock_data_frame.MockDataFrame(count_result=0, columns=["PACKAGE_NAME", "VERSION", "LANGUAGE"]), + result=mock_data_frame.MockDataFrame(collect_result=[], columns=["PACKAGE_NAME", "VERSION", "LANGUAGE"]), ) c_session = cast(session.Session, m_session) # Test with self.assertRaises(RuntimeError): - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.0.0"], session=c_session ) # Test subsequent calls are served through cache. with self.assertRaises(RuntimeError): - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.0.0"], session=c_session ) def test_unsupported_version_with_runtime_version_column(self) -> None: - query = """SELECT * + query = """SELECT PACKAGE_NAME, VERSION, LANGUAGE + FROM ( + SELECT *, + SUBSTRING(VERSION, LEN(VERSION) - CHARINDEX('.', REVERSE(VERSION)) + 2, LEN(VERSION)) as micro_version FROM information_schema.packages WHERE package_name = 'xgboost' - AND version = '1.0.0'""" + AND version LIKE '1.0.%' + ORDER BY abs(0-micro_version), -micro_version + )""" m_session = mock_session.MockSession(conn=None, test_case=self) mock_df = mock_data_frame.MockDataFrame(columns=["PACKAGE_NAME", "VERSION", "LANGUAGE", "RUNTIME_VERSION"]) @@ -99,13 +134,13 @@ def test_unsupported_version_with_runtime_version_column(self) -> None: # Test with self.assertRaises(RuntimeError): - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.0.0"], session=c_session ) # Test subsequent calls are served through cache. with self.assertRaises(RuntimeError): - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost==1.0.0"], session=c_session ) @@ -113,7 +148,7 @@ def test_invalid_package_name(self) -> None: m_session = mock_session.MockSession(conn=None, test_case=self) c_session = cast(session.Session, m_session) with self.assertRaises(RuntimeError): - pkg_version_utils.validate_pkg_versions_supported_in_snowflake_conda_channel( + pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel( pkg_versions=["xgboost"], session=c_session ) diff --git a/snowflake/ml/fileset/fileset.py b/snowflake/ml/fileset/fileset.py index 8d7663a2..ba5ffc64 100644 --- a/snowflake/ml/fileset/fileset.py +++ b/snowflake/ml/fileset/fileset.py @@ -254,6 +254,7 @@ def _fileset_absolute_path(self) -> str: @telemetry.send_api_usage_telemetry( project=_PROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") @_raise_if_deleted def files(self) -> List[str]: """Get the list of stage file paths in the current FileSet. @@ -279,6 +280,10 @@ def files(self) -> List[str]: self._files = [f"sfc://{file}" for file in files] return self._files + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + ) + @snowpark._internal.utils.private_preview(version="0.2.0") @_raise_if_deleted def fileset_stage_location(self) -> str: """Get the stage path to the current FileSet in sfc protocol. @@ -300,6 +305,7 @@ def fileset_stage_location(self) -> str: project=_PROJECT, func_params_to_log=["batch_size", "shuffle", "drop_last_batch"], ) + @snowpark._internal.utils.private_preview(version="0.2.0") @_raise_if_deleted def to_torch_datapipe(self, *, batch_size: int, shuffle: bool = False, drop_last_batch: bool = True) -> Any: """Transform the Snowflake data into a ready-to-use Pytorch datapipe. @@ -341,6 +347,7 @@ def to_torch_datapipe(self, *, batch_size: int, shuffle: bool = False, drop_last project=_PROJECT, func_params_to_log=["batch_size", "shuffle", "drop_last_batch"], ) + @snowpark._internal.utils.private_preview(version="0.2.0") @_raise_if_deleted def to_tf_dataset(self, *, batch_size: int, shuffle: bool = False, drop_last_batch: bool = True) -> Any: """Transform the Snowflake data into a ready-to-use TensorFlow tf.data.Dataset. @@ -377,6 +384,7 @@ def to_tf_dataset(self, *, batch_size: int, shuffle: bool = False, drop_last_bat @telemetry.send_api_usage_telemetry( project=_PROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") @_raise_if_deleted def to_snowpark_dataframe(self) -> snowpark.DataFrame: """Convert the fileset to a snowpark dataframe. @@ -398,6 +406,10 @@ def to_snowpark_dataframe(self) -> snowpark.DataFrame: assert isinstance(df, snowpark.DataFrame) return df + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + ) + @snowpark._internal.utils.private_preview(version="0.2.0") @_raise_if_deleted def delete(self) -> None: """Delete the FileSet directory and all the stage files in it. diff --git a/snowflake/ml/fileset/sfcfs.py b/snowflake/ml/fileset/sfcfs.py index 6cdf7247..3fa54f8f 100644 --- a/snowflake/ml/fileset/sfcfs.py +++ b/snowflake/ml/fileset/sfcfs.py @@ -107,6 +107,7 @@ def _get_stage_fs(self, sf_file_path: _SFFilePath) -> stage_fs.SFStageFileSystem func_params_to_log=["detail"], conn_attr_name="_conn", ) + @snowpark._internal.utils.private_preview(version="0.2.0") def ls(self, path: str, detail: bool = False, **kwargs: Any) -> Union[List[str], List[Dict[str, Any]]]: """Override fsspec `ls` method. List single "directory" with or without details. @@ -133,6 +134,11 @@ def ls(self, path: str, detail: bool = False, **kwargs: Any) -> Union[List[str], stage_path_list = cast(List[Dict[str, Any]], stage_path_list) return self._decorate_ls_res(stage_fs, stage_path_list, detail) + @telemetry.send_api_usage_telemetry( + project=_PROJECT, + conn_attr_name="_conn", + ) + @snowpark._internal.utils.private_preview(version="0.2.0") def optimize_read(self, files: Optional[List[str]] = None) -> None: """Prefetch and cache the presigned urls for all the given files to speed up the file opening. @@ -156,6 +162,7 @@ def optimize_read(self, files: Optional[List[str]] = None) -> None: project=_PROJECT, conn_attr_name="_conn", ) + @snowpark._internal.utils.private_preview(version="0.2.0") def _open(self, path: str, **kwargs: Any) -> fsspec.spec.AbstractBufferedFile: """Override fsspec `_open` method. Open a file for reading in 'rb' mode. @@ -181,6 +188,7 @@ def _open(self, path: str, **kwargs: Any) -> fsspec.spec.AbstractBufferedFile: project=_PROJECT, conn_attr_name="_conn", ) + @snowpark._internal.utils.private_preview(version="0.2.0") def info(self, path: str, **kwargs: Any) -> Dict[str, Any]: """Override fsspec `info` method. Give details of entry at path.""" file_path = _parse_sfc_file_path(path) diff --git a/snowflake/ml/fileset/stage_fs.py b/snowflake/ml/fileset/stage_fs.py index 28e6b11d..a7a495b8 100644 --- a/snowflake/ml/fileset/stage_fs.py +++ b/snowflake/ml/fileset/stage_fs.py @@ -130,6 +130,7 @@ def stage_name(self) -> str: project=_PROJECT, func_params_to_log=["detail"], ) + @snowpark._internal.utils.private_preview(version="0.2.0") def ls(self, path: str, detail: bool = False) -> Union[List[str], List[Dict[str, Any]]]: """Override fsspec `ls` method. List single "directory" with or without details. @@ -167,6 +168,7 @@ def ls(self, path: str, detail: bool = False) -> Union[List[str], List[Dict[str, @telemetry.send_api_usage_telemetry( project=_PROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def optimize_read(self, files: Optional[List[str]] = None) -> None: """Prefetch and cache the presigned urls for all the given files to speed up the read performance. @@ -193,6 +195,7 @@ def optimize_read(self, files: Optional[List[str]] = None) -> None: @telemetry.send_api_usage_telemetry( project=_PROJECT, ) + @snowpark._internal.utils.private_preview(version="0.2.0") def _open(self, path: str, mode: str = "rb", **kwargs: Any) -> fsspec.spec.AbstractBufferedFile: """Override fsspec `_open` method. Open a file for reading. diff --git a/snowflake/ml/metrics/__init__.py b/snowflake/ml/metrics/__init__.py deleted file mode 100644 index 562d1900..00000000 --- a/snowflake/ml/metrics/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .accuracy_score import accuracy_score -from .correlation import correlation -from .covariance import covariance - -__all__ = [ - "accuracy_score", - "correlation", - "covariance", -] diff --git a/snowflake/ml/model/BUILD.bazel b/snowflake/ml/model/BUILD.bazel index 8ce82f4d..0af9fce7 100644 --- a/snowflake/ml/model/BUILD.bazel +++ b/snowflake/ml/model/BUILD.bazel @@ -2,12 +2,28 @@ load("//bazel:py_rules.bzl", "py_library", "py_test") package(default_visibility = ["//visibility:public"]) +genrule( + name = "gen_core_requirements", + srcs = [ + "//:requirements.yml", + "//bazel/requirements:requirements.schema.json", + ], + outs = ["_core_requirements.py"], + cmd = "$(location //bazel/requirements:parse_and_generate_requirements) $(location //:requirements.yml) --schema $(location //bazel/requirements:requirements.schema.json) --mode version_requirements --format python --filter_by_tag deployment_core > $@", + tools = ["//bazel/requirements:parse_and_generate_requirements"], +) + +py_library( + name = "_core_requirements", + srcs = [":gen_core_requirements"], +) + py_library( name = "type_hints", srcs = ["type_hints.py"], deps = [ - "//snowflake/ml/sklearn/framework:framework" - ] + "//snowflake/ml/modeling/framework", + ], ) py_library( @@ -43,8 +59,9 @@ py_library( srcs = ["_model_meta.py"], deps = [ ":_env", - ":type_hints", ":model_signature", + ":type_hints", + ":_core_requirements", "//snowflake/ml/_internal:env", "//snowflake/ml/_internal:env_utils", "//snowflake/ml/_internal:file_utils", @@ -60,6 +77,8 @@ py_library( ":_model_meta", ":type_hints", "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:env", + "//snowflake/ml/_internal:file_utils", ], ) @@ -68,8 +87,8 @@ py_library( srcs = ["_deployer.py"], deps = [ ":_udf_util", - ":type_hints", ":model_signature", + ":type_hints", "//snowflake/ml/_internal/utils:identifier", ], ) @@ -98,7 +117,7 @@ py_library( "//snowflake/ml/model/_handlers:sklearn", "//snowflake/ml/model/_handlers:snowmlmodel", "//snowflake/ml/model/_handlers:xgboost", - "//snowflake/ml/sklearn/framework:framework" + "//snowflake/ml/modeling/framework", ], ) @@ -127,6 +146,7 @@ py_test( ":_udf_util", ":model_signature", "//snowflake/ml/_internal:env_utils", + "//snowflake/ml/_internal:env", "//snowflake/ml/test_utils:mock_data_frame", "//snowflake/ml/test_utils:mock_session", ], @@ -147,6 +167,7 @@ py_test( deps = [ ":_model_meta", ":model_signature", + "//snowflake/ml/_internal:env_utils", ], ) @@ -162,7 +183,7 @@ py_test( ":custom_model", ":model_signature", ":type_hints", + "//snowflake/ml/modeling/linear_model:linear_regression", "//snowflake/ml/test_utils:mock_session", - "//snowflake/ml/sklearn/linear_model:linear_regression", ], ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel index e5f6f6b4..1f12034c 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel +++ b/snowflake/ml/model/_deploy_client/image_builds/BUILD.bazel @@ -38,6 +38,13 @@ py_test( ], data = [ "templates/dockerfile_template", - "templates/app_template", + ":inference_server" + ] +) + +filegroup( + name = "inference_server", + srcs = [ + "//snowflake/ml/model/_deploy_client/image_builds/inference_server:main" ] ) diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py index 8c564936..d7998b19 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context.py @@ -47,10 +47,7 @@ def _generate_inference_code(self) -> None: Generates inference code based on the app template and creates a folder named 'server' to house the inference server code. """ - server_dir = os.path.join(self.context_dir, "server") - os.makedirs(server_dir, exist_ok=True) - - app_file_path = os.path.join(server_dir, "app.py") - app_file_template = os.path.join(os.path.dirname(__file__), "templates/app_template") - with open(app_file_path, "w") as app_file, open(app_file_template) as template: - app_file.write(template.read()) + inference_server_folder_path = os.path.join(os.path.dirname(__file__), "inference_server") + destination_folder_path = os.path.join(self.context_dir, "inference_server") + ignore_patterns = shutil.ignore_patterns("BUILD.bazel", "*test.py", "*.\\.*") + shutil.copytree(inference_server_folder_path, destination_folder_path, ignore=ignore_patterns) diff --git a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py index bbbdabcb..c7a4d68a 100644 --- a/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py +++ b/snowflake/ml/model/_deploy_client/image_builds/docker_context_test.py @@ -1,7 +1,6 @@ import os import shutil import tempfile -import unittest from absl.testing import absltest @@ -20,14 +19,14 @@ def tearDown(self) -> None: shutil.rmtree(self.context_dir) def test_build(self) -> None: - expected_files = [os.path.basename(self.model_dir), "Dockerfile", "server"] + expected_files = [os.path.basename(self.model_dir), "Dockerfile", "inference_server"] self.docker_context.build() generated_files = os.listdir(self.context_dir) self.assertCountEqual(expected_files, generated_files) - actual_inference_files = os.listdir(os.path.join(self.context_dir, "server")) - self.assertCountEqual(["app.py"], actual_inference_files) + actual_inference_files = os.listdir(os.path.join(self.context_dir, "inference_server")) + self.assertCountEqual(["main.py"], actual_inference_files) if __name__ == "__main__": - unittest.main() + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel new file mode 100644 index 00000000..113c6be1 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/BUILD.bazel @@ -0,0 +1,20 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "main", + srcs = ["main.py"], + deps = [ + "//snowflake/ml/model:_model", + "//snowflake/ml/model:custom_model" + ] +) + +py_test( + name = "main_test", + srcs = ["main_test.py"], + deps = [ + ":main" + ] +) diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py new file mode 100644 index 00000000..69b6b69c --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py @@ -0,0 +1,103 @@ +import logging +import os + +import pandas as pd +from starlette import applications, requests, responses, routing + +logger = logging.getLogger(__name__) +loaded_model = None + + +def _run_setup() -> None: + """Set up logging and load model into memory.""" + # Align the application logger's handler with Gunicorn's to capture logs from all processes. + gunicorn_logger = logging.getLogger("gunicorn.error") + logger.handlers = gunicorn_logger.handlers + logger.setLevel(gunicorn_logger.level) + + from snowflake.ml.model import _model as model_api + + global loaded_model + model_dir = os.getenv("MODEL_DIR") + logger.info(f"Loading model from {model_dir} into memory") + assert model_dir, "Environment variable 'model_dir' is not set" + loaded_model, _ = model_api._load_model_for_deploy(model_dir_path=model_dir) + logger.info("Successfully loaded model into memory") + + +async def ready(request: requests.Request) -> responses.JSONResponse: + """Endpoint to check if the application is ready.""" + return responses.JSONResponse({"status": "ready"}) + + +async def predict(request: requests.Request) -> responses.JSONResponse: + """Endpoint to make predictions based on input data. + + Args: + request: The input data is expected to be in the following JSON format: + { + "data": [ + [0, 5.1, 3.5, 4.2, 1.3], + [1, 4.7, 3.2, 4.1, 4.2] + } + Each row is represented as a list, where the first element denotes the index of the row. + + Returns: + Two possible responses: + For success, return a JSON response {"data": [[0, 1], [1, 2]]}, where the first element of each resulting list + denotes the index of the row, and the rest of the elements represent the prediction results for that row. + For an error, return {"error": error_message, "status_code": http_response_status_code}. + """ + try: + input = await request.json() + assert "data" in input, "missing data field in the request input" + # The expression x[1:] is used to exclude the index of the data row. + input_data = [x[1:] for x in input.get("data")] + x = pd.DataFrame(input_data) + assert len(input_data) != 0 and not all(not row for row in input_data), "empty data" + except Exception as e: + error_message = f"Input data malformed: {str(e)}" + return responses.JSONResponse({"error": error_message}, status_code=400) + + assert loaded_model + + try: + # TODO(shchen): SNOW-835369, Support target method in inference server (Multi-task model). + # Mypy ignore will be fixed along with the above ticket. + predictions = loaded_model.predict(x) # type: ignore[attr-defined] + result = predictions.to_records(index=True).tolist() + response = {"data": result} + return responses.JSONResponse(response) + except Exception as e: + error_message = f"Prediction failed: {str(e)}" + return responses.JSONResponse({"error": error_message}, status_code=400) + + +def _in_test_mode() -> bool: + """Check if the code is running in test mode. + + Specifically, it checks for the presence of + - "PYTEST_CURRENT_TEST" environment variable, which is automatically set by Pytest when running tests, and + - "TEST_WORKSPACE" environment variable, which is set by Bazel test, and + - "TEST_SRCDIR" environment variable, which is set by the Absl test. + + Returns: + True if in test mode; otherwise, returns False + """ + is_running_under_py_test = "PYTEST_CURRENT_TEST" in os.environ + is_running_under_bazel_test = "TEST_WORKSPACE" in os.environ + is_running_under_absl_test = "TEST_SRCDIR" in os.environ + return is_running_under_py_test or is_running_under_bazel_test or is_running_under_absl_test + + +def run_app() -> applications.Starlette: + if not _in_test_mode(): + _run_setup() + routes = [ + routing.Route("/health", endpoint=ready, methods=["GET"]), + routing.Route("/predict", endpoint=predict, methods=["POST"]), + ] + return applications.Starlette(routes=routes) + + +app = run_app() diff --git a/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py new file mode 100644 index 00000000..853f1a41 --- /dev/null +++ b/snowflake/ml/model/_deploy_client/image_builds/inference_server/main_test.py @@ -0,0 +1,116 @@ +import pandas as pd +import sklearn.datasets as datasets +import sklearn.neighbors as neighbors +from absl.testing import absltest +from absl.testing.absltest import mock +from starlette import testclient + +from snowflake.ml.model import custom_model + + +class MainTest(absltest.TestCase): + """ + This test utilizes TestClient, powered by httpx, to send requests to the Starlette application. It optionally skips + the model loading step in the inference code, which is irrelevant for route testing and challenging to mock due to + gunicorn's preload option when loading the Starlette Python app. This skipping is achieved by checking the presence + of the 'PYTEST_CURRENT_TEST' environment variable during pytest execution, the 'TEST_WORKSPACE' variable during + bazel test execution, or the 'TEST_SRCDIR' variable during Absl test execution. + """ + + def setUp(self) -> None: + super().setUp() + + from main import app + + self.client = testclient.TestClient(app) + + self.loaded_model = self.get_custom_model() + + def get_custom_model(self) -> custom_model.CustomModel: + # Set up a mock model + iris = datasets.load_iris(as_frame=True) + x = iris.data + y = iris.target + knn_model = neighbors.KNeighborsClassifier() + knn_model.fit(x, y) + + class TestCustomModel(custom_model.CustomModel): + def __init__(self, context: custom_model.ModelContext) -> None: + super().__init__(context) + + @custom_model.inference_api + def predict(self, input: pd.DataFrame) -> pd.DataFrame: + return pd.DataFrame(knn_model.predict(input)) + + return TestCustomModel(custom_model.ModelContext()) + + def test_ready_endpoint(self) -> None: + with mock.patch("main.loaded_model", return_value=self.loaded_model): + response = self.client.get("/health") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json(), {"status": "ready"}) + + def test_predict_endpoint_happy_path(self) -> None: + data = { + "data": [[0, 5.1, 3.5, 4.2, 1.3], [1, 4.7, 3.2, 4.1, 4.2], [2, 5.1, 3.5, 4.2, 4.6], [3, 4.7, 3.2, 4.1, 5.1]] + } + + with mock.patch("main.loaded_model", self.loaded_model): + response = self.client.post("/predict", json=data) + + self.assertEqual(response.status_code, 200) + expected_response = {"data": [[0, 1], [1, 2], [2, 2], [3, 2]]} + self.assertEqual(response.json(), expected_response) + + def test_predict_endpoint_with_invalid_input(self) -> None: + response = self.client.post("/predict", json={}) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed: missing data field in the request input") + + response = self.client.post("/predict", json={"data": []}) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed: empty data") + + # Input data with indexes only. + response = self.client.post("/predict", json={"data": [[0], [1]]}) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed: empty data") + + response = self.client.post( + "/predict", + json={ + "foo": [ + [1, 2], + [2, 3], + ] + }, + ) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Input data malformed: missing data field in the request input") + + def test_predict_with_misshaped_data(self) -> None: + data = {"data": [[0, 5.1, 3.5, 4.2], [1, 4.7, 3.2, 4.1], [2, 5.1, 3.5, 4.2], [3, 4.7, 3.2, 4.1]]} + + with mock.patch("main.loaded_model", self.loaded_model): + response = self.client.post("/predict", json=data) + self.assertEqual(response.status_code, 400) + self.assertRegex( + response.text, + "Prediction failed: X has 3 features, but KNeighborsClassifier is " "expecting 4 features as input", + ) + + def test_predict_with_incorrect_data_type(self) -> None: + data = { + "data": [ + [0, "a", "b", "c", "d"], + ] + } + + with mock.patch("main.loaded_model", self.loaded_model): + response = self.client.post("/predict", json=data) + self.assertEqual(response.status_code, 400) + self.assertRegex(response.text, "Prediction failed: could not convert string to float") + + +if __name__ == "__main__": + absltest.main() diff --git a/snowflake/ml/model/_deploy_client/image_builds/templates/app_template b/snowflake/ml/model/_deploy_client/image_builds/templates/app_template deleted file mode 100644 index 7872234c..00000000 --- a/snowflake/ml/model/_deploy_client/image_builds/templates/app_template +++ /dev/null @@ -1 +0,0 @@ -# TODO(shchen), SNOW-825996, Define inference server code template used for model deployment diff --git a/snowflake/ml/model/_handlers/BUILD.bazel b/snowflake/ml/model/_handlers/BUILD.bazel index c4a67ddf..369f09f8 100644 --- a/snowflake/ml/model/_handlers/BUILD.bazel +++ b/snowflake/ml/model/_handlers/BUILD.bazel @@ -44,9 +44,8 @@ py_library( "//snowflake/ml/_internal:type_utils", "//snowflake/ml/model:_model_meta", "//snowflake/ml/model:custom_model", - "//snowflake/ml/model:model_signature", "//snowflake/ml/model:type_hints", - "//snowflake/ml/sklearn/framework", + "//snowflake/ml/modeling/framework", ], ) diff --git a/snowflake/ml/model/_handlers/snowmlmodel.py b/snowflake/ml/model/_handlers/snowmlmodel.py index df17354a..794dad4c 100644 --- a/snowflake/ml/model/_handlers/snowmlmodel.py +++ b/snowflake/ml/model/_handlers/snowmlmodel.py @@ -10,20 +10,19 @@ from snowflake.ml.model import ( _model_meta as model_meta_api, custom_model, - model_signature, type_hints as model_types, ) from snowflake.ml.model._handlers import _base if TYPE_CHECKING: - from snowflake.ml.sklearn.framework.base import BaseEstimator + from snowflake.ml.modeling.framework.base import BaseEstimator class _SnowMLModelHandler(_base._ModelHandler["BaseEstimator"]): """Handler for SnowML based model. - Currently snowflake.ml.sklearn.framework.base.BaseEstimator - and snowflake.ml.sklearn.framework.pipeline.Pipeline based classes are supported. + Currently snowflake.ml.modeling.framework.base.BaseEstimator + and snowflake.ml.modeling.pipeline.Pipeline based classes are supported. """ handler_type = "snowml" @@ -34,7 +33,7 @@ def can_handle( model: model_types.SupportedModelType, ) -> TypeGuard["BaseEstimator"]: return ( - type_utils.LazyType("snowflake.ml.sklearn.framework.base.BaseEstimator").isinstance(model) + type_utils.LazyType("snowflake.ml.modeling.framework.base.BaseEstimator").isinstance(model) # Pipeline is inherited from BaseEstimator, so no need to add one more check ) and any( (hasattr(model, method) and callable(getattr(model, method, None))) @@ -45,7 +44,7 @@ def can_handle( def cast_model( model: model_types.SupportedModelType, ) -> "BaseEstimator": - from snowflake.ml.sklearn.framework.base import BaseEstimator + from snowflake.ml.modeling.framework.base import BaseEstimator assert isinstance(model, BaseEstimator) # Pipeline is inherited from BaseEstimator, so no need to add one more check @@ -62,41 +61,38 @@ def _save_model( is_sub_model: Optional[bool] = False, **kwargs: Unpack[model_types.SNOWModelSaveOptions], ) -> None: - from snowflake.ml.sklearn.framework.base import BaseEstimator + from snowflake.ml.modeling.framework.base import BaseEstimator assert isinstance(model, BaseEstimator) # Pipeline is inherited from BaseEstimator, so no need to add one more check if not is_sub_model: # TODO(xjiang): get model signature from modeling. - if model_meta._signatures is None: - # In this case sample_input should be available, because of the check in save_model. - assert sample_input is not None - target_methods = kwargs.pop("target_methods", None) - if target_methods is None: - target_methods = [ - method - for method in _SnowMLModelHandler.DEFAULT_TARGET_METHODS - if hasattr(model, method) and callable(getattr(model, method, None)) - ] - else: - for method_name in target_methods: - if not callable(getattr(model, method_name, None)): - raise ValueError(f"Target method {method_name} is not callable.") - if method_name not in _SnowMLModelHandler.DEFAULT_TARGET_METHODS: - raise ValueError(f"Target method {method_name} is not supported.") - - model_meta._signatures = {} - for method_name in target_methods: - target_method = getattr(model, method_name) - sig = model_signature.infer_signature(sample_input, target_method(sample_input)) - model_meta._signatures[method_name] = sig + if model_meta._signatures is None and sample_input is None: + assert hasattr(model, "model_signatures") + model_meta._signatures = getattr(model, "model_signatures", {}) else: - for method_name in model_meta._signatures.keys(): - if not callable(getattr(model, method_name, None)): - raise ValueError(f"Target method {method_name} is not callable.") - if method_name not in _SnowMLModelHandler.DEFAULT_TARGET_METHODS: - raise ValueError(f"Target method {method_name} is not supported.") + target_methods = model_meta_api._get_target_methods( + model=model, + target_methods=kwargs.pop("target_methods", None), + default_target_methods=_SnowMLModelHandler.DEFAULT_TARGET_METHODS, + ) + + def get_prediction( + target_method_name: str, sample_input: model_types.SupportedLocalDataType + ) -> model_types.SupportedLocalDataType: + target_method = getattr(model, target_method_name, None) + assert callable(target_method) + predictions_df = target_method(sample_input) + return predictions_df + + model_meta = model_meta_api._validate_signature( + model=model, + model_meta=model_meta, + target_methods=target_methods, + sample_input=sample_input, + get_prediction_fn=get_prediction, + ) model_blob_path = os.path.join(model_blobs_dir_path, name) os.makedirs(model_blob_path, exist_ok=True) @@ -106,9 +102,12 @@ def _save_model( name=name, model_type=_SnowMLModelHandler.handler_type, path=_SnowMLModelHandler.MODEL_BLOB_FILE ) model_meta.models[name] = base_meta - model_meta._include_if_absent( - [("scikit-learn", "scikit-learn"), ("xgboost", "xgboost"), ("lightgbm", "lightgbm"), ("joblib", "joblib")] - ) + _include_if_absent_pkgs = [ + ("scikit-learn", "scikit-learn"), + ("xgboost", "xgboost"), + ("lightgbm", "lightgbm"), + ] + model_meta._include_if_absent(_include_if_absent_pkgs) @staticmethod def _load_model(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs_dir_path: str) -> "BaseEstimator": @@ -123,7 +122,7 @@ def _load_model(name: str, model_meta: model_meta_api.ModelMetadata, model_blobs with open(os.path.join(model_blob_path, model_blob_filename), "rb") as f: m = cloudpickle.load(f) - from snowflake.ml.sklearn.framework.base import BaseEstimator + from snowflake.ml.modeling.framework.base import BaseEstimator assert isinstance(m, BaseEstimator) return m diff --git a/snowflake/ml/model/_model.py b/snowflake/ml/model/_model.py index d11b58cf..e74a659c 100644 --- a/snowflake/ml/model/_model.py +++ b/snowflake/ml/model/_model.py @@ -13,7 +13,7 @@ model_signature, type_hints as model_types, ) -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base from snowflake.snowpark import FileOperation, Session MODEL_BLOBS_DIR = "models" diff --git a/snowflake/ml/model/_model_meta.py b/snowflake/ml/model/_model_meta.py index b2521187..27ab1ef6 100644 --- a/snowflake/ml/model/_model_meta.py +++ b/snowflake/ml/model/_model_meta.py @@ -13,21 +13,18 @@ from packaging import version from snowflake.ml._internal import env as snowml_env, env_utils, file_utils -from snowflake.ml.model import _env, model_signature, type_hints as model_types +from snowflake.ml.model import ( + _core_requirements, + _env, + model_signature, + type_hints as model_types, +) from snowflake.snowpark import DataFrame as SnowparkDataFrame MODEL_METADATA_VERSION = 1 +_BASIC_DEPENDENCIES = _core_requirements.REQUIREMENTS -_BASIC_DEPENDENCIES = [ - "pandas", - "pyyaml", - "typing-extensions", - "cloudpickle", - "packaging", - "anyio", - "snowflake-snowpark-python", - "scikit-learn", -] +_BASIC_DEPENDENCIES.append(env_utils._SNOWML_PKG_NAME) @dataclasses.dataclass @@ -229,17 +226,18 @@ def conda_dependencies(self) -> List[str]: ) def _include_if_absent(self, pkgs: List[Tuple[str, str]]) -> None: - conda_names, pip_names = tuple(zip(*pkgs)) - pip_reqs = env_utils.validate_pip_requirement_string_list(list(pip_names)) + conda_reqs_str, pip_reqs_str = tuple(zip(*pkgs)) + pip_reqs = env_utils.validate_pip_requirement_string_list(list(pip_reqs_str)) + conda_reqs = env_utils.validate_conda_dependency_string_list(list(conda_reqs_str)) - for conda_name, pip_req in zip(conda_names, pip_reqs): + for conda_req, pip_req in zip(conda_reqs[""], pip_reqs): req_to_add = env_utils.get_local_installed_version_of_pip_package(pip_req) - req_to_add.name = conda_name + req_to_add.name = conda_req.name for added_pip_req in self._pip_requirements: if added_pip_req.name == pip_req.name: warnings.warn( ( - f"Basic dependency {conda_name} specified from PIP requirements." + f"Basic dependency {conda_req} specified from PIP requirements." + " This may prevent model deploying to Snowflake Warehouse." ), category=UserWarning, @@ -251,7 +249,7 @@ def _include_if_absent(self, pkgs: List[Tuple[str, str]]) -> None: except env_utils.DuplicateDependencyInMultipleChannelsError: warnings.warn( ( - f"Basic dependency {conda_name} specified from non-Snowflake channel." + f"Basic dependency {conda_req.name} specified from non-Snowflake channel." + " This may prevent model deploying to Snowflake Warehouse." ), category=UserWarning, diff --git a/snowflake/ml/model/_model_meta_test.py b/snowflake/ml/model/_model_meta_test.py index 79071a6e..2d594277 100644 --- a/snowflake/ml/model/_model_meta_test.py +++ b/snowflake/ml/model/_model_meta_test.py @@ -4,7 +4,9 @@ import yaml from absl.testing import absltest +from packaging import requirements +from snowflake.ml._internal import env_utils from snowflake.ml.model import _model_meta, model_signature _DUMMY_SIG = { @@ -17,7 +19,12 @@ } _BASIC_DEPENDENCIES_TARGET = list( - sorted(map(lambda x: f"{x}=={importlib_metadata.version(x)}", _model_meta._BASIC_DEPENDENCIES)) + sorted( + map( + lambda x: str(env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x))), + _model_meta._BASIC_DEPENDENCIES, + ) + ) ) diff --git a/snowflake/ml/model/_model_test.py b/snowflake/ml/model/_model_test.py index fbc7fe1d..ab4e541d 100644 --- a/snowflake/ml/model/_model_test.py +++ b/snowflake/ml/model/_model_test.py @@ -17,7 +17,7 @@ model_signature, type_hints as model_types, ) -from snowflake.ml.sklearn.linear_model import LinearRegression +from snowflake.ml.modeling.linear_model import LinearRegression from snowflake.ml.test_utils import mock_session from snowflake.snowpark import FileOperation, Session diff --git a/snowflake/ml/model/_udf_util.py b/snowflake/ml/model/_udf_util.py index 4966ee56..d1b81a45 100644 --- a/snowflake/ml/model/_udf_util.py +++ b/snowflake/ml/model/_udf_util.py @@ -6,7 +6,7 @@ from typing_extensions import Unpack -from snowflake.ml._internal import env_utils +from snowflake.ml._internal import env as snowml_env, env_utils, file_utils from snowflake.ml.model import ( _env as model_env, _model, @@ -43,18 +43,6 @@ myzip.extractall(extracted_model_dir_path) """ -_SNOWML_IMPORT_CODE = """ - -snowml_filename = '{snowml_filename}' -snowml_path = import_dir + snowml_filename -snowml_extracted = '/tmp/' + snowml_filename -with FileLock(): - if not os.path.isdir(snowml_extracted): - with zipfile.ZipFile(snowml_path, 'r') as myzip: - myzip.extractall(snowml_extracted) -sys.path.insert(0, snowml_extracted) -""" - _UDF_CODE_TEMPLATE = """ import pandas as pd import numpy as np @@ -81,13 +69,11 @@ def __exit__(self, type, value, traceback): IMPORT_DIRECTORY_NAME = "snowflake_import_directory" import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] -{snowml_import_code} - -from snowflake.ml.model._model import _load_model_for_deploy +from snowflake.ml.model import _model {extract_model_code} -model, meta = _load_model_for_deploy(extracted_model_dir_path) +model, meta = _model._load_model_for_deploy(extracted_model_dir_path) # TODO(halu): Wire `max_batch_size`. # TODO(halu): Avoid per batch async detection branching. @@ -148,9 +134,16 @@ def _deploy_to_warehouse( if target_method not in meta.signatures.keys(): raise ValueError(f"Target method {target_method} does not exist in model.") - _snowml_wheel_path = kwargs.get("_snowml_wheel_path", None) + _use_local_snowml = kwargs.get("_use_local_snowml", False) - final_packages = _get_model_final_packages(meta, session, relax_version=relax_version) + final_packages = _get_model_final_packages( + meta, session, relax_version=relax_version, _use_local_snowml=_use_local_snowml + ) + + stage_location = kwargs.get("permanent_udf_stage_location", None) + _snowml_wheel_path = None + if _use_local_snowml: + _snowml_wheel_path = file_utils.upload_snowml(session, stage_location=stage_location) with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: _write_UDF_py_file(f.file, extract_model_code, target_method, **kwargs) @@ -161,8 +154,6 @@ def _deploy_to_warehouse( + ([_snowml_wheel_path] if _snowml_wheel_path else []) ) - stage_location = kwargs.get("permanent_udf_stage_location", None) - class _UDFParams(TypedDict): file_path: str func_name: str @@ -206,23 +197,21 @@ def _write_UDF_py_file( **kwargs: Options that control some features in generated udf code. """ keep_order = kwargs.get("keep_order", True) - snowml_wheel_path = kwargs.get("_snowml_wheel_path", None) - if snowml_wheel_path: - whl_filename = os.path.basename(snowml_wheel_path) - snowml_import_code = _SNOWML_IMPORT_CODE.format(snowml_filename=whl_filename) udf_code = _UDF_CODE_TEMPLATE.format( extract_model_code=extract_model_code, keep_order_code=_KEEP_ORDER_CODE_TEMPLATE if keep_order else "", target_method=target_method, - snowml_import_code=snowml_import_code if snowml_wheel_path else "", ) f.write(udf_code) f.flush() def _get_model_final_packages( - meta: _model_meta.ModelMetadata, session: snowpark_session.Session, relax_version: Optional[bool] = False + meta: _model_meta.ModelMetadata, + session: snowpark_session.Session, + relax_version: Optional[bool] = False, + _use_local_snowml: Optional[bool] = False, ) -> List[str]: """Generate final packages list of dependency of a model to be deployed to warehouse. @@ -231,6 +220,7 @@ def _get_model_final_packages( session: Snowpark connection session. relax_version: Whether or not relax the version restriction when fail to resolve dependencies. Defaults to False. + _use_local_snowml: Flag to indicate if using local SnowML code as execution library Raises: RuntimeError: Raised when PIP requirements and dependencies from non-Snowflake anaconda channel found. @@ -245,13 +235,26 @@ def _get_model_final_packages( or meta.pip_requirements ): raise RuntimeError("PIP requirements and dependencies from non-Snowflake anaconda channel is not supported.") + + deps = meta._conda_dependencies[""] + if _use_local_snowml: + local_snowml_version = snowml_env.VERSION + snowml_dept = next((dep for dep in deps if dep.name == env_utils._SNOWML_PKG_NAME), None) + if snowml_dept: + if not snowml_dept.specifier.contains(local_snowml_version) and not relax_version: + raise RuntimeError( + "Incompatible snowflake-ml-python-version is found. " + + f"Require {snowml_dept.specifier}, got {local_snowml_version}." + ) + deps.remove(snowml_dept) + try: final_packages = env_utils.resolve_conda_environment( - meta._conda_dependencies[""], [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], python_version=meta.python_version + deps, [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], python_version=meta.python_version ) if final_packages is None and relax_version: final_packages = env_utils.resolve_conda_environment( - list(map(env_utils.relax_requirement_version, meta._conda_dependencies[""])), + list(map(env_utils.relax_requirement_version, deps)), [model_env._SNOWFLAKE_CONDA_CHANNEL_URL], python_version=meta.python_version, ) @@ -262,13 +265,13 @@ def _get_model_final_packages( ) final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( session=session, - reqs=meta._conda_dependencies[""], + reqs=deps, python_version=meta.python_version, ) if final_packages is None and relax_version: final_packages = env_utils.validate_requirements_in_snowflake_conda_channel( session=session, - reqs=list(map(env_utils.relax_requirement_version, meta._conda_dependencies[""])), + reqs=list(map(env_utils.relax_requirement_version, deps)), python_version=meta.python_version, ) diff --git a/snowflake/ml/model/_udf_util_test.py b/snowflake/ml/model/_udf_util_test.py index 53d5ed47..63d1bfd1 100644 --- a/snowflake/ml/model/_udf_util_test.py +++ b/snowflake/ml/model/_udf_util_test.py @@ -4,8 +4,9 @@ from typing import Dict, List, cast from absl.testing import absltest +from packaging import requirements -from snowflake.ml._internal import env_utils +from snowflake.ml._internal import env as snowml_env, env_utils from snowflake.ml.model import _model_meta, _udf_util, model_signature from snowflake.ml.test_utils import mock_data_frame, mock_session from snowflake.snowpark import row, session @@ -20,18 +21,24 @@ } _BASIC_DEPENDENCIES_FINAL_PACKAGES = list( - sorted(map(lambda x: f"{x}=={importlib_metadata.version(x)}", _model_meta._BASIC_DEPENDENCIES)) + sorted( + map( + lambda x: env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(x)), + _model_meta._BASIC_DEPENDENCIES, + ), + key=lambda x: x.name, + ) ) -class TestFinalPackagesWithoutConda(absltest.TestCase): +class TestFinalPackagesWithoutCondaWithSnowML(absltest.TestCase): @classmethod def setUpClass(cls) -> None: cls._temp_conda = None if sys.modules.get("conda"): cls._temp_conda = sys.modules["conda"] sys.modules["conda"] = None # type: ignore[assignment] - + env_utils._INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION = None cls.m_session = mock_session.MockSession(conn=None, test_case=None) cls.m_session.add_mock_sql( query=textwrap.dedent( @@ -46,7 +53,14 @@ def setUpClass(cls) -> None: def setUp(self) -> None: self.add_packages( - {basic_dep: [importlib_metadata.version(basic_dep)] for basic_dep in _model_meta._BASIC_DEPENDENCIES} + { + **{ + basic_dep.name: [importlib_metadata.version(basic_dep.name)] + for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES + if basic_dep.name != env_utils._SNOWML_PKG_NAME + }, + env_utils._SNOWML_PKG_NAME: [snowml_env.VERSION], + } ) @classmethod @@ -80,9 +94,12 @@ def test_get_model_final_packages(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} meta = _model_meta.ModelMetadata(name="model1", model_type="custom", signatures=_DUMMY_SIG) c_session = cast(session.Session, self.m_session) - with self.assertWarns(RuntimeWarning): + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): final_packages = _udf_util._get_model_final_packages(meta, c_session) - self.assertListEqual(final_packages, _BASIC_DEPENDENCIES_FINAL_PACKAGES) + self.assertListEqual(final_packages, list(map(str, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) def test_get_model_final_packages_no_relax(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -90,7 +107,10 @@ def test_get_model_final_packages_no_relax(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] ) c_session = cast(session.Session, self.m_session) - with self.assertWarns(RuntimeWarning): + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): with self.assertRaises(RuntimeError): _udf_util._get_model_final_packages(meta, c_session) @@ -100,9 +120,14 @@ def test_get_model_final_packages_relax(self) -> None: name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] ) c_session = cast(session.Session, self.m_session) - with self.assertWarns(RuntimeWarning): + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): final_packages = _udf_util._get_model_final_packages(meta, c_session, relax_version=True) - self.assertListEqual(final_packages, sorted(_model_meta._BASIC_DEPENDENCIES)) + self.assertListEqual( + final_packages, sorted(list(map(lambda x: x.name, _BASIC_DEPENDENCIES_FINAL_PACKAGES))) + ) def test_get_model_final_packages_with_pip(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} @@ -127,19 +152,216 @@ def test_get_model_final_packages_with_other_channel(self) -> None: def test_get_model_final_packages_with_non_exist_package(self) -> None: env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} - d = {basic_dep: [importlib_metadata.version(basic_dep)] for basic_dep in _model_meta._BASIC_DEPENDENCIES} + d = { + **{ + basic_dep.name: [importlib_metadata.version(basic_dep.name)] + for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES + if basic_dep.name != env_utils._SNOWML_PKG_NAME + }, + env_utils._SNOWML_PKG_NAME: [snowml_env.VERSION], + } d["python-package"] = [] + self.m_session = mock_session.MockSession(conn=None, test_case=self) self.add_packages(d) meta = _model_meta.ModelMetadata( name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["python-package"] ) c_session = cast(session.Session, self.m_session) - with self.assertWarns(RuntimeWarning): + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): with self.assertRaises(RuntimeError): _udf_util._get_model_final_packages(meta, c_session) -class TestFinalPackagesWithConda(absltest.TestCase): +_BASIC_DEPENDENCIES_FINAL_PACKAGES_NO_SNOWML = list( + sorted( + ( + env_utils.get_local_installed_version_of_pip_package(requirements.Requirement(dep)) + for dep in _model_meta._BASIC_DEPENDENCIES + if dep != env_utils._SNOWML_PKG_NAME + ), + key=lambda x: x.name, + ) +) + + +class TestFinalPackagesWithoutCondaWithoutSnowML(absltest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls._temp_conda = None + if sys.modules.get("conda"): + cls._temp_conda = sys.modules["conda"] + sys.modules["conda"] = None # type: ignore[assignment] + env_utils._INFO_SCHEMA_PACKAGES_HAS_RUNTIME_VERSION = None + cls.m_session = mock_session.MockSession(conn=None, test_case=None) + cls.m_session.add_mock_sql( + query=textwrap.dedent( + """ + SHOW COLUMNS + LIKE 'runtime_version' + IN TABLE information_schema.packages; + """ + ), + result=mock_data_frame.MockDataFrame(count_result=0), + ) + + def setUp(self) -> None: + self.add_packages( + { + basic_dep.name: [importlib_metadata.version(basic_dep.name)] + for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES_NO_SNOWML + if basic_dep.name != env_utils._SNOWML_PKG_NAME + } + ) + + @classmethod + def tearDownClass(cls) -> None: + if cls._temp_conda: + sys.modules["conda"] = cls._temp_conda + else: + del sys.modules["conda"] + + def add_packages(self, packages_dicts: Dict[str, List[str]]) -> None: + pkg_names_str = " OR ".join(f"package_name = '{pkg}'" for pkg in sorted(packages_dicts.keys())) + query = textwrap.dedent( + f""" + SELECT PACKAGE_NAME, VERSION + FROM information_schema.packages + WHERE ({pkg_names_str}) + AND language = 'python'; + """ + ) + sql_result = [ + row.Row(PACKAGE_NAME=pkg, VERSION=pkg_ver) + for pkg, pkg_vers in packages_dicts.items() + for pkg_ver in pkg_vers + ] + if len(sql_result) == 0: + sql_result = [row.Row()] + + self.m_session.add_mock_sql(query=query, result=mock_data_frame.MockDataFrame(sql_result)) + + def test_get_model_final_packages(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata(name="model1", model_type="custom", signatures=_DUMMY_SIG) + c_session = cast(session.Session, self.m_session) + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): + final_packages = _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) + self.assertListEqual(final_packages, list(map(str, _BASIC_DEPENDENCIES_FINAL_PACKAGES_NO_SNOWML))) + + def test_get_model_final_packages_no_relax(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata( + name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] + ) + c_session = cast(session.Session, self.m_session) + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): + with self.assertRaises(RuntimeError): + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) + + def test_get_model_final_packages_relax(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata( + name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] + ) + c_session = cast(session.Session, self.m_session) + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): + final_packages = _udf_util._get_model_final_packages( + meta, c_session, relax_version=True, _use_local_snowml=True + ) + self.assertListEqual( + final_packages, + sorted(map(lambda x: x.name, _BASIC_DEPENDENCIES_FINAL_PACKAGES_NO_SNOWML)), + ) + + def test_get_model_final_packages_with_pip(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata( + name="model1", model_type="custom", signatures=_DUMMY_SIG, pip_requirements=["python-package"] + ) + c_session = cast(session.Session, self.m_session) + with self.assertRaises(RuntimeError): + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) + + def test_get_model_final_packages_with_other_channel(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata( + name="model1", + model_type="custom", + signatures=_DUMMY_SIG, + conda_dependencies=["conda-forge::python_package"], + ) + c_session = cast(session.Session, self.m_session) + with self.assertRaises(RuntimeError): + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) + + def test_get_model_final_packages_with_non_exist_package(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + d = { + basic_dep.name: [importlib_metadata.version(basic_dep.name)] + for basic_dep in _BASIC_DEPENDENCIES_FINAL_PACKAGES_NO_SNOWML + if basic_dep.name != env_utils._SNOWML_PKG_NAME + } + d["python-package"] = [] + self.m_session = mock_session.MockSession(conn=None, test_case=self) + self.add_packages(d) + meta = _model_meta.ModelMetadata( + name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["python-package"] + ) + c_session = cast(session.Session, self.m_session) + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): + with self.assertRaises(RuntimeError): + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) + + def test_get_model_final_packages_failed_snowml(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata(name="model1", model_type="custom", signatures=_DUMMY_SIG) + c_session = cast(session.Session, self.m_session) + with self.assertRaises(RuntimeError): + original_verison = snowml_env.VERSION + snowml_env.VERSION = "0.0.0" + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) + snowml_env.VERSION = original_verison + + def test_get_model_final_packages_relax_failed_snowml(self) -> None: + env_utils._SNOWFLAKE_CONDA_PACKAGE_CACHE = {} + meta = _model_meta.ModelMetadata( + name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas==1.0.*"] + ) + c_session = cast(session.Session, self.m_session) + with self.assertWarnsRegex( + RuntimeWarning, + "Cannot find conda resolver", + ): + original_verison = snowml_env.VERSION + snowml_env.VERSION = "0.0.0" + final_packages = _udf_util._get_model_final_packages( + meta, + c_session, + relax_version=True, + _use_local_snowml=True, + ) + self.assertListEqual( + final_packages, sorted(list(map(lambda x: x.name, _BASIC_DEPENDENCIES_FINAL_PACKAGES_NO_SNOWML))) + ) + snowml_env.VERSION = original_verison + + +class TestFinalPackagesWithCondaWIthoutSnowML(absltest.TestCase): def setUp(self) -> None: self.m_session = mock_session.MockSession(conn=None, test_case=self) @@ -149,7 +371,9 @@ def tearDown(self) -> None: def test_get_model_final_packages(self) -> None: meta = _model_meta.ModelMetadata(name="model1", model_type="custom", signatures=_DUMMY_SIG) c_session = cast(session.Session, self.m_session) - final_packages = _udf_util._get_model_final_packages(meta, c_session, relax_version=True) + final_packages = _udf_util._get_model_final_packages( + meta, c_session, relax_version=True, _use_local_snowml=True + ) self.assertIsNotNone(final_packages) def test_get_model_final_packages_no_relax(self) -> None: @@ -158,14 +382,16 @@ def test_get_model_final_packages_no_relax(self) -> None: ) c_session = cast(session.Session, self.m_session) with self.assertRaises(RuntimeError): - _udf_util._get_model_final_packages(meta, c_session) + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) def test_get_model_final_packages_relax(self) -> None: meta = _model_meta.ModelMetadata( name="model1", model_type="custom", signatures=_DUMMY_SIG, conda_dependencies=["pandas<1"] ) c_session = cast(session.Session, self.m_session) - final_packages = _udf_util._get_model_final_packages(meta, c_session, relax_version=True) + final_packages = _udf_util._get_model_final_packages( + meta, c_session, relax_version=True, _use_local_snowml=True + ) self.assertIsNotNone(final_packages) def test_get_model_final_packages_with_pip(self) -> None: @@ -174,7 +400,7 @@ def test_get_model_final_packages_with_pip(self) -> None: ) c_session = cast(session.Session, self.m_session) with self.assertRaises(RuntimeError): - _udf_util._get_model_final_packages(meta, c_session) + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) def test_get_model_final_packages_with_other_channel(self) -> None: meta = _model_meta.ModelMetadata( @@ -185,7 +411,7 @@ def test_get_model_final_packages_with_other_channel(self) -> None: ) c_session = cast(session.Session, self.m_session) with self.assertRaises(RuntimeError): - _udf_util._get_model_final_packages(meta, c_session) + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) def test_get_model_final_packages_with_non_exist_package(self) -> None: meta = _model_meta.ModelMetadata( @@ -193,7 +419,7 @@ def test_get_model_final_packages_with_non_exist_package(self) -> None: ) c_session = cast(session.Session, self.m_session) with self.assertRaises(RuntimeError): - _udf_util._get_model_final_packages(meta, c_session) + _udf_util._get_model_final_packages(meta, c_session, _use_local_snowml=True) if __name__ == "__main__": diff --git a/snowflake/ml/model/type_hints.py b/snowflake/ml/model/type_hints.py index 4acbd920..6608967d 100644 --- a/snowflake/ml/model/type_hints.py +++ b/snowflake/ml/model/type_hints.py @@ -4,7 +4,7 @@ import numpy.typing as npt from typing_extensions import NotRequired, TypeAlias -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base if TYPE_CHECKING: import numpy as np @@ -86,11 +86,13 @@ class DeployOptions(TypedDict): rows. Defaults to True. Internal-only options - _snowml_wheel_path: Local or in-stage path to snowml wheel file. If deployed permanently, it needs to be a stage - path where the stage is non-temporary, internal stage. + _use_local_snowml: Use local SnowML when as the execution library of the deployment. If set to True, local SnowML + would be packed and uploaded to 1) session stage, if it is a temporary deployment, or 2) the provided stage path + if it is a permanent deployment. It should be set to True before SnowML available in Snowflake Anaconda Channel. + Default to False. """ - _snowml_wheel_path: NotRequired[str] + _use_local_snowml: NotRequired[bool] output_with_input_features: NotRequired[bool] keep_order: NotRequired[bool] diff --git a/snowflake/ml/sklearn/calibration/BUILD.bazel b/snowflake/ml/modeling/calibration/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/calibration/BUILD.bazel rename to snowflake/ml/modeling/calibration/BUILD.bazel diff --git a/snowflake/ml/sklearn/calibration/estimators_info.bzl b/snowflake/ml/modeling/calibration/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/calibration/estimators_info.bzl rename to snowflake/ml/modeling/calibration/estimators_info.bzl diff --git a/snowflake/ml/sklearn/cluster/BUILD.bazel b/snowflake/ml/modeling/cluster/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/cluster/BUILD.bazel rename to snowflake/ml/modeling/cluster/BUILD.bazel diff --git a/snowflake/ml/sklearn/cluster/estimators_info.bzl b/snowflake/ml/modeling/cluster/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/cluster/estimators_info.bzl rename to snowflake/ml/modeling/cluster/estimators_info.bzl diff --git a/snowflake/ml/sklearn/compose/BUILD.bazel b/snowflake/ml/modeling/compose/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/compose/BUILD.bazel rename to snowflake/ml/modeling/compose/BUILD.bazel diff --git a/snowflake/ml/sklearn/compose/estimators_info.bzl b/snowflake/ml/modeling/compose/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/compose/estimators_info.bzl rename to snowflake/ml/modeling/compose/estimators_info.bzl diff --git a/snowflake/ml/sklearn/covariance/BUILD.bazel b/snowflake/ml/modeling/covariance/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/covariance/BUILD.bazel rename to snowflake/ml/modeling/covariance/BUILD.bazel diff --git a/snowflake/ml/sklearn/covariance/estimators_info.bzl b/snowflake/ml/modeling/covariance/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/covariance/estimators_info.bzl rename to snowflake/ml/modeling/covariance/estimators_info.bzl diff --git a/snowflake/ml/sklearn/decomposition/BUILD.bazel b/snowflake/ml/modeling/decomposition/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/decomposition/BUILD.bazel rename to snowflake/ml/modeling/decomposition/BUILD.bazel diff --git a/snowflake/ml/sklearn/decomposition/estimators_info.bzl b/snowflake/ml/modeling/decomposition/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/decomposition/estimators_info.bzl rename to snowflake/ml/modeling/decomposition/estimators_info.bzl diff --git a/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel b/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel rename to snowflake/ml/modeling/discriminant_analysis/BUILD.bazel diff --git a/snowflake/ml/sklearn/discriminant_analysis/estimators_info.bzl b/snowflake/ml/modeling/discriminant_analysis/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/discriminant_analysis/estimators_info.bzl rename to snowflake/ml/modeling/discriminant_analysis/estimators_info.bzl diff --git a/snowflake/ml/sklearn/ensemble/BUILD.bazel b/snowflake/ml/modeling/ensemble/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/ensemble/BUILD.bazel rename to snowflake/ml/modeling/ensemble/BUILD.bazel diff --git a/snowflake/ml/sklearn/ensemble/estimators_info.bzl b/snowflake/ml/modeling/ensemble/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/ensemble/estimators_info.bzl rename to snowflake/ml/modeling/ensemble/estimators_info.bzl diff --git a/snowflake/ml/sklearn/feature_selection/BUILD.bazel b/snowflake/ml/modeling/feature_selection/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/feature_selection/BUILD.bazel rename to snowflake/ml/modeling/feature_selection/BUILD.bazel diff --git a/snowflake/ml/sklearn/feature_selection/estimators_info.bzl b/snowflake/ml/modeling/feature_selection/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/feature_selection/estimators_info.bzl rename to snowflake/ml/modeling/feature_selection/estimators_info.bzl diff --git a/snowflake/ml/sklearn/framework/BUILD.bazel b/snowflake/ml/modeling/framework/BUILD.bazel similarity index 93% rename from snowflake/ml/sklearn/framework/BUILD.bazel rename to snowflake/ml/modeling/framework/BUILD.bazel index 6fe6fe45..8fcbd293 100644 --- a/snowflake/ml/sklearn/framework/BUILD.bazel +++ b/snowflake/ml/modeling/framework/BUILD.bazel @@ -6,7 +6,6 @@ py_library( name = "framework", srcs = [ "base.py", - "pipeline.py", "_utils.py" ], deps = [ diff --git a/snowflake/ml/sklearn/framework/_utils.py b/snowflake/ml/modeling/framework/_utils.py similarity index 94% rename from snowflake/ml/sklearn/framework/_utils.py rename to snowflake/ml/modeling/framework/_utils.py index 14340434..056dd369 100644 --- a/snowflake/ml/sklearn/framework/_utils.py +++ b/snowflake/ml/modeling/framework/_utils.py @@ -195,3 +195,25 @@ def get_filtered_valid_sklearn_args( raise ImportError(msg) return sklearn_args + + +def str_to_bool(value: str) -> Union[bool, None]: + if value is None: + return None + elif value.lower() == "true": + return True + elif value.lower() == "false": + return False + return None + + +def _handle_str_bool_type() -> np.vectorize: + return np.vectorize(str_to_bool) + + +def to_native_format(obj: Any) -> Any: + if "XGB" in obj.__class__.__name__: + return obj.to_xgboost() + elif "LGBM" in obj.__class__.__name__: + return obj.to_lightgbm() + return obj.to_sklearn() diff --git a/snowflake/ml/sklearn/framework/base.py b/snowflake/ml/modeling/framework/base.py similarity index 96% rename from snowflake/ml/sklearn/framework/base.py rename to snowflake/ml/modeling/framework/base.py index f1541a50..130190c2 100644 --- a/snowflake/ml/sklearn/framework/base.py +++ b/snowflake/ml/modeling/framework/base.py @@ -15,7 +15,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry from snowflake.ml._internal.utils import parallelize -from snowflake.ml.sklearn.framework import _utils +from snowflake.ml.modeling.framework import _utils from snowflake.snowpark import functions as F from snowflake.snowpark._internal import type_utils @@ -342,6 +342,15 @@ def set_sample_weight_col(self, sample_weight_col: Optional[str]) -> "Base": self.sample_weight_col = sample_weight_col return self + def _get_dependencies(self) -> List[str]: + """ + Return the list of conda dependencies required to work with the object. + + Returns: + List of dependencies. + """ + return [] + @abstractmethod def fit(self, dataset: snowpark.DataFrame) -> "BaseEstimator": raise NotImplementedError() @@ -449,11 +458,19 @@ def enforce_fit(self) -> None: def set_drop_input_cols(self, drop_input_cols: Optional[bool] = False) -> None: self._drop_input_cols = drop_input_cols - def get_sklearn_object(self) -> Any: + def to_sklearn(self) -> Any: if self._sklearn_object is None: self._sklearn_object = self._create_sklearn_object() return self._sklearn_object + # to_xgboost would be only used in XGB estimators + # to_lightgbm would be only used in LightGBM estimators, but they function the same + def to_xgboost(self) -> Any: + raise AttributeError("Object doesn't support to_xgboost. Please use to_sklearn()") + + def to_lightgbm(self) -> Any: + raise AttributeError("Object doesn't support to_lightgbm. Please use to_sklearn()") + def _reset(self) -> None: self._sklearn_object = None self._is_fitted = False @@ -508,7 +525,7 @@ def _transform_sklearn(self, dataset: pd.DataFrame) -> pd.DataFrame: """ self.enforce_fit() dataset = dataset.copy() - sklearn_transform = self.get_sklearn_object() + sklearn_transform = self.to_sklearn() transformed_data = sklearn_transform.transform(dataset[self.input_cols]) shape = transformed_data.shape if (len(shape) == 1 and len(self.output_cols) != 1) or (len(shape) > 1 and shape[1] != len(self.output_cols)): diff --git a/snowflake/ml/sklearn/gaussian_process/BUILD.bazel b/snowflake/ml/modeling/gaussian_process/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/gaussian_process/BUILD.bazel rename to snowflake/ml/modeling/gaussian_process/BUILD.bazel diff --git a/snowflake/ml/sklearn/gaussian_process/estimators_info.bzl b/snowflake/ml/modeling/gaussian_process/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/gaussian_process/estimators_info.bzl rename to snowflake/ml/modeling/gaussian_process/estimators_info.bzl diff --git a/snowflake/ml/sklearn/impute/BUILD.bazel b/snowflake/ml/modeling/impute/BUILD.bazel similarity index 74% rename from snowflake/ml/sklearn/impute/BUILD.bazel rename to snowflake/ml/modeling/impute/BUILD.bazel index e91a8253..7e825459 100644 --- a/snowflake/ml/sklearn/impute/BUILD.bazel +++ b/snowflake/ml/modeling/impute/BUILD.bazel @@ -1,6 +1,7 @@ load("//codegen:codegen_rules.bzl", "autogen_estimators", "autogen_init_file_for_module") load(":estimators_info.bzl", "estimator_info_list") +load(":BUILD_NATIVE.bzl", "get_build_rules_for_native_impl") package(default_visibility = ["//visibility:public"]) -autogen_init_file_for_module(module="sklearn.impute") autogen_estimators(module="sklearn.impute", estimator_info_list=estimator_info_list) +get_build_rules_for_native_impl() diff --git a/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl b/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl new file mode 100644 index 00000000..e9caa22f --- /dev/null +++ b/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl @@ -0,0 +1,33 @@ +load("//bazel:py_rules.bzl", "py_library") +load("@rules_python//python:packaging.bzl", "py_package") + +def get_build_rules_for_native_impl(): + py_library( + name = "init", + srcs = [ + "__init__.py", + ], + deps = [ + "//snowflake/ml/_internal:init_utils", + ], + ) + + py_library( + name = "simple_imputer", + srcs = [ + "simple_imputer.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_package( + name = "impute_pkg", + packages = ["snowflake.ml"], + deps = [ + ":simple_imputer", + ], + ) diff --git a/snowflake/ml/sklearn/preprocessing/__init__.py b/snowflake/ml/modeling/impute/__init__.py similarity index 100% rename from snowflake/ml/sklearn/preprocessing/__init__.py rename to snowflake/ml/modeling/impute/__init__.py diff --git a/snowflake/ml/sklearn/impute/estimators_info.bzl b/snowflake/ml/modeling/impute/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/impute/estimators_info.bzl rename to snowflake/ml/modeling/impute/estimators_info.bzl diff --git a/snowflake/ml/sklearn/preprocessing/simple_imputer.py b/snowflake/ml/modeling/impute/simple_imputer.py similarity index 97% rename from snowflake/ml/sklearn/preprocessing/simple_imputer.py rename to snowflake/ml/modeling/impute/simple_imputer.py index 1c3bd401..1c579df1 100644 --- a/snowflake/ml/sklearn/preprocessing/simple_imputer.py +++ b/snowflake/ml/modeling/impute/simple_imputer.py @@ -11,9 +11,9 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import _utils, base +from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F, types as T -from snowflake.snowpark._internal import utils as snowpark_internal_utils +from snowflake.snowpark._internal import utils as snowpark_utils STRATEGY_TO_STATE_DICT = { "constant": None, @@ -323,6 +323,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Returns: Output dataset. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] output_columns = [F.col(input_col) for input_col in self.input_cols] transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) @@ -334,7 +335,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame temp_input_cols = [] for input_col, output_col in zip(self.input_cols, self.output_cols): if input_col != output_col: - temp_input_col = f"{input_col}_{snowpark_internal_utils.generate_random_alphanumeric()}" + temp_input_col = f"{input_col}_{snowpark_utils.generate_random_alphanumeric()}" transformed_dataset = transformed_dataset.with_column_renamed(input_col, temp_input_col) temp_input_cols.append(temp_input_col) @@ -372,6 +373,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame temp_input_col = temp_input_cols.pop(0) transformed_dataset = transformed_dataset.with_column_renamed(temp_input_col, input_col) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_sklearn_object(self) -> impute.SimpleImputer: diff --git a/snowflake/ml/sklearn/isotonic/BUILD.bazel b/snowflake/ml/modeling/isotonic/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/isotonic/BUILD.bazel rename to snowflake/ml/modeling/isotonic/BUILD.bazel diff --git a/snowflake/ml/sklearn/isotonic/estimators_info.bzl b/snowflake/ml/modeling/isotonic/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/isotonic/estimators_info.bzl rename to snowflake/ml/modeling/isotonic/estimators_info.bzl diff --git a/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel b/snowflake/ml/modeling/kernel_approximation/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/kernel_approximation/BUILD.bazel rename to snowflake/ml/modeling/kernel_approximation/BUILD.bazel diff --git a/snowflake/ml/sklearn/kernel_approximation/estimators_info.bzl b/snowflake/ml/modeling/kernel_approximation/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/kernel_approximation/estimators_info.bzl rename to snowflake/ml/modeling/kernel_approximation/estimators_info.bzl diff --git a/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel b/snowflake/ml/modeling/kernel_ridge/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/kernel_ridge/BUILD.bazel rename to snowflake/ml/modeling/kernel_ridge/BUILD.bazel diff --git a/snowflake/ml/sklearn/kernel_ridge/estimators_info.bzl b/snowflake/ml/modeling/kernel_ridge/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/kernel_ridge/estimators_info.bzl rename to snowflake/ml/modeling/kernel_ridge/estimators_info.bzl diff --git a/snowflake/ml/lightgbm/BUILD.bazel b/snowflake/ml/modeling/lightgbm/BUILD.bazel similarity index 100% rename from snowflake/ml/lightgbm/BUILD.bazel rename to snowflake/ml/modeling/lightgbm/BUILD.bazel diff --git a/snowflake/ml/lightgbm/estimators_info.bzl b/snowflake/ml/modeling/lightgbm/estimators_info.bzl similarity index 100% rename from snowflake/ml/lightgbm/estimators_info.bzl rename to snowflake/ml/modeling/lightgbm/estimators_info.bzl diff --git a/snowflake/ml/sklearn/linear_model/BUILD.bazel b/snowflake/ml/modeling/linear_model/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/linear_model/BUILD.bazel rename to snowflake/ml/modeling/linear_model/BUILD.bazel diff --git a/snowflake/ml/sklearn/linear_model/estimators_info.bzl b/snowflake/ml/modeling/linear_model/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/linear_model/estimators_info.bzl rename to snowflake/ml/modeling/linear_model/estimators_info.bzl diff --git a/snowflake/ml/sklearn/manifold/BUILD.bazel b/snowflake/ml/modeling/manifold/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/manifold/BUILD.bazel rename to snowflake/ml/modeling/manifold/BUILD.bazel diff --git a/snowflake/ml/sklearn/manifold/estimators_info.bzl b/snowflake/ml/modeling/manifold/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/manifold/estimators_info.bzl rename to snowflake/ml/modeling/manifold/estimators_info.bzl diff --git a/snowflake/ml/metrics/BUILD.bazel b/snowflake/ml/modeling/metrics/BUILD.bazel similarity index 77% rename from snowflake/ml/metrics/BUILD.bazel rename to snowflake/ml/modeling/metrics/BUILD.bazel index 58f9d1d4..d5bd8657 100644 --- a/snowflake/ml/metrics/BUILD.bazel +++ b/snowflake/ml/modeling/metrics/BUILD.bazel @@ -7,13 +7,16 @@ py_library( name = "metrics", srcs = [ "accuracy_score.py", + "confusion_matrix.py", "correlation.py", "covariance.py", + "precision_recall_fscore_support.py", + "precision_score.py", "regression.py", ], deps = [ ":init", - ":utils", + ":metrics_utils", "//snowflake/ml/_internal:telemetry", ], ) @@ -32,8 +35,8 @@ py_library( ) py_library( - name = "utils", - srcs = ["_utils.py"], + name = "metrics_utils", + srcs = ["metrics_utils.py"], deps = [ "//snowflake/ml/_internal:telemetry", ], diff --git a/snowflake/ml/modeling/metrics/__init__.py b/snowflake/ml/modeling/metrics/__init__.py new file mode 100644 index 00000000..ed8be701 --- /dev/null +++ b/snowflake/ml/modeling/metrics/__init__.py @@ -0,0 +1,15 @@ +from .accuracy_score import accuracy_score +from .confusion_matrix import confusion_matrix +from .correlation import correlation +from .covariance import covariance +from .precision_recall_fscore_support import precision_recall_fscore_support +from .precision_score import precision_score + +__all__ = [ + "accuracy_score", + "confusion_matrix", + "correlation", + "covariance", + "precision_recall_fscore_support", + "precision_score", +] diff --git a/snowflake/ml/metrics/accuracy_score.py b/snowflake/ml/modeling/metrics/accuracy_score.py similarity index 55% rename from snowflake/ml/metrics/accuracy_score.py rename to snowflake/ml/modeling/metrics/accuracy_score.py index b98bce27..e0ea5e70 100644 --- a/snowflake/ml/metrics/accuracy_score.py +++ b/snowflake/ml/modeling/metrics/accuracy_score.py @@ -1,8 +1,8 @@ -from typing import Optional +from typing import List, Optional, Union from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.metrics import _utils +from snowflake.ml.modeling.metrics import metrics_utils from snowflake.snowpark import functions as F _PROJECT = "ModelDevelopment" @@ -13,20 +13,22 @@ def accuracy_score( *, df: snowpark.DataFrame, - y_true_col_name: str, - y_pred_col_name: str, + y_true_col_names: Union[str, List[str]], + y_pred_col_names: Union[str, List[str]], normalize: bool = True, sample_weight_col_name: Optional[str] = None, ) -> float: """ Accuracy classification score. - Note: Currently multilabel classification is not supported. + In multilabel classification, this function computes subset accuracy: + the set of labels predicted for a sample must *exactly* match the + corresponding set of labels in the y true columns. Args: df: Input dataframe. - y_true_col_name: Column name representing actual values. - y_pred_col_name: Column name representing predicted values. + y_true_col_names: Column name(s) representing actual values. + y_pred_col_names: Column name(s) representing predicted values. normalize: If ``False``, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. sample_weight_col_name: Column name representing sample weights. @@ -39,9 +41,15 @@ def accuracy_score( The best performance is 1 with ``normalize == True`` and the number of samples with ``normalize == False``. """ - # TODO: Support multilabel classification. - score_column = F.iff(df[y_true_col_name] == df[y_pred_col_name], 1, 0) # type: ignore[arg-type] - return _utils.weighted_sum( + metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names) + + if isinstance(y_true_col_names, str) or (len(y_true_col_names) == 1): + score_column = F.iff(df[y_true_col_names] == df[y_pred_col_names], 1, 0) # type: ignore[arg-type] + # multilabel + else: + expr = " and ".join([f"({y_true_col_names[i]} = {y_pred_col_names[i]})" for i in range(len(y_true_col_names))]) + score_column = F.iff(expr, 1, 0) # type: ignore[arg-type] + return metrics_utils.weighted_sum( df=df, sample_score_column=score_column, sample_weight_column=df[sample_weight_col_name] if sample_weight_col_name else None, diff --git a/snowflake/ml/modeling/metrics/confusion_matrix.py b/snowflake/ml/modeling/metrics/confusion_matrix.py new file mode 100644 index 00000000..1c530fb8 --- /dev/null +++ b/snowflake/ml/modeling/metrics/confusion_matrix.py @@ -0,0 +1,224 @@ +import uuid +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import cloudpickle +import numpy as np +import numpy.typing as npt + +from snowflake import snowpark +from snowflake.ml._internal import telemetry +from snowflake.ml.modeling.metrics import metrics_utils +from snowflake.snowpark import functions as F, types as T +from snowflake.snowpark._internal import utils as snowpark_utils + +_PROJECT = "ModelDevelopment" +_SUBPROJECT = "Metrics" + + +@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT) +def confusion_matrix( + *, + df: snowpark.DataFrame, + y_true_col_name: str, + y_pred_col_name: str, + labels: Optional[npt.ArrayLike] = None, + sample_weight_col_name: Optional[str] = None, + normalize: Optional[str] = None, +) -> Union[npt.NDArray[np.int_], npt.NDArray[np.float_]]: + """ + Compute confusion matrix to evaluate the accuracy of a classification. + + By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` + is equal to the number of observations known to be in group :math:`i` and + predicted to be in group :math:`j`. + + Thus in binary classification, the count of true negatives is + :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is + :math:`C_{1,1}` and false positives is :math:`C_{0,1}`. + + Args: + df: Input dataframe. + y_true_col_name: Column name representing actual values. + y_pred_col_name: Column name representing predicted values. + labels: List of labels to index the matrix. This may be used to + reorder or select a subset of labels. + If ``None`` is given, those that appear at least once in the + y true or y pred column are used in sorted order. + sample_weight_col_name: Column name representing sample weights. + normalize: {'true', 'pred', 'all'}, default=None + Normalizes confusion matrix over the true (rows), predicted (columns) + conditions or all the population. If None, confusion matrix will not be + normalized. + + Returns: + Confusion matrix whose i-th row and j-th column entry indicates the number of + samples with true label being i-th class and predicted label being j-th class. + + Raises: + ValueError: The given ``labels`` is empty. + ValueError: No label specified in the given ``labels`` is in the y true column. + ValueError: ``normalize`` is not one of {'true', 'pred', 'all', None}. + """ + assert df._session is not None + session = df._session + + # Get a label df with columns: [LABEL, INDEX]. + if labels is None: + label_df = metrics_utils.unique_labels(df=df, columns=[df[y_true_col_name], df[y_pred_col_name]]) + else: + _labels = np.array(labels) + label_data = np.vstack((_labels, np.arange(_labels.size))).T.tolist() + label_df = session.create_dataframe(label_data, schema=[metrics_utils.LABEL, metrics_utils.INDEX]) + + n_labels = label_df.count() + if labels is not None: + if n_labels == 0: + raise ValueError("'labels' should contains at least one label.") + elif df[[y_true_col_name]].filter(~F.is_null(df[y_true_col_name])).count() == 0: + return np.zeros((n_labels, n_labels), dtype=int) + elif df[[y_true_col_name]].join(label_df, df[y_true_col_name] == label_df[metrics_utils.LABEL]).count() == 0: + raise ValueError("At least one label specified must be in the y true column") + + rand = snowpark_utils.generate_random_alphanumeric() + if sample_weight_col_name is None: + sample_weight_col_name = f'"_SAMPLE_WEIGHT_{rand}"' + df = df.with_column(sample_weight_col_name, F.lit(1)) # type: ignore[arg-type] + + if normalize not in ["true", "pred", "all", None]: + raise ValueError("normalize must be one of {'true', 'pred', 'all', None}") + + # Get indices of true and pred data. + label_col = f'"_LABEL_{rand}"' + y_true_index_col = f'"_Y_TRUE_INDEX_{rand}"' + y_pred_index_col = f'"_Y_PRED_INDEX_{rand}"' + label_df = label_df.with_column_renamed(metrics_utils.LABEL, label_col) + ind_df = ( + df.join( + label_df.with_column_renamed(metrics_utils.INDEX, y_true_index_col), + df[y_true_col_name] == label_df[label_col], + ) + .drop(label_col) + .join( + label_df.with_column_renamed(metrics_utils.INDEX, y_pred_index_col), + df[y_pred_col_name] == label_df[label_col], + ) + .drop(label_col) + ) + + # Register UDTFs. + statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + confusion_matrix_computer = _register_confusion_matrix_computer(session=session, statement_params=statement_params) + confusion_matrix_computer_udtf = F.table_function(confusion_matrix_computer) + accumulator = metrics_utils.register_accumulator_udtf(session=session, statement_params=statement_params) + accumulator_udtf = F.table_function(accumulator) + + # Compute the confusion matrix. + temp_df1 = ind_df.select( + F.array_construct(sample_weight_col_name, y_true_index_col, y_pred_index_col).alias( # type: ignore[arg-type] + "ARR_COL" + ) + ) + temp_df2 = temp_df1.select( + confusion_matrix_computer_udtf(F.col("ARR_COL"), F.lit(n_labels)) # type: ignore[arg-type] + ).with_column_renamed("RESULT", "RES") + res_df = temp_df2.select(accumulator_udtf(F.col("RES")).over(partition_by="PART"), F.col("PART")) + results = res_df.collect(statement_params=statement_params) + + cm = np.zeros((n_labels, n_labels)) + for i in range(len(results)): + row = int(results[i][1].strip("row_")) + cm[row, :] = cloudpickle.loads(results[i][0]) + + with np.errstate(all="ignore"): + if normalize == "true": + cm = cm / cm.sum(axis=1, keepdims=True) + elif normalize == "pred": + cm = cm / cm.sum(axis=0, keepdims=True) + elif normalize == "all": + cm = cm / cm.sum() + cm = np.nan_to_num(cm) + + return cm + + +def _register_confusion_matrix_computer(*, session: snowpark.Session, statement_params: Dict[str, Any]) -> str: + """Registers confusion matrix computation UDTF in Snowflake and returns the name of the UDTF. + + Args: + session: Snowpark session. + statement_params: Dictionary used for tagging queries for tracking purposes. + + Returns: + Name of the UDTF. + """ + + class ConfusionMatrixComputer: + BATCH_SIZE = 1000 + + def __init__(self) -> None: + self._initialized = False + self._confusion_matrix = np.zeros((1, 1)) + # 2d array containing a batch of input rows. A batch contains self.BATCH_SIZE rows. + # [sample_weight, y_true, y_pred] + self._batched_rows = np.zeros((self.BATCH_SIZE, 1)) + # Number of columns in the dataset. + self._n_cols = -1 + # Running count of number of rows added to self._batched_rows. + self._cur_count = 0 + # Number of labels. + self._n_label = 0 + + def process(self, input_row: List[float], n_label: int) -> None: + """Computes confusion matrix. + + Args: + input_row: List of floats: [sample_weight, y_true, y_pred]. + n_label: Number of labels. + """ + # 1. Initialize variables. + if not self._initialized: + self._n_cols = len(input_row) + self._batched_rows = np.zeros((self.BATCH_SIZE, self._n_cols)) + self._n_label = n_label + self._confusion_matrix = np.zeros((self._n_label, self._n_label)) + self._initialized = True + + self._batched_rows[self._cur_count, :] = input_row + self._cur_count += 1 + + # 2. Compute incremental sum and dot_prod for the batch. + if self._cur_count >= self.BATCH_SIZE: + self.update_confusion_matrix() + self._cur_count = 0 + + def end_partition(self) -> Iterable[Tuple[bytes, str]]: + # 3. Compute sum and dot_prod for the remaining rows in the batch. + if self._cur_count > 0: + self.update_confusion_matrix() + for i in range(self._n_label): + yield cloudpickle.dumps(self._confusion_matrix[i, :]), "row_" + str(i) + + def update_confusion_matrix(self) -> None: + np.add.at( + self._confusion_matrix, + (self._batched_rows[:, 1].astype(int), self._batched_rows[:, 2].astype(int)), + self._batched_rows[:, 0], + ) + + confusion_matrix_computer = "ConfusionMatrixComputer_{}".format(str(uuid.uuid4()).replace("-", "_").upper()) + session.udtf.register( + ConfusionMatrixComputer, + output_schema=T.StructType( + [ + T.StructField("result", T.BinaryType()), + T.StructField("part", T.StringType()), + ] + ), + input_types=[T.ArrayType(), T.IntegerType()], + packages=["numpy", "cloudpickle"], + name=confusion_matrix_computer, + is_permanent=False, + replace=True, + statement_params=statement_params, + ) + return confusion_matrix_computer diff --git a/snowflake/ml/metrics/correlation.py b/snowflake/ml/modeling/metrics/correlation.py similarity index 92% rename from snowflake/ml/metrics/correlation.py rename to snowflake/ml/modeling/metrics/correlation.py index 413616d3..808501f3 100644 --- a/snowflake/ml/metrics/correlation.py +++ b/snowflake/ml/modeling/metrics/correlation.py @@ -9,7 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.metrics import _utils +from snowflake.ml.modeling.metrics import metrics_utils from snowflake.snowpark import functions as F _PROJECT = "ModelDevelopment" @@ -50,15 +50,15 @@ def correlation(*, df: snowpark.DataFrame, columns: Optional[Collection[str]] = session = df._session statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) - input_df, columns = _utils.validate_and_return_dataframe_and_columns(df=df, columns=columns) + input_df, columns = metrics_utils.validate_and_return_dataframe_and_columns(df=df, columns=columns) count = input_df.count(statement_params=statement_params) # Register UDTFs. - sharded_dot_and_sum_computer = _utils.register_sharded_dot_sum_computer( + sharded_dot_and_sum_computer = metrics_utils.register_sharded_dot_sum_computer( session=session, statement_params=statement_params ) sharded_dot_and_sum_computer_udtf = F.table_function(sharded_dot_and_sum_computer) - accumulator = _utils.register_accumulator_udtf(session=session, statement_params=statement_params) + accumulator = metrics_utils.register_accumulator_udtf(session=session, statement_params=statement_params) accumulator_udtf = F.table_function(accumulator) # Compute the confusion matrix. diff --git a/snowflake/ml/metrics/covariance.py b/snowflake/ml/modeling/metrics/covariance.py similarity index 92% rename from snowflake/ml/metrics/covariance.py rename to snowflake/ml/modeling/metrics/covariance.py index 8d5e11bb..2569f8e4 100644 --- a/snowflake/ml/metrics/covariance.py +++ b/snowflake/ml/modeling/metrics/covariance.py @@ -8,7 +8,7 @@ import pandas as pd from snowflake.ml._internal import telemetry -from snowflake.ml.metrics import _utils +from snowflake.ml.modeling.metrics import metrics_utils from snowflake.snowpark import DataFrame, functions as F _PROJECT = "ModelDevelopment" @@ -52,15 +52,15 @@ def covariance(*, df: DataFrame, columns: Optional[Collection[str]] = None, ddof session = df._session statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) - input_df, columns = _utils.validate_and_return_dataframe_and_columns(df=df, columns=columns) + input_df, columns = metrics_utils.validate_and_return_dataframe_and_columns(df=df, columns=columns) count = input_df.count(statement_params=statement_params) # Register UDTFs. - sharded_dot_and_sum_computer = _utils.register_sharded_dot_sum_computer( + sharded_dot_and_sum_computer = metrics_utils.register_sharded_dot_sum_computer( session=session, statement_params=statement_params ) sharded_dot_and_sum_computer_udtf = F.table_function(sharded_dot_and_sum_computer) - accumulator = _utils.register_accumulator_udtf(session=session, statement_params=statement_params) + accumulator = metrics_utils.register_accumulator_udtf(session=session, statement_params=statement_params) accumulator_udtf = F.table_function(accumulator) # Compute the confusion matrix. diff --git a/snowflake/ml/metrics/_utils.py b/snowflake/ml/modeling/metrics/metrics_utils.py similarity index 76% rename from snowflake/ml/metrics/_utils.py rename to snowflake/ml/modeling/metrics/metrics_utils.py index e99d8942..db07e737 100644 --- a/snowflake/ml/metrics/_utils.py +++ b/snowflake/ml/modeling/metrics/metrics_utils.py @@ -2,7 +2,7 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # import math -from typing import Collection, Dict, Iterable, List, Optional, Tuple +from typing import Any, Collection, Dict, Iterable, List, Optional, Tuple, Union from uuid import uuid4 import cloudpickle @@ -11,19 +11,22 @@ from snowflake import snowpark from snowflake.snowpark import Session, functions as F, types as T +LABEL = "LABEL" +INDEX = "INDEX" -def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, str]) -> str: + +def register_accumulator_udtf(*, session: Session, statement_params: Dict[str, Any]) -> str: """Registers accumulator UDTF in Snowflake and returns the name of the UDTF. Args: - session (Session): Snowpark session. - statement_params (Dict[str, str]): Dictionary used for tagging queries for tracking purposes. + session: Snowpark session. + statement_params: Dictionary used for tagging queries for tracking purposes. Returns: Name of the UDTF. """ - class DotAndSumAccumulator: + class Accumulator: """This class is registered as a UDTF. It accumulates all the rows passed to the UDTF.""" def __init__(self) -> None: @@ -44,9 +47,9 @@ def process(self, input_row: bytes) -> None: def end_partition(self) -> Iterable[Tuple[bytes]]: yield (cloudpickle.dumps(self._accumulated_row),) - dot_and_sum_accumulator = "DotAndSumAccumulator_{}".format(str(uuid4()).replace("-", "_").upper()) + accumulator = "Accumulator_{}".format(str(uuid4()).replace("-", "_").upper()) session.udtf.register( - DotAndSumAccumulator, + Accumulator, output_schema=T.StructType( [ T.StructField("result", T.BinaryType()), @@ -54,20 +57,20 @@ def end_partition(self) -> Iterable[Tuple[bytes]]: ), input_types=[T.BinaryType()], packages=["numpy", "cloudpickle"], - name=dot_and_sum_accumulator, + name=accumulator, is_permanent=False, replace=True, statement_params=statement_params, ) - return dot_and_sum_accumulator + return accumulator -def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dict[str, str]) -> str: +def register_sharded_dot_sum_computer(*, session: Session, statement_params: Dict[str, Any]) -> str: """Registers dot and sum computation UDTF in Snowflake and returns the name of the UDTF. Args: - session (Session): Snowpark session. - statement_params (Dict[str, str]): Dictionary used for tagging queries for tracking purposes. + session: Snowpark session. + statement_params: Dictionary used for tagging queries for tracking purposes. Returns: Name of the UDTF. @@ -181,8 +184,8 @@ def validate_and_return_dataframe_and_columns( """Validates that the columns are all numeric and returns a dataframe with those columns. Args: - df (snowpark.DataFrame): Input snowpark dataframe. - columns (Optional[Collection[str]]): Columns that need to be validated. + df: Input snowpark dataframe. + columns: Columns that need to be validated. Returns: Tuple with snowpark dataframe and list of columns. @@ -203,6 +206,33 @@ def validate_and_return_dataframe_and_columns( return (input_df, columns) +def check_label_columns( + y_true_col_names: Union[str, List[str]], + y_pred_col_names: Union[str, List[str]], +) -> None: + """Check y true and y pred columns. + + Args: + y_true_col_names: Column name(s) representing actual values. + y_pred_col_names: Column name(s) representing predicted values. + + Raises: + TypeError: `y_true_col_names` and `y_pred_col_names` are of different types. + ValueError: Multilabel `y_true_col_names` and `y_pred_col_names` are of different lengths. + """ + if type(y_true_col_names) != type(y_pred_col_names): + raise TypeError( + "Label columns should be of the same type." + f"Got y_true_col_names={type(y_true_col_names)} vs y_pred_col_names={type(y_pred_col_names)}." + ) + if isinstance(y_true_col_names, list) and (len(y_true_col_names) != len(y_pred_col_names)): + raise ValueError( + "Length of multilabel label columns should be of the same between y_true_col_names and y_pred_col_names." + f"Got y_true_col_names={y_true_col_names} (length: {len(y_true_col_names)}) vs " + f"y_pred_col_names={y_pred_col_names} (length: {len(y_pred_col_names)})." + ) + + def weighted_sum( *, df: snowpark.DataFrame, @@ -238,3 +268,35 @@ def weighted_sum( res = F.sum(sample_score_column) # type: ignore[arg-type] return float(df.select(res).collect(statement_params=statement_params)[0][0]) + + +def unique_labels( + *, + df: snowpark.DataFrame, + columns: List[snowpark.Column], +) -> snowpark.DataFrame: + """Extract indexed ordered unique labels as a dataframe. + + Args: + df: Input dataframe. + columns: Columns to extract labels from. + + Returns: + Dataframe with ordered unique labels and indices. + Columns: [LABEL, INDEX]. + """ + union_df = None + for col in columns: + temp_df = df.select(col.alias(LABEL)) + if union_df: + # uniqueness guaranteed through `DataFrame.union` + union_df = union_df.union(temp_df) + else: + union_df = temp_df + + # append an index column dense ranking labels + assert union_df is not None + res: snowpark.DataFrame = union_df.with_column( + INDEX, F.dense_rank().over(snowpark.Window.order_by(LABEL)) - 1 # type: ignore[arg-type, operator] + ) + return res diff --git a/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py b/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py new file mode 100644 index 00000000..ecd5504f --- /dev/null +++ b/snowflake/ml/modeling/metrics/precision_recall_fscore_support.py @@ -0,0 +1,162 @@ +import warnings +from typing import List, Optional, Set, Tuple, Union + +import cloudpickle +import numpy.typing as npt +from sklearn import exceptions, metrics + +from snowflake import snowpark +from snowflake.ml._internal import telemetry +from snowflake.ml.modeling.metrics import metrics_utils +from snowflake.snowpark import functions as F +from snowflake.snowpark._internal import utils as snowpark_utils + +_PROJECT = "ModelDevelopment" +_SUBPROJECT = "Metrics" + + +@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT) +def precision_recall_fscore_support( + *, + df: snowpark.DataFrame, + y_true_col_names: Union[str, List[str]], + y_pred_col_names: Union[str, List[str]], + beta: float = 1.0, + labels: Optional[npt.ArrayLike] = None, + pos_label: Union[str, int] = 1, + average: Optional[str] = None, + warn_for: Union[Tuple[str, ...], Set[str]] = ("precision", "recall", "f-score"), + sample_weight_col_name: Optional[str] = None, + zero_division: Union[str, int] = "warn", +) -> Union[Tuple[float, float, float, None], Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]]: + """ + Compute precision, recall, F-measure and support for each class. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label a negative sample as + positive. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The F-beta score can be interpreted as a weighted harmonic mean of + the precision and recall, where an F-beta score reaches its best + value at 1 and worst score at 0. + + The F-beta score weights recall more than precision by a factor of + ``beta``. ``beta == 1.0`` means recall and precision are equally important. + + The support is the number of occurrences of each class in the y true column(s). + + If ``pos_label is None`` and in binary classification, this function + returns the average precision, recall and F-measure if ``average`` + is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. + + Args: + df: Input dataframe. + y_true_col_names: Column name(s) representing actual values. + y_pred_col_names: Column name(s) representing predicted values. + beta: The strength of recall versus precision in the F-score. + labels: The set of labels to include when ``average != 'binary'``, and + their order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in the y true and + y pred columns are used in sorted order. + pos_label: The class to report if ``average='binary'`` and the data is + binary. If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + average: {'binary', 'micro', 'macro', 'samples', 'weighted'}, default=None + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (y true, y pred) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + warn_for: This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + sample_weight_col_name: Column name representing sample weights. + zero_division: "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division: + - recall: when there are no positive labels + - precision: when there are no positive predictions + - f-score: both + If set to "warn", this acts as 0, but warnings are also raised. + + Returns: + precision: float (if average is not None) or array of float, shape = [n_unique_labels] + Precision score. + recall: float (if average is not None) or array of float, shape = [n_unique_labels] + Recall score. + fbeta_score: float (if average is not None) or array of float, shape = [n_unique_labels] + F-beta score. + support: None (if average is not None) or array of int, shape = [n_unique_labels] + The number of occurrences of each label in the y true column(s). + """ + metrics_utils.check_label_columns(y_true_col_names, y_pred_col_names) + + session = df._session + assert session is not None + query = df.queries["queries"][-1] + sproc_name = f"precision_recall_fscore_support_{snowpark_utils.generate_random_alphanumeric()}" + statement_params = telemetry.get_statement_params(_PROJECT, _SUBPROJECT) + + @F.sproc( # type: ignore[misc] + session=session, + name=sproc_name, + replace=True, + packages=["cloudpickle", "scikit-learn", "snowflake-snowpark-python"], + statement_params=statement_params, + ) + def precision_recall_fscore_support_sproc(session: snowpark.Session) -> bytes: + df = session.sql(query).to_pandas(statement_params=statement_params) + y_true = df[y_true_col_names] + y_pred = df[y_pred_col_names] + sample_weight = df[sample_weight_col_name] if sample_weight_col_name else None + + with warnings.catch_warnings(record=True) as w: + p, r, f, s = metrics.precision_recall_fscore_support( + y_true, + y_pred, + beta=beta, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=warn_for, + sample_weight=sample_weight, + zero_division=zero_division, + ) + + # handle zero_division warnings + warning = None + if len(w) > 0 and issubclass(w[-1].category, exceptions.UndefinedMetricWarning): + warning = w[-1] + + return cloudpickle.dumps((p, r, f, s, warning)) # type: ignore[no-any-return] + + loaded_data = cloudpickle.loads(session.call(sproc_name)) + res: Union[ + Tuple[float, float, float, None], Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike, npt.ArrayLike] + ] = loaded_data[:4] + warning = loaded_data[-1] + if warning: + warnings.warn(warning.message, category=warning.category) + return res diff --git a/snowflake/ml/modeling/metrics/precision_score.py b/snowflake/ml/modeling/metrics/precision_score.py new file mode 100644 index 00000000..45dec487 --- /dev/null +++ b/snowflake/ml/modeling/metrics/precision_score.py @@ -0,0 +1,92 @@ +from typing import List, Optional, Union + +import numpy.typing as npt + +from snowflake import snowpark +from snowflake.ml._internal import telemetry +from snowflake.ml.modeling import metrics + +_PROJECT = "ModelDevelopment" +_SUBPROJECT = "Metrics" + + +@telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT) +def precision_score( + *, + df: snowpark.DataFrame, + y_true_col_names: Union[str, List[str]], + y_pred_col_names: Union[str, List[str]], + labels: Optional[npt.ArrayLike] = None, + pos_label: Union[str, int] = 1, + average: Optional[str] = "binary", + sample_weight_col_name: Optional[str] = None, + zero_division: Union[str, int] = "warn", +) -> Union[float, npt.ArrayLike]: + """ + Compute the precision. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The best value is 1 and the worst value is 0. + + Args: + df: Input dataframe. + y_true_col_names: Column name(s) representing actual values. + y_pred_col_names: Column name(s) representing predicted values. + labels: The set of labels to include when ``average != 'binary'``, and + their order if ``average is None``. Labels present in the data can be + excluded, for example to calculate a multiclass average ignoring a + majority negative class, while labels not present in the data will + result in 0 components in a macro average. For multilabel targets, + labels are column indices. By default, all labels in the y true and + y pred columns are used in sorted order. + pos_label: The class to report if ``average='binary'`` and the data is + binary. If the data are multiclass or multilabel, this will be ignored; + setting ``labels=[pos_label]`` and ``average != 'binary'`` will report + scores for that label only. + average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary' + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (y true, y pred) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + sample_weight_col_name: Column name representing sample weights. + zero_division: "warn", 0 or 1, default="warn" + Sets the value to return when there is a zero division. If set to + "warn", this acts as 0, but warnings are also raised. + + Returns: + precision: float (if average is not None) or array of float, shape = (n_unique_labels,) + Precision of the positive class in binary classification or weighted + average of the precision of each class for the multiclass task. + """ + p, _, _, _ = metrics.precision_recall_fscore_support( + df=df, + y_true_col_names=y_true_col_names, + y_pred_col_names=y_pred_col_names, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("precision",), + sample_weight_col_name=sample_weight_col_name, + zero_division=zero_division, + ) + return p diff --git a/snowflake/ml/metrics/regression.py b/snowflake/ml/modeling/metrics/regression.py similarity index 100% rename from snowflake/ml/metrics/regression.py rename to snowflake/ml/modeling/metrics/regression.py diff --git a/snowflake/ml/sklearn/mixture/BUILD.bazel b/snowflake/ml/modeling/mixture/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/mixture/BUILD.bazel rename to snowflake/ml/modeling/mixture/BUILD.bazel diff --git a/snowflake/ml/sklearn/mixture/estimators_info.bzl b/snowflake/ml/modeling/mixture/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/mixture/estimators_info.bzl rename to snowflake/ml/modeling/mixture/estimators_info.bzl diff --git a/snowflake/ml/sklearn/model_selection/BUILD.bazel b/snowflake/ml/modeling/model_selection/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/model_selection/BUILD.bazel rename to snowflake/ml/modeling/model_selection/BUILD.bazel diff --git a/snowflake/ml/sklearn/model_selection/estimators_info.bzl b/snowflake/ml/modeling/model_selection/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/model_selection/estimators_info.bzl rename to snowflake/ml/modeling/model_selection/estimators_info.bzl diff --git a/snowflake/ml/sklearn/multiclass/BUILD.bazel b/snowflake/ml/modeling/multiclass/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/multiclass/BUILD.bazel rename to snowflake/ml/modeling/multiclass/BUILD.bazel diff --git a/snowflake/ml/sklearn/multiclass/estimators_info.bzl b/snowflake/ml/modeling/multiclass/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/multiclass/estimators_info.bzl rename to snowflake/ml/modeling/multiclass/estimators_info.bzl diff --git a/snowflake/ml/sklearn/multioutput/BUILD.bazel b/snowflake/ml/modeling/multioutput/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/multioutput/BUILD.bazel rename to snowflake/ml/modeling/multioutput/BUILD.bazel diff --git a/snowflake/ml/sklearn/multioutput/estimators_info.bzl b/snowflake/ml/modeling/multioutput/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/multioutput/estimators_info.bzl rename to snowflake/ml/modeling/multioutput/estimators_info.bzl diff --git a/snowflake/ml/sklearn/naive_bayes/BUILD.bazel b/snowflake/ml/modeling/naive_bayes/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/naive_bayes/BUILD.bazel rename to snowflake/ml/modeling/naive_bayes/BUILD.bazel diff --git a/snowflake/ml/sklearn/naive_bayes/estimators_info.bzl b/snowflake/ml/modeling/naive_bayes/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/naive_bayes/estimators_info.bzl rename to snowflake/ml/modeling/naive_bayes/estimators_info.bzl diff --git a/snowflake/ml/sklearn/neighbors/BUILD.bazel b/snowflake/ml/modeling/neighbors/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/neighbors/BUILD.bazel rename to snowflake/ml/modeling/neighbors/BUILD.bazel diff --git a/snowflake/ml/sklearn/neighbors/estimators_info.bzl b/snowflake/ml/modeling/neighbors/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/neighbors/estimators_info.bzl rename to snowflake/ml/modeling/neighbors/estimators_info.bzl diff --git a/snowflake/ml/sklearn/neural_network/BUILD.bazel b/snowflake/ml/modeling/neural_network/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/neural_network/BUILD.bazel rename to snowflake/ml/modeling/neural_network/BUILD.bazel diff --git a/snowflake/ml/sklearn/neural_network/estimators_info.bzl b/snowflake/ml/modeling/neural_network/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/neural_network/estimators_info.bzl rename to snowflake/ml/modeling/neural_network/estimators_info.bzl diff --git a/snowflake/ml/modeling/pipeline/BUILD.bazel b/snowflake/ml/modeling/pipeline/BUILD.bazel new file mode 100644 index 00000000..8ccf8c36 --- /dev/null +++ b/snowflake/ml/modeling/pipeline/BUILD.bazel @@ -0,0 +1,33 @@ +load("//bazel:py_rules.bzl", "py_library") +load("@rules_python//python:packaging.bzl", "py_package") + +package(default_visibility = ["//visibility:public"]) + +py_library( + name = "init", + srcs = [ + "__init__.py", + ], + deps = [ + "//snowflake/ml/_internal:init_utils" + ], +) + +py_library( + name = "pipeline", + srcs = [ + "pipeline.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + ], +) + +py_package( + name = "pipeline_pkg", + packages = ["snowflake.ml"], + deps = [ + ":pipeline", + ], +) diff --git a/snowflake/ml/modeling/pipeline/__init__.py b/snowflake/ml/modeling/pipeline/__init__.py new file mode 100644 index 00000000..6010bc4d --- /dev/null +++ b/snowflake/ml/modeling/pipeline/__init__.py @@ -0,0 +1,9 @@ +import os + +from snowflake.ml._internal import init_utils + +pkg_dir = os.path.dirname(os.path.abspath(__file__)) +pkg_name = __name__ +exportable_classes = init_utils.fetch_classes_from_modules_in_pkg_dir(pkg_dir=pkg_dir, pkg_name=pkg_name) +for k, v in exportable_classes.items(): + globals()[k] = v diff --git a/snowflake/ml/sklearn/framework/pipeline.py b/snowflake/ml/modeling/pipeline/pipeline.py similarity index 56% rename from snowflake/ml/sklearn/framework/pipeline.py rename to snowflake/ml/modeling/pipeline/pipeline.py index f245a156..0cd419a0 100644 --- a/snowflake/ml/sklearn/framework/pipeline.py +++ b/snowflake/ml/modeling/pipeline/pipeline.py @@ -2,15 +2,19 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # -from typing import Any, Callable, List, Optional, Tuple, Union +from itertools import chain +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +import numpy as np import pandas as pd -from sklearn import pipeline +from sklearn import __version__ as skversion, pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import FunctionTransformer from sklearn.utils import metaestimators from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import _utils, base _PROJECT = "ModelDevelopment" _SUBPROJECT = "Framework" @@ -41,6 +45,35 @@ def has_callable_attr(obj: object, attr: str) -> bool: return callable(getattr(obj, attr, None)) +def _get_column_indices(all_columns: List[str], target_columns: List[str]) -> List[int]: + """ + Extract the indices of the taget_columns from all_columns. + + Args: + all_columns: List of all the columns in a dataframe. + target_columns: List of target column names to be extracted. + + Returns: + Return the list of indices of target column in the original column array. + + Raises: + ValueError: If the target column is not present in the original column array. + """ + column_indices = [] + for col in target_columns: + found = False + for i, c in enumerate(all_columns): + if c == col: + column_indices.append(i) + found = True + break + if not found: + raise ValueError( + f"Selected column {col} is not found in the input dataframe. Columns in the input df : {all_columns}" + ) + return column_indices + + class Pipeline(base.BaseTransformer): def __init__(self, steps: List[Tuple[str, Any]]) -> None: """ @@ -65,6 +98,16 @@ def __init__(self, steps: List[Tuple[str, Any]]) -> None: self.steps = steps self._is_final_step_estimator = Pipeline._is_estimator(steps[-1][1]) self._is_fitted = False + self._feature_names_in: List[np.ndarray[Any, np.dtype[Any]]] = [] + self._n_features_in: List[int] = [] + self._transformers_to_input_indices: Dict[str, List[int]] = {} + self._is_convertable_to_sklearn = True + + deps: Set[str] = {f"pandas=={pd.__version__}", f"scikit-learn=={skversion}"} + for _, obj in steps: + if isinstance(obj, base.BaseTransformer): + deps = deps | set(obj._get_dependencies()) + self._deps = list(deps) @staticmethod def _is_estimator(obj: object) -> bool: @@ -90,6 +133,61 @@ def _validate_steps(self) -> None: f"{name} (type {type(t)}) doesn't." ) + def _reset(self) -> None: + super()._reset() + self._feature_names_in = [] + self._n_features_in = [] + self._transformers_to_input_indices = {} + + def _is_pipeline_modifying_label_or_sample_weight(self) -> bool: + """ + Checks if pipeline is modifying label or sample_weight columns. + + Returns: + True if pipeline is processing label or sample_weight columns, False otherwise. + """ + estimator_step = self._get_estimator() + if not estimator_step: + return False + + target_cols = set( + estimator_step[1].get_label_cols() + + ([] if not estimator_step[1].get_sample_weight_col() else [estimator_step[1].get_sample_weight_col()]) + ) + processed_cols = set(chain.from_iterable([trans.get_input_cols() for (_, trans) in self._get_transformers()])) + return len(target_cols & processed_cols) > 0 + + def _get_sanitized_list_of_columns(self, columns: List[str]) -> List[str]: + """ + Removes the label and sample_weight columns from the input list of columns and returns the results for the + purpous of computing column indices for SKLearn ColumnTransformer objects. + + Args: + columns: List if input columns for a transformer step. + + Returns: + Returns a list of columns without lable and sample_weight columns. + """ + estimator_step = self._get_estimator() + if not estimator_step: + return columns + + target_cols = set( + estimator_step[1].get_label_cols() + + ([] if not estimator_step[1].get_sample_weight_col() else [estimator_step[1].get_sample_weight_col()]) + ) + + return [c for c in columns if c not in target_cols] + + def _append_step_feature_consumption_info(self, step_name: str, all_cols: List[str], input_cols: List[str]) -> None: + if self._is_convertable_to_sklearn: + all_cols = self._get_sanitized_list_of_columns(all_cols) + self._feature_names_in.append(np.asarray(all_cols, dtype=object)) + self._n_features_in.append(len(all_cols)) + self._transformers_to_input_indices[step_name] = _get_column_indices( + all_columns=all_cols, target_columns=input_cols + ) + def _transform_dataset( self, dataset: Union[snowpark.DataFrame, pd.DataFrame] ) -> Union[snowpark.DataFrame, pd.DataFrame]: @@ -101,8 +199,13 @@ def _transform_dataset( def _fit_transform_dataset( self, dataset: Union[snowpark.DataFrame, pd.DataFrame] ) -> Union[snowpark.DataFrame, pd.DataFrame]: + self._reset() + self._is_convertable_to_sklearn = not self._is_pipeline_modifying_label_or_sample_weight() transformed_dataset = dataset - for _, trans in self._get_transformers(): + for name, trans in self._get_transformers(): + self._append_step_feature_consumption_info( + step_name=name, all_cols=transformed_dataset.columns[:], input_cols=trans.get_input_cols() + ) if has_callable_attr(trans, "fit_transform"): transformed_dataset = trans.fit_transform(transformed_dataset) else: @@ -131,8 +234,13 @@ def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "Pipeline": estimator = self._get_estimator() if estimator: + all_cols = transformed_dataset.columns[:] estimator[1].fit(transformed_dataset) + self._append_step_feature_consumption_info( + step_name=estimator[0], all_cols=all_cols, input_cols=estimator[1].get_input_cols() + ) + self._is_fitted = True return self @@ -343,10 +451,111 @@ def _create_unfitted_sklearn_object(self) -> pipeline.Pipeline: sksteps = [] for step in self.steps: if isinstance(step[1], base.BaseTransformer): - sksteps.append(tuple([step[0], step[1].get_sklearn_object()])) + sksteps.append(tuple([step[0], _utils.to_native_format(step[1])])) else: sksteps.append(tuple([step[0], step[1]])) return pipeline.Pipeline(steps=sksteps) + def _construct_fitted_column_transformer_object( + self, + step_name_in_pipeline: str, + step_index_in_pipeline: int, + step_name_in_ct: str, + step_transformer_obj: Any, + remainder_action: str, + ) -> ColumnTransformer: + """ + Constructs a fitted column transformer object with one step. + + Args: + step_name_in_pipeline: Name of the step in origional pipeline. + step_index_in_pipeline: Index of the step in the original pipeline. + step_name_in_ct: Name of the step in column transformer. + step_transformer_obj: SKLearn object for the transformer or "passthrough". + remainder_action: Action to take on the remainder of input. Possible options "drop" or "passthrough". + + Returns: + Returns a fitted column transformer object. + """ + input_col_indices = self._transformers_to_input_indices[step_name_in_pipeline] + ct = ColumnTransformer( + transformers=[(step_name_in_ct, step_transformer_obj, input_col_indices)], remainder="passthrough" + ) + if step_index_in_pipeline == 0: + # Add column name check for only first transformer. Everything else works with ndarrays as input. + ct.feature_names_in_ = self._feature_names_in[step_index_in_pipeline] + ct.n_features_in_ = self._n_features_in[step_index_in_pipeline] + ct._columns = [input_col_indices] + ct._n_features = self._n_features_in[step_index_in_pipeline] + remaining = sorted(set(range(self._n_features_in[step_index_in_pipeline])) - set(input_col_indices)) + ct._remainder = ("remainder", remainder_action, remaining) + ct._transformer_to_input_indices = {step_name_in_ct: input_col_indices, "remainder": remaining} + ct.transformers_ = [ + (step_name_in_ct, step_transformer_obj, input_col_indices), + ("remainder", remainder_action, remaining), + ] + ct.sparse_output_ = False + + # ColumnTransformer internally replaces the "passthrough" string in the "remainder" step with a + # fitted FunctionTransformer, saved in the _name_to_fitted_passthrough dict, during the transform() + # call. So we need to populate _name_to_fitted_passthrough dict with fitted FunctionTransformer so + # that the replacements works correctly during the transform() call. + ft = FunctionTransformer( + accept_sparse=True, + check_inverse=False, + feature_names_out="one-to-one", + ) + + if remainder_action == "passthrough": + ft.n_features_in_ = len(remaining) + ct._name_to_fitted_passthrough = {"remainder": ft} + elif step_transformer_obj == "passthrough": + ft.n_features_in_ = self._n_features_in[step_index_in_pipeline] + ct._name_to_fitted_passthrough = {step_name_in_ct: ft} + return ct + def _create_sklearn_object(self) -> pipeline.Pipeline: - return self._create_unfitted_sklearn_object() + if not self._is_fitted: + return self._create_unfitted_sklearn_object() + + if not self._is_convertable_to_sklearn: + raise ValueError( + "The pipeline can't be converted to SKLearn equivalent because it processing label or sample_weight " + "columns as part of pipeline preprocessing steps which is not allowed in SKLearn." + ) + + # Create a fitted sklearn pipeline object by translating each non-estimator step in pipeline with with + # a fitted column transformer. + sksteps = [] + for i, (name, trans) in enumerate(self._get_transformers()): + if isinstance(trans, base.BaseTransformer): + trans = self._construct_fitted_column_transformer_object( + step_name_in_pipeline=name, + step_index_in_pipeline=i, + step_name_in_ct=name, + step_transformer_obj=_utils.to_native_format(trans), + remainder_action="passthrough", + ) + + sksteps.append(tuple([name, trans])) + + estimator_step = self._get_estimator() + if estimator_step: + if isinstance(estimator_step[1], base.BaseTransformer): + ct = self._construct_fitted_column_transformer_object( + step_name_in_pipeline=estimator_step[0], + step_index_in_pipeline=i, + step_name_in_ct="filter_input_cols_for_estimator", + step_transformer_obj="passthrough", + remainder_action="drop", + ) + + sksteps.append(tuple(["filter_input_cols_for_estimator", ct])) + sksteps.append(tuple([estimator_step[0], _utils.to_native_format(estimator_step[1])])) + else: + sksteps.append(estimator_step) + + return pipeline.Pipeline(steps=sksteps) + + def _get_dependencies(self) -> List[str]: + return self._deps diff --git a/snowflake/ml/modeling/preprocessing/BUILD.bazel b/snowflake/ml/modeling/preprocessing/BUILD.bazel new file mode 100644 index 00000000..eaab4887 --- /dev/null +++ b/snowflake/ml/modeling/preprocessing/BUILD.bazel @@ -0,0 +1,7 @@ +load("//codegen:codegen_rules.bzl", "autogen_estimators", "autogen_init_file_for_module") +load(":estimators_info.bzl", "estimator_info_list") +load(":BUILD_NATIVE.bzl", "get_build_rules_for_native_impl") +package(default_visibility = ["//visibility:public"]) + +autogen_estimators(module="sklearn.preprocessing", estimator_info_list=estimator_info_list) +get_build_rules_for_native_impl() diff --git a/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl b/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl new file mode 100644 index 00000000..66c3c093 --- /dev/null +++ b/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl @@ -0,0 +1,157 @@ +load("//bazel:py_rules.bzl", "py_library") +load("@rules_python//python:packaging.bzl", "py_package") + +def get_build_rules_for_native_impl(): + py_library( + name = "init", + srcs = [ + "__init__.py", + ], + deps = [ + "//snowflake/ml/_internal:init_utils", + ], + ) + + py_library( + name = "binarizer", + srcs = [ + "binarizer.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "k_bins_discretizer", + srcs = [ + "k_bins_discretizer.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "label_encoder", + srcs = [ + "label_encoder.py", + ], + deps = [ + ":init", + ":ordinal_encoder", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "max_abs_scaler", + srcs = [ + "max_abs_scaler.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "min_max_scaler", + srcs = [ + "min_max_scaler.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "normalizer", + srcs = [ + "normalizer.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "one_hot_encoder", + srcs = [ + "one_hot_encoder.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/model:model_signature", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "ordinal_encoder", + srcs = [ + "ordinal_encoder.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/_internal:type_utils", + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "robust_scaler", + srcs = [ + "robust_scaler.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_library( + name = "standard_scaler", + srcs = [ + "standard_scaler.py", + ], + deps = [ + ":init", + "//snowflake/ml/_internal:telemetry", + "//snowflake/ml/modeling/framework", + ], + ) + + py_package( + name = "preprocessing_pkg", + packages = ["snowflake.ml"], + deps = [ + ":binarizer", + ":k_bins_discretizer", + ":label_encoder", + ":max_abs_scaler", + ":min_max_scaler", + ":normalizer", + ":one_hot_encoder", + ":ordinal_encoder", + ":robust_scaler", + ":standard_scaler", + ], + ) diff --git a/snowflake/ml/modeling/preprocessing/__init__.py b/snowflake/ml/modeling/preprocessing/__init__.py new file mode 100644 index 00000000..6010bc4d --- /dev/null +++ b/snowflake/ml/modeling/preprocessing/__init__.py @@ -0,0 +1,9 @@ +import os + +from snowflake.ml._internal import init_utils + +pkg_dir = os.path.dirname(os.path.abspath(__file__)) +pkg_name = __name__ +exportable_classes = init_utils.fetch_classes_from_modules_in_pkg_dir(pkg_dir=pkg_dir, pkg_name=pkg_name) +for k, v in exportable_classes.items(): + globals()[k] = v diff --git a/snowflake/ml/sklearn/preprocessing/binarizer.py b/snowflake/ml/modeling/preprocessing/binarizer.py similarity index 93% rename from snowflake/ml/sklearn/preprocessing/binarizer.py rename to snowflake/ml/modeling/preprocessing/binarizer.py index 4e272f92..d2e6fd8f 100644 --- a/snowflake/ml/sklearn/preprocessing/binarizer.py +++ b/snowflake/ml/modeling/preprocessing/binarizer.py @@ -9,7 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T @@ -110,6 +110,7 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] self._validate_data_has_no_nulls(dataset) output_columns = [] for input_col in self.input_cols: @@ -117,6 +118,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame output_columns.append(col) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.Binarizer: diff --git a/snowflake/ml/modeling/preprocessing/estimators_info.bzl b/snowflake/ml/modeling/preprocessing/estimators_info.bzl new file mode 100644 index 00000000..b791d55d --- /dev/null +++ b/snowflake/ml/modeling/preprocessing/estimators_info.bzl @@ -0,0 +1,3 @@ +estimator_info_list = [ + struct(class_name="PolynomialFeatures", normalized_class_name="polynomial_features") +] diff --git a/snowflake/ml/sklearn/preprocessing/k_bins_discretizer.py b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py similarity index 93% rename from snowflake/ml/sklearn/preprocessing/k_bins_discretizer.py rename to snowflake/ml/modeling/preprocessing/k_bins_discretizer.py index 06a5518e..87fa662a 100644 --- a/snowflake/ml/sklearn/preprocessing/k_bins_discretizer.py +++ b/snowflake/ml/modeling/preprocessing/k_bins_discretizer.py @@ -15,7 +15,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_utils @@ -322,6 +322,7 @@ def _handle_ordinal(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: Returns: Output dataset with ordinal encoding. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] # NB: the reason we need to generate a random UDF name each time is because the UDF registration # is centralized per database, so if there are multiple sessions with same UDF name, there might be # a conflict and some parties could fail to fetch the UDF. @@ -347,10 +348,12 @@ def vec_bucketize(x: T.PandasSeries[float], boarders: T.PandasSeries[List[float] boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] # type: ignore[arg-type, index] dataset = dataset.select( *dataset.columns, - F.call_udf( - f"{udf_name}", F.col(input_col), F.array_construct(*boarders) # type: ignore[arg-type] - ).alias(output_col), + F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias( # type: ignore[arg-type] + output_col + ), ) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + dataset = dataset[self.output_cols + passthrough_columns] return dataset def _handle_onehot(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: @@ -364,6 +367,7 @@ def _handle_onehot(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: Returns: Output dataset in sparse representation. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] udf_name = f"vec_bucketize_sparse_{snowpark_utils.generate_random_alphanumeric()}" @F.pandas_udf( # type: ignore[arg-type, misc] @@ -390,10 +394,10 @@ def vec_bucketize_sparse_output( boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] # type: ignore[arg-type, index] dataset = dataset.select( *dataset.columns, - F.call_udf(f"{udf_name}", F.col(input_col), F.array_construct(*boarders)).alias( # type: ignore - output_col - ), + F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), # type: ignore ) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + dataset = dataset[self.output_cols + passthrough_columns] return dataset def _handle_onehot_dense(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: @@ -407,6 +411,9 @@ def _handle_onehot_dense(self, dataset: snowpark.DataFrame) -> snowpark.DataFram Returns: Output dataset in dense representation. """ + origional_dataset_columns = dataset.columns[:] + all_output_cols = [] + udf_name = f"vec_bucketize_dense_{snowpark_utils.generate_random_alphanumeric()}" @F.pandas_udf( # type: ignore[arg-type, misc] @@ -434,15 +441,16 @@ def vec_bucketize_dense_output( boarders = [F.lit(float(x)) for x in self.bin_edges_[idx]] # type: ignore[arg-type, index] dataset = dataset.select( *dataset.columns, - F.call_udf(f"{udf_name}", F.col(input_col), F.array_construct(*boarders)).alias( # type: ignore - output_col - ), + F.call_udf(udf_name, F.col(input_col), F.array_construct(*boarders)).alias(output_col), # type: ignore ) dataset = dataset.with_columns( [f"{output_col}_{i}" for i in range(len(boarders) - 1)], [F.col(output_col)[i].cast(T.IntegerType()) for i in range(len(boarders) - 1)], ).drop(output_col) + all_output_cols += [f"{output_col}_{i}" for i in range(len(boarders) - 1)] + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + dataset = dataset[all_output_cols + origional_dataset_columns] return dataset def _transform_sklearn(self, dataset: pd.DataFrame) -> Union[pd.DataFrame, sparse.csr_matrix]: @@ -457,7 +465,7 @@ def _transform_sklearn(self, dataset: pd.DataFrame) -> Union[pd.DataFrame, spars Output dataset. """ self.enforce_fit() - encoder_sklearn = self.get_sklearn_object() + encoder_sklearn = self.to_sklearn() transformed_dataset = encoder_sklearn.transform(dataset[self.input_cols]) diff --git a/snowflake/ml/sklearn/preprocessing/label_encoder.py b/snowflake/ml/modeling/preprocessing/label_encoder.py similarity index 97% rename from snowflake/ml/sklearn/preprocessing/label_encoder.py rename to snowflake/ml/modeling/preprocessing/label_encoder.py index 50dc4e02..372c6f66 100644 --- a/snowflake/ml/sklearn/preprocessing/label_encoder.py +++ b/snowflake/ml/modeling/preprocessing/label_encoder.py @@ -9,8 +9,8 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils -from snowflake.ml.sklearn.framework import base -from snowflake.ml.sklearn.preprocessing import ordinal_encoder +from snowflake.ml.modeling.framework import base +from snowflake.ml.modeling.preprocessing import ordinal_encoder class LabelEncoder(base.BaseTransformer): diff --git a/snowflake/ml/sklearn/preprocessing/max_abs_scaler.py b/snowflake/ml/modeling/preprocessing/max_abs_scaler.py similarity index 95% rename from snowflake/ml/sklearn/preprocessing/max_abs_scaler.py rename to snowflake/ml/modeling/preprocessing/max_abs_scaler.py index 3bd0675d..22c96389 100644 --- a/snowflake/ml/sklearn/preprocessing/max_abs_scaler.py +++ b/snowflake/ml/modeling/preprocessing/max_abs_scaler.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base class MaxAbsScaler(base.BaseTransformer): @@ -163,6 +163,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Returns: Output dataset. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] output_columns = [] for _, input_col in enumerate(self.input_cols): col = dataset[input_col] @@ -170,6 +171,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame output_columns.append(col) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.MaxAbsScaler: diff --git a/snowflake/ml/sklearn/preprocessing/min_max_scaler.py b/snowflake/ml/modeling/preprocessing/min_max_scaler.py similarity index 96% rename from snowflake/ml/sklearn/preprocessing/min_max_scaler.py rename to snowflake/ml/modeling/preprocessing/min_max_scaler.py index e370ff64..ea96063c 100644 --- a/snowflake/ml/sklearn/preprocessing/min_max_scaler.py +++ b/snowflake/ml/modeling/preprocessing/min_max_scaler.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import _utils, base +from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F @@ -185,6 +185,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Returns: Output dataset. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] output_columns = [] for _, input_col in enumerate(self.input_cols): output_column = dataset[input_col] * self.scale_[input_col] + self.min_[input_col] @@ -202,6 +203,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame output_columns.append(output_column) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.MinMaxScaler: diff --git a/snowflake/ml/sklearn/preprocessing/normalizer.py b/snowflake/ml/modeling/preprocessing/normalizer.py similarity index 94% rename from snowflake/ml/sklearn/preprocessing/normalizer.py rename to snowflake/ml/modeling/preprocessing/normalizer.py index 48feaf22..b3bf8b0a 100644 --- a/snowflake/ml/sklearn/preprocessing/normalizer.py +++ b/snowflake/ml/modeling/preprocessing/normalizer.py @@ -9,7 +9,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T _VALID_NORMS = ["l1", "l2", "max"] @@ -112,6 +112,7 @@ def transform(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> Union[s return self._drop_input_columns(output_df) if self._drop_input_cols is True else output_df def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] self._validate_data_has_no_nulls(dataset) if len(self.input_cols) == 0: raise ValueError("Found array with 0 columns, but a minimum of 1 is required.") @@ -143,6 +144,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame output_columns.append(output_column) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.Normalizer: diff --git a/snowflake/ml/sklearn/preprocessing/one_hot_encoder.py b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py similarity index 95% rename from snowflake/ml/sklearn/preprocessing/one_hot_encoder.py rename to snowflake/ml/modeling/preprocessing/one_hot_encoder.py index 378456d9..88a9d0ef 100644 --- a/snowflake/ml/sklearn/preprocessing/one_hot_encoder.py +++ b/snowflake/ml/modeling/preprocessing/one_hot_encoder.py @@ -16,7 +16,8 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils from snowflake.ml._internal.utils import identifier -from snowflake.ml.sklearn.framework import _utils, base +from snowflake.ml.model import model_signature +from snowflake.ml.modeling.framework import _utils, base from snowflake.snowpark import functions as F, types as T from snowflake.snowpark._internal import utils as snowpark_utils @@ -293,6 +294,8 @@ def _fit_sklearn(self, dataset: pd.DataFrame) -> None: self._update_categories_state() def _fit_snowpark(self, dataset: snowpark.DataFrame) -> None: + # StructType[[StructField(COLUMN, TYPE, nullable=True), ...] + self._dataset_schema = dataset.schema fit_results = self._fit_category_state(dataset, return_counts=self._infrequent_enabled) if self._infrequent_enabled: self._fit_infrequent_category_mapping(fit_results["n_samples"], fit_results["category_counts"]) @@ -476,6 +479,27 @@ def _assign_categories(self, state_object_pandas: pd.DataFrame) -> None: categories: Dict[str, type_utils.LiteralNDArrayType] = categories_pandas.set_index(_COLUMN_NAME).to_dict()[ categories_col ] + # Giving the original type back to categories. + for k, v in categories.items(): + snowml_type = model_signature.DataType.from_snowpark_type(self._dataset_schema[k].datatype) + # Don't convert the boolean type, numpy is unable to switch from string to boolean. + # Boolean types would be treated as string + if snowml_type not in [model_signature.DataType.BOOL]: + # If the category contains None values - None stays None; other values are converted. Type unchanged + if pd.isnull(v).any(): + categories[k] = np.where(pd.isnull(v), v, v.astype(snowml_type._numpy_type)) + # Otherwise, convert the whole array, changing the array type. + else: + categories[k] = v.astype(snowml_type._numpy_type) + else: + # Custom function to convert string to bool + # Vectorize the function to work with arrays + vectorized_func = _utils._handle_str_bool_type() + if pd.isnull(v).any(): + categories[k] = np.where(pd.isnull(v), v, vectorized_func(v)) + # Otherwise, convert the whole array, changing the array type. + else: + categories[k] = vectorized_func(v) self.categories_ = categories else: self.categories_ = self.categories @@ -545,6 +569,15 @@ def map_encoding(row: pd.Series) -> int: has_infrequent_categories = self._infrequent_enabled and self.infrequent_categories_[col_idx] is not None cat = row[_CATEGORY] + if hasattr(self, "_dataset_schema") and not pd.isnull(cat): # Do not convert when it is null + row_element = np.array([row[_CATEGORY]]) + snowml_type = model_signature.DataType.from_snowpark_type(self._dataset_schema[input_col].datatype) + # Don't convert the boolean type, it would be treated as string + if snowml_type not in [model_signature.DataType.BOOL]: + cat = row_element.astype(snowml_type._numpy_type)[0] + else: + if not pd.isnull(cat) and isinstance(cat, str): + cat = _utils.str_to_bool(cat) # np.isnan cannot be applied to object or string dtypes, use pd.isnull instead cat_idx = ( np.where(pd.isnull(self.categories_[input_col]))[0][0] @@ -672,8 +705,11 @@ def map_encoded_value(row: pd.Series) -> Dict[str, Any]: state_df = dataset._session.create_dataframe(state_pandas) transformed_dataset = dataset + origional_dataset_columns = transformed_dataset.columns[:] + all_output_cols = [] for idx, input_col in enumerate(self.input_cols): output_col = self.output_cols[idx] + all_output_cols += [output_col] input_col_state_df = state_df.filter(F.col(_COLUMN_NAME) == input_col)[ [_CATEGORY, _ENCODED_VALUE] ].with_column_renamed(_ENCODED_VALUE, output_col) @@ -686,6 +722,8 @@ def map_encoded_value(row: pd.Series) -> Dict[str, Any]: )[transformed_dataset.columns + [output_col]] transformed_dataset = self._handle_unknown_in_transform(transformed_dataset) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[all_output_cols + origional_dataset_columns] return transformed_dataset def _transform_snowpark_dense(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: @@ -732,10 +770,13 @@ def map_encoded_value(row: pd.Series) -> List[int]: state_df = dataset._session.create_dataframe(state_pandas) transformed_dataset = dataset + origional_dataset_columns = transformed_dataset.columns[:] + all_output_cols = [] for input_col in self.input_cols: output_cols = [ identifier.quote_name_without_upper_casing(col) for col in self._dense_output_cols_mappings[input_col] ] + all_output_cols += output_cols input_col_state_df = state_df.filter(F.col(_COLUMN_NAME) == input_col)[output_cols + [_CATEGORY]] # index values through a left join over the dataset and its states @@ -746,6 +787,8 @@ def map_encoded_value(row: pd.Series) -> List[int]: )[transformed_dataset.columns + output_cols] transformed_dataset = self._handle_unknown_in_transform(transformed_dataset) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[all_output_cols + origional_dataset_columns] return transformed_dataset def _transform_snowpark_sparse_udf(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame: @@ -763,7 +806,7 @@ def _transform_snowpark_sparse_udf(self, dataset: snowpark.DataFrame) -> snowpar Returns: Output dataset in the sparse representation. """ - encoder_sklearn = self.get_sklearn_object() + encoder_sklearn = self.to_sklearn() @F.pandas_udf( # type: ignore is_permanent=False, @@ -821,7 +864,7 @@ def _transform_sklearn(self, dataset: pd.DataFrame) -> Union[pd.DataFrame, spars Returns: Output dataset. """ - encoder_sklearn = self.get_sklearn_object() + encoder_sklearn = self.to_sklearn() transformed_dataset = encoder_sklearn.transform(dataset[self.input_cols]) diff --git a/snowflake/ml/sklearn/preprocessing/ordinal_encoder.py b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py similarity index 98% rename from snowflake/ml/sklearn/preprocessing/ordinal_encoder.py rename to snowflake/ml/modeling/preprocessing/ordinal_encoder.py index b2f05665..6c09d141 100644 --- a/snowflake/ml/sklearn/preprocessing/ordinal_encoder.py +++ b/snowflake/ml/modeling/preprocessing/ordinal_encoder.py @@ -13,7 +13,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry, type_utils from snowflake.ml._internal.utils import identifier -from snowflake.ml.sklearn.framework import base +from snowflake.ml.modeling.framework import base from snowflake.snowpark import functions as F, types as T _COLUMN_NAME = "_COLUMN_NAME" @@ -437,6 +437,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Returns: Output dataset. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] assert dataset._session is not None, "dataset._session cannot be None" state_df = ( dataset._session.table(self._vocab_table_name) @@ -482,6 +483,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame transformed_dataset = transformed_dataset.with_column_renamed(F.col(_CATEGORY + suffix), _CATEGORY) transformed_dataset = self._handle_unknown_in_transform(transformed_dataset) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.OrdinalEncoder: diff --git a/snowflake/ml/sklearn/preprocessing/robust_scaler.py b/snowflake/ml/modeling/preprocessing/robust_scaler.py similarity index 96% rename from snowflake/ml/sklearn/preprocessing/robust_scaler.py rename to snowflake/ml/modeling/preprocessing/robust_scaler.py index 006ad395..755d2856 100644 --- a/snowflake/ml/sklearn/preprocessing/robust_scaler.py +++ b/snowflake/ml/modeling/preprocessing/robust_scaler.py @@ -12,7 +12,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import _utils, base +from snowflake.ml.modeling.framework import _utils, base class RobustScaler(base.BaseTransformer): @@ -213,6 +213,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Returns: Output dataset. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] output_columns = [] for _, input_col in enumerate(self.input_cols): col = dataset[input_col] @@ -223,6 +224,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame output_columns.append(col) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.RobustScaler: diff --git a/snowflake/ml/sklearn/preprocessing/standard_scaler.py b/snowflake/ml/modeling/preprocessing/standard_scaler.py similarity index 96% rename from snowflake/ml/sklearn/preprocessing/standard_scaler.py rename to snowflake/ml/modeling/preprocessing/standard_scaler.py index bab46f77..992abb3a 100644 --- a/snowflake/ml/sklearn/preprocessing/standard_scaler.py +++ b/snowflake/ml/modeling/preprocessing/standard_scaler.py @@ -11,7 +11,7 @@ from snowflake import snowpark from snowflake.ml._internal import telemetry -from snowflake.ml.sklearn.framework import _utils, base +from snowflake.ml.modeling.framework import _utils, base class StandardScaler(base.BaseTransformer): @@ -198,6 +198,7 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame Returns: Output dataset. """ + passthrough_columns = [c for c in dataset.columns if c not in self.output_cols] output_columns = [] for _, input_col in enumerate(self.input_cols): output_column = dataset[input_col] @@ -210,6 +211,8 @@ def _transform_snowpark(self, dataset: snowpark.DataFrame) -> snowpark.DataFrame output_columns.append(output_column) transformed_dataset: snowpark.DataFrame = dataset.with_columns(self.output_cols, output_columns) + # Reorder columns. Passthrough columns are added at the right to the output of the transformers. + transformed_dataset = transformed_dataset[self.output_cols + passthrough_columns] return transformed_dataset def _create_unfitted_sklearn_object(self) -> preprocessing.StandardScaler: diff --git a/snowflake/ml/sklearn/semi_supervised/BUILD.bazel b/snowflake/ml/modeling/semi_supervised/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/semi_supervised/BUILD.bazel rename to snowflake/ml/modeling/semi_supervised/BUILD.bazel diff --git a/snowflake/ml/sklearn/semi_supervised/estimators_info.bzl b/snowflake/ml/modeling/semi_supervised/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/semi_supervised/estimators_info.bzl rename to snowflake/ml/modeling/semi_supervised/estimators_info.bzl diff --git a/snowflake/ml/sklearn/svm/BUILD.bazel b/snowflake/ml/modeling/svm/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/svm/BUILD.bazel rename to snowflake/ml/modeling/svm/BUILD.bazel diff --git a/snowflake/ml/sklearn/svm/estimators_info.bzl b/snowflake/ml/modeling/svm/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/svm/estimators_info.bzl rename to snowflake/ml/modeling/svm/estimators_info.bzl diff --git a/snowflake/ml/sklearn/tree/BUILD.bazel b/snowflake/ml/modeling/tree/BUILD.bazel similarity index 100% rename from snowflake/ml/sklearn/tree/BUILD.bazel rename to snowflake/ml/modeling/tree/BUILD.bazel diff --git a/snowflake/ml/sklearn/tree/estimators_info.bzl b/snowflake/ml/modeling/tree/estimators_info.bzl similarity index 100% rename from snowflake/ml/sklearn/tree/estimators_info.bzl rename to snowflake/ml/modeling/tree/estimators_info.bzl diff --git a/snowflake/ml/xgboost/BUILD.bazel b/snowflake/ml/modeling/xgboost/BUILD.bazel similarity index 100% rename from snowflake/ml/xgboost/BUILD.bazel rename to snowflake/ml/modeling/xgboost/BUILD.bazel diff --git a/snowflake/ml/xgboost/estimators_info.bzl b/snowflake/ml/modeling/xgboost/estimators_info.bzl similarity index 100% rename from snowflake/ml/xgboost/estimators_info.bzl rename to snowflake/ml/modeling/xgboost/estimators_info.bzl diff --git a/snowflake/ml/registry/BUILD.bazel b/snowflake/ml/registry/BUILD.bazel index 46120bd1..8b4a2630 100644 --- a/snowflake/ml/registry/BUILD.bazel +++ b/snowflake/ml/registry/BUILD.bazel @@ -16,7 +16,7 @@ py_library( "//snowflake/ml/_internal:telemetry", "//snowflake/ml/model:_model", "//snowflake/ml/model:_deployer", - "//snowflake/ml/sklearn/framework:framework" + "//snowflake/ml/modeling/framework:framework" ], ) diff --git a/snowflake/ml/registry/model_registry.py b/snowflake/ml/registry/model_registry.py index 97c533ab..308be158 100644 --- a/snowflake/ml/registry/model_registry.py +++ b/snowflake/ml/registry/model_registry.py @@ -25,8 +25,8 @@ model_signature, type_hints as model_types, ) +from snowflake.ml.modeling.framework import base from snowflake.ml.registry import _schema -from snowflake.ml.sklearn.framework import base if TYPE_CHECKING: import pandas as pd @@ -56,6 +56,10 @@ _TELEMETRY_SUBPROJECT = "ModelRegistry" +@telemetry.send_api_usage_telemetry( + project=_TELEMETRY_PROJECT, + subproject=_TELEMETRY_SUBPROJECT, +) @snowpark._internal.utils.private_preview(version="0.2.0") def create_model_registry( *, @@ -437,8 +441,8 @@ def _insert_registry_entry( if v and v != properties[k]: raise connector.DataError( formatting.unwrap( - f"""Parameter '{k.lower()}' is given and parameter 'properties' has the field '{k}' set but the values - do not match: {k.lower()}=="{v}" properties['{k}']=="{properties[k]}".""" + f"""Parameter '{k.lower()}' is given and parameter 'properties' has the field '{k}' set but + the values do not match: {k.lower()}=="{v}" properties['{k}']=="{properties[k]}".""" ) ) # Could do a multi-table insert here with some pros and cons: @@ -551,20 +555,18 @@ def _list_selected_models( Returns: A Snowpark dataframe representing the models that match the given constraints. - - Raises: - DataError: Model ID or (Model Name + Model Version) is not given. """ - if not (id or (model_name and model_version)): - raise connector.DataError("Either (Model Name + Model Version) or Model ID is required, but none is given.") - models = self.list_models() if id: filtered_models = models.filter(snowpark.Column("ID") == id) else: + self._model_identifier_is_nonempty_or_raise(model_name, model_version) + + # The following two asserts is to satisfy mypy. assert model_name assert model_version + filtered_models = models.filter(snowpark.Column("NAME") == model_name).filter( snowpark.Column("VERSION") == model_version ) @@ -672,7 +674,7 @@ def _set_metadata_attribute( except connector.DataError: raise connector.DataError(f"Setting model name for mode id {id} failed.") - def _model_identifier_is_nonempty_or_raise(self, model_name: str, model_version: str) -> None: + def _model_identifier_is_nonempty_or_raise(self, model_name: Optional[str], model_version: Optional[str]) -> None: """Validate model_name and model_version are non-empty strings. Args: @@ -1300,6 +1302,7 @@ def log_model( Raises: TypeError: Raised when both signatures and sample_input_data is not presented. Will be captured locally. + DataError: Raised when the given model exists. Returns: String of the auto-generate unique model identifier. None if failed. @@ -1309,6 +1312,9 @@ def log_model( self._model_identifier_is_nonempty_or_raise(model_name, model_version) + existing_model_nums = self._list_selected_models(model_name=model_name, model_version=model_version).count() + if existing_model_nums: + raise connector.DataError(f"Model {model_name}/{model_version} already exists. Unable to log the model.") with tempfile.TemporaryDirectory() as tmpdir: model = cast(model_types.SupportedModelType, model) try: @@ -1597,10 +1603,9 @@ def __init__( registry: ModelRegistry, model_name: str, model_version: str, - id: Optional[str] = None, ) -> None: self._registry = registry - self._id = id if id else registry._get_model_id(model_name=model_name, model_version=model_version) + self._id = registry._get_model_id(model_name=model_name, model_version=model_version) self._model_name = model_name self._model_version = model_version diff --git a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb index 236f4571..e4640eaa 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging Example.ipynb @@ -43,7 +43,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install snowflake_ml_python-0.3.2-py3-none-any.whl" + "%pip install snowflake_ml_python-0.3.3-py3-none-any.whl" ] }, { @@ -71,7 +71,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install snowflake_ml_python-0.3.2-py3-none-any.whl[tensorflow] transformers==4.24.0" + "%pip install snowflake_ml_python-0.3.3-py3-none-any.whl[tensorflow] transformers==4.24.0" ] }, { @@ -143,71 +143,6 @@ "session = Session.builder.configs(SnowflakeLoginOptions()).create()" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "e2fcbe4a", - "metadata": {}, - "source": [ - "### Let `snowflake-ml-python` available for your models to be deployed" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "671a7710", - "metadata": {}, - "source": [ - "Unfortunately, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to import them manually to use it when the model get deployed to Snowflake. To avoid upload them again and again, we could set up a temporary stage and upload the wheel file there." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eae711f", - "metadata": {}, - "outputs": [], - "source": [ - "SNOW_ML_WHEEL_LOCAL_PATH = \"~/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-0.3.3-py3-none-any.whl\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fcececa", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from typing import Optional\n", - "\n", - "def upload_snowml_to_tmp_stage(session: Session, wheel_path: str, stage_name: Optional[str] = None) -> str:\n", - " \"\"\"Upload model module of snowml to tmp stage.\n", - "\n", - " Args:\n", - " session: Snowpark session.\n", - " wheel_path: Path to the local SnowML wheel file.\n", - "\n", - " Returns:\n", - " The stage path to uploaded snowml.zip file.\n", - " \"\"\"\n", - " if stage_name is None:\n", - " stage_name = session.get_session_stage()\n", - " _ = session.file.put(wheel_path, stage_name, auto_compress=False, overwrite=True)\n", - " whl_filename = os.path.basename(wheel_path)\n", - " return f\"{stage_name}/{whl_filename}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90ea99cc", - "metadata": {}, - "outputs": [], - "source": [ - "SNOW_ML_WHEEL_STAGE_PATH = upload_snowml_to_tmp_stage(session, SNOW_ML_WHEEL_LOCAL_PATH)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -478,7 +413,7 @@ "id": "c52611ac", "metadata": {}, "source": [ - "Also, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to import them manually in the options when deploying, it will not required when we our package into Snowflake Anaconda Channel." + "Also, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to set `_use_local_snowml` as `True` to use local SnowML, it will not required when we our package into Snowflake Anaconda Channel." ] }, { @@ -495,7 +430,7 @@ "model.deploy(\n", " deployment_name=\"svc_model_predict\",\n", " target_method=\"predict\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH},\n", + " options={\"_use_local_snowml\": True},\n", ")" ] }, @@ -532,7 +467,7 @@ "model.deploy(\n", " deployment_name=\"svc_model_predict_proba\",\n", " target_method=\"predict_proba\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH},\n", + " options={\"_use_local_snowml\": True},\n", ")" ] }, @@ -774,7 +709,7 @@ "gpt_model.deploy(\n", " deployment_name=\"gpt_model_predict\",\n", " target_method=\"predict\",\n", - " options={\"relax_version\": True, \"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH},\n", + " options={\"relax_version\": True, \"_use_local_snowml\": True},\n", ")" ] }, @@ -845,25 +780,6 @@ "session.sql(f\"CREATE OR REPLACE STAGE {PERMANENT_UDF_STAGE_NAME}\").collect()" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3203f803", - "metadata": {}, - "source": [ - "To make the deployment permanent, any dependency must be put into the a permanent stage as well. Of course, this will no longer be necessary after `snowflake-ml-python` gets available in Snowflake Anaconda channel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a25b641a", - "metadata": {}, - "outputs": [], - "source": [ - "SNOW_ML_WHEEL_STAGE_PATH = upload_snowml_to_tmp_stage(session, SNOW_ML_WHEEL_LOCAL_PATH, f\"@{PERMANENT_UDF_STAGE_NAME}\")" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -1075,7 +991,7 @@ " options={\n", " \"relax_version\": True,\n", " \"permanent_udf_stage_location\": f\"@{PERMANENT_UDF_STAGE_NAME}\",\n", - " \"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH,\n", + " \"_use_local_snowml\": True,\n", " },\n", ")" ] diff --git a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb index 3f06233d..7a910925 100644 --- a/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb +++ b/snowflake/ml/registry/notebooks/Model Packaging SnowML Examples.ipynb @@ -39,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install snowflake_ml_python-0.3.2-py3-none-any.whl\n", + "%pip install snowflake_ml_python-0.3.3-py3-none-any.whl\n", "\n", "# Snowpark Connector, Snowpark Library, Session\n", "import snowflake.connector\n", @@ -138,68 +138,6 @@ "session = Session.builder.configs(SnowflakeLoginOptions()).create()" ] }, - { - "cell_type": "markdown", - "id": "e2fcbe4a", - "metadata": {}, - "source": [ - "### Let `snowflake-ml-python` available for your models to be deployed" - ] - }, - { - "cell_type": "markdown", - "id": "671a7710", - "metadata": {}, - "source": [ - "Unfortunately, since `snowflake-ml-python` does not exist in Anaconda channel yet, we have to import them manually to use it when the model get deployed to Snowflake. To avoid upload them again and again, we could set up a temporary stage and upload the wheel file there." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5eae711f", - "metadata": {}, - "outputs": [], - "source": [ - "SNOW_ML_WHEEL_LOCAL_PATH = \"~/snowml/bazel-bin/snowflake/ml/snowflake_ml_python-0.3.2-py3-none-any.whl\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6fcececa", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "\n", - "def upload_snowml_to_tmp_stage(session: Session, wheel_path: str) -> str:\n", - " \"\"\"Upload model module of snowml to tmp stage.\n", - "\n", - " Args:\n", - " session: Snowpark session.\n", - " wheel_path: Path to the local SnowML wheel file.\n", - "\n", - " Returns:\n", - " The stage path to uploaded snowml.zip file.\n", - " \"\"\"\n", - " tmp_stage = session.get_session_stage()\n", - " _ = session.file.put(wheel_path, tmp_stage, auto_compress=False, overwrite=True)\n", - " whl_filename = os.path.basename(wheel_path)\n", - " return f\"{tmp_stage}/{whl_filename}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "90ea99cc", - "metadata": {}, - "outputs": [], - "source": [ - "SNOW_ML_WHEEL_STAGE_PATH = upload_snowml_to_tmp_stage(session, SNOW_ML_WHEEL_LOCAL_PATH)" - ] - }, { "cell_type": "markdown", "id": "dfa9ab88", @@ -653,7 +591,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -742,7 +680,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_proba\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1017,7 +955,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1098,7 +1036,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_proba\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1191,7 +1129,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_log_proba\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1451,7 +1389,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1532,7 +1470,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_proba\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1625,7 +1563,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict_log_proba\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -1718,7 +1656,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"decision_function\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, @@ -2210,7 +2148,7 @@ "model.deploy(\n", " deployment_name=deploy_name,\n", " target_method=\"predict\",\n", - " options={\"_snowml_wheel_path\": SNOW_ML_WHEEL_STAGE_PATH, \"relax_version\": True},\n", + " options={\"_use_local_snowml\": True, \"relax_version\": True},\n", ")" ] }, diff --git a/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb b/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb index d6af459b..e89bf2ca 100644 --- a/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb +++ b/snowflake/ml/registry/notebooks/Model Registry Demo.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "5de3eb26", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "99e58d8c", "metadata": {}, @@ -70,6 +72,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c592d46c", "metadata": {}, @@ -78,6 +81,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bec73215", "metadata": {}, @@ -118,6 +122,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "20eef3b6", "metadata": {}, @@ -162,6 +167,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "dfa9ab88", "metadata": {}, @@ -170,6 +176,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "676b28b3", "metadata": {}, @@ -202,6 +209,7 @@ "name": "stderr", "output_type": "stream", "text": [ + "WARNING:snowflake.snowpark:create_model_registry() is in private preview since 0.2.0. Do not use it in production. \n", "WARNING:absl:The database model_registry_zzhu already exists. Skipping creation.\n", "WARNING:absl:The schmea \"model_registry_zzhu\".\"PUBLIC\" already exists. Skipping creation.\n", "WARNING:absl:The registry table \"model_registry_zzhu\".\"PUBLIC\".\"MODELS\" already exists. Skipping creation.\n", @@ -225,6 +233,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5d6a85b3", "metadata": {}, @@ -238,6 +247,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "317e7843", "metadata": {}, @@ -246,6 +256,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3bdda91a", "metadata": {}, @@ -262,21 +273,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "9d8ad06e", "metadata": {}, "outputs": [], "source": [ "# A name and model tags can be added to the model at registration time.\n", "model_name = \"my_model\"\n", - "model_version = \"107.1\"\n", + "model_version = \"107.2\"\n", "model_id = registry.log_model(model_name=model_name, model_version=model_version, model=clf, tags={\n", " \"stage\": \"testing\", \"classifier_type\": \"svm.SVC\", \"svc_gamma\": svc_gamma, \"svc_C\": svc_C}, sample_input_data=train_features)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "id": "b463bad9", "metadata": {}, "outputs": [ @@ -284,17 +295,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Registered new model: 9a72b5dcf68e11ed89d2acde48001122\n" + "Registered new model id: 0d67062c04c411ee8bddacde48001122\n" ] } ], "source": [ "# The object API can be used to reference a model after creation.\n", - "model = model_registry.ModelReference(registry=registry, id=model_id, model_name=model_name, model_version=model_version)\n", - "print(\"Registered new model:\", model_id)" + "model = model_registry.ModelReference(registry=registry, model_name=model_name, model_version=model_version)\n", + "print(\"Registered new model id:\", model_id)" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "27d1158d", "metadata": {}, @@ -303,6 +315,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d6035ca5", "metadata": {}, @@ -312,10 +325,18 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "c2b0cdbd", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.set_metric() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.get_metrics() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -352,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "id": "45b81834", "metadata": {}, "outputs": [ @@ -365,7 +386,7 @@ " 'test_accuracy': 0.97}" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -377,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "id": "9a2627c5", "metadata": {}, "outputs": [ @@ -390,7 +411,7 @@ " 'test_accuracy': 0.97}" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -401,6 +422,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "98164cb7", "metadata": {}, @@ -409,6 +431,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "67eac368", "metadata": {}, @@ -418,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "dc82b541", "metadata": {}, "outputs": [ @@ -429,7 +452,7 @@ "-------------------------------------------------------------------------------------------\n", "|\"NAME\" |\"VERSION\" |\"TAGS\" |\"METRICS\" |\n", "-------------------------------------------------------------------------------------------\n", - "|my_model |107.1 |{ |{ |\n", + "|my_model |107.2 |{ |{ |\n", "| | | \"classifier_type\": \"svm.SVC\", | \"confusion_matrix\": [ |\n", "| | | \"stage\": \"testing\", | [ |\n", "| | | \"svc_C\": 10, | 90, |\n", @@ -476,6 +499,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "88707ecd", "metadata": {}, @@ -485,18 +509,65 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "f80f78da", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.get_tags() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.set_tag() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Old tags: {'classifier_type': 'svm.SVC', 'stage': 'testing', 'svc_C': 10, 'svc_gamma': 0.001}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.remove_tag() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Old tags: {'classifier_type': 'svm.SVC', 'stage': 'testing', 'svc_C': 10, 'svc_gamma': 0.001}\n", "Added tag: {'classifier_type': 'svm.SVC', 'minor_version': '23', 'stage': 'testing', 'svc_C': 10, 'svc_gamma': 0.001}\n", - "Removed tag {'classifier_type': 'svm.SVC', 'stage': 'testing', 'svc_C': 10, 'svc_gamma': 0.001}\n", - "Updated tag: {'classifier_type': 'svm.SVC', 'stage': 'production', 'svc_C': 10, 'svc_gamma': 0.001}\n", + "Removed tag {'classifier_type': 'svm.SVC', 'stage': 'testing', 'svc_C': 10, 'svc_gamma': 0.001}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.set_model_description() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated tag: {'classifier_type': 'svm.SVC', 'stage': 'production', 'svc_C': 10, 'svc_gamma': 0.001}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.get_model_description() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Added description: \"My model is better than talkgpt-5!\"\n" ] } @@ -518,6 +589,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "47e80e1e", "metadata": {}, @@ -527,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "7905d9c9", "metadata": {}, "outputs": [ @@ -560,6 +632,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "72ade02b", "metadata": {}, @@ -568,6 +641,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "645df90e", "metadata": {}, @@ -577,7 +651,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "eef6965d", "metadata": { "scrolled": true @@ -590,19 +664,19 @@ "--------------------------------------------------------------------------------------------------------------------------------\n", "|\"ID\" |\"NAME\" |\"VERSION\" |\"CREATION_TIME\" |\"TAGS\" |\n", "--------------------------------------------------------------------------------------------------------------------------------\n", - "|9a72b5dcf68e11ed89d2acde48001122 |my_model |107.1 |2023-05-19 14:46:27.717000-07:00 |{ |\n", + "|0d67062c04c411ee8bddacde48001122 |my_model |107.2 |2023-06-06 16:44:19.333000-07:00 |{ |\n", "| | | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | | \"stage\": \"production\", |\n", "| | | | | \"svc_C\": 10, |\n", "| | | | | \"svc_gamma\": 0.001 |\n", "| | | | |} |\n", - "|6e09ee48f5da11edaa85acde48001122 |my_model |107 |2023-05-18 17:17:22.291000-07:00 |{ |\n", + "|9a72b5dcf68e11ed89d2acde48001122 |my_model |107.1 |2023-05-19 14:46:27.717000-07:00 |{ |\n", "| | | | | \"classifier_type\": \"svm.SVC\", |\n", - "| | | | | \"stage\": \"testing\", |\n", + "| | | | | \"stage\": \"production\", |\n", "| | | | | \"svc_C\": 10, |\n", "| | | | | \"svc_gamma\": 0.001 |\n", "| | | | |} |\n", - "|45b51536f5d911eda1b8acde48001122 |my_model |106 |2023-05-18 17:08:25.500000-07:00 |{ |\n", + "|6e09ee48f5da11edaa85acde48001122 |my_model |107 |2023-05-18 17:17:22.291000-07:00 |{ |\n", "| | | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | | \"stage\": \"testing\", |\n", "| | | | | \"svc_C\": 10, |\n", @@ -618,6 +692,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b2a42a8f", "metadata": {}, @@ -626,6 +701,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "833bfd54", "metadata": {}, @@ -635,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 20, "id": "6df2eafc", "metadata": { "scrolled": false @@ -648,7 +724,7 @@ "------------------------------------------------------------------------------------------------------------------------------\n", "|\"ID\" |\"NAME\" |\"VERSION\" |\"TAGS\" |\"METRICS\" |\n", "------------------------------------------------------------------------------------------------------------------------------\n", - "|8aa8fac2f03911edb94aacde48001122 |my_model |103 |{ |{ |\n", + "|ae1a8938efc811edb049acde48001122 |my_model |1 |{ |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", | \"confusion_matrix\": [ |\n", "| | | | \"stage\": \"production\", | [ |\n", "| | | | \"svc_C\": 10, | 90, |\n", @@ -682,7 +758,7 @@ "| | | | | \"num_training_examples\": 10, |\n", "| | | | | \"test_accuracy\": 0.97 |\n", "| | | | |} |\n", - "|ae1a8938efc811edb049acde48001122 |my_model |1 |{ |{ |\n", + "|9a72b5dcf68e11ed89d2acde48001122 |my_model |107.1 |{ |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", | \"confusion_matrix\": [ |\n", "| | | | \"stage\": \"production\", | [ |\n", "| | | | \"svc_C\": 10, | 90, |\n", @@ -711,6 +787,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f444385f", "metadata": {}, @@ -719,6 +796,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2045ef23", "metadata": {}, @@ -727,6 +805,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fb221c55", "metadata": {}, @@ -736,12 +815,20 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 21, "id": "aed50394", "metadata": { "scrolled": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.get_model_history() is in private preview since 0.2.0. Do not use it in production. \n", + "WARNING:snowflake.snowpark:ModelRegistry.get_history() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -749,39 +836,39 @@ "-----------------------------------------------------------------------------------------------------------------------------------\n", "|\"EVENT_TIMESTAMP\" |\"ROLE\" |\"ATTRIBUTE_NAME\" |\"VALUE[ATTRIBUTE_NAME]\" |\n", "-----------------------------------------------------------------------------------------------------------------------------------\n", - "|2023-05-19 14:46:29.334000-07:00 |\"ENG_ML_MODELING_RL\" |REGISTRATION |{ |\n", + "|2023-06-06 16:44:21.119000-07:00 |\"ENG_ML_MODELING_RL\" |REGISTRATION |{ |\n", "| | | | \"CREATION_ENVIRONMENT_SPEC\": { |\n", "| | | | \"python\": \"3.8.16\" |\n", "| | | | }, |\n", "| | | | \"CREATION_ROLE\": \"\\\"ENG_ML_MODELING_RL\\\"\", |\n", - "| | | | \"CREATION_TIME\": \"2023-05-19 14:46:29.334 -07... |\n", - "| | | | \"ID\": \"9a72b5dcf68e11ed89d2acde48001122\", |\n", + "| | | | \"CREATION_TIME\": \"2023-06-06 16:44:21.119 -07... |\n", + "| | | | \"ID\": \"0d67062c04c411ee8bddacde48001122\", |\n", "| | | | \"NAME\": \"my_model\", |\n", "| | | | \"TYPE\": \"snowflake_native\", |\n", "| | | | \"URI\": \"sfc:model_registry_zzhu.PUBLIC.SNOWML... |\n", - "| | | | \"VERSION\": \"107.1\" |\n", + "| | | | \"VERSION\": \"107.2\" |\n", "| | | |} |\n", - "|2023-05-19 14:46:30.462000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:44:22.556000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"stage\": \"testing\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:46:45.751000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:10.417000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:46:47.522000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:12.037000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"num_training_examples\": 10, |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:46:49.228000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:14.065000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"dataset_test\": { |\n", "| | | | \"accuracy\": 0.97 |\n", "| | | | }, |\n", "| | | | \"num_training_examples\": 10, |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:46:50.837000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:15.570000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"confusion_matrix\": [ |\n", "| | | | [ |\n", "| | | | 90, |\n", @@ -798,26 +885,26 @@ "| | | | \"num_training_examples\": 10, |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:47:04.772000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:45:40.634000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"minor_version\": \"23\", |\n", "| | | | \"stage\": \"testing\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:47:06.814000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:45:43.082000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"stage\": \"testing\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:47:08.923000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:45:45.092000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"stage\": \"production\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:47:10.910000-07:00 |\"ENG_ML_MODELING_RL\" |DESCRIPTION |\"My model is better than talkgpt-5!\" |\n", + "|2023-06-06 16:45:46.624000-07:00 |\"ENG_ML_MODELING_RL\" |DESCRIPTION |\"My model is better than talkgpt-5!\" |\n", "-----------------------------------------------------------------------------------------------------------------------------------\n", "\n" ] @@ -828,6 +915,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "9810d4f4", "metadata": {}, @@ -837,7 +925,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "id": "21f7e0b5", "metadata": { "scrolled": false @@ -850,39 +938,39 @@ "-----------------------------------------------------------------------------------------------------------------------------------\n", "|\"EVENT_TIMESTAMP\" |\"ROLE\" |\"ATTRIBUTE_NAME\" |\"VALUE[ATTRIBUTE_NAME]\" |\n", "-----------------------------------------------------------------------------------------------------------------------------------\n", - "|2023-05-19 14:46:29.334000-07:00 |\"ENG_ML_MODELING_RL\" |REGISTRATION |{ |\n", + "|2023-06-06 16:44:21.119000-07:00 |\"ENG_ML_MODELING_RL\" |REGISTRATION |{ |\n", "| | | | \"CREATION_ENVIRONMENT_SPEC\": { |\n", "| | | | \"python\": \"3.8.16\" |\n", "| | | | }, |\n", "| | | | \"CREATION_ROLE\": \"\\\"ENG_ML_MODELING_RL\\\"\", |\n", - "| | | | \"CREATION_TIME\": \"2023-05-19 14:46:29.334 -07... |\n", - "| | | | \"ID\": \"9a72b5dcf68e11ed89d2acde48001122\", |\n", + "| | | | \"CREATION_TIME\": \"2023-06-06 16:44:21.119 -07... |\n", + "| | | | \"ID\": \"0d67062c04c411ee8bddacde48001122\", |\n", "| | | | \"NAME\": \"my_model\", |\n", "| | | | \"TYPE\": \"snowflake_native\", |\n", "| | | | \"URI\": \"sfc:model_registry_zzhu.PUBLIC.SNOWML... |\n", - "| | | | \"VERSION\": \"107.1\" |\n", + "| | | | \"VERSION\": \"107.2\" |\n", "| | | |} |\n", - "|2023-05-19 14:46:30.462000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:44:22.556000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"stage\": \"testing\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:46:45.751000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:10.417000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:46:47.522000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:12.037000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"num_training_examples\": 10, |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:46:49.228000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:14.065000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"dataset_test\": { |\n", "| | | | \"accuracy\": 0.97 |\n", "| | | | }, |\n", "| | | | \"num_training_examples\": 10, |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:46:50.837000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", + "|2023-06-06 16:45:15.570000-07:00 |\"ENG_ML_MODELING_RL\" |METRICS |{ |\n", "| | | | \"confusion_matrix\": [ |\n", "| | | | [ |\n", "| | | | 90, |\n", @@ -899,26 +987,26 @@ "| | | | \"num_training_examples\": 10, |\n", "| | | | \"test_accuracy\": 0.97 |\n", "| | | |} |\n", - "|2023-05-19 14:47:04.772000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:45:40.634000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"minor_version\": \"23\", |\n", "| | | | \"stage\": \"testing\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:47:06.814000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:45:43.082000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"stage\": \"testing\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:47:08.923000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", + "|2023-06-06 16:45:45.092000-07:00 |\"ENG_ML_MODELING_RL\" |TAGS |{ |\n", "| | | | \"classifier_type\": \"svm.SVC\", |\n", "| | | | \"stage\": \"production\", |\n", "| | | | \"svc_C\": 10, |\n", "| | | | \"svc_gamma\": 0.001 |\n", "| | | |} |\n", - "|2023-05-19 14:47:10.910000-07:00 |\"ENG_ML_MODELING_RL\" |DESCRIPTION |\"My model is better than talkgpt-5!\" |\n", + "|2023-06-06 16:45:46.624000-07:00 |\"ENG_ML_MODELING_RL\" |DESCRIPTION |\"My model is better than talkgpt-5!\" |\n", "-----------------------------------------------------------------------------------------------------------------------------------\n", "\n" ] @@ -929,6 +1017,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "735f0ac3", "metadata": {}, @@ -937,6 +1026,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0a43d2b6", "metadata": {}, @@ -945,6 +1035,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5e065521", "metadata": {}, @@ -954,10 +1045,17 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "cc0512e1", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:snowflake.snowpark:ModelRegistry.load_model() is in private preview since 0.2.0. Do not use it in production. \n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -979,6 +1077,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4fbc0793", "metadata": {}, @@ -988,7 +1087,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "2796f2e0", "metadata": {}, "outputs": [ diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl new file mode 100755 index 00000000..bd77ddda --- /dev/null +++ b/snowflake/ml/requirements.bzl @@ -0,0 +1,6 @@ +# DO NOT EDIT! +# Generate by running 'bazel run //bazel/requirements:sync_requirements' + +EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'all': ['lightgbm==3.3.5', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1']} + +REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<2', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]', 'snowflake-snowpark-python>=1.4.0,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2'] diff --git a/snowflake/ml/sklearn/preprocessing/BUILD.bazel b/snowflake/ml/sklearn/preprocessing/BUILD.bazel deleted file mode 100644 index c407482b..00000000 --- a/snowflake/ml/sklearn/preprocessing/BUILD.bazel +++ /dev/null @@ -1,171 +0,0 @@ -load("//bazel:py_rules.bzl", "py_library") -load("@rules_python//python:packaging.bzl", "py_package") - -package(default_visibility = ["//visibility:public"]) - -py_library( - name = "init", - srcs = [ - "__init__.py", - ], - deps = [ - "//snowflake/ml/_internal:init_utils" - ], -) - -py_library( - name = "binarizer", - srcs = [ - "binarizer.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "k_bins_discretizer", - srcs = [ - "k_bins_discretizer.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "label_encoder", - srcs = [ - "label_encoder.py", - ], - deps = [ - ":init", - ":ordinal_encoder", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "max_abs_scaler", - srcs = [ - "max_abs_scaler.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "min_max_scaler", - srcs = [ - "min_max_scaler.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "normalizer", - srcs = [ - "normalizer.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "one_hot_encoder", - srcs = [ - "one_hot_encoder.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "ordinal_encoder", - srcs = [ - "ordinal_encoder.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/_internal:type_utils", - "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "robust_scaler", - srcs = [ - "robust_scaler.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "simple_imputer", - srcs = [ - "simple_imputer.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - -py_library( - name = "standard_scaler", - srcs = [ - "standard_scaler.py", - ], - deps = [ - ":init", - "//snowflake/ml/_internal:telemetry", - "//snowflake/ml/sklearn/framework", - ], -) - - -py_package( - name = "preprocessing_pkg", - packages = ["snowflake.ml"], - deps = [ - ":binarizer", - ":k_bins_discretizer", - ":label_encoder", - ":max_abs_scaler", - ":min_max_scaler", - ":normalizer", - ":one_hot_encoder", - ":ordinal_encoder", - ":robust_scaler", - ":simple_imputer", - ":standard_scaler", - ], -) diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl index 8c61027c..bf8a1125 100644 --- a/snowflake/ml/version.bzl +++ b/snowflake/ml/version.bzl @@ -1,2 +1,2 @@ # This is parsed by regex in conda reciper meta file. Make sure not to break it. -VERSION = "0.3.3" +VERSION = "1.0.0" diff --git a/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel b/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel index a87b01ff..b6108a9e 100644 --- a/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel +++ b/tests/integ/snowflake/ml/_internal/utils/BUILD.bazel @@ -8,6 +8,6 @@ py_test( deps = [ "//snowflake/ml/_internal/utils:parallelize", "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", + "//tests/integ/snowflake/ml/modeling/framework:utils", ], ) diff --git a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel index ead9b4d9..5b5c7ed1 100644 --- a/tests/integ/snowflake/ml/extra_tests/BUILD.bazel +++ b/tests/integ/snowflake/ml/extra_tests/BUILD.bazel @@ -6,7 +6,7 @@ py_test( name = "test_column_name_inference", srcs = ["test_column_name_inference.py"], deps = [ - "//snowflake/ml/sklearn/linear_model:linear_regression", + "//snowflake/ml/modeling/linear_model:linear_regression", "//snowflake/ml/utils:connection_params", ], ) @@ -15,8 +15,8 @@ py_test( name = "test_grid_search", srcs = ["test_grid_search.py"], deps = [ - "//snowflake/ml/sklearn/model_selection:grid_search_cv", - "//snowflake/ml/sklearn/svm:svr", + "//snowflake/ml/modeling/model_selection:grid_search_cv", + "//snowflake/ml/modeling/svm:svr", "//snowflake/ml/utils:connection_params", ], ) @@ -25,9 +25,9 @@ py_test( name = "test_voting_regressor", srcs = ["test_voting_regressor.py"], deps = [ - "//snowflake/ml/sklearn/ensemble:voting_regressor", - "//snowflake/ml/sklearn/linear_model:linear_regression", - "//snowflake/ml/sklearn/linear_model:sgd_regressor", + "//snowflake/ml/modeling/ensemble:voting_regressor", + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/modeling/linear_model:sgd_regressor", "//snowflake/ml/utils:connection_params", ], ) @@ -36,13 +36,13 @@ py_test( name="test_grid_search_on_pipeline", srcs = ["test_grid_search_on_pipeline.py"], deps = [ - "//snowflake/ml/sklearn/linear_model:logistic_regression", - "//snowflake/ml/sklearn/model_selection:grid_search_cv", - "//snowflake/ml/sklearn/compose:column_transformer", - "//snowflake/ml/sklearn/framework:framework", - "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", - "//snowflake/ml/sklearn/preprocessing:min_max_scaler", - "//snowflake/ml/sklearn/preprocessing:label_encoder", + "//snowflake/ml/modeling/linear_model:logistic_regression", + "//snowflake/ml/modeling/model_selection:grid_search_cv", + "//snowflake/ml/modeling/compose:column_transformer", + "//snowflake/ml/modeling/pipeline:pipeline", + "//snowflake/ml/modeling/preprocessing:one_hot_encoder", + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/preprocessing:label_encoder", "//snowflake/ml/utils:connection_params", ] ) @@ -51,8 +51,8 @@ py_test( name="test_iterative_imputer", srcs = ["test_iterative_imputer.py"], deps = [ - "//snowflake/ml/sklearn/linear_model:linear_regression", - "//snowflake/ml/sklearn/impute:iterative_imputer", + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/modeling/impute:iterative_imputer", "//snowflake/ml/utils:connection_params", ] ) @@ -61,10 +61,12 @@ py_test( name="test_pipeline_with_ohe_and_xgbr", srcs = ["test_pipeline_with_ohe_and_xgbr.py"], deps = [ - "//snowflake/ml/xgboost:xgb_regressor", - "//snowflake/ml/sklearn/framework:framework", - "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", - "//snowflake/ml/sklearn/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/pipeline:pipeline", + "//snowflake/ml/modeling/preprocessing:one_hot_encoder", + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/xgboost:xgb_regressor", + "//snowflake/ml/modeling/framework:framework", + "//snowflake/ml/modeling/preprocessing:standard_scaler", "//snowflake/ml/utils:connection_params", ] ) @@ -73,8 +75,8 @@ py_test( name="test_randomized_search", srcs = ["test_randomized_search.py"], deps = [ - "//snowflake/ml/sklearn/model_selection:randomized_search_cv", - "//snowflake/ml/sklearn/ensemble:random_forest_classifier", + "//snowflake/ml/modeling/model_selection:randomized_search_cv", + "//snowflake/ml/modeling/ensemble:random_forest_classifier", "//snowflake/ml/utils:connection_params", ] ) diff --git a/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py b/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py index 88f5bac0..65fefdee 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py +++ b/tests/integ/snowflake/ml/extra_tests/test_column_name_inference.py @@ -8,7 +8,7 @@ from sklearn.datasets import load_diabetes from sklearn.linear_model import LinearRegression as SkLinearRegression -from snowflake.ml.sklearn.linear_model import LinearRegression +from snowflake.ml.modeling.linear_model import LinearRegression from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_grid_search.py b/tests/integ/snowflake/ml/extra_tests/test_grid_search.py index c4efdd95..cbc820c3 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_grid_search.py +++ b/tests/integ/snowflake/ml/extra_tests/test_grid_search.py @@ -9,8 +9,8 @@ from sklearn.model_selection import GridSearchCV as SkGridSearchCV from sklearn.svm import SVR as SkSVR -from snowflake.ml.sklearn.model_selection import GridSearchCV -from snowflake.ml.sklearn.svm import SVR +from snowflake.ml.modeling.model_selection import GridSearchCV +from snowflake.ml.modeling.svm import SVR from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py b/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py index 98dc201d..ec8d6054 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py +++ b/tests/integ/snowflake/ml/extra_tests/test_grid_search_on_pipeline.py @@ -2,12 +2,12 @@ # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # from absl.testing.absltest import TestCase, main -from snowflake.ml.sklearn.linear_model.logistic_regression import LogisticRegression +from snowflake.ml.modeling.linear_model.logistic_regression import LogisticRegression -from snowflake.ml.sklearn.compose import ColumnTransformer -from snowflake.ml.sklearn.framework.pipeline import Pipeline -from snowflake.ml.sklearn.model_selection import GridSearchCV -from snowflake.ml.sklearn.preprocessing import MinMaxScaler, OneHotEncoder +from snowflake.ml.modeling.compose import ColumnTransformer +from snowflake.ml.modeling.model_selection import GridSearchCV +from snowflake.ml.modeling.pipeline import Pipeline +from snowflake.ml.modeling.preprocessing import MinMaxScaler, OneHotEncoder from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Column, Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py b/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py index fe8a2e8d..782697ca 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py +++ b/tests/integ/snowflake/ml/extra_tests/test_iterative_imputer.py @@ -13,8 +13,8 @@ from sklearn.impute import IterativeImputer as SkIterativeImputer from sklearn.linear_model import LinearRegression as SkLinearRegression -from snowflake.ml.sklearn.impute import IterativeImputer -from snowflake.ml.sklearn.linear_model import LinearRegression +from snowflake.ml.modeling.impute import IterativeImputer +from snowflake.ml.modeling.linear_model import LinearRegression from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py b/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py index 2641c9c9..fde30aeb 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py +++ b/tests/integ/snowflake/ml/extra_tests/test_pipeline_with_ohe_and_xgbr.py @@ -1,13 +1,18 @@ # # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. # +import numpy as np from absl.testing import absltest -from snowflake.ml.sklearn.framework.pipeline import Pipeline -from snowflake.ml.sklearn.preprocessing import MinMaxScaler, OneHotEncoder +from snowflake.ml.modeling.pipeline import Pipeline +from snowflake.ml.modeling.preprocessing import ( + MinMaxScaler, + OneHotEncoder, + StandardScaler, +) +from snowflake.ml.modeling.xgboost import XGBRegressor from snowflake.ml.utils.connection_params import SnowflakeLoginOptions -from snowflake.ml.xgboost import XGBRegressor -from snowflake.snowpark import Column, Session +from snowflake.snowpark import Column, Session, functions as F categorical_columns = [ "AGE", @@ -106,6 +111,86 @@ def test_fit_and_compare_results_pandas_dataframe(self) -> None: pipeline.fit(raw_data_pandas) pipeline.predict(raw_data_pandas) + def test_pipeline_export(self) -> None: + snow_df = ( + self._session.sql( + """SELECT *, IFF(Y = 'yes', 1.0, 0.0) as LABEL + FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS + LIMIT 2000""" + ) + .drop("Y") + .withColumn("ROW_INDEX", F.monotonically_increasing_id()) + ) + pd_df = snow_df.to_pandas().sort_values(by=["ROW_INDEX"]).drop("LABEL", axis=1) + + pipeline = Pipeline( + steps=[ + ( + "OHE", + OneHotEncoder( + input_cols=categorical_columns, output_cols=categorical_columns, drop_input_cols=True + ), + ), + ( + "MMS", + MinMaxScaler( + clip=True, + input_cols=numerical_columns, + output_cols=numerical_columns, + ), + ), + ( + "SS", + StandardScaler(input_cols=(numerical_columns[0:2]), output_cols=(numerical_columns[0:2])), + ), + ("regression", XGBRegressor(label_cols=label_column)), + ] + ) + + pipeline.fit(snow_df) + snow_results = pipeline.predict(snow_df).to_pandas().sort_values(by=["ROW_INDEX"])["OUTPUT_LABEL"].to_numpy() + + sk_pipeline = pipeline.to_sklearn() + sk_results = sk_pipeline.predict(pd_df) + np.testing.assert_allclose(snow_results.flatten(), sk_results.flatten(), rtol=1.0e-1, atol=1.0e-2) + + def test_pipeline_with_limitted_number_of_columns_in_estimator_export(self) -> None: + snow_df = ( + self._session.sql( + """SELECT *, IFF(Y = 'yes', 1.0, 0.0) as LABEL + FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS + LIMIT 2000""" + ) + .drop("Y", "DEFAULT") + .withColumn("ROW_INDEX", F.monotonically_increasing_id()) + ) + pd_df = snow_df.to_pandas().sort_values(by=["ROW_INDEX"]).drop("LABEL", axis=1) + + pipeline = Pipeline( + steps=[ + ( + "MMS", + MinMaxScaler( + clip=True, + input_cols=numerical_columns, + output_cols=numerical_columns, + ), + ), + ( + "SS", + StandardScaler(input_cols=(numerical_columns[0:2]), output_cols=(numerical_columns[0:2])), + ), + ("regression", XGBRegressor(input_cols=numerical_columns, label_cols=label_column)), + ] + ) + + pipeline.fit(snow_df) + snow_results = pipeline.predict(snow_df).to_pandas().sort_values(by=["ROW_INDEX"])["OUTPUT_LABEL"].to_numpy() + + sk_pipeline = pipeline.to_sklearn() + sk_results = sk_pipeline.predict(pd_df) + np.testing.assert_allclose(snow_results.flatten(), sk_results.flatten(), rtol=1.0e-1, atol=1.0e-2) + if __name__ == "__main__": absltest.main() diff --git a/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py b/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py index 9daa4ed4..4ae783a9 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py +++ b/tests/integ/snowflake/ml/extra_tests/test_randomized_search.py @@ -10,8 +10,8 @@ from sklearn.ensemble import RandomForestClassifier as SkRandomForestClassifier from sklearn.model_selection import RandomizedSearchCV as SkRandomizedSearchCV -from snowflake.ml.sklearn.ensemble import RandomForestClassifier -from snowflake.ml.sklearn.model_selection import RandomizedSearchCV +from snowflake.ml.modeling.ensemble import RandomForestClassifier +from snowflake.ml.modeling.model_selection import RandomizedSearchCV from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py b/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py index 273c9fb6..36f76fe3 100644 --- a/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py +++ b/tests/integ/snowflake/ml/extra_tests/test_voting_regressor.py @@ -12,8 +12,8 @@ SGDRegressor as SkSGDRegressor, ) -from snowflake.ml.sklearn.ensemble import VotingRegressor -from snowflake.ml.sklearn.linear_model import LinearRegression, SGDRegressor +from snowflake.ml.modeling.ensemble import VotingRegressor +from snowflake.ml.modeling.linear_model import LinearRegression, SGDRegressor from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session diff --git a/tests/integ/snowflake/ml/metrics/BUILD.bazel b/tests/integ/snowflake/ml/metrics/BUILD.bazel deleted file mode 100644 index 522d90e5..00000000 --- a/tests/integ/snowflake/ml/metrics/BUILD.bazel +++ /dev/null @@ -1,42 +0,0 @@ -load("//bazel:py_rules.bzl", "py_test") - -package(default_visibility = ["//visibility:public"]) - -py_test( - name = "test_r2_score", - srcs = ["test_r2_score.py"], - deps = [ - "//snowflake/ml/metrics", - "//snowflake/ml/utils:connection_params", - ], -) - -py_test( - name = "test_accuracy_score", - srcs = ["test_accuracy_score.py"], - deps = [ - "//snowflake/ml/metrics", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_correlation", - timeout = "long", - srcs = ["test_correlation.py"], - deps = [ - "//snowflake/ml/metrics", - "//snowflake/ml/utils:connection_params", - ], -) - -py_test( - name = "test_covariance", - timeout = "long", - srcs = ["test_covariance.py"], - deps = [ - "//snowflake/ml/metrics", - "//snowflake/ml/utils:connection_params", - ], -) diff --git a/tests/integ/snowflake/ml/metrics/test_accuracy_score.py b/tests/integ/snowflake/ml/metrics/test_accuracy_score.py deleted file mode 100644 index dbe058bf..00000000 --- a/tests/integ/snowflake/ml/metrics/test_accuracy_score.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. -# -import numpy as np -from absl.testing.absltest import TestCase, main -from sklearn import metrics as sklearn_metrics - -from snowflake import snowpark -from snowflake.ml import metrics as snowml_metrics -from snowflake.ml.utils import connection_params -from tests.integ.snowflake.ml.sklearn.framework import utils - -_DATA, _SCHEMA = utils.gen_fuzz_data( - rows=100, - types=[utils.DataType.INTEGER, utils.DataType.INTEGER, utils.DataType.FLOAT], - low=0, - high=20, -) - - -class AccuracyScoreTest(TestCase): - """Test accuracy score.""" - - def setUp(self) -> None: - """Creates Snowpark and Snowflake environments for testing.""" - self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() - - def tearDown(self) -> None: - self._session.close() - - def test_accuracy_score(self) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() - - score = snowml_metrics.accuracy_score( - df=input_df, y_true_col_name=_SCHEMA[1], y_pred_col_name=_SCHEMA[2], normalize=False - ) - score_sklearn = sklearn_metrics.accuracy_score(pandas_df[_SCHEMA[1]], pandas_df[_SCHEMA[2]], normalize=False) - np.testing.assert_allclose(score, score_sklearn) - - def test_accuracy_score_sample_weight(self) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() - - score = snowml_metrics.accuracy_score( - df=input_df, - y_true_col_name=_SCHEMA[1], - y_pred_col_name=_SCHEMA[2], - sample_weight_col_name=_SCHEMA[3], - normalize=False, - ) - score_sklearn = sklearn_metrics.accuracy_score( - pandas_df[_SCHEMA[1]], - pandas_df[_SCHEMA[2]], - sample_weight=pandas_df[_SCHEMA[3]].to_numpy(), - normalize=False, - ) - np.testing.assert_allclose(score, score_sklearn) - - def test_accuracy_score_normalized(self) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() - - score = snowml_metrics.accuracy_score( - df=input_df, y_true_col_name=_SCHEMA[1], y_pred_col_name=_SCHEMA[2], normalize=True - ) - score_sklearn = sklearn_metrics.accuracy_score(pandas_df[_SCHEMA[1]], pandas_df[_SCHEMA[2]], normalize=True) - np.testing.assert_allclose(score, score_sklearn) - - def test_accuracy_score_sample_weight_normalized(self) -> None: - input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) - pandas_df = input_df.to_pandas() - - score = snowml_metrics.accuracy_score( - df=input_df, - y_true_col_name=_SCHEMA[1], - y_pred_col_name=_SCHEMA[2], - sample_weight_col_name=_SCHEMA[3], - normalize=True, - ) - score_sklearn = sklearn_metrics.accuracy_score( - pandas_df[_SCHEMA[1]], pandas_df[_SCHEMA[2]], sample_weight=pandas_df[_SCHEMA[3]].to_numpy(), normalize=True - ) - np.testing.assert_allclose(score, score_sklearn) - - -if __name__ == "__main__": - main() diff --git a/tests/integ/snowflake/ml/model/BUILD.bazel b/tests/integ/snowflake/ml/model/BUILD.bazel index 6f833ed9..31d444ec 100644 --- a/tests/integ/snowflake/ml/model/BUILD.bazel +++ b/tests/integ/snowflake/ml/model/BUILD.bazel @@ -4,17 +4,16 @@ py_test( name = "model_integ_test", timeout = "long", srcs = ["model_integ_test.py"], - data = [ - "//snowflake/ml:wheel", - ], - shard_count = 6, + shard_count = 8, tags = ["skip_merge_gates"], deps = [ "//snowflake/ml/model:_deployer", "//snowflake/ml/model:_model", "//snowflake/ml/model:custom_model", "//snowflake/ml/model:type_hints", - "//snowflake/ml/sklearn/linear_model:linear_regression", + "//snowflake/ml/modeling/lightgbm:lgbm_regressor", + "//snowflake/ml/modeling/linear_model:logistic_regression", + "//snowflake/ml/modeling/xgboost:xgb_regressor", "//snowflake/ml/utils:connection_params", ], ) diff --git a/tests/integ/snowflake/ml/model/model_integ_test.py b/tests/integ/snowflake/ml/model/model_integ_test.py index dfeb652c..859a5396 100644 --- a/tests/integ/snowflake/ml/model/model_integ_test.py +++ b/tests/integ/snowflake/ml/model/model_integ_test.py @@ -14,7 +14,6 @@ import xgboost from absl import flags from absl.testing import absltest -from packaging import utils as packaging_utils from sklearn import datasets, ensemble, linear_model, model_selection, multioutput from snowflake.ml.model import ( @@ -23,7 +22,9 @@ custom_model, type_hints as model_types, ) -from snowflake.ml.sklearn.linear_model import LinearRegression +from snowflake.ml.modeling.lightgbm import LGBMRegressor +from snowflake.ml.modeling.linear_model import LogisticRegression +from snowflake.ml.modeling.xgboost import XGBRegressor from snowflake.ml.utils import connection_params from snowflake.snowpark import Session @@ -80,44 +81,6 @@ def predict(self, input: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame({"output": (input["c1"] + self.bias) > 12}) -def _upload_snowml_to_tmp_stage( - session: Session, -) -> str: - """Upload model module of snowml to tmp stage. - - Args: - session: Snowpark session. - - Returns: - The stage path to uploaded snowml.zip file. - """ - root_paths = [ - os.path.join(absltest.TEST_SRCDIR.value, "SnowML", "snowflake", "ml"), # Test using bazel - os.path.join(absltest.TEST_SRCDIR.value, "bazel-bin", "snowflake", "ml"), # Test using pytest - os.path.join(absltest.TEST_SRCDIR.value), # Test in Jenkins Wheel build and test pipeline. - ] - whl_filename = None - for root_path in root_paths: - if not os.path.exists(root_path): - continue - for filename in os.listdir(root_path): - if os.path.splitext(filename)[-1] == ".whl": - try: - packaging_utils.parse_wheel_filename(filename=filename) - whl_filename = filename - break - except packaging_utils.InvalidWheelFilename: - continue - if whl_filename: - break - if whl_filename is None: - raise RuntimeError("Cannot file wheel file. Have it been built?") - whl_path = os.path.join(root_path, whl_filename) - tmp_stage = session.get_session_stage() - _ = session.file.put(whl_path, tmp_stage, auto_compress=False, overwrite=True) - return f"{tmp_stage}/{whl_filename}" - - def _create_stage(session: Session, stage_qual_name: str) -> None: sql = f"CREATE STAGE {stage_qual_name}" session.sql(sql).collect() @@ -140,7 +103,6 @@ def setUpClass(self) -> None: self._session = Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() # To create different UDF names among different runs self.run_id = str(uuid4()).replace("-", "_") - self._snowml_wheel_path = _upload_snowml_to_tmp_stage(self._session) db = self._session.get_current_database() schema = self._session.get_current_schema() @@ -178,9 +140,7 @@ async def _test(self: "TestModelInteg") -> None: model_dir_path=os.path.join(tmpdir, "async_model_composition"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None @@ -217,9 +177,7 @@ def test_bad_model_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_bad_model"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": False, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": False, "_use_local_snowml": True}), ) with self.assertRaises(ValueError): @@ -244,9 +202,7 @@ def test_custom_demo_model_sp(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_sp0"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], sp_df) @@ -276,9 +232,7 @@ def test_custom_demo_model_sp_quote(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_sp_good"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df) @@ -308,9 +262,7 @@ def test_custom_demo_model_sp_mix_1(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_sp1"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], sp_df) @@ -340,9 +292,7 @@ def test_custom_demo_model_sp_mix_2(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_sp2"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df) @@ -374,7 +324,7 @@ def test_custom_demo_model(self) -> None: options=model_types.WarehouseDeployOptions( { "relax_version": True, - "_snowml_wheel_path": self._snowml_wheel_path, + "_use_local_snowml": True, "permanent_udf_stage_location": f"@{self.stage_qual_name}/", } ), @@ -399,7 +349,7 @@ def test_custom_demo_model(self) -> None: options=model_types.WarehouseDeployOptions( { "relax_version": True, - "_snowml_wheel_path": self._snowml_wheel_path, + "_use_local_snowml": True, "permanent_udf_stage_location": f"@{self.stage_qual_name}/", } ), @@ -426,9 +376,7 @@ def test_custom_demo_model_array(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_array"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df) @@ -458,9 +406,7 @@ def test_custom_demo_model_str(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_str"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df) @@ -490,9 +436,7 @@ def test_custom_demo_model_array_sp(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_array_sp"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], sp_df) @@ -523,9 +467,7 @@ def test_custom_demo_model_str_sp(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_str_sp"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], sp_df) @@ -555,9 +497,7 @@ def test_custom_demo_model_array_str(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_demo_model_array_str"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df) @@ -589,7 +529,7 @@ def test_custom_demo_model_with_input_no_keep_order(self) -> None: options=model_types.WarehouseDeployOptions( { "relax_version": True, - "_snowml_wheel_path": self._snowml_wheel_path, + "_use_local_snowml": True, "output_with_input_features": True, "keep_order": False, } @@ -621,7 +561,7 @@ def test_custom_demo_model_with_input(self) -> None: options=model_types.WarehouseDeployOptions( { "relax_version": True, - "_snowml_wheel_path": self._snowml_wheel_path, + "_use_local_snowml": True, "output_with_input_features": True, } ), @@ -661,9 +601,7 @@ def test_custom_model_with_artifacts(self) -> None: model_dir_path=os.path.join(tmpdir, "custom_model_with_artifacts"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df[["c3", "c1", "c2"]]) @@ -703,9 +641,7 @@ def test_custom_demo_model_in_stage(self) -> None: model_stage_file_path=model_path_in_stage, platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df) @@ -753,9 +689,7 @@ def test_custom_model_with_artifacts_in_stage(self) -> None: model_stage_file_path=model_path_in_stage, platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert deploy_info is not None res = deployer.predict(deploy_info["name"], pd_df[["c3", "c1", "c2"]]) @@ -767,8 +701,9 @@ def test_custom_model_with_artifacts_in_stage(self) -> None: def test_skl_model_deploy(self) -> None: iris_X, iris_y = datasets.load_iris(return_X_y=True) - regr = linear_model.LinearRegression() - regr.fit(iris_X[:10], iris_y[:10]) + # LogisticRegression is for classfication task, such as iris + regr = linear_model.LogisticRegression() + regr.fit(iris_X, iris_y) with tempfile.TemporaryDirectory() as tmpdir: model_api.save_model( name="skl_model", @@ -783,9 +718,7 @@ def test_skl_model_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "skl_model"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di is not None @@ -811,9 +744,7 @@ def test_skl_model_proba_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "skl_model_proba"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di_predict is not None res = dc.predict(di_predict["name"], iris_X[:10]) @@ -824,9 +755,7 @@ def test_skl_model_proba_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "skl_model_proba"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict_proba", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di_predict_proba is not None res = dc.predict(di_predict_proba["name"], iris_X[:10]) @@ -853,9 +782,7 @@ def test_skl_multiple_output_model_proba_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "skl_multiple_output_model_proba"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di_predict is not None res = dc.predict(di_predict["name"], iris_X[-10:]) @@ -866,9 +793,7 @@ def test_skl_multiple_output_model_proba_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "skl_multiple_output_model_proba"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict_proba", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di_predict_proba is not None res = dc.predict(di_predict_proba["name"], iris_X[-10:]) @@ -895,9 +820,7 @@ def test_xgb(self) -> None: model_dir_path=os.path.join(tmpdir, "xgb_model"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di_predict is not None res = dc.predict(di_predict["name"], cal_X_test) @@ -925,9 +848,7 @@ def test_xgb_sp(self) -> None: model_dir_path=os.path.join(tmpdir, "xgb_model"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di_predict is not None cal_data_sp_df_test_X = cal_data_sp_df_test.drop('"target"') @@ -936,24 +857,23 @@ def test_xgb_sp(self) -> None: res.to_pandas().values, np.expand_dims(regressor.predict(cal_data_sp_df_test_X.to_pandas()), axis=1) ) - def test_snowml_model_deploy(self) -> None: - iris = datasets.load_iris() - df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]) - df.columns = [s.replace(" (CM)", "").replace(" ", "") for s in df.columns.str.upper()] + def test_snowml_model_deploy_snowml_sklearn(self) -> None: + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] LABEL_COLUMNS = "TARGET" OUTPUT_COLUMNS = "PREDICTED_TARGET" - regr = LinearRegression(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) - test_features = df[:10] + regr = LogisticRegression(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + test_features = iris_X regr.fit(test_features) + # no sample input because snowml can infer the model signature itself with tempfile.TemporaryDirectory() as tmpdir: model_api.save_model( name="snowml_model", model_dir_path=os.path.join(tmpdir, "snowml_model"), model=regr, - sample_input=test_features, metadata={"author": "xjiang", "version": "1"}, ) dc = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) @@ -962,9 +882,71 @@ def test_snowml_model_deploy(self) -> None: model_dir_path=os.path.join(tmpdir, "snowml_model"), platform=_deployer.TargetPlatform.WAREHOUSE, target_method="predict", - options=model_types.WarehouseDeployOptions( - {"relax_version": True, "_snowml_wheel_path": self._snowml_wheel_path} - ), + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), + ) + + assert di is not None + res = dc.predict(di["name"], test_features) + np.testing.assert_allclose(res[OUTPUT_COLUMNS].values, regr.predict(test_features)[OUTPUT_COLUMNS].values) + + def test_snowml_model_deploy_xgboost(self) -> None: + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = XGBRegressor(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + test_features = iris_X[:10] + regr.fit(test_features) + + # no sample input because snowml can infer the model signature itself + with tempfile.TemporaryDirectory() as tmpdir: + model_api.save_model( + name="snowml_model", + model_dir_path=os.path.join(tmpdir, "snowml_model"), + model=regr, + metadata={"author": "xjiang", "version": "1"}, + ) + dc = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + di = dc.create_deployment( + name=f"snowml_model{self.run_id}", + model_dir_path=os.path.join(tmpdir, "snowml_model"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), + ) + + assert di is not None + res = dc.predict(di["name"], test_features) + np.testing.assert_allclose(res[OUTPUT_COLUMNS].values, regr.predict(test_features)[OUTPUT_COLUMNS].values) + + def test_snowml_model_deploy_lightgbm(self) -> None: + iris_X = datasets.load_iris(as_frame=True).frame + iris_X.columns = [s.replace(" (CM)", "").replace(" ", "") for s in iris_X.columns.str.upper()] + + INPUT_COLUMNS = ["SEPALLENGTH", "SEPALWIDTH", "PETALLENGTH", "PETALWIDTH"] + LABEL_COLUMNS = "TARGET" + OUTPUT_COLUMNS = "PREDICTED_TARGET" + regr = LGBMRegressor(input_cols=INPUT_COLUMNS, output_cols=OUTPUT_COLUMNS, label_cols=LABEL_COLUMNS) + test_features = iris_X[:10] + regr.fit(test_features) + + # no sample input because snowml can infer the model signature itself + with tempfile.TemporaryDirectory() as tmpdir: + model_api.save_model( + name="snowml_model", + model_dir_path=os.path.join(tmpdir, "snowml_model"), + model=regr, + metadata={"author": "xjiang", "version": "1"}, + ) + dc = _deployer.Deployer(self._session, _deployer.LocalDeploymentManager()) + di = dc.create_deployment( + name=f"snowml_model{self.run_id}", + model_dir_path=os.path.join(tmpdir, "snowml_model"), + platform=_deployer.TargetPlatform.WAREHOUSE, + target_method="predict", + options=model_types.WarehouseDeployOptions({"relax_version": True, "_use_local_snowml": True}), ) assert di is not None diff --git a/tests/integ/snowflake/ml/sklearn/calibration/BUILD.bazel b/tests/integ/snowflake/ml/modeling/calibration/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/calibration/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/calibration/BUILD.bazel index 91271f7a..39b05975 100644 --- a/tests/integ/snowflake/ml/sklearn/calibration/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/calibration/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/calibration:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/calibration:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.calibration", - module_root_dir = "snowflake/ml/sklearn/calibration", + module_root_dir = "snowflake/ml/modeling/calibration", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/cluster/BUILD.bazel b/tests/integ/snowflake/ml/modeling/cluster/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/cluster/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/cluster/BUILD.bazel index 247afc5a..adbfa784 100644 --- a/tests/integ/snowflake/ml/sklearn/cluster/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/cluster/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/cluster:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/cluster:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.cluster", - module_root_dir = "snowflake/ml/sklearn/cluster", + module_root_dir = "snowflake/ml/modeling/cluster", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/compose/BUILD.bazel b/tests/integ/snowflake/ml/modeling/compose/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/compose/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/compose/BUILD.bazel index 34f9c526..303c831a 100644 --- a/tests/integ/snowflake/ml/sklearn/compose/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/compose/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/compose:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/compose:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.compose", - module_root_dir = "snowflake/ml/sklearn/compose", + module_root_dir = "snowflake/ml/modeling/compose", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/covariance/BUILD.bazel b/tests/integ/snowflake/ml/modeling/covariance/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/covariance/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/covariance/BUILD.bazel index e03c5cab..5dba8854 100644 --- a/tests/integ/snowflake/ml/sklearn/covariance/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/covariance/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/covariance:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/covariance:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.covariance", - module_root_dir = "snowflake/ml/sklearn/covariance", + module_root_dir = "snowflake/ml/modeling/covariance", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/decomposition/BUILD.bazel b/tests/integ/snowflake/ml/modeling/decomposition/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/decomposition/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/decomposition/BUILD.bazel index 80771ce6..6a19961a 100644 --- a/tests/integ/snowflake/ml/sklearn/decomposition/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/decomposition/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/decomposition:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/decomposition:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.decomposition", - module_root_dir = "snowflake/ml/sklearn/decomposition", + module_root_dir = "snowflake/ml/modeling/decomposition", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel b/tests/integ/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel similarity index 59% rename from tests/integ/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel index d5c0f6ac..2dbf8fc9 100644 --- a/tests/integ/snowflake/ml/sklearn/discriminant_analysis/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/discriminant_analysis/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/discriminant_analysis:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/discriminant_analysis:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.discriminant_analysis", - module_root_dir = "snowflake/ml/sklearn/discriminant_analysis", + module_root_dir = "snowflake/ml/modeling/discriminant_analysis", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/ensemble/BUILD.bazel b/tests/integ/snowflake/ml/modeling/ensemble/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/ensemble/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/ensemble/BUILD.bazel index bd46af45..dc15e953 100644 --- a/tests/integ/snowflake/ml/sklearn/ensemble/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/ensemble/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/ensemble:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/ensemble:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.ensemble", - module_root_dir = "snowflake/ml/sklearn/ensemble", + module_root_dir = "snowflake/ml/modeling/ensemble", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/feature_selection/BUILD.bazel b/tests/integ/snowflake/ml/modeling/feature_selection/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/sklearn/feature_selection/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/feature_selection/BUILD.bazel index 64017722..d6533bc1 100644 --- a/tests/integ/snowflake/ml/sklearn/feature_selection/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/feature_selection/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/feature_selection:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/feature_selection:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.feature_selection", - module_root_dir = "snowflake/ml/sklearn/feature_selection", + module_root_dir = "snowflake/ml/modeling/feature_selection", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel b/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel new file mode 100644 index 00000000..c0257bfa --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/framework/BUILD.bazel @@ -0,0 +1,19 @@ +load("//bazel:py_rules.bzl", "py_library", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_test( + name = "test_base", + srcs = ["test_base.py"], + deps = [ + ":utils", + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/preprocessing:standard_scaler", + "//snowflake/ml/utils:connection_params", + ], +) + +py_library( + name = "utils", + srcs = ["utils.py"], +) diff --git a/tests/integ/snowflake/ml/sklearn/framework/test_base.py b/tests/integ/snowflake/ml/modeling/framework/test_base.py similarity index 93% rename from tests/integ/snowflake/ml/sklearn/framework/test_base.py rename to tests/integ/snowflake/ml/modeling/framework/test_base.py index 0ecaa649..7f86b9b7 100644 --- a/tests/integ/snowflake/ml/sklearn/framework/test_base.py +++ b/tests/integ/snowflake/ml/modeling/framework/test_base.py @@ -7,16 +7,16 @@ import pytest from absl.testing.absltest import TestCase, main -from snowflake.ml.sklearn.framework.base import BaseTransformer, _process_cols -from snowflake.ml.sklearn.preprocessing import ( # type: ignore[attr-defined] +from snowflake.ml.modeling.framework.base import BaseTransformer, _process_cols +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] MinMaxScaler, StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session from snowflake.snowpark.exceptions import SnowparkColumnException -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, DATA_NONE_NAN, NUMERIC_COLS, @@ -105,7 +105,7 @@ def test_transform_output_cols_check(self) -> None: scaler.transform(df) assert "Size mismatch" in excinfo.value.args[0] - def test_get_sklearn_object(self) -> None: + def test_to_sklearn(self) -> None: input_cols = ["F"] output_cols = ["F"] pandas_df_1 = pd.DataFrame(data={"F": [1, 2, 3]}) @@ -116,10 +116,10 @@ def test_get_sklearn_object(self) -> None: scaler = StandardScaler(input_cols=input_cols, output_cols=output_cols) scaler.fit(snow_df_1) - sk_object_1 = scaler.get_sklearn_object() + sk_object_1 = scaler.to_sklearn() assert np.allclose([sk_object_1.mean_], [2.0]) scaler.fit(snow_df_2) - sk_object_2 = scaler.get_sklearn_object() + sk_object_2 = scaler.to_sklearn() assert np.allclose([sk_object_2.mean_], [5.0]) def test_validate_data_has_no_nulls(self) -> None: diff --git a/tests/integ/snowflake/ml/sklearn/framework/utils.py b/tests/integ/snowflake/ml/modeling/framework/utils.py similarity index 100% rename from tests/integ/snowflake/ml/sklearn/framework/utils.py rename to tests/integ/snowflake/ml/modeling/framework/utils.py diff --git a/tests/integ/snowflake/ml/sklearn/gaussian_process/BUILD.bazel b/tests/integ/snowflake/ml/modeling/gaussian_process/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/sklearn/gaussian_process/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/gaussian_process/BUILD.bazel index 23971ad4..2cc8a11f 100644 --- a/tests/integ/snowflake/ml/sklearn/gaussian_process/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/gaussian_process/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/gaussian_process:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/gaussian_process:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.gaussian_process", - module_root_dir = "snowflake/ml/sklearn/gaussian_process", + module_root_dir = "snowflake/ml/modeling/gaussian_process", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/impute/BUILD.bazel b/tests/integ/snowflake/ml/modeling/impute/BUILD.bazel new file mode 100644 index 00000000..6abe5d6a --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/impute/BUILD.bazel @@ -0,0 +1,11 @@ +load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") +load("//snowflake/ml/modeling/impute:estimators_info.bzl", "estimator_info_list") +load(":BUILD_NATIVE.bzl", "get_build_rules_for_native_impl") +package(default_visibility = ["//visibility:public"]) + +autogen_tests_for_estimators( + module = "sklearn.impute", + module_root_dir = "snowflake/ml/modeling/impute", + estimator_info_list=estimator_info_list +) +get_build_rules_for_native_impl() diff --git a/tests/integ/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl b/tests/integ/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl new file mode 100644 index 00000000..cfc93454 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/impute/BUILD_NATIVE.bzl @@ -0,0 +1,17 @@ +load("//bazel:py_rules.bzl", "py_test") + +def get_build_rules_for_native_impl(): + SHARD_COUNT = 5 + TIMEOUT = "long" # 900s + + py_test( + name = "test_simple_imputer", + srcs = ["test_simple_imputer.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/impute:simple_imputer", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_simple_imputer.py b/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py similarity index 98% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_simple_imputer.py rename to tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py index 936e8876..2c96785e 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_simple_imputer.py +++ b/tests/integ/snowflake/ml/modeling/impute/test_simple_imputer.py @@ -15,13 +15,11 @@ from absl.testing.absltest import main from sklearn.impute import SimpleImputer as SklearnSimpleImputer -from snowflake.ml.sklearn.preprocessing import ( - SimpleImputer, # type: ignore[attr-defined] -) +from snowflake.ml.modeling.impute import SimpleImputer # type: ignore[attr-defined] from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( CATEGORICAL_COLS, DATA, DATA_ALL_NONE, @@ -503,7 +501,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.simple_imputer"]) + importlib.reload(sys.modules["snowflake.ml.modeling.impute.simple_imputer"]) # cloudpickle simple_imputer_load_cloudpickle = cloudpickle.loads(simple_imputer_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/isotonic/BUILD.bazel b/tests/integ/snowflake/ml/modeling/isotonic/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/isotonic/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/isotonic/BUILD.bazel index f1e34874..28597688 100644 --- a/tests/integ/snowflake/ml/sklearn/isotonic/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/isotonic/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/isotonic:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/isotonic:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.isotonic", - module_root_dir = "snowflake/ml/sklearn/isotonic", + module_root_dir = "snowflake/ml/modeling/isotonic", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel b/tests/integ/snowflake/ml/modeling/kernel_approximation/BUILD.bazel similarity index 59% rename from tests/integ/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/kernel_approximation/BUILD.bazel index 880c5dd0..b29935b2 100644 --- a/tests/integ/snowflake/ml/sklearn/kernel_approximation/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/kernel_approximation/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/kernel_approximation:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/kernel_approximation:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.kernel_approximation", - module_root_dir = "snowflake/ml/sklearn/kernel_approximation", + module_root_dir = "snowflake/ml/modeling/kernel_approximation", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel b/tests/integ/snowflake/ml/modeling/kernel_ridge/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/kernel_ridge/BUILD.bazel index 317df396..efe574e8 100644 --- a/tests/integ/snowflake/ml/sklearn/kernel_ridge/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/kernel_ridge/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/kernel_ridge:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/kernel_ridge:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.kernel_ridge", - module_root_dir = "snowflake/ml/sklearn/kernel_ridge", + module_root_dir = "snowflake/ml/modeling/kernel_ridge", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/lightgbm/BUILD.bazel b/tests/integ/snowflake/ml/modeling/lightgbm/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/lightgbm/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/lightgbm/BUILD.bazel index 2fb38b2c..18aa456e 100644 --- a/tests/integ/snowflake/ml/lightgbm/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/lightgbm/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/lightgbm:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/lightgbm:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "lightgbm", - module_root_dir = "snowflake/ml/lightgbm", + module_root_dir = "snowflake/ml/modeling/lightgbm", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/linear_model/BUILD.bazel b/tests/integ/snowflake/ml/modeling/linear_model/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/linear_model/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/linear_model/BUILD.bazel index a81b02ab..6ed8421e 100644 --- a/tests/integ/snowflake/ml/sklearn/linear_model/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/linear_model/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/linear_model:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/linear_model:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.linear_model", - module_root_dir = "snowflake/ml/sklearn/linear_model", + module_root_dir = "snowflake/ml/modeling/linear_model", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/manifold/BUILD.bazel b/tests/integ/snowflake/ml/modeling/manifold/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/manifold/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/manifold/BUILD.bazel index a4109257..e53401d2 100644 --- a/tests/integ/snowflake/ml/sklearn/manifold/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/manifold/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/manifold:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/manifold:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.manifold", - module_root_dir = "snowflake/ml/sklearn/manifold", + module_root_dir = "snowflake/ml/modeling/manifold", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel b/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel new file mode 100644 index 00000000..d4befcbf --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/metrics/BUILD.bazel @@ -0,0 +1,75 @@ +load("//bazel:py_rules.bzl", "py_test") + +package(default_visibility = ["//visibility:public"]) + +py_test( + name = "test_r2_score", + srcs = ["test_r2_score.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + ], +) + +py_test( + name = "test_accuracy_score", + srcs = ["test_accuracy_score.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], +) + +py_test( + name = "test_confusion_matrix", + timeout = "long", + srcs = ["test_confusion_matrix.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], +) + +py_test( + name = "test_correlation", + timeout = "long", + srcs = ["test_correlation.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + ], +) + +py_test( + name = "test_covariance", + timeout = "long", + srcs = ["test_covariance.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + ], +) + +py_test( + name = "test_precision_recall_fscore_support", + timeout = "long", + srcs = ["test_precision_recall_fscore_support.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], +) + +py_test( + name = "test_precision_score", + timeout = "long", + srcs = ["test_precision_score.py"], + deps = [ + "//snowflake/ml/modeling/metrics", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], +) diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py new file mode 100644 index 00000000..5f114412 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/metrics/test_accuracy_score.py @@ -0,0 +1,138 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +from typing import Any, Dict + +import numpy as np +from absl.testing import parameterized +from absl.testing.absltest import main +from sklearn import metrics as sklearn_metrics + +from snowflake import snowpark +from snowflake.ml.modeling import metrics as snowml_metrics +from snowflake.ml.utils import connection_params +from tests.integ.snowflake.ml.modeling.framework import utils + +_BINARY_DATA, _SCHEMA = utils.gen_fuzz_data( + rows=100, + types=[utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT], + low=0, + high=2, +) +_MULTICLASS_DATA, _ = utils.gen_fuzz_data( + rows=100, + types=[utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT], + low=0, + high=5, +) +_Y_TRUE_COL = _SCHEMA[1] +_Y_PRED_COL = _SCHEMA[2] +_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] +_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SCHEMA[5] + + +class AccuracyScoreTest(parameterized.TestCase): + """Test accuracy score.""" + + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + self._binary_input_df = self._session.create_dataframe(_BINARY_DATA, schema=_SCHEMA) + self._binary_pandas_df = self._binary_input_df.to_pandas() + self._multiclass_input_df = self._session.create_dataframe(_MULTICLASS_DATA, schema=_SCHEMA) + self._multiclass_pandas_df = self._multiclass_input_df.to_pandas() + + def tearDown(self) -> None: + self._session.close() + + @parameterized.parameters( # type: ignore[misc] + {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}}, + ) + def test_accuracy_score(self, params: Dict[str, Any]) -> None: + for i in range(len(params["y_true_col_names"])): + y_true_col_names = params["y_true_col_names"][i] + y_pred_col_names = params["y_pred_col_names"][i] + input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df + pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df + + score = snowml_metrics.accuracy_score( + df=input_df, y_true_col_names=y_true_col_names, y_pred_col_names=y_pred_col_names, normalize=False + ) + score_sklearn = sklearn_metrics.accuracy_score( + pandas_df[y_true_col_names], pandas_df[y_pred_col_names], normalize=False + ) + np.testing.assert_allclose(score, score_sklearn) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}}, + ) + def test_accuracy_score_sample_weight(self, params: Dict[str, Any]) -> None: + for i in range(len(params["y_true_col_names"])): + y_true_col_names = params["y_true_col_names"][i] + y_pred_col_names = params["y_pred_col_names"][i] + input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df + pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df + + score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=y_true_col_names, + y_pred_col_names=y_pred_col_names, + sample_weight_col_name=_SAMPLE_WEIGHT_COL, + normalize=False, + ) + score_sklearn = sklearn_metrics.accuracy_score( + pandas_df[y_true_col_names], + pandas_df[y_pred_col_names], + sample_weight=pandas_df[_SAMPLE_WEIGHT_COL].to_numpy(), + normalize=False, + ) + np.testing.assert_allclose(score, score_sklearn) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}}, + ) + def test_accuracy_score_normalized(self, params: Dict[str, Any]) -> None: + for i in range(len(params["y_true_col_names"])): + y_true_col_names = params["y_true_col_names"][i] + y_pred_col_names = params["y_pred_col_names"][i] + input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df + pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df + + score = snowml_metrics.accuracy_score( + df=input_df, y_true_col_names=y_true_col_names, y_pred_col_names=y_pred_col_names, normalize=True + ) + score_sklearn = sklearn_metrics.accuracy_score( + pandas_df[y_true_col_names], pandas_df[y_pred_col_names], normalize=True + ) + np.testing.assert_allclose(score, score_sklearn) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"y_true_col_names": [_Y_TRUE_COL, _Y_TRUE_COLS], "y_pred_col_names": [_Y_PRED_COL, _Y_PRED_COLS]}}, + ) + def test_accuracy_score_sample_weight_normalized(self, params: Dict[str, Any]) -> None: + for i in range(len(params["y_true_col_names"])): + y_true_col_names = params["y_true_col_names"][i] + y_pred_col_names = params["y_pred_col_names"][i] + input_df = self._multiclass_input_df if isinstance(y_true_col_names, str) else self._binary_input_df + pandas_df = self._multiclass_pandas_df if isinstance(y_true_col_names, str) else self._binary_pandas_df + + score = snowml_metrics.accuracy_score( + df=input_df, + y_true_col_names=y_true_col_names, + y_pred_col_names=y_pred_col_names, + sample_weight_col_name=_SAMPLE_WEIGHT_COL, + normalize=True, + ) + score_sklearn = sklearn_metrics.accuracy_score( + pandas_df[y_true_col_names], + pandas_df[y_pred_col_names], + sample_weight=pandas_df[_SAMPLE_WEIGHT_COL].to_numpy(), + normalize=True, + ) + np.testing.assert_allclose(score, score_sklearn) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_confusion_matrix.py b/tests/integ/snowflake/ml/modeling/metrics/test_confusion_matrix.py new file mode 100644 index 00000000..4ef2c7b3 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/metrics/test_confusion_matrix.py @@ -0,0 +1,129 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +from typing import Any, Dict + +import numpy as np +from absl.testing import parameterized +from absl.testing.absltest import main +from sklearn import metrics as sklearn_metrics + +from snowflake import snowpark +from snowflake.ml.modeling import metrics as snowml_metrics +from snowflake.ml.utils import connection_params +from tests.integ.snowflake.ml.modeling.framework import utils + +_DATA, _SCHEMA = utils.gen_fuzz_data( + rows=100, + types=[utils.DataType.INTEGER] * 2 + [utils.DataType.FLOAT], + low=-1, + high=5, +) +_Y_TRUE_COL = _SCHEMA[1] +_Y_PRED_COL = _SCHEMA[2] +_SAMPLE_WEIGHT_COL = _SCHEMA[3] + + +class ConfusionMatrixTest(parameterized.TestCase): + """Test confusion matrix.""" + + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + def tearDown(self) -> None: + self._session.close() + + @parameterized.parameters( # type: ignore[misc] + {"params": {"labels": [None, [2, 0, 4]]}}, + ) + def test_confusion_matrix_labels(self, params: Dict[str, Any]) -> None: + input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) + pandas_df = input_df.to_pandas() + + for labels in params["labels"]: + actual_cm = snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + labels=labels, + ) + sklearn_cm = sklearn_metrics.confusion_matrix( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + labels=labels, + ) + np.testing.assert_allclose(actual_cm, sklearn_cm) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, + ) + def test_confusion_matrix_sample_weight(self, params: Dict[str, Any]) -> None: + input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) + pandas_df = input_df.to_pandas() + + for sample_weight_col_name in params["sample_weight_col_name"]: + actual_cm = snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + sample_weight_col_name=sample_weight_col_name, + ) + sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None + sklearn_cm = sklearn_metrics.confusion_matrix( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + sample_weight=sample_weight, + ) + np.testing.assert_allclose(actual_cm, sklearn_cm) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"normalize": ["true", "pred", "all", None]}}, + ) + def test_confusion_matrix_normalize(self, params: Dict[str, Any]) -> None: + input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) + pandas_df = input_df.to_pandas() + + for normalize in params["normalize"]: + actual_cm = snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + normalize=normalize, + ) + sklearn_cm = sklearn_metrics.confusion_matrix( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + normalize=normalize, + ) + np.testing.assert_allclose(actual_cm, sklearn_cm) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"labels": []}}, + {"params": {"labels": [100, -10]}}, + {"params": {"normalize": "invalid"}}, + ) + def test_confusion_matrix_invalid_params(self, params: Dict[str, Any]) -> None: + input_df = self._session.create_dataframe(_DATA, schema=_SCHEMA) + + if "labels" in params: + with self.assertRaises(ValueError): + snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + labels=params["labels"], + ) + + if "normalize" in params: + with self.assertRaises(ValueError): + snowml_metrics.confusion_matrix( + df=input_df, + y_true_col_name=_Y_TRUE_COL, + y_pred_col_name=_Y_PRED_COL, + normalize=params["normalize"], + ) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/metrics/test_correlation.py b/tests/integ/snowflake/ml/modeling/metrics/test_correlation.py similarity index 98% rename from tests/integ/snowflake/ml/metrics/test_correlation.py rename to tests/integ/snowflake/ml/modeling/metrics/test_correlation.py index b8aa9f83..fa0bfae1 100644 --- a/tests/integ/snowflake/ml/metrics/test_correlation.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_correlation.py @@ -7,7 +7,7 @@ import pandas as pd from absl.testing.absltest import TestCase, main -from snowflake.ml.metrics.correlation import correlation +from snowflake.ml.modeling.metrics.correlation import correlation from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Row, Session diff --git a/tests/integ/snowflake/ml/metrics/test_covariance.py b/tests/integ/snowflake/ml/modeling/metrics/test_covariance.py similarity index 98% rename from tests/integ/snowflake/ml/metrics/test_covariance.py rename to tests/integ/snowflake/ml/modeling/metrics/test_covariance.py index caab4c63..c1e2b648 100644 --- a/tests/integ/snowflake/ml/metrics/test_covariance.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_covariance.py @@ -7,7 +7,7 @@ import pandas as pd from absl.testing.absltest import TestCase, main -from snowflake.ml.metrics.covariance import covariance +from snowflake.ml.modeling.metrics.covariance import covariance from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Row, Session diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py b/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py new file mode 100644 index 00000000..6272ea87 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/metrics/test_precision_recall_fscore_support.py @@ -0,0 +1,213 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +from typing import Any, Dict + +import numpy as np +import pandas as pd +from absl.testing import parameterized +from absl.testing.absltest import main +from sklearn import exceptions, metrics as sklearn_metrics + +from snowflake import snowpark +from snowflake.ml.modeling import metrics as snowml_metrics +from snowflake.ml.utils import connection_params +from tests.integ.snowflake.ml.modeling.framework import utils + +_ROWS = 100 +_TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] +_DATA, _SCHEMA = utils.gen_fuzz_data( + rows=_ROWS, + types=_TYPES, + low=0, + high=5, +) +_Y_TRUE_COL = _SCHEMA[1] +_Y_PRED_COL = _SCHEMA[2] +_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] +_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SCHEMA[5] + + +class PrecisionRecallFscoreSupportTest(parameterized.TestCase): + """Test precision_recall_fscore_support.""" + + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + def tearDown(self) -> None: + self._session.close() + + @parameterized.parameters( # type: ignore[misc] + {"params": {"labels": [None, [2, 0, 4]]}}, + ) + def test_precision_recall_fscore_support_labels(self, params: Dict[str, Any]) -> None: + pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for labels in params["labels"]: + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + labels=labels, + ) + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + labels=labels, + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s)), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s)), + ) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, + ) + def test_precision_recall_fscore_support_sample_weight(self, params: Dict[str, Any]) -> None: + pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for sample_weight_col_name in params["sample_weight_col_name"]: + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + sample_weight_col_name=sample_weight_col_name, + ) + sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + sample_weight=sample_weight, + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s)), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s)), + ) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"average": [None, "binary", "micro", "macro", "samples", "weighted"]}}, + ) + def test_precision_recall_fscore_support_average(self, params: Dict[str, Any]) -> None: + pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for average in params["average"]: + if average == "binary" or average == "samples": + continue + + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + average=average, + ) + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + average=average, + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s), dtype=np.float_), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_), + ) + + data, _ = utils.gen_fuzz_data( + rows=_ROWS, + types=_TYPES, + low=0, + high=2, + ) + binary_pandas_df = pd.DataFrame(data, columns=_SCHEMA) + binary_input_df = self._session.create_dataframe(binary_pandas_df) + + # binary + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=binary_input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + average="binary", + ) + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + binary_pandas_df[_Y_TRUE_COL], + binary_pandas_df[_Y_PRED_COL], + average="binary", + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s), dtype=np.float_), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_), + ) + + # samples + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=binary_input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + average="samples", + ) + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + binary_pandas_df[_Y_TRUE_COLS], + binary_pandas_df[_Y_PRED_COLS], + average="samples", + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s), dtype=np.float_), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s), dtype=np.float_), + ) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"zero_division": ["warn", 0, 1]}}, + ) + def test_precision_recall_fscore_support_zero_division(self, params: Dict[str, Any]) -> None: + data = [ + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + ] + pandas_df = pd.DataFrame(data, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for zero_division in params["zero_division"]: + if zero_division == "warn": + continue + + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + zero_division=zero_division, + ) + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + zero_division=zero_division, + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s)), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s)), + ) + + # warn + sklearn_p, sklearn_r, sklearn_f, sklearn_s = sklearn_metrics.precision_recall_fscore_support( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + zero_division="warn", + ) + + with self.assertWarns(exceptions.UndefinedMetricWarning): + actual_p, actual_r, actual_f, actual_s = snowml_metrics.precision_recall_fscore_support( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + zero_division="warn", + ) + np.testing.assert_allclose( + np.array((actual_p, actual_r, actual_f, actual_s)), + np.array((sklearn_p, sklearn_r, sklearn_f, sklearn_s)), + ) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py new file mode 100644 index 00000000..e45cd16c --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/metrics/test_precision_score.py @@ -0,0 +1,200 @@ +# +# Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved. +# +from typing import Any, Dict + +import numpy as np +import pandas as pd +from absl.testing import parameterized +from absl.testing.absltest import main +from sklearn import exceptions, metrics as sklearn_metrics + +from snowflake import snowpark +from snowflake.ml.modeling import metrics as snowml_metrics +from snowflake.ml.utils import connection_params +from tests.integ.snowflake.ml.modeling.framework import utils + +_ROWS = 100 +_TYPES = [utils.DataType.INTEGER] * 4 + [utils.DataType.FLOAT] +_DATA, _SCHEMA = utils.gen_fuzz_data( + rows=_ROWS, + types=_TYPES, + low=0, + high=2, +) +_Y_TRUE_COL = _SCHEMA[1] +_Y_PRED_COL = _SCHEMA[2] +_Y_TRUE_COLS = [_SCHEMA[1], _SCHEMA[2]] +_Y_PRED_COLS = [_SCHEMA[3], _SCHEMA[4]] +_SAMPLE_WEIGHT_COL = _SCHEMA[5] + + +class PrecisionScoreTest(parameterized.TestCase): + """Test precision score.""" + + def setUp(self) -> None: + """Creates Snowpark and Snowflake environments for testing.""" + self._session = snowpark.Session.builder.configs(connection_params.SnowflakeLoginOptions()).create() + + def tearDown(self) -> None: + self._session.close() + + @parameterized.parameters( # type: ignore[misc] + {"params": {"labels": [None, [2, 0, 4]]}}, + ) + def test_precision_score_labels(self, params: Dict[str, Any]) -> None: + data, _ = utils.gen_fuzz_data( + rows=_ROWS, + types=_TYPES, + low=0, + high=5, + ) + pandas_df = pd.DataFrame(data, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for labels in params["labels"]: + actual_p = snowml_metrics.precision_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + labels=labels, + average=None, + ) + sklearn_p = sklearn_metrics.precision_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + labels=labels, + average=None, + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"sample_weight_col_name": [None, _SAMPLE_WEIGHT_COL]}}, + ) + def test_precision_score_sample_weight(self, params: Dict[str, Any]) -> None: + pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for sample_weight_col_name in params["sample_weight_col_name"]: + actual_p = snowml_metrics.precision_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + sample_weight_col_name=sample_weight_col_name, + ) + sample_weight = pandas_df[sample_weight_col_name].to_numpy() if sample_weight_col_name else None + sklearn_p = sklearn_metrics.precision_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + sample_weight=sample_weight, + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"average": [None, "binary", "micro", "macro", "samples", "weighted"]}}, + ) + def test_precision_score_average(self, params: Dict[str, Any]) -> None: + data, _ = utils.gen_fuzz_data( + rows=_ROWS, + types=_TYPES, + low=0, + high=5, + ) + multiclass_pandas_df = pd.DataFrame(data, columns=_SCHEMA) + multiclass_input_df = self._session.create_dataframe(multiclass_pandas_df) + + for average in params["average"]: + if average == "binary" or average == "samples": + continue + + actual_p = snowml_metrics.precision_score( + df=multiclass_input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + average=average, + ) + sklearn_p = sklearn_metrics.precision_score( + multiclass_pandas_df[_Y_TRUE_COL], + multiclass_pandas_df[_Y_PRED_COL], + average=average, + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + pandas_df = pd.DataFrame(_DATA, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + # binary + actual_p = snowml_metrics.precision_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + average="binary", + ) + sklearn_p = sklearn_metrics.precision_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + average="binary", + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + # samples + actual_p = snowml_metrics.precision_score( + df=input_df, + y_true_col_names=_Y_TRUE_COLS, + y_pred_col_names=_Y_PRED_COLS, + average="samples", + ) + sklearn_p = sklearn_metrics.precision_score( + pandas_df[_Y_TRUE_COLS], + pandas_df[_Y_PRED_COLS], + average="samples", + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + @parameterized.parameters( # type: ignore[misc] + {"params": {"zero_division": ["warn", 0, 1]}}, + ) + def test_precision_score_zero_division(self, params: Dict[str, Any]) -> None: + data = [ + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + ] + pandas_df = pd.DataFrame(data, columns=_SCHEMA) + input_df = self._session.create_dataframe(pandas_df) + + for zero_division in params["zero_division"]: + if zero_division == "warn": + continue + + actual_p = snowml_metrics.precision_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + zero_division=zero_division, + ) + sklearn_p = sklearn_metrics.precision_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + zero_division=zero_division, + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + # warn + sklearn_p = sklearn_metrics.precision_score( + pandas_df[_Y_TRUE_COL], + pandas_df[_Y_PRED_COL], + zero_division="warn", + ) + + with self.assertWarns(exceptions.UndefinedMetricWarning): + actual_p = snowml_metrics.precision_score( + df=input_df, + y_true_col_names=_Y_TRUE_COL, + y_pred_col_names=_Y_PRED_COL, + zero_division="warn", + ) + np.testing.assert_allclose(actual_p, sklearn_p) + + +if __name__ == "__main__": + main() diff --git a/tests/integ/snowflake/ml/metrics/test_r2_score.py b/tests/integ/snowflake/ml/modeling/metrics/test_r2_score.py similarity index 95% rename from tests/integ/snowflake/ml/metrics/test_r2_score.py rename to tests/integ/snowflake/ml/modeling/metrics/test_r2_score.py index f0832972..4e45372e 100644 --- a/tests/integ/snowflake/ml/metrics/test_r2_score.py +++ b/tests/integ/snowflake/ml/modeling/metrics/test_r2_score.py @@ -7,7 +7,7 @@ from absl.testing.absltest import TestCase, main from sklearn.metrics import r2_score as SKr2_score -from snowflake.ml.metrics import regression +from snowflake.ml.modeling.metrics import regression from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Row, Session diff --git a/tests/integ/snowflake/ml/sklearn/mixture/BUILD.bazel b/tests/integ/snowflake/ml/modeling/mixture/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/mixture/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/mixture/BUILD.bazel index 45a416f7..08dd1fc3 100644 --- a/tests/integ/snowflake/ml/sklearn/mixture/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/mixture/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/mixture:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/mixture:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.mixture", - module_root_dir = "snowflake/ml/sklearn/mixture", + module_root_dir = "snowflake/ml/modeling/mixture", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/model_selection/BUILD.bazel b/tests/integ/snowflake/ml/modeling/model_selection/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/sklearn/model_selection/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/model_selection/BUILD.bazel index fd05ef10..6c90ca2b 100644 --- a/tests/integ/snowflake/ml/sklearn/model_selection/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/model_selection/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/model_selection:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/model_selection:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.model_selection", - module_root_dir = "snowflake/ml/sklearn/model_selection", + module_root_dir = "snowflake/ml/modeling/model_selection", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/multiclass/BUILD.bazel b/tests/integ/snowflake/ml/modeling/multiclass/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/multiclass/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/multiclass/BUILD.bazel index 3fe9b70a..f9ed793e 100644 --- a/tests/integ/snowflake/ml/sklearn/multiclass/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/multiclass/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/multiclass:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/multiclass:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.multiclass", - module_root_dir = "snowflake/ml/sklearn/multiclass", + module_root_dir = "snowflake/ml/modeling/multiclass", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/multioutput/BUILD.bazel b/tests/integ/snowflake/ml/modeling/multioutput/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/multioutput/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/multioutput/BUILD.bazel index 03800288..b3cc2f6e 100644 --- a/tests/integ/snowflake/ml/sklearn/multioutput/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/multioutput/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/multioutput:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/multioutput:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.multioutput", - module_root_dir = "snowflake/ml/sklearn/multioutput", + module_root_dir = "snowflake/ml/modeling/multioutput", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/naive_bayes/BUILD.bazel b/tests/integ/snowflake/ml/modeling/naive_bayes/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/naive_bayes/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/naive_bayes/BUILD.bazel index 700afc11..effe5868 100644 --- a/tests/integ/snowflake/ml/sklearn/naive_bayes/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/naive_bayes/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/naive_bayes:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/naive_bayes:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.naive_bayes", - module_root_dir = "snowflake/ml/sklearn/naive_bayes", + module_root_dir = "snowflake/ml/modeling/naive_bayes", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/neighbors/BUILD.bazel b/tests/integ/snowflake/ml/modeling/neighbors/BUILD.bazel similarity index 62% rename from tests/integ/snowflake/ml/sklearn/neighbors/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/neighbors/BUILD.bazel index e4fc5d3b..c11ab353 100644 --- a/tests/integ/snowflake/ml/sklearn/neighbors/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/neighbors/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/neighbors:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/neighbors:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.neighbors", - module_root_dir = "snowflake/ml/sklearn/neighbors", + module_root_dir = "snowflake/ml/modeling/neighbors", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/neural_network/BUILD.bazel b/tests/integ/snowflake/ml/modeling/neural_network/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/sklearn/neural_network/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/neural_network/BUILD.bazel index 7ef7d44a..ae89ad6f 100644 --- a/tests/integ/snowflake/ml/sklearn/neural_network/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/neural_network/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/neural_network:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/neural_network:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.neural_network", - module_root_dir = "snowflake/ml/sklearn/neural_network", + module_root_dir = "snowflake/ml/modeling/neural_network", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/modeling/pipeline/BUILD.bazel b/tests/integ/snowflake/ml/modeling/pipeline/BUILD.bazel new file mode 100644 index 00000000..ee875cac --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/pipeline/BUILD.bazel @@ -0,0 +1,22 @@ +load("//bazel:py_rules.bzl", "py_test") + +package(default_visibility = ["//visibility:public"]) + +SHARD_COUNT = 3 +TIMEOUT = "long" # 900s + +py_test( + name = "test_pipeline", + srcs = ["test_pipeline.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/linear_model:linear_regression", + "//snowflake/ml/modeling/linear_model:logistic_regression", + "//snowflake/ml/modeling/pipeline:pipeline", + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/preprocessing:standard_scaler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], +) diff --git a/tests/integ/snowflake/ml/sklearn/framework/test_pipeline.py b/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py similarity index 67% rename from tests/integ/snowflake/ml/sklearn/framework/test_pipeline.py rename to tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py index bc0f1971..f43cd475 100644 --- a/tests/integ/snowflake/ml/sklearn/framework/test_pipeline.py +++ b/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py @@ -7,19 +7,18 @@ import pickle import sys import tempfile -from typing import List, Union +from typing import List import cloudpickle import inflection import joblib import numpy as np -import pandas as pd from absl.testing.absltest import TestCase, main from sklearn.compose import ColumnTransformer as SkColumnTransformer from sklearn.datasets import load_diabetes, load_iris from sklearn.linear_model import ( LinearRegression as SklearnLinearRegression, - SGDClassifier as SklearnSGDClassifier, + LogisticRegression as SklearnLogisticRegression, ) from sklearn.pipeline import Pipeline as SkPipeline from sklearn.preprocessing import ( @@ -27,15 +26,19 @@ StandardScaler as SklearnStandardScaler, ) -from snowflake.ml.sklearn.framework.pipeline import Pipeline -from snowflake.ml.sklearn.preprocessing import ( # type: ignore[attr-defined] +from snowflake.ml.modeling import pipeline as snowml_pipeline +from snowflake.ml.modeling.linear_model import ( + LinearRegression as SnowmlLinearRegression, + LogisticRegression as SnowmlLogisticRegression, +) +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] MinMaxScaler, StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions -from snowflake.snowpark import DataFrame, Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from snowflake.snowpark import Session +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -44,80 +47,6 @@ ) -# TODO(snandamuri): Replace these dummy classes with actual estimators when estimator are landed. -class SnowmlLinearRegression: - def __init__(self, input_cols: List[str], output_cols: List[str], label_cols: List[str], session: Session) -> None: - self.model = SklearnLinearRegression() - self.input_cols = input_cols - self.output_cols = output_cols - self.label_cols = label_cols - self._session = session - - def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SnowmlLinearRegression": - if isinstance(dataset, DataFrame): - pandas_df = dataset.to_pandas() - else: - pandas_df = dataset - self.model.fit(pandas_df[self.input_cols], pandas_df[self.label_cols]) - return self - - def predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[DataFrame, pd.DataFrame]: - if isinstance(dataset, DataFrame): - pandas_df = dataset.to_pandas() - result = self.model.predict(pandas_df[self.input_cols]) - pandas_df[self.output_cols] = result.reshape(pandas_df.shape[0], len(self.output_cols)) - return self._session.create_dataframe(pandas_df) - else: - pandas_df = dataset - result = self.model.predict(pandas_df[self.input_cols]) - pandas_df[self.output_cols] = result.reshape(pandas_df.shape[0], len(self.output_cols)) - return pandas_df - - -class SnowmlSGDClassifier: - def __init__( - self, - input_cols: List[str], - output_cols: List[str], - label_cols: List[str], - session: Session, - random_state: int = 0, - ) -> None: - self.model = SklearnSGDClassifier(random_state=random_state, loss="log_loss") - self.input_cols = input_cols - self.output_cols = output_cols - self.label_cols = label_cols - self._session = session - - def fit(self, dataset: DataFrame) -> "SnowmlSGDClassifier": - pandas_df = dataset.to_pandas() - self.model.fit(pandas_df[self.input_cols], pandas_df[self.label_cols]) - return self - - def predict(self, dataset: DataFrame) -> DataFrame: - pandas_df = dataset.to_pandas() - result = self.model.predict(pandas_df[self.input_cols]) - pandas_df[self.output_cols] = result.reshape(pandas_df.shape[0], len(self.output_cols)) - return self._session.create_dataframe(pandas_df) - - def predict_proba(self, dataset: DataFrame) -> DataFrame: - pandas_df = dataset.to_pandas() - proba = self.model.predict_proba(pandas_df[self.input_cols]) - columns = [f"CLASS_{c}" for c in range(0, proba.shape[1])] - return self._session.create_dataframe(pd.DataFrame(proba, columns=columns)) - - def predict_log_proba(self, dataset: DataFrame) -> DataFrame: - pandas_df = dataset.to_pandas() - log_proba = self.model.predict_log_proba(pandas_df[self.input_cols]) - columns = [f"CLASS_{c}" for c in range(0, log_proba.shape[1])] - return self._session.create_dataframe(pd.DataFrame(log_proba, columns=columns)) - - def score(self, dataset: DataFrame) -> DataFrame: - pandas_df = dataset.to_pandas() - score = self.model.score(pandas_df[self.input_cols], pandas_df[self.label_cols]) - return self._session.create_dataframe(pd.DataFrame({"score": [score]})) - - class TestPipeline(TestCase): """Test Pipeline.""" @@ -145,7 +74,7 @@ def test_single_step(self) -> None: _, df = framework_utils.get_df(self._session, DATA, SCHEMA, np.nan) scaler = MinMaxScaler().set_input_cols(input_col).set_output_cols(output_col) - pipeline = Pipeline([("scaler", scaler)]) + pipeline = snowml_pipeline.Pipeline([("scaler", scaler)]) pipeline.fit(df) transformed_df = pipeline.transform(df) @@ -161,12 +90,12 @@ def test_multiple_steps(self) -> None: AssertionError If the queries of the transformed dataframes with and without Pipeline are not identical. """ - input_col, output_col1, output_col2 = NUMERIC_COLS[0], "output1", "output2" + input_col, output_col1, output_col2 = NUMERIC_COLS[0], "OUTPUT1", "OUTPUT2" _, df = framework_utils.get_df(self._session, DATA, SCHEMA, np.nan) mms = MinMaxScaler().set_input_cols(input_col).set_output_cols(output_col1) ss = StandardScaler().set_input_cols(output_col1).set_output_cols(output_col2) - pipeline = Pipeline([("mms", mms), ("ss", ss)]) + pipeline = snowml_pipeline.Pipeline([("mms", mms), ("ss", ss)]) pipeline.fit(df) transformed_df = pipeline.transform(df) @@ -192,7 +121,7 @@ def test_serde(self) -> None: ss = StandardScaler(input_cols=input_cols, output_cols=output_cols) mms = MinMaxScaler(input_cols=output_cols, output_cols=pipeline_output_cols) - pipeline = Pipeline([("ss", ss), ("mms", mms)]) + pipeline = snowml_pipeline.Pipeline([("ss", ss), ("mms", mms)]) pipeline.fit(df1) filepath = os.path.join(tempfile.gettempdir(), "test_pipeline.pkl") self._to_be_deleted_files.append(filepath) @@ -208,7 +137,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.framework.pipeline"]) + importlib.reload(sys.modules["snowflake.ml.modeling.pipeline"]) # cloudpickle pipeline_load_cloudpickle = cloudpickle.loads(pipeline_dump_cloudpickle) @@ -238,10 +167,11 @@ def test_pipeline_with_regression_estimators(self) -> None: input_df_pandas = load_diabetes(as_frame=True).frame # Normalize column names input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index input_df = self._session.create_dataframe(input_df_pandas) - input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET") and not c.startswith("INDEX")] label_cols = ["TARGET"] output_cols = ["OUTPUT"] @@ -252,11 +182,9 @@ def test_pipeline_with_regression_estimators(self) -> None: ss.set_input_cols(["AGE"]) ss.set_output_cols(["AGE"]) - estimator = SnowmlLinearRegression( - input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, session=self._session - ) + estimator = SnowmlLinearRegression(input_cols=input_cols, output_cols=output_cols, label_cols=label_cols) - pipeline = Pipeline(steps=[("mms", mms), ("ss", ss), ("estimator", estimator)]) + pipeline = snowml_pipeline.Pipeline(steps=[("mms", mms), ("ss", ss), ("estimator", estimator)]) assert not hasattr(pipeline, "transform") assert not hasattr(pipeline, "fit_transform") @@ -264,12 +192,12 @@ def test_pipeline_with_regression_estimators(self) -> None: assert hasattr(pipeline, "fit_predict") assert not hasattr(pipeline, "predict_proba") assert not hasattr(pipeline, "predict_log_proba") - assert not hasattr(pipeline, "score") + assert hasattr(pipeline, "score") # fit and predict pipeline.fit(input_df) output_df = pipeline.predict(input_df) - actual_results = output_df.to_pandas()[output_cols].to_numpy() + actual_results = output_df.to_pandas().sort_values(by="INDEX")[output_cols].to_numpy() # Do the same with SKLearn age_col_transform = SkPipeline(steps=[("mms", SklearnMinMaxScaler()), ("ss", SklearnStandardScaler())]) @@ -288,24 +216,25 @@ def test_pipeline_with_regression_estimators(self) -> None: skpipeline.fit(input_df_pandas[input_cols], input_df_pandas[label_cols]) sk_predict_results = skpipeline.predict(input_df_pandas[input_cols]) - assert np.allclose(actual_results, sk_predict_results) + np.testing.assert_allclose(actual_results, sk_predict_results, rtol=1.0e-1, atol=1.0e-2) def test_pipeline_with_classifier_estimators(self) -> None: input_df_pandas = load_iris(as_frame=True).frame # Normalize column names input_df_pandas.columns = [inflection.parameterize(c, "_").upper() for c in input_df_pandas.columns] + input_df_pandas["INDEX"] = input_df_pandas.reset_index().index input_df = self._session.create_dataframe(input_df_pandas) - input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET")] + input_cols = [c for c in input_df_pandas.columns if not c.startswith("TARGET") and not c.startswith("INDEX")] label_cols = ["TARGET"] output_cols = ["OUTPUT"] - estimator = SnowmlSGDClassifier( - input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, session=self._session, random_state=0 + estimator = SnowmlLogisticRegression( + input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, random_state=0 ) - pipeline = Pipeline(steps=[("estimator", estimator)]) + pipeline = snowml_pipeline.Pipeline(steps=[("estimator", estimator)]) assert not hasattr(pipeline, "transform") assert not hasattr(pipeline, "fit_transform") @@ -318,13 +247,17 @@ def test_pipeline_with_classifier_estimators(self) -> None: # fit and predict pipeline.fit(input_df) output_df = pipeline.predict(input_df) - actual_results = output_df.to_pandas()[output_cols].to_numpy().flatten() - actual_proba = pipeline.predict_proba(input_df).to_pandas().to_numpy() - actual_log_proba = pipeline.predict_log_proba(input_df).to_pandas().to_numpy() - actual_score = pipeline.score(input_df).to_pandas().iat[0, 0] + actual_results = output_df.to_pandas().sort_values(by="INDEX")[output_cols].astype(float).to_numpy().flatten() + actual_proba = pipeline.predict_proba(input_df).to_pandas().sort_values(by="INDEX") + actual_proba = actual_proba[[c for c in actual_proba.columns if c.find("PREDICT_PROBA_") >= 0]].to_numpy() + actual_log_proba = pipeline.predict_log_proba(input_df).to_pandas().sort_values(by="INDEX") + actual_log_proba = actual_log_proba[ + [c for c in actual_log_proba.columns if c.find("PREDICT_LOG_PROBA_") >= 0] + ].to_numpy() + actual_score = pipeline.score(input_df) # Do the same with SKLearn - skpipeline = SkPipeline(steps=[("estimator", SklearnSGDClassifier(random_state=0, loss="log_loss"))]) + skpipeline = SkPipeline(steps=[("estimator", SklearnLogisticRegression(random_state=0))]) skpipeline.fit(input_df_pandas[input_cols], input_df_pandas[label_cols]) sk_predict_results = skpipeline.predict(input_df_pandas[input_cols]) @@ -332,10 +265,10 @@ def test_pipeline_with_classifier_estimators(self) -> None: sk_log_proba = skpipeline.predict_log_proba(input_df_pandas[input_cols]) sk_score = skpipeline.score(input_df_pandas[input_cols], input_df_pandas[label_cols]) - assert np.allclose(actual_results, sk_predict_results) - assert np.allclose(actual_proba, sk_proba) - assert np.allclose(actual_log_proba, sk_log_proba) - assert np.allclose(actual_score, sk_score) + np.testing.assert_allclose(actual_results, sk_predict_results) + np.testing.assert_allclose(actual_proba, sk_proba, rtol=1.0e-1, atol=1.0e-2) + np.testing.assert_allclose(actual_log_proba, sk_log_proba, rtol=1.0e-1, atol=1.0e-2) + np.testing.assert_allclose(actual_score, sk_score) def test_pipeline_transform_with_pandas_dataframe(self) -> None: input_df_pandas = load_diabetes(as_frame=True).frame @@ -348,7 +281,7 @@ def test_pipeline_transform_with_pandas_dataframe(self) -> None: mms = MinMaxScaler(input_cols=input_cols, output_cols=input_cols) ss = StandardScaler(input_cols=input_cols, output_cols=input_cols) - pipeline = Pipeline(steps=[("mms", mms), ("ss", ss)]) + pipeline = snowml_pipeline.Pipeline(steps=[("mms", mms), ("ss", ss)]) pipeline.fit(input_df) @@ -356,13 +289,15 @@ def test_pipeline_transform_with_pandas_dataframe(self) -> None: pandas_df_output = pipeline.transform(input_df_pandas) assert pandas_df_output.columns.shape == snow_df_output.columns.shape - assert np.allclose(snow_df_output[pandas_df_output.columns].to_numpy(), pandas_df_output.to_numpy()) + np.testing.assert_allclose(snow_df_output[pandas_df_output.columns].to_numpy(), pandas_df_output.to_numpy()) snow_df_output_2 = pipeline.transform(input_df[input_cols]).to_pandas() pandas_df_output_2 = pipeline.transform(input_df_pandas[input_cols]) assert pandas_df_output_2.columns.shape == snow_df_output_2.columns.shape - assert np.allclose(snow_df_output_2[pandas_df_output_2.columns].to_numpy(), pandas_df_output_2.to_numpy()) + np.testing.assert_allclose( + snow_df_output_2[pandas_df_output_2.columns].to_numpy(), pandas_df_output_2.to_numpy() + ) def test_pipeline_with_regression_estimators_pandas_dataframe(self) -> None: input_df_pandas = load_diabetes(as_frame=True).frame @@ -380,11 +315,9 @@ def test_pipeline_with_regression_estimators_pandas_dataframe(self) -> None: ss.set_input_cols(["AGE"]) ss.set_output_cols(["AGE"]) - estimator = SnowmlLinearRegression( - input_cols=input_cols, output_cols=output_cols, label_cols=label_cols, session=self._session - ) + estimator = SnowmlLinearRegression(input_cols=input_cols, output_cols=output_cols, label_cols=label_cols) - pipeline = Pipeline(steps=[("mms", mms), ("ss", ss), ("estimator", estimator)]) + pipeline = snowml_pipeline.Pipeline(steps=[("mms", mms), ("ss", ss), ("estimator", estimator)]) self.assertFalse(hasattr(pipeline, "transform")) self.assertFalse(hasattr(pipeline, "fit_transform")) @@ -392,7 +325,7 @@ def test_pipeline_with_regression_estimators_pandas_dataframe(self) -> None: self.assertTrue(hasattr(pipeline, "fit_predict")) self.assertFalse(hasattr(pipeline, "predict_proba")) self.assertFalse(hasattr(pipeline, "predict_log_proba")) - self.assertFalse(hasattr(pipeline, "score")) + self.assertTrue(hasattr(pipeline, "score")) # fit and predict pipeline.fit(input_df_pandas) diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/BUILD.bazel b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD.bazel new file mode 100644 index 00000000..9141a991 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD.bazel @@ -0,0 +1,11 @@ +load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") +load("//snowflake/ml/modeling/preprocessing:estimators_info.bzl", "estimator_info_list") +load(":BUILD_NATIVE.bzl", "get_build_rules_for_native_impl") +package(default_visibility = ["//visibility:public"]) + +autogen_tests_for_estimators( + module = "sklearn.preprocessing", + module_root_dir = "snowflake/ml/modeling/preprocessing", + estimator_info_list=estimator_info_list +) +get_build_rules_for_native_impl() diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl new file mode 100644 index 00000000..abc73fa7 --- /dev/null +++ b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl @@ -0,0 +1,140 @@ +load("//bazel:py_rules.bzl", "py_test") + +def get_build_rules_for_native_impl(): + SHARD_COUNT = 5 + TIMEOUT = "long" # 900s + + py_test( + name = "test_binarizer", + srcs = ["test_binarizer.py"], + deps = [ + "//snowflake/ml/modeling/preprocessing:binarizer", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_k_bins_discretizer", + srcs = ["test_k_bins_discretizer.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/preprocessing:k_bins_discretizer", + "//snowflake/ml/utils:connection_params", + "//snowflake/ml/utils:sparse", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_label_encoder", + srcs = ["test_label_encoder.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/preprocessing:label_encoder", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_max_abs_scaler", + srcs = ["test_max_abs_scaler.py"], + deps = [ + "//snowflake/ml/modeling/preprocessing:max_abs_scaler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_min_max_scaler", + srcs = ["test_min_max_scaler.py"], + deps = [ + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_normalizer", + srcs = ["test_normalizer.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/preprocessing:normalizer", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_one_hot_encoder", + srcs = ["test_one_hot_encoder.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/_internal/utils:identifier", + "//snowflake/ml/modeling/preprocessing:one_hot_encoder", + "//snowflake/ml/utils:connection_params", + "//snowflake/ml/utils:sparse", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_ordinal_encoder", + srcs = ["test_ordinal_encoder.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/preprocessing:ordinal_encoder", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_robust_scaler", + srcs = ["test_robust_scaler.py"], + shard_count = SHARD_COUNT, + timeout = TIMEOUT, + deps = [ + "//snowflake/ml/modeling/preprocessing:robust_scaler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_standard_scaler", + srcs = ["test_standard_scaler.py"], + deps = [ + "//snowflake/ml/modeling/preprocessing:standard_scaler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) + + py_test( + name = "test_drop_input_cols", + srcs = ["test_drop_input_cols.py"], + deps = [ + "//snowflake/ml/modeling/impute:simple_imputer", + "//snowflake/ml/modeling/pipeline:pipeline", + "//snowflake/ml/modeling/preprocessing:binarizer", + "//snowflake/ml/modeling/preprocessing:label_encoder", + "//snowflake/ml/modeling/preprocessing:max_abs_scaler", + "//snowflake/ml/modeling/preprocessing:min_max_scaler", + "//snowflake/ml/modeling/preprocessing:normalizer", + "//snowflake/ml/modeling/preprocessing:one_hot_encoder", + "//snowflake/ml/modeling/preprocessing:ordinal_encoder", + "//snowflake/ml/modeling/preprocessing:robust_scaler", + "//snowflake/ml/modeling/preprocessing:standard_scaler", + "//snowflake/ml/utils:connection_params", + "//tests/integ/snowflake/ml/modeling/framework:utils", + ], + ) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_binarizer.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py similarity index 94% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_binarizer.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py index fb2754f4..d078d3ce 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_binarizer.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_binarizer.py @@ -14,11 +14,11 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import Binarizer as SklearnBinarizer -from snowflake.ml.sklearn.preprocessing import Binarizer # type: ignore[attr-defined] +from snowflake.ml.modeling.preprocessing import Binarizer # type: ignore[attr-defined] from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, DATA_NONE_NAN, ID_COL, @@ -139,7 +139,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.binarizer"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.binarizer"]) # cloudpickle binarizer_load_cloudpickle = cloudpickle.loads(binarizer_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_drop_input_cols.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_drop_input_cols.py similarity index 93% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_drop_input_cols.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_drop_input_cols.py index dbd34022..c55a4c5a 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_drop_input_cols.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_drop_input_cols.py @@ -8,8 +8,9 @@ import numpy as np from absl.testing.absltest import TestCase -from snowflake.ml.sklearn.framework.pipeline import Pipeline -from snowflake.ml.sklearn.preprocessing import ( # type: ignore[attr-defined] +from snowflake.ml.modeling.impute import SimpleImputer # type: ignore[attr-defined] +from snowflake.ml.modeling.pipeline import Pipeline +from snowflake.ml.modeling.preprocessing import ( # type: ignore[attr-defined] Binarizer, LabelEncoder, MaxAbsScaler, @@ -18,13 +19,12 @@ OneHotEncoder, OrdinalEncoder, RobustScaler, - SimpleImputer, StandardScaler, ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( CATEGORICAL_COLS, DATA, ID_COL, diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_k_bins_discretizer.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py similarity index 99% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_k_bins_discretizer.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py index f946df2b..f20f1743 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_k_bins_discretizer.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_k_bins_discretizer.py @@ -8,13 +8,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import KBinsDiscretizer as SklearnKBinsDiscretizer -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( KBinsDiscretizer, # type: ignore[attr-defined] ) from snowflake.ml.utils import sparse as sparse_utils from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils +from tests.integ.snowflake.ml.modeling.framework import utils np.set_printoptions(threshold=sys.maxsize) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_label_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py similarity index 96% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_label_encoder.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py index fbc833b0..c7417925 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_label_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_label_encoder.py @@ -14,13 +14,13 @@ from absl.testing.absltest import main from sklearn.preprocessing import LabelEncoder as SklearnLabelEncoder -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( LabelEncoder, # type: ignore[attr-defined] ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, DATA_BOOLEAN, DATA_NONE_NAN, @@ -210,7 +210,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.label_encoder"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.label_encoder"]) # cloudpickle label_encoder_load_cloudpickle = cloudpickle.loads(label_encoder_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_max_abs_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py similarity index 95% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_max_abs_scaler.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py index 321b8dc6..98ea4c40 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_max_abs_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_max_abs_scaler.py @@ -16,13 +16,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import MaxAbsScaler as SklearnMaxAbsScaler -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( MaxAbsScaler, # type: ignore[attr-defined] ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -154,7 +154,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.max_abs_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.max_abs_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_min_max_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py similarity index 98% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_min_max_scaler.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py index 273d052a..cb753fa1 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_min_max_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_min_max_scaler.py @@ -15,13 +15,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import MinMaxScaler as SklearnMinMaxScaler -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( MinMaxScaler, # type: ignore[attr-defined] ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, DATA_CLIP, ID_COL, @@ -355,7 +355,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.min_max_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.min_max_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_normalizer.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py similarity index 96% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_normalizer.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py index b36a37b9..0101f374 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_normalizer.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_normalizer.py @@ -16,12 +16,12 @@ from absl.testing.absltest import main from sklearn.preprocessing import Normalizer as SklearnNormalizer -from snowflake.ml.sklearn.preprocessing import Normalizer # type: ignore[attr-defined] +from snowflake.ml.modeling.preprocessing import Normalizer # type: ignore[attr-defined] from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session from snowflake.snowpark.exceptions import SnowparkSQLException -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( CATEGORICAL_COLS, DATA_NONE_NAN, ID_COL, @@ -197,7 +197,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.normalizer"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.normalizer"]) # cloudpickle normalizer_load_cloudpickle = cloudpickle.loads(normalizer_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_one_hot_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py similarity index 97% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_one_hot_encoder.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py index ab5e161e..5c1cb206 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_one_hot_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_one_hot_encoder.py @@ -21,14 +21,14 @@ from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder from snowflake.ml._internal.utils import identifier as utils_identifier -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( OneHotEncoder, # type: ignore[attr-defined] ) from snowflake.ml.utils import sparse as utils_sparse from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import DataFrame, Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( BOOLEAN_COLS, CATEGORICAL_COLS, DATA, @@ -1568,7 +1568,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.one_hot_encoder"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.one_hot_encoder"]) # cloudpickle encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) @@ -1619,6 +1619,49 @@ def test_fit_empty(self) -> None: encoder.fit(df) self.assertIn("Empty data while a minimum of 1 sample is required.", str(ex.exception)) + def test_fit_snowpark_transform_numeric_data(self) -> None: + snow_df = self._session.sql( + """SELECT *, IFF(Y = 'yes', 1.0, 0.0) as LABEL + FROM ML_DATASETS.PUBLIC.UCI_BANK_MARKETING_20COLUMNS + LIMIT 2000""" + ).drop("Y") + input_cols = [c for c in snow_df.columns if c != "LABEL"] + # contains dtype as int, object, float. + output_cols = [f"OHE_{c}" for c in input_cols] + + snow_df_single_feature = snow_df[input_cols] + ohe = OneHotEncoder(input_cols=input_cols, output_cols=output_cols) + ohe.fit(snow_df_single_feature) + ohe.transform(snow_df_single_feature.to_pandas()) + + def test_fit_snowpark_transform_everydtypes(self) -> None: + x = np.ones( + (10,), + dtype=[ + ("X", np.uint8), + ("Y", np.float64), + ("Z", np.str_), + ("A", np.bool8), + ("B", np.bytes0), + ("C", np.object0), + ], + ) + pd_df = pd.DataFrame(x) + df = self._session.create_dataframe(pd_df) + input_cols = ["A", "B", "C", "X", "Y", "Z"] + output_cols = [f"OHE_{c}" for c in input_cols] + + ohe = OneHotEncoder(input_cols=input_cols, output_cols=output_cols) + ohe.fit(df) + actual_arr = ohe.transform(pd_df)[ohe.get_output_cols()].to_numpy() + + # sklearn + encoder_sklearn = SklearnOneHotEncoder() + encoder_sklearn.fit(pd_df[input_cols]) + sklearn_arr = encoder_sklearn.transform(pd_df[input_cols]) + + np.testing.assert_allclose(actual_arr, sklearn_arr.toarray()) + if __name__ == "__main__": main() diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_ordinal_encoder.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py similarity index 99% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_ordinal_encoder.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py index 8aac2dd1..28d40a19 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_ordinal_encoder.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_ordinal_encoder.py @@ -18,13 +18,13 @@ from absl.testing.absltest import main from sklearn.preprocessing import OrdinalEncoder as SklearnOrdinalEncoder -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( OrdinalEncoder, # type: ignore[attr-defined] ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( BOOLEAN_COLS, CATEGORICAL_COLS, DATA, @@ -811,7 +811,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.ordinal_encoder"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.ordinal_encoder"]) # cloudpickle encoder_load_cloudpickle = cloudpickle.loads(encoder_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_robust_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py similarity index 97% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_robust_scaler.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py index 5d826bd3..c3cfd309 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_robust_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_robust_scaler.py @@ -17,13 +17,13 @@ from absl.testing.absltest import main from sklearn.preprocessing import RobustScaler as SklearnRobustScaler -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( RobustScaler, # type: ignore[attr-defined] ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -273,7 +273,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.robust_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.robust_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/test_standard_scaler.py b/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py similarity index 98% rename from tests/integ/snowflake/ml/sklearn/preprocessing/test_standard_scaler.py rename to tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py index a39ed348..1f1522a0 100644 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/test_standard_scaler.py +++ b/tests/integ/snowflake/ml/modeling/preprocessing/test_standard_scaler.py @@ -15,13 +15,13 @@ from absl.testing.absltest import TestCase, main from sklearn.preprocessing import StandardScaler as SklearnStandardScaler -from snowflake.ml.sklearn.preprocessing import ( +from snowflake.ml.modeling.preprocessing import ( StandardScaler, # type: ignore[attr-defined] ) from snowflake.ml.utils.connection_params import SnowflakeLoginOptions from snowflake.snowpark import Session -from tests.integ.snowflake.ml.sklearn.framework import utils as framework_utils -from tests.integ.snowflake.ml.sklearn.framework.utils import ( +from tests.integ.snowflake.ml.modeling.framework import utils as framework_utils +from tests.integ.snowflake.ml.modeling.framework.utils import ( DATA, ID_COL, NUMERIC_COLS, @@ -387,7 +387,7 @@ def test_serde(self) -> None: input_cols_extended = input_cols.copy() input_cols_extended.append(id_col) - importlib.reload(sys.modules["snowflake.ml.sklearn.preprocessing.standard_scaler"]) + importlib.reload(sys.modules["snowflake.ml.modeling.preprocessing.standard_scaler"]) # cloudpickle scaler_load_cloudpickle = cloudpickle.loads(scaler_dump_cloudpickle) diff --git a/tests/integ/snowflake/ml/sklearn/semi_supervised/BUILD.bazel b/tests/integ/snowflake/ml/modeling/semi_supervised/BUILD.bazel similarity index 60% rename from tests/integ/snowflake/ml/sklearn/semi_supervised/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/semi_supervised/BUILD.bazel index a3af24a6..54bc3a86 100644 --- a/tests/integ/snowflake/ml/sklearn/semi_supervised/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/semi_supervised/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/semi_supervised:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/semi_supervised:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.semi_supervised", - module_root_dir = "snowflake/ml/sklearn/semi_supervised", + module_root_dir = "snowflake/ml/modeling/semi_supervised", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/svm/BUILD.bazel b/tests/integ/snowflake/ml/modeling/svm/BUILD.bazel similarity index 63% rename from tests/integ/snowflake/ml/sklearn/svm/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/svm/BUILD.bazel index 60eaa350..ce9a11d3 100644 --- a/tests/integ/snowflake/ml/sklearn/svm/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/svm/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/svm:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/svm:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.svm", - module_root_dir = "snowflake/ml/sklearn/svm", + module_root_dir = "snowflake/ml/modeling/svm", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/tree/BUILD.bazel b/tests/integ/snowflake/ml/modeling/tree/BUILD.bazel similarity index 63% rename from tests/integ/snowflake/ml/sklearn/tree/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/tree/BUILD.bazel index a97cc6bf..26ec20cd 100644 --- a/tests/integ/snowflake/ml/sklearn/tree/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/tree/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/tree:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/tree:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "sklearn.tree", - module_root_dir = "snowflake/ml/sklearn/tree", + module_root_dir = "snowflake/ml/modeling/tree", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/xgboost/BUILD.bazel b/tests/integ/snowflake/ml/modeling/xgboost/BUILD.bazel similarity index 61% rename from tests/integ/snowflake/ml/xgboost/BUILD.bazel rename to tests/integ/snowflake/ml/modeling/xgboost/BUILD.bazel index b1d7a563..c1a9221a 100644 --- a/tests/integ/snowflake/ml/xgboost/BUILD.bazel +++ b/tests/integ/snowflake/ml/modeling/xgboost/BUILD.bazel @@ -1,9 +1,9 @@ load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/xgboost:estimators_info.bzl", "estimator_info_list") +load("//snowflake/ml/modeling/xgboost:estimators_info.bzl", "estimator_info_list") package(default_visibility = ["//visibility:public"]) autogen_tests_for_estimators( module = "xgboost", - module_root_dir = "snowflake/ml/xgboost", + module_root_dir = "snowflake/ml/modeling/xgboost", estimator_info_list=estimator_info_list ) diff --git a/tests/integ/snowflake/ml/sklearn/framework/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/framework/BUILD.bazel deleted file mode 100644 index e087479e..00000000 --- a/tests/integ/snowflake/ml/sklearn/framework/BUILD.bazel +++ /dev/null @@ -1,35 +0,0 @@ -load("//bazel:py_rules.bzl", "py_library", "py_test") - -package(default_visibility = ["//visibility:public"]) - -SHARD_COUNT = 3 -TIMEOUT = "long" # 900s - -py_test( - name = "test_base", - srcs = ["test_base.py"], - deps = [ - ":utils", - "//snowflake/ml/sklearn/preprocessing:min_max_scaler", - "//snowflake/ml/sklearn/preprocessing:standard_scaler", - "//snowflake/ml/utils:connection_params", - ], -) - -py_test( - name = "test_pipeline", - srcs = ["test_pipeline.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - ":utils", - "//snowflake/ml/sklearn/preprocessing:min_max_scaler", - "//snowflake/ml/sklearn/preprocessing:standard_scaler", - "//snowflake/ml/utils:connection_params", - ], -) - -py_library( - name = "utils", - srcs = ["utils.py"], -) diff --git a/tests/integ/snowflake/ml/sklearn/impute/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/impute/BUILD.bazel deleted file mode 100644 index 1b79cbbc..00000000 --- a/tests/integ/snowflake/ml/sklearn/impute/BUILD.bazel +++ /dev/null @@ -1,9 +0,0 @@ -load("//codegen:codegen_rules.bzl", "autogen_tests_for_estimators") -load("//snowflake/ml/sklearn/impute:estimators_info.bzl", "estimator_info_list") -package(default_visibility = ["//visibility:public"]) - -autogen_tests_for_estimators( - module = "sklearn.impute", - module_root_dir = "snowflake/ml/sklearn/impute", - estimator_info_list=estimator_info_list -) diff --git a/tests/integ/snowflake/ml/sklearn/preprocessing/BUILD.bazel b/tests/integ/snowflake/ml/sklearn/preprocessing/BUILD.bazel deleted file mode 100644 index 93823d35..00000000 --- a/tests/integ/snowflake/ml/sklearn/preprocessing/BUILD.bazel +++ /dev/null @@ -1,152 +0,0 @@ -load("//bazel:py_rules.bzl", "py_test") - -package(default_visibility = ["//visibility:public"]) - -SHARD_COUNT = 5 -TIMEOUT = "long" # 900s - -py_test( - name = "test_binarizer", - srcs = ["test_binarizer.py"], - deps = [ - "//snowflake/ml/sklearn/preprocessing:binarizer", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_k_bins_discretizer", - srcs = ["test_k_bins_discretizer.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/sklearn/preprocessing:k_bins_discretizer", - "//snowflake/ml/utils:connection_params", - "//snowflake/ml/utils:sparse", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_label_encoder", - srcs = ["test_label_encoder.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/sklearn/preprocessing:label_encoder", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_max_abs_scaler", - srcs = ["test_max_abs_scaler.py"], - deps = [ - "//snowflake/ml/sklearn/preprocessing:max_abs_scaler", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_min_max_scaler", - srcs = ["test_min_max_scaler.py"], - deps = [ - "//snowflake/ml/sklearn/preprocessing:min_max_scaler", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_normalizer", - srcs = ["test_normalizer.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/sklearn/preprocessing:normalizer", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_one_hot_encoder", - srcs = ["test_one_hot_encoder.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/_internal/utils:identifier", - "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", - "//snowflake/ml/utils:connection_params", - "//snowflake/ml/utils:sparse", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_ordinal_encoder", - srcs = ["test_ordinal_encoder.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/sklearn/preprocessing:ordinal_encoder", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_robust_scaler", - srcs = ["test_robust_scaler.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/sklearn/preprocessing:robust_scaler", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_standard_scaler", - srcs = ["test_standard_scaler.py"], - deps = [ - "//snowflake/ml/sklearn/preprocessing:standard_scaler", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_simple_imputer", - srcs = ["test_simple_imputer.py"], - shard_count = SHARD_COUNT, - timeout = TIMEOUT, - deps = [ - "//snowflake/ml/sklearn/preprocessing:simple_imputer", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) - -py_test( - name = "test_drop_input_cols", - srcs = ["test_drop_input_cols.py"], - deps = [ - "//snowflake/ml/sklearn/preprocessing:binarizer", - "//snowflake/ml/sklearn/preprocessing:label_encoder", - "//snowflake/ml/sklearn/preprocessing:max_abs_scaler", - "//snowflake/ml/sklearn/preprocessing:min_max_scaler", - "//snowflake/ml/sklearn/preprocessing:normalizer", - "//snowflake/ml/sklearn/preprocessing:one_hot_encoder", - "//snowflake/ml/sklearn/preprocessing:ordinal_encoder", - "//snowflake/ml/sklearn/preprocessing:robust_scaler", - "//snowflake/ml/sklearn/preprocessing:simple_imputer", - "//snowflake/ml/sklearn/preprocessing:standard_scaler", - "//snowflake/ml/utils:connection_params", - "//tests/integ/snowflake/ml/sklearn/framework:utils", - ], -) diff --git a/third_party/rules_conda/env.bzl b/third_party/rules_conda/env.bzl index d553a9e4..8203a272 100644 --- a/third_party/rules_conda/env.bzl +++ b/third_party/rules_conda/env.bzl @@ -5,7 +5,6 @@ load(":utils.bzl", "CONDA_EXT_MAP", "EXECUTE_TIMEOUT", "PYTHON_EXT_MAP", "get_os BUILD_FILE_CONTENT = """# This file was automatically generated by rules_conda package(default_visibility = ["//visibility:public"]) - load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair") py_runtime( @@ -17,8 +16,8 @@ py_runtime( ) filegroup( - name = "coverage_tool", - srcs = [{coverage_tool_path}], + name={coverage_tool_target}, + srcs=["{coverage_tool_path}"] ) """ @@ -103,14 +102,21 @@ def _create_env_build_file(rctx, env_name): if py_major != 3: fail("Only Python 3 is supported. Your Python version is: {}.".format(py_version)) - coverage_tool_path = "{}/bin/coverage".format(env_name) + # We replace the coverage tool with our own wrapper, to injects a --ignore-errors argument when called to generate + # a coverage report, to avoid bazel fails when running coverage tool to collect coverage report on a source code + # file that does not exist, for example, zip-imported source. + # The original coverage tool is {env_name}/bin/coverage. + coverage_tool_path = "{}/bin/coverage_tool.py".format(env_name) coverage_tool_target = "\"coverage_tool\"" - coverage_tool_path_fmt = "\"{}\"".format(coverage_tool_path) - ls_result = rctx.execute(["ls", rctx.path(coverage_tool_path)]) - if ls_result.return_code != 0: - print("Coverage tool not found in {}. Coverage support disabled.".format(env_name)) - coverage_tool_target = "None" - coverage_tool_path_fmt = "" + + coverage_tool_label = rctx.attr.coverage_tool + coverage_tool_content = rctx.read(coverage_tool_label) + + coverage_tool_header = "#!{interpreter_path}\n\n".format(interpreter_path = rctx.path("{}/{}".format(env_name, interpreter_path))) + rctx.file( + coverage_tool_path, + content = coverage_tool_header + coverage_tool_content, + ) rctx.file( "BUILD", @@ -120,7 +126,7 @@ def _create_env_build_file(rctx, env_name): interpreter_path = interpreter_path, py_major = py_major, coverage_tool_target = coverage_tool_target, - coverage_tool_path = coverage_tool_path_fmt, + coverage_tool_path = coverage_tool_path, ), ) @@ -136,6 +142,10 @@ conda_create_rule = repository_rule( attrs = { "conda_repo": attr.string(mandatory = True), "conda_dir": attr.string(mandatory = True), + "coverage_tool": attr.label( + allow_single_file = True, + doc = "The label of the coverage_tool.py file.", + ), "environment": attr.label( mandatory = True, allow_single_file = True,