From 1fc159608053b4c98db50308d62ec7162ac329c0 Mon Sep 17 00:00:00 2001 From: William Shin Date: Thu, 30 Mar 2023 18:00:04 -0700 Subject: [PATCH 01/96] [MAINTENANCE] SqlAlchemy2 Compatibility - `Row.keys()` (#7520) --- .../multicolumn_map_condition_auxilliary_methods.py | 2 +- .../expectations/metrics/query_metrics/query_column.py | 2 +- .../expectations/metrics/query_metrics/query_column_pair.py | 2 +- .../metrics/query_metrics/query_multiple_columns.py | 2 +- .../expectations/metrics/query_metrics/query_table.py | 3 +-- .../metrics/query_metrics/query_template_values.py | 2 +- pyproject.toml | 2 -- 7 files changed, 6 insertions(+), 9 deletions(-) diff --git a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py index e554b14eb1e8..1e55283e243c 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py @@ -160,7 +160,7 @@ def _sqlalchemy_multicolumn_map_condition_values( if result_format["result_format"] != "COMPLETE": query = query.limit(result_format["partial_unexpected_count"]) - return [dict(val) for val in execution_engine.engine.execute(query).fetchall()] + return [val._asdict() for val in execution_engine.engine.execute(query).fetchall()] def _sqlalchemy_multicolumn_map_condition_filtered_row_count( diff --git a/great_expectations/expectations/metrics/query_metrics/query_column.py b/great_expectations/expectations/metrics/query_metrics/query_column.py index 9169dbe01cb5..a04490efed70 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_column.py +++ b/great_expectations/expectations/metrics/query_metrics/query_column.py @@ -65,7 +65,7 @@ def _sqlalchemy( engine: sqlalchemy_engine_Engine = execution_engine.engine result: List[sqlalchemy_engine_Row] = engine.execute(sa.text(query)).fetchall() - return [dict(element) for element in result] + return [element._asdict() for element in result] @metric_value(engine=SparkDFExecutionEngine) def _spark( diff --git a/great_expectations/expectations/metrics/query_metrics/query_column_pair.py b/great_expectations/expectations/metrics/query_metrics/query_column_pair.py index 40061871cc9d..3bf5ff04e588 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_column_pair.py +++ b/great_expectations/expectations/metrics/query_metrics/query_column_pair.py @@ -74,7 +74,7 @@ def _sqlalchemy( engine: sqlalchemy_engine_Engine = execution_engine.engine result: List[sqlalchemy_engine_Row] = engine.execute(sa.text(query)).fetchall() - return [dict(element) for element in result] + return [element._asdict() for element in result] @metric_value(engine=SparkDFExecutionEngine) def _spark( diff --git a/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py b/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py index 6eabb80ddfd9..f65d513f3fe5 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py +++ b/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py @@ -81,7 +81,7 @@ def _sqlalchemy( engine: sqlalchemy_engine_Engine = execution_engine.engine result: List[sqlalchemy_engine_Row] = engine.execute(sa.text(query)).fetchall() - return [dict(element) for element in result] + return [element._asdict() for element in result] @metric_value(engine=SparkDFExecutionEngine) def _spark( diff --git a/great_expectations/expectations/metrics/query_metrics/query_table.py b/great_expectations/expectations/metrics/query_metrics/query_table.py index d9f4a1d6024a..427667d92f93 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_table.py +++ b/great_expectations/expectations/metrics/query_metrics/query_table.py @@ -60,8 +60,7 @@ def _sqlalchemy( engine: sqlalchemy_engine_Engine = execution_engine.engine result: List[sqlalchemy_engine_Row] = engine.execute(sa.text(query)).fetchall() - - return [dict(element) for element in result] + return [element._asdict() for element in result] # @metric_value(engine=SparkDFExecutionEngine) diff --git a/great_expectations/expectations/metrics/query_metrics/query_template_values.py b/great_expectations/expectations/metrics/query_metrics/query_template_values.py index b2fd7849f023..17f4a2c78327 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_template_values.py +++ b/great_expectations/expectations/metrics/query_metrics/query_template_values.py @@ -96,7 +96,7 @@ def _sqlalchemy( e._query_id = None raise e - return [dict(element) for element in result] + return [element._asdict() for element in result] @metric_value(engine=SparkDFExecutionEngine) def _spark( diff --git a/pyproject.toml b/pyproject.toml index c43f9e541c14..ec7f52988129 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -468,8 +468,6 @@ filterwarnings = [ # To get SQLAlchemy 2.x supported, remove one of these ignores and then fix the resulting errors. 'ignore: The Engine.execute\(\) method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0. All statement execution in SQLAlchemy 2.0 is performed by the Connection.execute\(\) method of Connection, or in the ORM by the Session.execute\(\) method of Session.:DeprecationWarning', # Example Actual Warning: Found by running pytest tests/test_definitions/test_expectations_v2_api.py (delete with v2 api code if this warning doesn't appear elsewhere). - # sqlalchemy.exc.RemovedIn20Warning: The Row.keys() method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0. Use the namedtuple standard accessor Row._fields, or for full mapping behavior use row._mapping.keys() (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: The Row.keys\(\) method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0.:DeprecationWarning', # Example Actual Warning: Found by running pytest tests/test_definitions/test_expectations_v2_api.py (delete with v2 api code if this warning doesn't appear elsewhere). # sqlalchemy.exc.RemovedIn20Warning: Using non-integer/slice indices on Row is deprecated and will be removed in version 2.0; please use row._mapping[], or the mappings() accessor on the Result object. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) 'ignore: Using non-integer\/slice indices on Row is deprecated and will be removed in version 2.0:DeprecationWarning', From 44b78bc9125d35cce611df69bfbcd8f41bebfc5c Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Thu, 30 Mar 2023 19:52:43 -0700 Subject: [PATCH 02/96] [MAINTENANCE] Refactoring of CapitalOne Metrics and Profiler-Based DataAssistant for Enhanced Code Elegance (#7522) --- .../metrics/__init__.py | 20 +- .../metrics/data_profiler_metrics/__init__.py | 30 ++- .../data_profiler_column_profiler_report.py | 28 +-- .../data_profiler_profile_diff.py | 5 +- .../data_profiler_profile_numeric_columns.py | 5 +- .../data_profiler_profile_percent_diff.py | 5 +- .../data_profiler_profile_report.py | 5 +- .../data_profiler_table_column_infos.py | 51 +++++ .../data_profiler_table_column_list.py | 68 ++++++ ...data_profiler_structured_data_assistant.py | 35 --- .../data_profiler_column_domain_builder.py | 8 +- .../tests/conftest.py | 55 +++-- ...yellow_tripdata_5_lines_sample_2019-01.csv | 6 + .../tests/expectations/metrics/test_core.py | 201 ++++++++++++++++-- ...data_profiler_structured_data_assistant.py | 12 +- ...est_data_profiler_column_domain_builder.py | 66 +++--- great_expectations/util.py | 2 +- 17 files changed, 442 insertions(+), 160 deletions(-) create mode 100644 contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_infos.py create mode 100644 contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py create mode 100644 contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/data_profiler_files/taxi_yellow_tripdata_samples/yellow_tripdata_5_lines_sample_2019-01.csv diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/__init__.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/__init__.py index 65ebd4718bb8..a37b77e2b729 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/__init__.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/__init__.py @@ -1,17 +1,25 @@ # Make sure to include any Metrics your want exported below! -from .data_profiler_metrics.data_profiler_column_profiler_report import ( +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_column_profiler_report import ( DataProfilerColumnProfileReport, ) -from .data_profiler_metrics.data_profiler_profile_diff import DataProfilerProfileDiff -from .data_profiler_metrics.data_profiler_profile_metric_provider import ( +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_diff import ( + DataProfilerProfileDiff, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( DataProfilerProfileMetricProvider, ) -from .data_profiler_metrics.data_profiler_profile_numeric_columns import ( +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_numeric_columns import ( DataProfilerProfileNumericColumns, ) -from .data_profiler_metrics.data_profiler_profile_percent_diff import ( +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_percent_diff import ( DataProfilerProfilePercentDiff, ) -from .data_profiler_metrics.data_profiler_profile_report import ( +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_report import ( DataProfilerProfileReport, ) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_table_column_infos import ( + DataProfilerTableColumnInfos, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_table_column_list import ( + DataProfilerTableColumnList, +) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/__init__.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/__init__.py index 6bf6176396a1..272308f8c6a0 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/__init__.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/__init__.py @@ -1,6 +1,24 @@ -from .data_profiler_column_profiler_report import DataProfilerColumnProfileReport -from .data_profiler_profile_diff import DataProfilerProfileDiff -from .data_profiler_profile_metric_provider import DataProfilerProfileMetricProvider -from .data_profiler_profile_numeric_columns import DataProfilerProfileNumericColumns -from .data_profiler_profile_percent_diff import DataProfilerProfilePercentDiff -from .data_profiler_profile_report import DataProfilerProfileReport +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_column_profiler_report import ( + DataProfilerColumnProfileReport, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_diff import ( + DataProfilerProfileDiff, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_numeric_columns import ( + DataProfilerProfileNumericColumns, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_percent_diff import ( + DataProfilerProfilePercentDiff, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_report import ( + DataProfilerProfileReport, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_table_column_infos import ( + DataProfilerTableColumnInfos, +) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_table_column_list import ( + DataProfilerTableColumnList, +) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_column_profiler_report.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_column_profiler_report.py index e336b1732aa2..0c62e07ca189 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_column_profiler_report.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_column_profiler_report.py @@ -1,5 +1,8 @@ from typing import Optional +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) from great_expectations.core import ExpectationConfiguration from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine @@ -9,8 +12,6 @@ ) from great_expectations.validator.metric_configuration import MetricConfiguration -from .data_profiler_profile_metric_provider import DataProfilerProfileMetricProvider - class DataProfilerColumnProfileReport(DataProfilerProfileMetricProvider): metric_name = "data_profiler.column_profile_report" @@ -37,10 +38,9 @@ def _pandas( batch_columns_list=metrics["table.columns"], ) - profile_report: dict = metrics["data_profiler.profile_report"] - profile_report_column_data_stats: dict = { - element["column_name"]: element for element in profile_report["data_stats"] - } + profile_report_column_data_stats: dict = metrics[ + "data_profiler.table_column_infos" + ] return profile_report_column_data_stats[column_name] @classmethod @@ -60,26 +60,14 @@ def _get_evaluation_dependencies( table_domain_kwargs: dict = { k: v for k, v in metric.metric_domain_kwargs.items() if k != "column" } - dependencies["data_profiler.profile_report"] = MetricConfiguration( - metric_name="data_profiler.profile_report", + dependencies["data_profiler.table_column_infos"] = MetricConfiguration( + metric_name="data_profiler.table_column_infos", metric_domain_kwargs={}, metric_value_kwargs=metric.metric_value_kwargs, ) - dependencies["table.column_types"] = MetricConfiguration( - metric_name="table.column_types", - metric_domain_kwargs=table_domain_kwargs, - metric_value_kwargs={ - "include_nested": True, - }, - ) dependencies["table.columns"] = MetricConfiguration( metric_name="table.columns", metric_domain_kwargs=table_domain_kwargs, metric_value_kwargs=None, ) - dependencies["table.row_count"] = MetricConfiguration( - metric_name="table.row_count", - metric_domain_kwargs=table_domain_kwargs, - metric_value_kwargs=None, - ) return dependencies diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_diff.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_diff.py index b891c6ca15bb..69199ab41836 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_diff.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_diff.py @@ -1,11 +1,12 @@ import dataprofiler as dp +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.execution_engine import PandasExecutionEngine from great_expectations.expectations.metrics.metric_provider import metric_value -from .data_profiler_profile_metric_provider import DataProfilerProfileMetricProvider - class DataProfilerProfileDiff(DataProfilerProfileMetricProvider): metric_name = "data_profiler.profile_diff" diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_numeric_columns.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_numeric_columns.py index 2c01619443af..ea1cd35e7cfd 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_numeric_columns.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_numeric_columns.py @@ -1,12 +1,13 @@ from typing import Optional +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.validator.metric_configuration import MetricConfiguration -from .data_profiler_profile_metric_provider import DataProfilerProfileMetricProvider - class DataProfilerProfileNumericColumns(DataProfilerProfileMetricProvider): metric_name = "data_profiler.profile_numeric_columns" diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_percent_diff.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_percent_diff.py index 74cfa564650c..ab1065fdb0c1 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_percent_diff.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_percent_diff.py @@ -1,13 +1,14 @@ import copy from typing import Optional +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.validator.metric_configuration import MetricConfiguration -from .data_profiler_profile_metric_provider import DataProfilerProfileMetricProvider - class DataProfilerProfilePercentDiff(DataProfilerProfileMetricProvider): metric_name = "data_profiler.profile_percent_diff" diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_report.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_report.py index 82a8ea1c2c59..e2fc96bb9538 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_report.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_profile_report.py @@ -1,11 +1,12 @@ import dataprofiler as dp import great_expectations.exceptions as gx_exceptions +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) from great_expectations.execution_engine import PandasExecutionEngine from great_expectations.expectations.metrics.metric_provider import metric_value -from .data_profiler_profile_metric_provider import DataProfilerProfileMetricProvider - class DataProfilerProfileReport(DataProfilerProfileMetricProvider): metric_name = "data_profiler.profile_report" diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_infos.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_infos.py new file mode 100644 index 000000000000..310c58460338 --- /dev/null +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_infos.py @@ -0,0 +1,51 @@ +from typing import Optional + +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) +from great_expectations.core import ExpectationConfiguration +from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.validator.metric_configuration import MetricConfiguration + + +class DataProfilerTableColumnInfos(DataProfilerProfileMetricProvider): + metric_name = "data_profiler.table_column_infos" + + value_keys = ("profile_path",) + + @metric_value(engine=PandasExecutionEngine) + def _pandas( + cls, + execution_engine, + metric_domain_kwargs, + metric_value_kwargs, + metrics, + runtime_configuration, + ): + profile_report: dict = metrics["data_profiler.profile_report"] + profile_report_column_data_stats: dict = { + element["column_name"]: element for element in profile_report["data_stats"] + } + return profile_report_column_data_stats + + @classmethod + def _get_evaluation_dependencies( + cls, + metric: MetricConfiguration, + configuration: Optional[ExpectationConfiguration] = None, + execution_engine: Optional[ExecutionEngine] = None, + runtime_configuration: Optional[dict] = None, + ): + dependencies: dict = super()._get_evaluation_dependencies( + metric=metric, + configuration=configuration, + execution_engine=execution_engine, + runtime_configuration=runtime_configuration, + ) + dependencies["data_profiler.profile_report"] = MetricConfiguration( + metric_name="data_profiler.profile_report", + metric_domain_kwargs={}, + metric_value_kwargs=metric.metric_value_kwargs, + ) + return dependencies diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py new file mode 100644 index 000000000000..b278c29812c5 --- /dev/null +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/metrics/data_profiler_metrics/data_profiler_table_column_list.py @@ -0,0 +1,68 @@ +from typing import List, Optional + +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics.data_profiler_metrics.data_profiler_profile_metric_provider import ( + DataProfilerProfileMetricProvider, +) +from great_expectations.core import ExpectationConfiguration +from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.expectations.metrics.util import ( + get_dbms_compatible_column_names, +) +from great_expectations.validator.metric_configuration import MetricConfiguration + + +class DataProfilerTableColumnList(DataProfilerProfileMetricProvider): + metric_name = "data_profiler.table_column_list" + + value_keys = ("profile_path",) + + @metric_value(engine=PandasExecutionEngine) + def _pandas( + cls, + execution_engine, + metric_domain_kwargs, + metric_value_kwargs, + metrics, + runtime_configuration, + ): + profile_report_column_data_stats: dict = metrics[ + "data_profiler.table_column_infos" + ] + profile_report_column_names: List[str] = list( + profile_report_column_data_stats.keys() + ) + profile_report_column_names = get_dbms_compatible_column_names( + column_names=profile_report_column_names, + batch_columns_list=metrics["table.columns"], + ) + return profile_report_column_names + + @classmethod + def _get_evaluation_dependencies( + cls, + metric: MetricConfiguration, + configuration: Optional[ExpectationConfiguration] = None, + execution_engine: Optional[ExecutionEngine] = None, + runtime_configuration: Optional[dict] = None, + ): + dependencies: dict = super()._get_evaluation_dependencies( + metric=metric, + configuration=configuration, + execution_engine=execution_engine, + runtime_configuration=runtime_configuration, + ) + table_domain_kwargs: dict = { + k: v for k, v in metric.metric_domain_kwargs.items() if k != "column" + } + dependencies["data_profiler.table_column_infos"] = MetricConfiguration( + metric_name="data_profiler.table_column_infos", + metric_domain_kwargs={}, + metric_value_kwargs=metric.metric_value_kwargs, + ) + dependencies["table.columns"] = MetricConfiguration( + metric_name="table.columns", + metric_domain_kwargs=table_domain_kwargs, + metric_value_kwargs=None, + ) + return dependencies diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py index e02b8af03e1c..817cf0401522 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py @@ -12,13 +12,6 @@ DataAssistantResult, ) from great_expectations.rule_based_profiler.domain_builder import ( - # TODO: - # """ - # Leaving example of general GreatExpectations "ColumnDomainBuilder" (commented out) for reference. - # Subject to inclusion/exclusion arguments, "ColumnDomainBuilder" emits "Domain" object for every column in table. - # """ - # ColumnDomainBuilder, - # TODO: DomainBuilder, ) from great_expectations.rule_based_profiler.expectation_configuration_builder import ( @@ -98,24 +91,6 @@ def _build_numeric_rule() -> Rule: rule. """ - # TODO: - """ - Leaving example of general GreatExpectations "ColumnDomainBuilder" (commented out) for reference. - Subject to inclusion/exclusion arguments, "ColumnDomainBuilder" emits "Domain" object for every column in table. - """ - # column_domain_builder: DomainBuilder = ColumnDomainBuilder( - # include_column_names=None, - # exclude_column_names=None, - # include_column_name_suffixes=None, - # exclude_column_name_suffixes=None, - # semantic_type_filter_module_name=None, - # semantic_type_filter_class_name=None, - # include_semantic_types=None, - # exclude_semantic_types=None, - # data_context=None, - # ) - # TODO: - # TODO: """ Subject to inclusion/exclusion arguments, "DataProfilerColumnDomainBuilder" emits "Domain" object for every column name in profiler report; GreatExpectations "table.columns" metric is used to validate column existence. @@ -134,7 +109,6 @@ def _build_numeric_rule() -> Rule: data_context=None, ) ) - # TODO: data_profiler_profile_report_metric_single_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.build_metric_single_batch_parameter_builder( metric_name="data_profiler.column_profile_report", @@ -226,16 +200,7 @@ def _build_numeric_rule() -> Rule: rule = Rule( name="numeric_rule", variables=variables, - # TODO: - # """ - # Leaving example of general GreatExpectations "ColumnDomainBuilder" (commented out) for reference. - # Subject to inclusion/exclusion arguments, "ColumnDomainBuilder" emits "Domain" object for every column in table. - # """ - # domain_builder=column_domain_builder, - # TODO: - # TODO: domain_builder=data_profiler_column_domain_builder, - # TODO: parameter_builders=parameter_builders, expectation_configuration_builders=expectation_configuration_builders, ) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py index 2b3a1c51424e..989d27f50b39 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py @@ -115,9 +115,9 @@ def _get_domains( parameters=None, ) - profile_report: dict = validator.get_metric( # type: ignore[assignment] # could be None + profile_report_column_names: List[str] = validator.get_metric( # type: ignore[assignment] # could be None metric=MetricConfiguration( - metric_name="data_profiler.profile_report", + metric_name="data_profiler.table_column_list", metric_domain_kwargs={}, metric_value_kwargs={ "profile_path": profile_path, @@ -125,10 +125,6 @@ def _get_domains( ) ) - profile_report_column_names: List[str] = [ - element["column_name"] for element in profile_report["data_stats"] - ] - if not (profile_report_column_names and table_column_names): raise gx_exceptions.ProfilerExecutionError( message=f"Error: List of available table columns in {self.__class__.__name__} must not be empty." diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py index 9c6a0261be68..73cb910c897f 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/conftest.py @@ -1,27 +1,20 @@ +from __future__ import annotations + import os import shutil import sys import pytest -from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import ( # registers these metrics - DataProfilerColumnProfileReport, - DataProfilerProfileReport, -) from great_expectations import get_context from great_expectations.data_context import FileDataContext from great_expectations.data_context.types.base import AnonymizedUsageStatisticsConfig from great_expectations.data_context.util import file_relative_path from great_expectations.self_check.util import build_test_backends_list -from tests.conftest import ( - set_consistent_seed_within_numeric_metric_range_multi_batch_parameter_builder, # implicitly used fixture +from tests.conftest import ( # noqa: F401,F403,F811 # registers implicitly used fixture and prevents removal of "unused" import + set_consistent_seed_within_numeric_metric_range_multi_batch_parameter_builder, ) -_ = DataProfilerProfileReport # prevents removal of "unused" import -_ = DataProfilerColumnProfileReport # prevents removal of "unused" import - -_ = set_consistent_seed_within_numeric_metric_range_multi_batch_parameter_builder # prevents removal of "unused" import - sys.path.insert(0, os.path.abspath("../..")) # noqa: PTH100 test_root_path: str = os.path.dirname( # noqa: PTH120 @@ -29,6 +22,39 @@ ) +class BaseProfiler: + """ + This class should ideally be named "MockBaseProfiler"; however, it has to be called "BaseProfiler", because its + "load()" method returns "BaseProfiler" type, which is type of class itself (using "fluent" programming style). + """ + + # noinspection PyMethodMayBeStatic,PyMethodParameters + def load(cls, filepath: str) -> BaseProfiler: + return cls + + # noinspection PyMethodMayBeStatic + def report(self, report_options: dict = None) -> dict: + return { + "global_stats": { + "profile_schema": {}, + }, + "data_stats": [ + { + "column_name": "vendor_id", + }, + { + "column_name": "passenger_count", + }, + { + "column_name": "total_amount", + }, + { + "column_name": "congestion_surcharge", + }, + ], + } + + def pytest_addoption(parser): # note: --no-spark will be deprecated in favor of --spark parser.addoption( @@ -128,9 +154,14 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("test_backends", [test_backends], scope="module") +@pytest.fixture(scope="function") +def mock_base_data_profiler() -> BaseProfiler: + return BaseProfiler() + + @pytest.fixture(scope="function") def bobby_columnar_table_multi_batch_deterministic_data_context( - set_consistent_seed_within_numeric_metric_range_multi_batch_parameter_builder, + set_consistent_seed_within_numeric_metric_range_multi_batch_parameter_builder, # noqa: F401,F403,F811 tmp_path_factory, monkeypatch, ) -> FileDataContext: diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/data_profiler_files/taxi_yellow_tripdata_samples/yellow_tripdata_5_lines_sample_2019-01.csv b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/data_profiler_files/taxi_yellow_tripdata_samples/yellow_tripdata_5_lines_sample_2019-01.csv new file mode 100644 index 000000000000..120115abc108 --- /dev/null +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/data_profiler_files/taxi_yellow_tripdata_samples/yellow_tripdata_5_lines_sample_2019-01.csv @@ -0,0 +1,6 @@ +vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge +1,2019-01-03 08:49:57,2019-01-03 09:09:10,1,7.0,1,N,236,87,1,22.0,0.0,0.5,2.0,0.0,0.3,24.8, +2,2019-01-31 11:46:01,2019-01-31 12:04:54,1,0.66,1,N,230,237,1,12.0,0.0,0.5,2.56,0.0,0.3,15.36,0.0 +2,2019-01-22 17:13:18,2019-01-22 17:26:40,1,2.44,1,N,166,236,1,11.0,1.0,0.5,2.56,0.0,0.3,15.36,0.0 +2,2019-01-04 09:08:34,2019-01-04 09:18:30,1,0.86,1,N,186,170,1,7.5,0.0,0.5,1.66,0.0,0.3,9.96, +2,2019-01-30 09:44:48,2019-01-30 09:59:07,1,1.58,1,N,142,237,2,10.0,0.0,0.5,0.0,0.0,0.3,10.8,0.0 diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/expectations/metrics/test_core.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/expectations/metrics/test_core.py index 4f75a2a55528..e2fdd4d705ec 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/expectations/metrics/test_core.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/expectations/metrics/test_core.py @@ -1,18 +1,21 @@ +from __future__ import annotations + import os +from typing import TYPE_CHECKING +from unittest import mock import dataprofiler as dp import pandas as pd -from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import ( # registers these metrics - DataProfilerColumnProfileReport, - DataProfilerProfileReport, -) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import * # noqa: F401,F403 from great_expectations.self_check.util import build_pandas_engine from great_expectations.validator.metric_configuration import MetricConfiguration from tests.expectations.test_util import get_table_columns_metric -_ = DataProfilerColumnProfileReport # prevents removal of "unused" import -_ = DataProfilerProfileReport # prevents removal of "unused" import +if TYPE_CHECKING: + from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.tests.conftest import ( + BaseProfiler, + ) test_root_path: str = os.path.dirname( # noqa: PTH120 @@ -21,6 +24,9 @@ def test_data_profiler_column_profile_report_metric_pd(): + """ + This test verifies that "data_profiler.column_profile_report" metric returns correct column statistics (from Profile report). + """ engine = build_pandas_engine( pd.DataFrame( { @@ -64,39 +70,202 @@ def test_data_profiler_column_profile_report_metric_pd(): ) metrics.update(results) + data_profiler_table_column_infos_metric: MetricConfiguration = MetricConfiguration( + metric_name="data_profiler.table_column_infos", + metric_domain_kwargs={}, + metric_value_kwargs=None, + ) + data_profiler_table_column_infos_metric.metric_dependencies = { + "table.columns": table_columns_metric, + "data_profiler.profile_report": data_profiler_profile_report_metric, + } + results = engine.resolve_metrics( + metrics_to_resolve=(data_profiler_table_column_infos_metric,), metrics=metrics + ) + metrics.update(results) + desired_metric = MetricConfiguration( metric_name="data_profiler.column_profile_report", metric_domain_kwargs={"column": "vendor_id"}, - metric_value_kwargs={ - "profile_path": profile_path, - }, + metric_value_kwargs=None, ) desired_metric.metric_dependencies = { "table.columns": table_columns_metric, - "data_profiler.profile_report": data_profiler_profile_report_metric, + "data_profiler.table_column_infos": data_profiler_table_column_infos_metric, } results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) - profile = dp.Profiler.load(profile_path) + + profile: dp.profilers.profile_builder.BaseProfiler = dp.Profiler.load(profile_path) assert ( - results[desired_metric.id]["column_name"] + metrics[desired_metric.id]["column_name"] == profile.report()["data_stats"][0]["column_name"] ) assert ( - results[desired_metric.id]["data_type"] + metrics[desired_metric.id]["data_type"] == profile.report()["data_stats"][0]["data_type"] ) assert ( - results[desired_metric.id]["categorical"] + metrics[desired_metric.id]["categorical"] == profile.report()["data_stats"][0]["categorical"] ) assert ( - results[desired_metric.id]["order"] + metrics[desired_metric.id]["order"] == profile.report()["data_stats"][0]["order"] ) assert ( - results[desired_metric.id]["samples"] + metrics[desired_metric.id]["samples"] == profile.report()["data_stats"][0]["samples"] ) + + +def test_data_profiler_column_list_metric_same_as_in_batch_table_pd(): + """ + This test verifies that "data_profiler.table_column_list" metric returns correct column list (same as in Batch). + """ + csv_file_path: str = os.path.join( # noqa: PTH118 + test_root_path, + "data_profiler_files", + "taxi_yellow_tripdata_samples", + "yellow_tripdata_5_lines_sample_2019-01.csv", + ) + engine = build_pandas_engine(pd.read_csv(filepath_or_buffer=csv_file_path)) + + profile_path = os.path.join( # noqa: PTH118 + test_root_path, + "data_profiler_files", + "profile.pkl", + ) + + metrics: dict = {} + + table_columns_metric: MetricConfiguration + results: dict + + table_columns_metric, results = get_table_columns_metric(engine=engine) + metrics.update(results) + + data_profiler_profile_report_metric: MetricConfiguration = MetricConfiguration( + metric_name="data_profiler.profile_report", + metric_domain_kwargs={}, + metric_value_kwargs={ + "profile_path": profile_path, + }, + ) + results = engine.resolve_metrics( + metrics_to_resolve=(data_profiler_profile_report_metric,), metrics=metrics + ) + metrics.update(results) + + data_profiler_table_column_infos_metric: MetricConfiguration = MetricConfiguration( + metric_name="data_profiler.table_column_infos", + metric_domain_kwargs={}, + metric_value_kwargs=None, + ) + data_profiler_table_column_infos_metric.metric_dependencies = { + "data_profiler.profile_report": data_profiler_profile_report_metric, + } + results = engine.resolve_metrics( + metrics_to_resolve=(data_profiler_table_column_infos_metric,), metrics=metrics + ) + metrics.update(results) + + desired_metric = MetricConfiguration( + metric_name="data_profiler.table_column_list", + metric_domain_kwargs={}, + metric_value_kwargs=None, + ) + desired_metric.metric_dependencies = { + "table.columns": table_columns_metric, + "data_profiler.table_column_infos": data_profiler_table_column_infos_metric, + } + results = engine.resolve_metrics( + metrics_to_resolve=(desired_metric,), metrics=metrics + ) + metrics.update(results) + + assert metrics[desired_metric.id] == metrics[table_columns_metric.id] + + +def test_data_profiler_column_list_metric_same_as_profile_report_pd( + mock_base_data_profiler: BaseProfiler, +): + """ + This test verifies that "data_profiler.table_column_list" metric returns correct column list (as in Profile report). + """ + csv_file_path: str = os.path.join( # noqa: PTH118 + test_root_path, + "data_profiler_files", + "taxi_yellow_tripdata_samples", + "yellow_tripdata_5_lines_sample_2019-01.csv", + ) + engine = build_pandas_engine(pd.read_csv(filepath_or_buffer=csv_file_path)) + + profile_path = os.path.join( # noqa: PTH118 + test_root_path, + "data_profiler_files", + "profile.pkl", + ) + + metrics: dict = {} + + table_columns_metric: MetricConfiguration + results: dict + + table_columns_metric, results = get_table_columns_metric(engine=engine) + metrics.update(results) + + with mock.patch( + "dataprofiler.profilers.profile_builder.BaseProfiler.load", + return_value=mock_base_data_profiler, + ): + data_profiler_profile_report_metric: MetricConfiguration = MetricConfiguration( + metric_name="data_profiler.profile_report", + metric_domain_kwargs={}, + metric_value_kwargs={ + "profile_path": profile_path, + }, + ) + + results = engine.resolve_metrics( + metrics_to_resolve=(data_profiler_profile_report_metric,), metrics=metrics + ) + metrics.update(results) + + data_profiler_table_column_infos_metric: MetricConfiguration = ( + MetricConfiguration( + metric_name="data_profiler.table_column_infos", + metric_domain_kwargs={}, + metric_value_kwargs=None, + ) + ) + data_profiler_table_column_infos_metric.metric_dependencies = { + "data_profiler.profile_report": data_profiler_profile_report_metric, + } + results = engine.resolve_metrics( + metrics_to_resolve=(data_profiler_table_column_infos_metric,), + metrics=metrics, + ) + metrics.update(results) + + desired_metric = MetricConfiguration( + metric_name="data_profiler.table_column_list", + metric_domain_kwargs={}, + metric_value_kwargs=None, + ) + desired_metric.metric_dependencies = { + "table.columns": table_columns_metric, + "data_profiler.table_column_infos": data_profiler_table_column_infos_metric, + } + results = engine.resolve_metrics( + metrics_to_resolve=(desired_metric,), metrics=metrics + ) + metrics.update(results) + + element: dict + assert metrics[desired_metric.id] == [ + element["column_name"] + for element in mock_base_data_profiler.report()["data_stats"] + ] diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py index 63f3f9f790c3..a8f3bb466e4a 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/data_assistant/test_data_profiler_structured_data_assistant.py @@ -7,11 +7,8 @@ import pytest -from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import ( # registers these metrics - DataProfilerColumnProfileReport, - DataProfilerProfileReport, -) -from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.rule_based_profiler.data_assistant.data_profiler_structured_data_assistant import ( # registers this DataAssistant +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import * # noqa: F401,F403 +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.rule_based_profiler.data_assistant.data_profiler_structured_data_assistant import ( # noqa: F401,F403 # registers this DataAssistant and prevents removal of "unused" import DataProfilerStructuredDataAssistant, ) from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.rule_based_profiler.data_assistant_result import ( @@ -33,11 +30,6 @@ from great_expectations.data_context import FileDataContext -_ = DataProfilerColumnProfileReport # prevents removal of "unused" import -_ = DataProfilerProfileReport # prevents removal of "unused" import -_ = DataProfilerStructuredDataAssistant # prevents removal of "unused" import - - test_root_path: str = os.path.dirname( # noqa: PTH120 os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # noqa: PTH120 ) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py index d8d618d0fa9a..ac07d32b0066 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py @@ -1,14 +1,12 @@ from __future__ import annotations import os -from typing import List +from typing import TYPE_CHECKING, List from unittest import mock import pytest -from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import ( # registers this metric - DataProfilerProfileReport, -) +from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.metrics import * # noqa: F401,F403 from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.rule_based_profiler.domain_builder.data_profiler_column_domain_builder import ( DataProfilerColumnDomainBuilder, ) @@ -26,19 +24,26 @@ build_parameter_container_for_variables, ) +if TYPE_CHECKING: + from contrib.capitalone_dataprofiler_expectations.capitalone_dataprofiler_expectations.tests.conftest import ( + BaseProfiler, + ) + + test_root_path: str = os.path.dirname( # noqa: PTH120 os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # noqa: PTH120 ) -_ = DataProfilerProfileReport # prevents removal of "unused" import - - @pytest.mark.integration @pytest.mark.slow # 1.21s def test_data_profiler_column_domain_builder_with_profile_path_as_value( bobby_columnar_table_multi_batch_deterministic_data_context: FileDataContext, ): + """ + This test verifies that "Domain" objects corresponding to full list of columns in Profiler Report (same as in Batch) + are emitted when path to "profile.pkl" is specified explicitly. + """ data_context: FileDataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) @@ -301,6 +306,10 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value( def test_data_profiler_column_domain_builder_with_profile_path_as_reference( bobby_columnar_table_multi_batch_deterministic_data_context: FileDataContext, ): + """ + This test verifies that "Domain" objects corresponding to full list of columns in Profiler Report (same as in Batch) + are emitted when path to "profile.pkl" is specified as Rule variable (implicitly). + """ data_context: FileDataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) @@ -564,6 +573,10 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference( def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with_exclude_column_names_with_exclude_column_name_suffixes( bobby_columnar_table_multi_batch_deterministic_data_context: FileDataContext, ): + """ + This test verifies that "Domain" objects corresponding to partial list of columns in Profiler Report under exclusion + directives (as subset of columns in Batch) are emitted when path to "profile.pkl" is specified as Rule variable (implicitly). + """ data_context: FileDataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) @@ -761,39 +774,12 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with @pytest.mark.slow # 1.21s def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with_partial_column_list_in_profiler_report( bobby_columnar_table_multi_batch_deterministic_data_context: FileDataContext, + mock_base_data_profiler: BaseProfiler, ): - class BaseProfiler: - """ - This class should ideally be named "MockBaseProfiler"; however, it has to be called "BaseProfiler", because its - "load()" method returns "BaseProfiler" type, which is type of class itself (using "fluent" programming style). - """ - - # noinspection PyMethodMayBeStatic,PyMethodParameters - def load(cls, filepath: str) -> BaseProfiler: - return cls - - # noinspection PyMethodMayBeStatic - def report(self, report_options: dict = None) -> dict: - return { - "global_stats": { - "profile_schema": {}, - }, - "data_stats": [ - { - "column_name": "vendor_id", - }, - { - "column_name": "passenger_count", - }, - { - "column_name": "total_amount", - }, - { - "column_name": "congestion_surcharge", - }, - ], - } - + """ + This test verifies that "Domain" objects corresponding to partial list of columns in Profiler Report (as subset of + columns in Batch) are emitted when path to "profile.pkl" is specified as Rule variable (implicitly). + """ data_context: FileDataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) @@ -827,7 +813,7 @@ def report(self, report_options: dict = None) -> dict: ) with mock.patch( "dataprofiler.profilers.profile_builder.BaseProfiler.load", - return_value=BaseProfiler(), + return_value=mock_base_data_profiler, ): domains: List[Domain] = domain_builder.get_domains( rule_name="my_rule", diff --git a/great_expectations/util.py b/great_expectations/util.py index 1efe8bca0dc5..a3d523aa8b4c 100644 --- a/great_expectations/util.py +++ b/great_expectations/util.py @@ -52,7 +52,6 @@ import pandas as pd from dateutil.parser import parse from packaging import version -from pkg_resources import Distribution from typing_extensions import Literal, TypeGuard import great_expectations.exceptions as gx_exceptions @@ -97,6 +96,7 @@ if TYPE_CHECKING: # needed until numpy min version 1.20 import numpy.typing as npt + from pkg_resources import Distribution from great_expectations.alias_types import PathStr from great_expectations.data_context import FileDataContext From d5a8dd4d42d2da88f220329e20fe76911bc7539a Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Fri, 31 Mar 2023 07:39:09 -0500 Subject: [PATCH 03/96] [DOCS] doc 508 Updates footer links on docs pages (#7521) --- docs/docusaurus/docusaurus.config.js | 40 +++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js index a00e02047b37..591cb413752b 100644 --- a/docs/docusaurus/docusaurus.config.js +++ b/docs/docusaurus/docusaurus.config.js @@ -19,9 +19,9 @@ module.exports = { [ require.resolve('docusaurus-gtm-plugin'), { - id: 'GTM-K63L45F', // GTM Container ID + id: 'GTM-K63L45F' // GTM Container ID } - ], + ] ], themeConfig: { @@ -36,7 +36,7 @@ module.exports = { searchPagePath: 'search', // schedule is in UTC - schedule: 'every 1 day at 5:00 pm', + schedule: 'every 1 day at 5:00 pm' // Optional: see doc section below // contextualSearch: true, @@ -68,7 +68,7 @@ module.exports = { // textColor: '#ffffff', // Defaults to `#000`. // isCloseable: false // Defaults to `true`. // }, - image:'img/gx-preview.png', + image: 'img/gx-preview.png', navbar: { logo: { alt: 'Great Expectations', @@ -89,7 +89,7 @@ module.exports = { label: '0.13.x and earlier' } ], - dropdownActiveClassDisabled: true, + dropdownActiveClassDisabled: true }, { label: 'Product', @@ -182,8 +182,8 @@ module.exports = { alt: 'Great Expectations', src: 'img/gx-logo-dark.svg', href: 'https://greatexpectations.io', - width: "100%", - height: "auto", + width: '100%', + height: 'auto' }, links: [ { @@ -204,23 +204,27 @@ module.exports = { items: [ { label: 'Careers', - to: 'https://jobs.superconductive.com/' + to: 'https://jobs.greatexpectations.io/' }, - { + { label: 'DPA', - to: 'https://greatexpectations.io/dpa' + to: 'https://greatexpectations.io/pdf/dpa' }, - { - label: 'Terms of Service', - to: 'https://greatexpectations.io/terms' + { + label: 'Master Subscription Agreement', + to: 'https://greatexpectations.io/pdf/msa' }, + { + label: 'Privacy Policy', + to: 'https://greatexpectations.io/privacy-policy' + } ] }, { title: 'Check Us Out', items: [ { - html: ` + html: ` @@ -230,13 +234,13 @@ module.exports = { - + - `, - }, + ` + } ] - }, + } ], copyright: `Copyright © ${new Date().getFullYear()} Great Expectations. All Rights Reserved.` } From 970e60dc24446bf1fbdc0f24f40baacba2daf588 Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Fri, 31 Mar 2023 10:00:36 -0500 Subject: [PATCH 04/96] [DOCS] DSB-64 removes outdated v2/v3 references from the docs (#7519) --- docs/docusaurus/docs/contributing/contributing_test.md | 2 +- .../how_to_use_great_expectations_with_airflow.md | 4 ++-- ...ctations_with_google_cloud_platform_and_bigquery.md | 9 --------- ...orking_with_a_single_or_multiple_batches_of_data.md | 2 +- .../how_to_choose_which_dataconnector_to_use.md | 2 +- .../how_to_configure_a_runtimedataconnector.md | 4 ++-- ...igure_an_expectation_store_in_azure_blob_storage.md | 4 ---- .../_preface.mdx | 6 ------ .../_steps_for_checkpoints_.mdx | 2 +- ...onfigure_a_new_checkpoint_using_test_yaml_config.md | 4 ++-- .../docs/integrations/integration_datahub.md | 5 +---- .../reference/expectations/implemented_expectations.md | 2 +- docs/docusaurus/docs/terms/data_connector.md | 8 -------- docs_rtd/_templates/layout.html | 10 +++++----- 14 files changed, 17 insertions(+), 47 deletions(-) diff --git a/docs/docusaurus/docs/contributing/contributing_test.md b/docs/docusaurus/docs/contributing/contributing_test.md index 70617c7687ad..574bb767b06f 100644 --- a/docs/docusaurus/docs/contributing/contributing_test.md +++ b/docs/docusaurus/docs/contributing/contributing_test.md @@ -144,7 +144,7 @@ The test fixture files are stored in subdirectories of `tests/test_definitions/` * multicolumn_map_expectations * other_expectations -By convention, the name of the the file is the name of the Expectation, with a .json suffix. Creating a new json file will automatically add the new Expectation tests to the test suite. +By convention, the name of the file is the name of the Expectation, with a .json suffix. Creating a new json file will automatically add the new Expectation tests to the test suite. Note: If you are implementing a new Expectation, but don’t plan to immediately implement it for all execution environments, you should add the new test to the appropriate list(s) in the `candidate_test_is_on_temporary_notimplemented_list_v2_api` method within `tests/test_utils.py`. Often, we see Expectations developed first for pandas, then later extended to SqlAlchemy and Spark. diff --git a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md index 9ac42b01dc1d..5aa178961177 100644 --- a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md +++ b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md @@ -32,9 +32,9 @@ To import the GreatExpectationsOperator in your Airflow project, run the followi pip install airflow-provider-great-expectations==0.1.1 ``` -It’s recommended to specify a version when installing the package. To make use of the latest Great Expectations V3 API, you need to specify a version >= `0.1.0`. +It’s recommended to specify a version when installing the package. To make use of the latest Great Expectations provider for Airflow, you need to specify a version >= `0.1.0`. -> *The Great Expectations V3 API requires Airflow 2.1+. If you're still running Airflow 1.x, you need to upgrade to at least 2.1 before using v0.1.0+ of the GreatExpectationsOperator.* +> *The current Great Expectations release requires Airflow 2.1+. If you're still running Airflow 1.x, you need to upgrade to at least 2.1 before using v0.1.0+ of the GreatExpectationsOperator.* ## Using the `GreatExpectationsOperator` diff --git a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_google_cloud_platform_and_bigquery.md b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_google_cloud_platform_and_bigquery.md index 8f26880b01f6..7e73962bfcb6 100644 --- a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_google_cloud_platform_and_bigquery.md +++ b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_google_cloud_platform_and_bigquery.md @@ -52,15 +52,6 @@ Relevant documentation for the components can also be found here: - [How to host and share Data Docs on GCS](../guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_gcs.md) - Optionally, you can also use a [Secret Manager for GCP Credentials](../guides/setup/configuring_data_contexts/how_to_configure_credentials.md) -:::note Note on V3 Expectations for BigQuery - - A small number of V3 Expectations have not been migrated to BigQuery, and will be very soon. These include: - - - `expect_column_quantile_values_to_be_between` - - `expect_column_kl_divergence_to_be_less_than` - -::: - ## Part 1: Local Configuration of Great Expectations that connects to Google Cloud Platform ### 1. If necessary, upgrade your Great Expectations version diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_between_working_with_a_single_or_multiple_batches_of_data.md b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_between_working_with_a_single_or_multiple_batches_of_data.md index 8b19d8dc6abc..a22ed73688a9 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_between_working_with_a_single_or_multiple_batches_of_data.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_between_working_with_a_single_or_multiple_batches_of_data.md @@ -12,7 +12,7 @@ By the end of this guide, you will know when it will be most beneficial to be wo -- [Understand the basics of Datasources in the V3 (Batch Request) API](../../terms/datasource.md) +- [Understand the basics of Datasources](../../terms/datasource.md) - [Understand the basics of requesting data with a Batch Requests](./how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md index e3deec76a136..9c217f883d9d 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md @@ -10,7 +10,7 @@ This guide demonstrates how to choose which -- [Understand the basics of Datasources in the V3 (Batch Request) API](../../terms/datasource.md) +- [Understand the basics of Datasources](../../terms/datasource.md) - Learned how to configure a [Data Context using test_yaml_config](../setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_configure_a_runtimedataconnector.md b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_configure_a_runtimedataconnector.md index 1cfc513664dd..f28c3f73d453 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_configure_a_runtimedataconnector.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_configure_a_runtimedataconnector.md @@ -6,11 +6,11 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -This guide demonstrates how to configure a RuntimeDataConnector and only applies to the V3 (Batch Request) API. A `RuntimeDataConnector` allows you to specify a using a Runtime , which is used to create a Validator. A is the key object used to create and datasets. +This guide demonstrates how to configure a RuntimeDataConnector. A `RuntimeDataConnector` allows you to specify a using a Runtime , which is used to create a Validator. A is the key object used to create and datasets. -- [Understand the basics of Datasources in the V3 (Batch Request) API](../../terms/datasource.md) +- [Understand the basics of Datasources](../../terms/datasource.md) - Learned how to configure a [Data Context using test_yaml_config](../setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md) diff --git a/docs/docusaurus/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md b/docs/docusaurus/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md index 9b32be024113..785e87f9f653 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md +++ b/docs/docusaurus/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md @@ -117,8 +117,4 @@ If you followed Step 4, the output should include the Expectation we copied to A ```bash great_expectations suite list - -Using v2 (Batch Kwargs) API -1 Expectation Suite found: -- exp1 ``` diff --git a/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_preface.mdx b/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_preface.mdx index 266b4e6cabc1..29d3fb8cf985 100644 --- a/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_preface.mdx +++ b/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_preface.mdx @@ -2,9 +2,3 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; This guide will help you create a new , which allows you to couple an with a data set to . - -:::note - -As of Great Expectations version 0.13.7, we have updated and improved the Checkpoints feature. You can continue to use your existing legacy Checkpoint workflows if you’re working with concepts from the Batch Kwargs (v2) API. If you’re using concepts from the BatchRequest (v3) API, please refer to the new Checkpoints guides. - -::: \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx b/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx index 78c9620c23b5..f0a691560182 100644 --- a/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx +++ b/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx @@ -4,7 +4,7 @@ This how-to guide assumes you have already: * [Set up a working deployment of Great Expectations](../../../../tutorials/getting_started/tutorial_overview.md) -* [Configured a Datasource using the BatchRequest (v3) API](../../../../tutorials/getting_started/tutorial_connect_to_data.md) +* [Configured a Datasource](../../../../tutorials/getting_started/tutorial_connect_to_data.md) * [Created an Expectation Suite](../../../../tutorials/getting_started/tutorial_create_expectations.md) ::: diff --git a/docs/docusaurus/docs/guides/validation/checkpoints/how_to_configure_a_new_checkpoint_using_test_yaml_config.md b/docs/docusaurus/docs/guides/validation/checkpoints/how_to_configure_a_new_checkpoint_using_test_yaml_config.md index 3e0b7cce7fb5..462c95a785ce 100644 --- a/docs/docusaurus/docs/guides/validation/checkpoints/how_to_configure_a_new_checkpoint_using_test_yaml_config.md +++ b/docs/docusaurus/docs/guides/validation/checkpoints/how_to_configure_a_new_checkpoint_using_test_yaml_config.md @@ -11,7 +11,7 @@ This how-to guide demonstrates advanced examples for configuring a - [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) -- [Configured a Datasource using the v3 API](../../../tutorials/getting_started/tutorial_connect_to_data.md) +- [Configured a Datasource](../../../tutorials/getting_started/tutorial_connect_to_data.md) - [Created an Expectation Suite](../../../tutorials/getting_started/tutorial_create_expectations.md) @@ -57,7 +57,7 @@ From here you can continue to edit your Checkpoint. After each change you should ### 5. Save your edited Checkpoint -Once you have made all of the changes you planned to implement and your last `test_yaml_config()` check passed, you are ready to save the Checkpoint you've created. At this point, run the remaining cells in your Jupyter Notebook. +Once you have made all the changes you planned to implement and your last `test_yaml_config()` check passed, you are ready to save the Checkpoint you've created. At this point, run the remaining cells in your Jupyter Notebook. Your checkpoint will be saved by the cell that contains the command: diff --git a/docs/docusaurus/docs/integrations/integration_datahub.md b/docs/docusaurus/docs/integrations/integration_datahub.md index dbbe94f167b9..38d13677028c 100644 --- a/docs/docusaurus/docs/integrations/integration_datahub.md +++ b/docs/docusaurus/docs/integrations/integration_datahub.md @@ -93,12 +93,9 @@ The Validation Results would show up in Validation tab on Dataset page in DataHu ## Further discussion ### Things to consider -Currently this integration only supports v3 API Datasources using `SqlAlchemyExecutionEngine`. - This integration does not support -- v2 Datasources such as `SqlAlchemyDataset` -- v3 Datasources using an Execution Engine other than `SqlAlchemyExecutionEngine` (Spark, Pandas) +- Datasources using an Execution Engine other than `SqlAlchemyExecutionEngine` (Spark, Pandas) - Cross-dataset Expectations (those involving > 1 table) ### When things don't work diff --git a/docs/docusaurus/docs/reference/expectations/implemented_expectations.md b/docs/docusaurus/docs/reference/expectations/implemented_expectations.md index 09cd5014d493..3b5f8494ad5d 100644 --- a/docs/docusaurus/docs/reference/expectations/implemented_expectations.md +++ b/docs/docusaurus/docs/reference/expectations/implemented_expectations.md @@ -62,4 +62,4 @@ out the missing implementations! |`expect_column_pair_cramers_phi_value_to_be_less_than` * | Y | N | N | |`expect_multicolumn_sum_to_equal` | Y | Y | Y | -`*` This Expectation has not yet been migrated to the v3 (Batch Request) API. +`*` This Expectation has not yet been migrated versions above GX v 0.13.x. diff --git a/docs/docusaurus/docs/terms/data_connector.md b/docs/docusaurus/docs/terms/data_connector.md index 3c663d234ee8..a3db9773769c 100644 --- a/docs/docusaurus/docs/terms/data_connector.md +++ b/docs/docusaurus/docs/terms/data_connector.md @@ -149,14 +149,6 @@ A **Batch Spec** is an Execution Engine-specific description of the Batch define A Data Connector is responsible for working with an Execution Engine to translate Batch Definitions into a Batch Spec that enables Great Expectations to access the data using that Execution Engine. - - -## API basics - -:::info API note -In the updated V3 Great Expectations API, Data Connectors replace the Batch Kwargs Generators from the V2 Great Expectations API. -::: - ### How to access Other than specifying a Data Connector when you configure a Datasource, you will not need to directly interact with one. Great Expectations will handle using them behind the scenes. diff --git a/docs_rtd/_templates/layout.html b/docs_rtd/_templates/layout.html index ad40f10ec304..33020c17d1ee 100644 --- a/docs_rtd/_templates/layout.html +++ b/docs_rtd/_templates/layout.html @@ -211,7 +211,7 @@ INTEGRATIONS - +
  • DOCUMENTATIONCAREERS -
  • + @@ -254,14 +254,14 @@ width: 100%; " > - NOTE: This is Legacy V2 documentation. + NOTE: This is a legacy site for documentation from Great Expectations version 0.13.0 and earlier. - See New Documentation + See the new documentation - for V3 and newer + for the more recent and current versions of GX. From a0d88d83b5497cf1b1f436c971f7bec2e1dbf884 Mon Sep 17 00:00:00 2001 From: Don Heppner Date: Fri, 31 Mar 2023 11:46:50 -0400 Subject: [PATCH 05/96] [DOCS] Update CODEOWNERS (#7528) --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 48d65c9ac563..1f1fee52b8fb 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,5 @@ # Each line is a file pattern followed by one or more owners. -sidebars.js @donaldheppner @Rachel-Reverie +sidebars.js @donaldheppner @Rachel-Reverie @tjholsman great_expectations/core/usage_statistics/schemas.py @tannerbeam From 5b736dac99a4ae1ed9292062ad484fe4dd531220 Mon Sep 17 00:00:00 2001 From: William Shin Date: Fri, 31 Mar 2023 09:51:36 -0700 Subject: [PATCH 06/96] [MAINTENANCE] SqlAlchemy 2 Compatibility - Autoload Parameter deprecation (#7526) --- .../data_context/store/database_store_backend.py | 2 +- pyproject.toml | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/great_expectations/data_context/store/database_store_backend.py b/great_expectations/data_context/store/database_store_backend.py index 2b2d7d08df45..3291ad5a6c0d 100644 --- a/great_expectations/data_context/store/database_store_backend.py +++ b/great_expectations/data_context/store/database_store_backend.py @@ -103,7 +103,7 @@ def __init__( # noqa: C901 - 16 cols.append(Column(column_, String, primary_key=True)) cols.append(Column("value", String)) try: - table = Table(table_name, meta, autoload=True, autoload_with=self.engine) + table = Table(table_name, meta, autoload_with=self.engine) # We do a "light" check: if the columns' names match, we will proceed, otherwise, create the table if {str(col.name).lower() for col in table.columns} != ( set(key_columns) | {"value"} diff --git a/pyproject.toml b/pyproject.toml index ec7f52988129..adcd5f843b3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -474,12 +474,6 @@ filterwarnings = [ # Example Actual Warning: Found by running setup of test_validate_dataset[sqlite] # sqlalchemy.exc.RemovedIn20Warning: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) 'ignore: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0.:DeprecationWarning', - # Example Actual Warning: Found in setup of test_database_evaluation_parameter_store_basics[param_store0-test_backends0] - # sqlalchemy.exc.RemovedIn20Warning: The autoload parameter is deprecated and will be removed in version 2.0. Please use the autoload_with parameter, passing an engine or connection. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: The autoload parameter is deprecated and will be removed in version 2.0.:DeprecationWarning', - # Example Actual Warning: Found in test_sample_using_limit_builds_correct_query_where_clause_none[test_backends0-hive] - # sqlalchemy.exc.RemovedIn20Warning: The `database` package is deprecated and will be removed in v2.0 of sqlalchemy. Use the `dialects` package instead. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: The `database` package is deprecated and will be removed in v2.0 of sqlalchemy. Use the `dialects` package instead.:DeprecationWarning', # Example Actual Warning: Found in mysql test_table_column_reflection_fallback[test_backends0] # sqlalchemy.exc.RemovedIn20Warning: The .close() method on a so-called 'branched' connection is deprecated as of 1.4, as are 'branched' connections overall, and will be removed in a future release. If this is a default-handling function, don't close the connection. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) 'ignore: The .close\(\) method on a so-called:DeprecationWarning', From a8c4b9d475315a501c6468eb207168a154dff015 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 31 Mar 2023 17:12:32 +0000 Subject: [PATCH 07/96] [MAINTENANCE] Bump notebook from 6.4.1 to 6.4.12 in /docs_rtd (#7511) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs_rtd/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs_rtd/requirements.txt b/docs_rtd/requirements.txt index 82f6ce3d39c6..f6736e4cb8f7 100644 --- a/docs_rtd/requirements.txt +++ b/docs_rtd/requirements.txt @@ -40,7 +40,7 @@ MarkupSafe==1.1.1 mistune==0.8.4 nbconvert==6.5.1 nbformat==5.0.7 -notebook==6.4.1 +notebook==6.4.12 numpy==1.22.0 packaging==20.4 pandas==1.0.5 From 2b1dba7af91d5923733285bca4c2f561312dba83 Mon Sep 17 00:00:00 2001 From: Bill Dirks Date: Fri, 31 Mar 2023 13:00:36 -0700 Subject: [PATCH 08/96] [MAINTENANCE] Break out unit tests to own stage. (#7530) --- ci/azure-pipelines-dev.yml | 51 ++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/ci/azure-pipelines-dev.yml b/ci/azure-pipelines-dev.yml index 8ec008d7f782..72171c72a582 100644 --- a/ci/azure-pipelines-dev.yml +++ b/ci/azure-pipelines-dev.yml @@ -109,6 +109,34 @@ stages: invoke lint || EXIT_STATUS=$? exit $EXIT_STATUS + - stage: unit_tests + dependsOn: scope_check + pool: + vmImage: 'ubuntu-latest' + + jobs: + - job: run_unit_tests + condition: eq(stageDependencies.scope_check.changes.outputs['CheckChanges.GXChanged'], true) + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' + displayName: 'Use Python 3.7' + + - bash: python -m pip install --upgrade pip==21.3.1 + displayName: 'Update pip' + + - script: | + pip install --constraint constraints-dev.txt ".[test]" pytest-azurepipelines + displayName: 'Install dependencies' + + - script: | + invoke tests --cloud --unit --timeout=3.0 + displayName: 'Unit Tests' + env: + GE_USAGE_STATISTICS_URL: ${{ variables.GE_USAGE_STATISTICS_URL }} + SQLALCHEMY_WARN_20: true + - stage: custom_checks dependsOn: scope_check pool: @@ -195,7 +223,7 @@ stages: displayName: 'Import Great Expectations' - stage: required - dependsOn: [scope_check, lint, import_ge, custom_checks] + dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests] pool: vmImage: 'ubuntu-20.04' @@ -212,10 +240,8 @@ stages: matrix: standard: pytest_args: 'tests --ignore "tests/rule_based_profiler" --ignore "tests/integration"' - run_unit_tests: true slow: pytest_args: 'tests/rule_based_profiler tests/integration' - run_unit_tests: false steps: - task: UsePythonVersion@0 @@ -230,17 +256,6 @@ stages: pip install --constraint constraints-dev.txt ".[test]" pytest-azurepipelines displayName: 'Install dependencies' - - script: | - # Run unit-tests - if $(run_unit_tests); then - invoke tests --ci --cloud --timeout=3.0 - fi - - displayName: 'Unit Tests' - env: - GE_USAGE_STATISTICS_URL: ${{ variables.GE_USAGE_STATISTICS_URL }} - SQLALCHEMY_WARN_20: true - - script: | # Run pytest pytest $(pytest_args) \ @@ -336,7 +351,7 @@ stages: reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov' - stage: usage_stats_integration - dependsOn: [scope_check, lint, import_ge, custom_checks] + dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests] pool: vmImage: 'ubuntu-latest' @@ -371,7 +386,7 @@ stages: pool: vmImage: 'ubuntu-latest' - dependsOn: [scope_check, lint, import_ge, custom_checks] + dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests] jobs: - job: mysql @@ -511,7 +526,7 @@ stages: SQLALCHEMY_WARN_20: true - stage: cli_integration - dependsOn: [scope_check, lint, import_ge, custom_checks] + dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests] pool: vmImage: 'ubuntu-latest' @@ -548,7 +563,7 @@ stages: SQLALCHEMY_WARN_20: true - stage: airflow_provider - dependsOn: [scope_check, lint, import_ge, custom_checks] + dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests] pool: vmImage: 'ubuntu-latest' From 8e9d438e6afe74b70688be409535b36cd417c4d9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 31 Mar 2023 20:18:06 +0000 Subject: [PATCH 09/96] [MAINTENANCE] Bump wheel from 0.37.1 to 0.38.1 in /contrib/cli (#7493) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- contrib/cli/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/cli/requirements.txt b/contrib/cli/requirements.txt index c0efaeee987a..2507c93945f5 100644 --- a/contrib/cli/requirements.txt +++ b/contrib/cli/requirements.txt @@ -6,4 +6,4 @@ pydantic>=1.0,<2.0 # Needed for mypy plugin pytest>=5.3.5 # Test framework ruff==0.0.255 # Linting / code style twine==3.7.1 # Packaging -wheel==0.37.1 # Packaging +wheel==0.38.1 # Packaging From fe0620750bcdabf4fe51f7f4de078919a35771e1 Mon Sep 17 00:00:00 2001 From: Bill Dirks Date: Fri, 31 Mar 2023 14:42:21 -0700 Subject: [PATCH 10/96] [FEATURE] Add batch metadata to sql datasources. (#7499) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../datasource/fluent/sources.py | 2 +- .../datasource/fluent/sql_datasource.py | 10 +++- .../datasource/fluent/sqlite_datasource.py | 18 +++++- .../test_integration_datasource.py | 36 ++++++++++++ .../fluent/test_postgres_datasource.py | 57 ++++++++++++++++++- 5 files changed, 117 insertions(+), 6 deletions(-) diff --git a/great_expectations/datasource/fluent/sources.py b/great_expectations/datasource/fluent/sources.py index e84e884a38b1..10231352678e 100644 --- a/great_expectations/datasource/fluent/sources.py +++ b/great_expectations/datasource/fluent/sources.py @@ -267,7 +267,7 @@ def _bind_asset_factory_method_if_not_present( asset_type_name: str, ): add_asset_factory_method_name = f"add_{asset_type_name}_asset" - asset_factory_defined: bool = add_asset_factory_method_name in ds_type.__dict__ + asset_factory_defined: bool = hasattr(ds_type, add_asset_factory_method_name) if not asset_factory_defined: logger.debug( diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index 207ed03b4916..27b3a2f74123 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -30,6 +30,7 @@ ) from great_expectations.datasource.fluent.interfaces import ( Batch, + BatchMetadata, BatchRequest, BatchRequestOptions, DataAsset, @@ -575,7 +576,8 @@ def get_batch_list_from_batch_request( splitter = self.splitter batch_spec_kwargs: dict[str, str | dict | None] for request in self._fully_specified_batch_requests(batch_request): - batch_metadata = copy.deepcopy(request.options) + batch_metadata = copy.deepcopy(self.batch_metadata) + batch_metadata.update(copy.deepcopy(request.options)) batch_spec_kwargs = self._create_batch_spec_kwargs() if splitter: batch_spec_kwargs["splitter_method"] = splitter.method_name @@ -906,6 +908,7 @@ def add_table_asset( table_name: str, schema_name: Optional[str] = None, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> TableAsset: """Adds a table asset to this datasource. @@ -914,6 +917,7 @@ def add_table_asset( table_name: The table where the data resides. schema_name: The schema that holds the table. order_by: A list of Sorters or Sorter strings. + batch_metadata: BatchMetadata we want to associate with this DataAsset and all batches derived from it. Returns: The table asset that is added to the datasource. @@ -926,6 +930,7 @@ def add_table_asset( table_name=table_name, schema_name=schema_name, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) return self._add_asset(asset) @@ -935,6 +940,7 @@ def add_query_asset( name: str, query: str, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> QueryAsset: """Adds a query asset to this datasource. @@ -942,6 +948,7 @@ def add_query_asset( name: The name of this table asset. query: The SELECT query to selects the data to validate. It must begin with the "SELECT". order_by: A list of Sorters or Sorter strings. + batch_metadata: BatchMetadata we want to associate with this DataAsset and all batches derived from it. Returns: The query asset that is added to the datasource. @@ -953,5 +960,6 @@ def add_query_asset( name=name, query=query, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) return self._add_asset(asset) diff --git a/great_expectations/datasource/fluent/sqlite_datasource.py b/great_expectations/datasource/fluent/sqlite_datasource.py index d8a506a1cc81..06028d694806 100644 --- a/great_expectations/datasource/fluent/sqlite_datasource.py +++ b/great_expectations/datasource/fluent/sqlite_datasource.py @@ -24,6 +24,7 @@ from typing_extensions import Self from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, BatchRequestOptions, DataAsset, SortersDefinition, @@ -183,10 +184,17 @@ def add_table_asset( table_name: str, schema_name: Optional[str] = None, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> SqliteTableAsset: return cast( SqliteTableAsset, - super().add_table_asset(name, table_name, schema_name, order_by), + super().add_table_asset( + name=name, + table_name=table_name, + schema_name=schema_name, + order_by=order_by, + batch_metadata=batch_metadata, + ), ) add_table_asset.__doc__ = SQLDatasource.add_table_asset.__doc__ @@ -197,8 +205,14 @@ def add_query_asset( name: str, query: str, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> SqliteQueryAsset: - return cast(SqliteQueryAsset, super().add_query_asset(name, query, order_by)) + return cast( + SqliteQueryAsset, + super().add_query_asset( + name=name, query=query, order_by=order_by, batch_metadata=batch_metadata + ), + ) add_query_asset.__doc__ = SQLDatasource.add_query_asset.__doc__ diff --git a/tests/datasource/fluent/integration/test_integration_datasource.py b/tests/datasource/fluent/integration/test_integration_datasource.py index 6495d7265cd8..018a8c5538dd 100644 --- a/tests/datasource/fluent/integration/test_integration_datasource.py +++ b/tests/datasource/fluent/integration/test_integration_datasource.py @@ -418,3 +418,39 @@ def test_checkpoint_run_with_nonstring_path_option(empty_data_context): result = checkpoint.run() assert result["success"] assert result["checkpoint_config"]["class_name"] == "Checkpoint" + + +@pytest.mark.parametrize( + ["add_asset_method", "add_asset_kwarg"], + [ + pytest.param( + "add_table_asset", + {"table_name": "yellow_tripdata_sample_2019_02"}, + id="table_asset", + ), + pytest.param( + "add_query_asset", + {"query": "select * from yellow_tripdata_sample_2019_02"}, + id="query_asset", + ), + ], +) +@pytest.mark.integration +def test_asset_specified_metadata( + empty_data_context, add_asset_method, add_asset_kwarg +): + context = empty_data_context + datasource = sqlite_datasource(context, "yellow_tripdata.db") + asset_specified_metadata = {"pipeline_name": "my_pipeline"} + asset = getattr(datasource, add_asset_method)( + name="asset", + batch_metadata=asset_specified_metadata, + **add_asset_kwarg, + ) + asset.add_splitter_year_and_month(column_name="pickup_datetime") + asset.add_sorters(["year", "month"]) + # Test getting all batches + batches = asset.get_batch_list_from_batch_request(asset.build_batch_request()) + assert len(batches) == 1 + # Update the batch_metadata from the request with the metadata inherited from the asset + assert batches[0].metadata == {**asset_specified_metadata, "year": 2019, "month": 2} diff --git a/tests/datasource/fluent/test_postgres_datasource.py b/tests/datasource/fluent/test_postgres_datasource.py index 9876f237cb67..673f35d4796e 100644 --- a/tests/datasource/fluent/test_postgres_datasource.py +++ b/tests/datasource/fluent/test_postgres_datasource.py @@ -1186,10 +1186,9 @@ def test_sorting_none_in_metadata( # We use a query asset because then we don't have to mock out db connection tests # in this unit test. asset = source.add_query_asset( - name="my_asset", query="select * from table", order_by=["year"] + name="my_asset", query="select * from table", order_by=["-year"] ) asset.add_splitter_year(column_name="my_col") - asset.add_sorters(["-year"]) batches = source.get_batch_list_from_batch_request(asset.build_batch_request()) assert len(batches) == len(years) assert batches[-1].metadata["year"] is None @@ -1206,3 +1205,57 @@ def test_create_temp_table(create_source): asset = source.add_query_asset(name="query_asset", query="SELECT * from table") _ = asset.get_batch_list_from_batch_request(asset.build_batch_request()) assert source._execution_engine._create_temp_table is False + + +@pytest.mark.unit +def test_add_postgres_query_asset_with_batch_metadata( + create_source: CreateSourceFixture, +): + years = [2021, 2022] + asset_specified_metadata = {"pipeline_name": "my_pipeline"} + + with create_source( + validate_batch_spec=lambda _: None, + dialect="postgresql", + splitter_query_response=[{"year": year} for year in years], + ) as source: + asset = source.add_query_asset( + name="query_asset", + query="SELECT * FROM my_table", + batch_metadata=asset_specified_metadata, + order_by=["year"], + ) + assert asset.batch_metadata == asset_specified_metadata + asset.add_splitter_year(column_name="col") + batches = source.get_batch_list_from_batch_request(asset.build_batch_request()) + assert len(batches) == len(years) + for i, year in enumerate(years): + assert batches[i].metadata == {"pipeline_name": "my_pipeline", "year": year} + + +@pytest.mark.unit +def test_add_postgres_table_asset_with_batch_metadata( + create_source: CreateSourceFixture, monkeypatch +): + monkeypatch.setattr(TableAsset, "test_connection", lambda _: None) + monkeypatch.setattr(TableAsset, "test_splitter_connection", lambda _: None) + years = [2021, 2022] + asset_specified_metadata = {"pipeline_name": "my_pipeline"} + + with create_source( + validate_batch_spec=lambda _: None, + dialect="postgresql", + splitter_query_response=[{"year": year} for year in years], + ) as source: + asset = source.add_table_asset( + name="query_asset", + table_name="my_table", + batch_metadata=asset_specified_metadata, + order_by=["year"], + ) + assert asset.batch_metadata == asset_specified_metadata + asset.add_splitter_year(column_name="my_col") + batches = source.get_batch_list_from_batch_request(asset.build_batch_request()) + assert len(batches) == len(years) + for i, year in enumerate(years): + assert batches[i].metadata == {"pipeline_name": "my_pipeline", "year": year} From 1e982c50db0f9041fec2ce651f069061562c02c1 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 31 Mar 2023 15:46:40 -0700 Subject: [PATCH 11/96] [MAINTENANCE] Simplifying CapitalOne DataProfilerColumnDomainBuilder Using Default "profile_path" Argument (#7535) --- ...data_profiler_structured_data_assistant.py | 13 +- .../data_profiler_column_domain_builder.py | 5 +- ...est_data_profiler_column_domain_builder.py | 278 +++++++++++++++++- 3 files changed, 276 insertions(+), 20 deletions(-) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py index 817cf0401522..8b594fb5aefa 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/data_assistant/data_profiler_structured_data_assistant.py @@ -96,18 +96,7 @@ def _build_numeric_rule() -> Rule: column name in profiler report; GreatExpectations "table.columns" metric is used to validate column existence. """ data_profiler_column_domain_builder: DomainBuilder = ( - DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}profile_path", - include_column_names=None, - exclude_column_names=None, - include_column_name_suffixes=None, - exclude_column_name_suffixes=None, - semantic_type_filter_module_name=None, - semantic_type_filter_class_name=None, - include_semantic_types=None, - exclude_semantic_types=None, - data_context=None, - ) + DataProfilerColumnDomainBuilder() ) data_profiler_profile_report_metric_single_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.build_metric_single_batch_parameter_builder( diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py index 989d27f50b39..6695e598d2fc 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/rule_based_profiler/domain_builder/data_profiler_column_domain_builder.py @@ -13,6 +13,7 @@ get_parameter_value_and_validate_return_type, ) from great_expectations.rule_based_profiler.parameter_container import ( + VARIABLES_KEY, # noqa: TCH001 ParameterContainer, # noqa: TCH001 ) from great_expectations.util import is_candidate_subset_of_target @@ -32,7 +33,7 @@ class DataProfilerColumnDomainBuilder(ColumnDomainBuilder): def __init__( self, - profile_path: str, + profile_path: str = f"{VARIABLES_KEY}profile_path", include_column_names: Optional[Union[str, Optional[List[str]]]] = None, exclude_column_names: Optional[Union[str, Optional[List[str]]]] = None, include_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None, @@ -49,7 +50,7 @@ def __init__( ) -> None: """ Args: - profile_path: path to output (in ".pkl" format) of CapitalOne DataProfiler + profile_path: path to output (in ".pkl" format) of CapitalOne DataProfiler (default references "variables"). include_column_names: Explicitly specified desired columns (if None, it is computed based on active Batch). exclude_column_names: If provided, these columns are pre-filtered and excluded from consideration. include_column_name_suffixes: Explicitly specified desired suffixes for corresponding columns to match. diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py index ac07d32b0066..0e6c6656ed96 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/tests/rule_based_profiler/domain_builder/test_data_profiler_column_domain_builder.py @@ -303,7 +303,7 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_value( @pytest.mark.integration @pytest.mark.slow # 1.21s -def test_data_profiler_column_domain_builder_with_profile_path_as_reference( +def test_data_profiler_column_domain_builder_with_profile_path_as_default_reference( bobby_columnar_table_multi_batch_deterministic_data_context: FileDataContext, ): """ @@ -338,7 +338,273 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference( } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}profile_path", + data_context=data_context, + ) + domains: List[Domain] = domain_builder.get_domains( + rule_name="my_rule", + variables=variables, + batch_request=batch_request, + ) + + assert len(domains) == 18 + assert domains == [ + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "vendor_id", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "vendor_id": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "pickup_datetime", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "pickup_datetime": SemanticDomainTypes.TEXT.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "dropoff_datetime", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "dropoff_datetime": SemanticDomainTypes.TEXT.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "passenger_count", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "passenger_count": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "trip_distance", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "trip_distance": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "rate_code_id", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "rate_code_id": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "store_and_fwd_flag", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "store_and_fwd_flag": SemanticDomainTypes.TEXT.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "pickup_location_id", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "pickup_location_id": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "dropoff_location_id", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "dropoff_location_id": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "payment_type", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "payment_type": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "fare_amount", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "fare_amount": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "extra", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "extra": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "mta_tax", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "mta_tax": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "tip_amount", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "tip_amount": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "tolls_amount", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "tolls_amount": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "improvement_surcharge", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "improvement_surcharge": SemanticDomainTypes.NUMERIC.value, + } + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "total_amount", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "total_amount": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + { + "rule_name": "my_rule", + "domain_type": MetricDomainTypes.COLUMN.value, + "domain_kwargs": { + "column": "congestion_surcharge", + }, + "details": { + INFERRED_SEMANTIC_TYPE_KEY: { + "congestion_surcharge": SemanticDomainTypes.NUMERIC.value, + }, + }, + }, + ] + + +@pytest.mark.integration +@pytest.mark.slow # 1.21s +def test_data_profiler_column_domain_builder_with_profile_path_as_reference( + bobby_columnar_table_multi_batch_deterministic_data_context: FileDataContext, +): + """ + This test verifies that "Domain" objects corresponding to full list of columns in Profiler Report (same as in Batch) + are emitted when path to "profile.pkl" is specified as Rule variable (implicitly). + """ + data_context: FileDataContext = ( + bobby_columnar_table_multi_batch_deterministic_data_context + ) + + profile_path = os.path.join( # noqa: PTH118 + test_root_path, + "data_profiler_files", + "profile.pkl", + ) + + variables_configs: dict = { + "my_profile_path": profile_path, + "estimator": "quantiles", + "false_positive_rate": 1.0e-2, + "mostly": 1.0, + } + variables: ParameterContainer = build_parameter_container_for_variables( + variables_configs=variables_configs + ) + + batch_request: dict = { + "datasource_name": "taxi_pandas", + "data_connector_name": "monthly", + "data_asset_name": "my_reports", + "data_connector_query": {"index": -1}, + } + + domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( + profile_path=f"{VARIABLES_KEY}my_profile_path", data_context=data_context, ) domains: List[Domain] = domain_builder.get_domains( @@ -588,7 +854,7 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with ) variables_configs: dict = { - "profile_path": profile_path, + "my_profile_path": profile_path, "estimator": "quantiles", "false_positive_rate": 1.0e-2, "mostly": 1.0, @@ -605,7 +871,7 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}profile_path", + profile_path=f"{VARIABLES_KEY}my_profile_path", exclude_column_names=[ "store_and_fwd_flag", "congestion_surcharge", @@ -791,7 +1057,7 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with ) variables_configs: dict = { - "profile_path": profile_path, + "my_profile_path": profile_path, "estimator": "quantiles", "false_positive_rate": 1.0e-2, "mostly": 1.0, @@ -808,7 +1074,7 @@ def test_data_profiler_column_domain_builder_with_profile_path_as_reference_with } domain_builder: DomainBuilder = DataProfilerColumnDomainBuilder( - profile_path=f"{VARIABLES_KEY}profile_path", + profile_path=f"{VARIABLES_KEY}my_profile_path", data_context=data_context, ) with mock.patch( From 4288e3aade27bdcfc46c6e577b919adefb88a9ec Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Sat, 1 Apr 2023 16:09:46 -0500 Subject: [PATCH 12/96] [MAINTENANCE]: Clean up ununsed imports (#7537) --- .../regex_based_column_map_expectation_template.py | 7 ------- .../expect_column_values_to_only_contain_vowels.py | 7 ------- .../regex_based_column_map_expectation_template.py | 7 ------- 3 files changed, 21 deletions(-) diff --git a/examples/expectations/regex_based_column_map_expectation_template.py b/examples/expectations/regex_based_column_map_expectation_template.py index 2976e1ae6f9c..5176482bbcd4 100644 --- a/examples/expectations/regex_based_column_map_expectation_template.py +++ b/examples/expectations/regex_based_column_map_expectation_template.py @@ -4,15 +4,8 @@ https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations """ -from typing import Dict, Optional - -from great_expectations.core.expectation_configuration import ExpectationConfiguration -from great_expectations.exceptions.exceptions import ( - InvalidExpectationConfigurationError, -) from great_expectations.expectations.regex_based_column_map_expectation import ( RegexBasedColumnMapExpectation, - RegexColumnMapMetricProvider, ) diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py index 07f0b853ca13..42605d2120f8 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py @@ -1,12 +1,5 @@ -from typing import Dict, Optional - -from great_expectations.core.expectation_configuration import ExpectationConfiguration -from great_expectations.exceptions.exceptions import ( - InvalidExpectationConfigurationError, -) from great_expectations.expectations.regex_based_column_map_expectation import ( RegexBasedColumnMapExpectation, - RegexColumnMapMetricProvider, ) diff --git a/tests/integration/docusaurus/expectations/examples/regex_based_column_map_expectation_template.py b/tests/integration/docusaurus/expectations/examples/regex_based_column_map_expectation_template.py index 5291217c154b..af69babd534b 100644 --- a/tests/integration/docusaurus/expectations/examples/regex_based_column_map_expectation_template.py +++ b/tests/integration/docusaurus/expectations/examples/regex_based_column_map_expectation_template.py @@ -4,15 +4,8 @@ https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations """ -from typing import Dict, Optional - -from great_expectations.core.expectation_configuration import ExpectationConfiguration -from great_expectations.exceptions.exceptions import ( - InvalidExpectationConfigurationError, -) from great_expectations.expectations.regex_based_column_map_expectation import ( RegexBasedColumnMapExpectation, - RegexColumnMapMetricProvider, ) From 424a75e74b1c9638a08fe868a3cb05b7da114767 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Sat, 1 Apr 2023 17:44:12 -0400 Subject: [PATCH 13/96] [BUGFIX] Fix issue running quickstart (#7539) --- .../compatibility/sqlalchemy_and_pandas.py | 16 +++++----------- great_expectations/df_to_database_loader.py | 7 ++++--- great_expectations/util.py | 1 - 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/great_expectations/compatibility/sqlalchemy_and_pandas.py b/great_expectations/compatibility/sqlalchemy_and_pandas.py index fc9b62b60209..3d194522ef75 100644 --- a/great_expectations/compatibility/sqlalchemy_and_pandas.py +++ b/great_expectations/compatibility/sqlalchemy_and_pandas.py @@ -6,7 +6,6 @@ import pandas as pd from great_expectations.optional_imports import ( - SQLALCHEMY_NOT_IMPORTED, is_version_greater_or_equal, is_version_less_than, sqlalchemy, @@ -15,12 +14,6 @@ warn_pandas_less_than_2_0_and_sqlalchemy_greater_than_or_equal_2_0, ) -try: - from sqlalchemy.exc import RemovedIn20Warning - -except ImportError: - RemovedIn20Warning = SQLALCHEMY_NOT_IMPORTED - def execute_pandas_reader_fn( reader_fn: Callable, reader_options: dict @@ -39,12 +32,13 @@ def execute_pandas_reader_fn( dataframe or list of dataframes """ if is_version_less_than(pd.__version__, "2.0.0"): - if sqlalchemy != SQLALCHEMY_NOT_IMPORTED and is_version_greater_or_equal( - sqlalchemy.__version__, "2.0.0" - ): + if sqlalchemy and is_version_greater_or_equal(sqlalchemy.__version__, "2.0.0"): warn_pandas_less_than_2_0_and_sqlalchemy_greater_than_or_equal_2_0() with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", category=RemovedIn20Warning) + # Note that RemovedIn20Warning is the warning class that we see from sqlalchemy + # but using the base class here since sqlalchemy is an optional dependency and this + # warning type only exists in sqlalchemy < 2.0. + warnings.filterwarnings(action="ignore", category=DeprecationWarning) reader_fn_result: pd.DataFrame | list[pd.DataFrame] = reader_fn( **reader_options ) diff --git a/great_expectations/df_to_database_loader.py b/great_expectations/df_to_database_loader.py index 020e0c24cf51..c6c5d8ed6e62 100644 --- a/great_expectations/df_to_database_loader.py +++ b/great_expectations/df_to_database_loader.py @@ -12,7 +12,6 @@ import sqlalchemy as sa from sqlalchemy import Table from sqlalchemy.engine import reflection - from sqlalchemy.exc import RemovedIn20Warning from sqlalchemy.sql import Select except ImportError: @@ -23,7 +22,6 @@ reflection = None Table = None Select = None - RemovedIn20Warning = None def add_dataframe_to_db( @@ -77,7 +75,10 @@ def add_dataframe_to_db( if isinstance(con, sa.engine.Engine): con = con.connect() with warnings.catch_warnings(): - warnings.filterwarnings(action="ignore", category=RemovedIn20Warning) + # Note that RemovedIn20Warning is the warning class that we see from sqlalchemy + # but using the base class here since sqlalchemy is an optional dependency and this + # warning type only exists in sqlalchemy < 2.0. + warnings.filterwarnings(action="ignore", category=DeprecationWarning) df.to_sql( name=name, con=con, diff --git a/great_expectations/util.py b/great_expectations/util.py index a3d523aa8b4c..6c83d630002d 100644 --- a/great_expectations/util.py +++ b/great_expectations/util.py @@ -90,7 +90,6 @@ reflection = None Table = None Select = None - RemovedIn20Warning = None if TYPE_CHECKING: From 2c96ca67c930e2750e4185bcc2bafd43e45dd35b Mon Sep 17 00:00:00 2001 From: Gabriel Date: Sat, 1 Apr 2023 18:18:30 -0400 Subject: [PATCH 14/96] [MAINTENANCE] Fix Type-Checking steps (#7536) --- ci/azure-pipelines-dev.yml | 4 +++- .../compatibility/sqlalchemy_and_pandas.py | 4 +--- great_expectations/self_check/util.py | 6 ++--- tests/datasource/fluent/test_config.py | 24 +++++++++---------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/ci/azure-pipelines-dev.yml b/ci/azure-pipelines-dev.yml index 72171c72a582..e62e0c4345d9 100644 --- a/ci/azure-pipelines-dev.yml +++ b/ci/azure-pipelines-dev.yml @@ -152,9 +152,11 @@ stages: - script: | pip install --requirement requirements-types.txt invoke type-check --ci --pretty + name: StaticTypeCheck + - script: | # initial run doesn't check `.pyi` source files invoke type-check --ci --pretty --check-stub-sources - name: StaticTypeCheck + name: TypeCheckStubSourceFiles - job: docstring_linter steps: diff --git a/great_expectations/compatibility/sqlalchemy_and_pandas.py b/great_expectations/compatibility/sqlalchemy_and_pandas.py index 3d194522ef75..3fa80da0bb5b 100644 --- a/great_expectations/compatibility/sqlalchemy_and_pandas.py +++ b/great_expectations/compatibility/sqlalchemy_and_pandas.py @@ -43,7 +43,5 @@ def execute_pandas_reader_fn( **reader_options ) else: - reader_fn_result: pd.DataFrame | list[pd.DataFrame] = reader_fn( - **reader_options - ) + reader_fn_result = reader_fn(**reader_options) return reader_fn_result diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index bfe19160ec75..3c409f9e040f 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -2036,7 +2036,7 @@ def generate_expectation_tests( # noqa: C901 - 43 validator_with_data = datasets[0] else: dataset_name = generate_dataset_name_from_expectation_name( - dataset=d, + dataset=d, # type: ignore[arg-type] # should be dict but got ExpectationTestDataCases expectation_type=expectation_type, index=i, ) @@ -2765,11 +2765,11 @@ def generate_dataset_name_from_expectation_name( dataset_name: str if not sub_index: - dataset_name: str = dataset.get( + dataset_name = dataset.get( "dataset_name", f"{expectation_type}_dataset_{index}" ) else: - dataset_name: str = dataset.get( + dataset_name = dataset.get( "dataset_name", f"{expectation_type}_dataset_{index}_{sub_index}" ) return dataset_name diff --git a/tests/datasource/fluent/test_config.py b/tests/datasource/fluent/test_config.py index ec7d9e3e0e5a..620a7d504b2a 100644 --- a/tests/datasource/fluent/test_config.py +++ b/tests/datasource/fluent/test_config.py @@ -9,7 +9,7 @@ import uuid from pprint import pformat as pf from pprint import pprint as pp -from typing import TYPE_CHECKING, Callable, List +from typing import TYPE_CHECKING, Callable, List, cast # TODO: revert use of cast import pydantic import pytest @@ -785,9 +785,9 @@ def test_config_substitution_retains_original_value_on_save( file_dc_config_file_with_substitutions: pathlib.Path, sqlite_database_path: pathlib.Path, ): - original: dict = yaml.load(file_dc_config_file_with_substitutions.read_text())[ - "fluent_datasources" - ]["my_sqlite_ds_w_subs"] + original: dict = cast( + dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) + )["fluent_datasources"]["my_sqlite_ds_w_subs"] from great_expectations import get_context @@ -815,9 +815,9 @@ def test_config_substitution_retains_original_value_on_save( context._save_project_config() - round_tripped = yaml.load(file_dc_config_file_with_substitutions.read_text())[ - "fluent_datasources" - ]["my_sqlite_ds_w_subs"] + round_tripped = cast( + dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) + )["fluent_datasources"]["my_sqlite_ds_w_subs"] # FIXME: serialized items should not have name round_tripped.pop("name") @@ -835,9 +835,9 @@ def test_config_substitution_retains_original_value_on_save_w_run_time_mods( my_conn_str = f"sqlite:///{sqlite_database_path}" monkeypatch.setenv("MY_CONN_STR", my_conn_str) - original: dict = yaml.load(file_dc_config_file_with_substitutions.read_text())[ - "fluent_datasources" - ] + original: dict = cast( + dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) + )["fluent_datasources"] assert original.get("my_sqlite_ds_w_subs") # will be modified assert original.get("my_pg_ds") # will be deleted assert not original.get("my_sqlite") # will be added @@ -868,8 +868,8 @@ def test_config_substitution_retains_original_value_on_save_w_run_time_mods( context._save_project_config() - round_tripped_datasources = yaml.load( - file_dc_config_file_with_substitutions.read_text() + round_tripped_datasources = cast( + dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) )["fluent_datasources"] assert round_tripped_datasources["my_new_one"] From 98f26a563c1267972c0c245fcec3b52669529a8b Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 1 Apr 2023 15:29:12 -0700 Subject: [PATCH 15/96] [MAINTENANCE] Disable UserConfigurableProfiler tests relying on deprecated V2 functionality (#7541) --- great_expectations/expectations/metrics/metric_provider.py | 2 +- tests/profile/test_basic_suite_builder_profiler.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/great_expectations/expectations/metrics/metric_provider.py b/great_expectations/expectations/metrics/metric_provider.py index d95a10e9913e..319321a720ea 100644 --- a/great_expectations/expectations/metrics/metric_provider.py +++ b/great_expectations/expectations/metrics/metric_provider.py @@ -189,7 +189,7 @@ def _register_metric_functions(cls) -> None: To instruct "ExecutionEngine" accordingly, original metric is registered with its "declared" name, but with "metric_provider" function omitted (set to "None"), and additional "AGGREGATE_FN" metric, with its "metric_provider" set to (decorated) implementation function, defined in metric class, is registered. - Then "AGGREGATE_FN" metric can specified with key "metric_partial_fn" as evaluation metric dependency. + Then "AGGREGATE_FN" metric is specified with key "metric_partial_fn" as evaluation metric dependency. By convention, aggregate partial metric implementation functions return three-valued tuple, containing deferred execution metric implementation function of corresponding "ExecutionEngine" backend (called "metric_aggregate") as well as "compute_domain_kwargs" and "accessor_domain_kwargs", which are relevant diff --git a/tests/profile/test_basic_suite_builder_profiler.py b/tests/profile/test_basic_suite_builder_profiler.py index 0af659993366..4b1bbdaba447 100644 --- a/tests/profile/test_basic_suite_builder_profiler.py +++ b/tests/profile/test_basic_suite_builder_profiler.py @@ -78,6 +78,9 @@ def datetime_dataset(test_backend): is_library_loadable(library_name="trino"), reason="datetime doesnt exist in Trino", ) +@pytest.mark.xfail( + reason='Utility methods "get_dataset()" is part of deprecated GX-V2 functionality (it must no longer be used).' +) def test__find_next_datetime_column(datetime_dataset, numeric_high_card_dataset): columns = datetime_dataset.get_table_columns() column_cache = {} @@ -113,6 +116,9 @@ def test__find_next_datetime_column(datetime_dataset, numeric_high_card_dataset) is_library_loadable(library_name="trino"), reason="datetime doesnt exist in Trino", ) +@pytest.mark.xfail( + reason='Utility methods "get_dataset()" is part of deprecated GX-V2 functionality (it must no longer be used).' +) def test__create_expectations_for_datetime_column(datetime_dataset): column = "datetime" From 0d0b33663bcae4685f2c07113914c51adb060f06 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Sat, 1 Apr 2023 17:29:45 -0500 Subject: [PATCH 16/96] [MAINTENANCE]: replace ColumnMetricProvider with ColumnAggregateMetricProvider (#7538) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../expect_column_discrete_entropy_to_be_between.py | 4 ++-- .../expect_column_distribution_to_match_benfords_law.py | 4 ++-- .../expectations/expect_column_kurtosis_to_be_between.py | 4 ++-- .../expectations/expect_column_skew_to_be_between.py | 4 ++-- .../expect_column_values_to_be_normally_distributed.py | 4 ++-- .../expect_column_wasserstein_distance_to_be_less_than.py | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py index 776a550d9d7b..91b5b7185ca1 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py @@ -20,7 +20,7 @@ render_evaluation_parameter_string, ) from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnMetricProvider, + ColumnAggregateMetricProvider, column_aggregate_value, ) from great_expectations.expectations.metrics.metric_provider import metric_value @@ -34,7 +34,7 @@ from great_expectations.validator.validation_graph import MetricConfiguration -class ColumnDiscreteEntropy(ColumnMetricProvider): +class ColumnDiscreteEntropy(ColumnAggregateMetricProvider): """MetricProvider Class for Discrete Entropy MetricProvider""" metric_name = "column.discrete.entropy" diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py index 66f1c4cae785..3d851a4ef6b6 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py @@ -8,7 +8,7 @@ ) from great_expectations.expectations.expectation import ColumnExpectation from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnMetricProvider, + ColumnAggregateMetricProvider, column_aggregate_value, ) from great_expectations.validator.validation_graph import MetricConfiguration @@ -36,7 +36,7 @@ def matchFirstDigit(value, digit): return 0.0 -class ColumnDistributionMatchesBenfordsLaw(ColumnMetricProvider): +class ColumnDistributionMatchesBenfordsLaw(ColumnAggregateMetricProvider): """ MetricProvider tests whether data matches Benford's Law Fraud Detection Algorithm. diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py index 9ce5605236e8..780db599dc39 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py @@ -11,13 +11,13 @@ from great_expectations.expectations.expectation import ColumnExpectation from great_expectations.expectations.metrics import column_aggregate_partial from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnMetricProvider, + ColumnAggregateMetricProvider, column_aggregate_value, ) from great_expectations.expectations.metrics.import_manager import F -class ColumnKurtosis(ColumnMetricProvider): +class ColumnKurtosis(ColumnAggregateMetricProvider): """MetricProvider Class for Aggregate Mean MetricProvider""" metric_name = "column.custom.kurtosis" diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py index 606fe88d4689..861887ce63a4 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py @@ -17,7 +17,7 @@ ) from great_expectations.expectations.expectation import ColumnExpectation from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnMetricProvider, + ColumnAggregateMetricProvider, ) from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( column_aggregate_partial, @@ -53,7 +53,7 @@ Row = None -class ColumnSkew(ColumnMetricProvider): +class ColumnSkew(ColumnAggregateMetricProvider): """MetricProvider Class for Aggregate Mean MetricProvider""" metric_name = "column.custom.skew" diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py index 2f1a55c62ac3..abd53a54d77a 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py @@ -6,12 +6,12 @@ from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.expectation import ColumnExpectation from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnMetricProvider, + ColumnAggregateMetricProvider, column_aggregate_value, ) -class ColumnNormallyDistributed(ColumnMetricProvider): +class ColumnNormallyDistributed(ColumnAggregateMetricProvider): """MetricProvider Class for Aggregate Mean MetricProvider""" metric_name = "column.custom.normally_distributed" diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py index 2012278132c7..c6b88a85ffb7 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py @@ -6,12 +6,12 @@ from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.expectation import ColumnExpectation from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnMetricProvider, + ColumnAggregateMetricProvider, column_aggregate_value, ) -class ColumnWassersteinDistance(ColumnMetricProvider): +class ColumnWassersteinDistance(ColumnAggregateMetricProvider): """MetricProvider Class for Wasserstein Distance MetricProvider""" metric_name = "column.custom.wasserstein" From 9cb65349413a85eed40f8959dc664b84315e5693 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Sat, 1 Apr 2023 19:02:38 -0400 Subject: [PATCH 17/96] Only run unit tests and expectation tests --- ci/azure-pipelines-sqlalchemy-compatibility.yml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/ci/azure-pipelines-sqlalchemy-compatibility.yml b/ci/azure-pipelines-sqlalchemy-compatibility.yml index 998ba4a996ad..b05c8d8cae9c 100644 --- a/ci/azure-pipelines-sqlalchemy-compatibility.yml +++ b/ci/azure-pipelines-sqlalchemy-compatibility.yml @@ -136,18 +136,12 @@ stages: pip install --constraint constraints-dev-temp.txt ".[dev]" pytest-azurepipelines displayName: 'Install dependencies using SQLAlchemy base version $(sqlalchemy_base_version)' + # TODO: Currently the below test only runs expectations tests for postgresql. We should figure out what the + # TODO: best way to test for sqlalchemy version compatibility and implement that here. - script: | # Run pytest - pytest \ - --postgresql \ - --ignore 'tests/cli' \ - --ignore 'tests/integration/usage_statistics' \ - --napoleon-docstrings \ - --junitxml=junit/test-results.xml \ - --cov=. \ - --cov-report=xml \ - --cov-report=html \ - -m 'not unit and not e2e' + pytest --postgresql tests/test_definitions/test_expectations_v3_api.py + pytest --postgresql -m unit displayName: 'pytest' env: From dd63701e8d7b1f278bdad051b9359cdb84ff4609 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Sat, 1 Apr 2023 19:46:02 -0400 Subject: [PATCH 18/96] [MAINTENANCE] Exclude files from deprecation warning check (#7544) --- tests/test_deprecation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py index 9ec9bddb1b27..06fc01e915f2 100644 --- a/tests/test_deprecation.py +++ b/tests/test_deprecation.py @@ -19,6 +19,13 @@ def regex_for_deprecation_comments() -> Pattern: @pytest.fixture def files_with_deprecation_warnings() -> List[str]: files: List[str] = glob.glob("great_expectations/**/*.py", recursive=True) + files_to_exclude = [ + "great_expectations/df_to_database_loader.py", + "great_expectations/compatibility/sqlalchemy_and_pandas.py", + ] + for file_to_exclude in files_to_exclude: + if file_to_exclude in files: + files.remove(file_to_exclude) return files From b68fa9ab6409a6dcc2613e9be68621e625bc25ae Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Sat, 1 Apr 2023 19:54:39 -0400 Subject: [PATCH 19/96] [DOCS] Quickstart code under test (#7542) --- ci/azure-pipelines-dev.yml | 27 +++++++++++++++++- .../public_api_report.py | 2 +- .../tutorials/quickstart/__init__.py | 0 .../tutorials/quickstart/quickstart.py | 28 +++++++++++++++++++ tests/integration/test_script_runner.py | 10 ++++++- 5 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 tests/integration/docusaurus/tutorials/quickstart/__init__.py create mode 100644 tests/integration/docusaurus/tutorials/quickstart/quickstart.py diff --git a/ci/azure-pipelines-dev.yml b/ci/azure-pipelines-dev.yml index e62e0c4345d9..bb0cdb02e268 100644 --- a/ci/azure-pipelines-dev.yml +++ b/ci/azure-pipelines-dev.yml @@ -224,8 +224,33 @@ stages: python -c "import great_expectations as gx; print('Successfully imported GX Version:', gx.__version__)" displayName: 'Import Great Expectations' + - stage: quickstart + dependsOn: scope_check + pool: + vmImage: 'ubuntu-20.04' + + jobs: + - job: quickstart + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' + displayName: 'Use Python 3.7' + + - script: | + pip install --constraint constraints-dev.txt ".[test]" + pip uninstall -y sqlalchemy + displayName: 'Install GX and required dependencies (i.e. not sqlalchemy)' + + - script: | + pytest -v --no-sqlalchemy --docs-tests -m integration -k "test_docs[quickstart]" tests/integration/test_script_runner.py + displayName: 'Run Quickstart' + + + - stage: required - dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests] + dependsOn: [scope_check, lint, import_ge, custom_checks, unit_tests, quickstart] pool: vmImage: 'ubuntu-20.04' diff --git a/docs/sphinx_api_docs_source/public_api_report.py b/docs/sphinx_api_docs_source/public_api_report.py index d9e39be16fc0..68056ae47c35 100755 --- a/docs/sphinx_api_docs_source/public_api_report.py +++ b/docs/sphinx_api_docs_source/public_api_report.py @@ -1859,7 +1859,7 @@ def main(): # any methods or classes you are adding to documentation with the @public_api # decorator and any relevant "new" or "deprecated" public api decorators. # If the actual is lower than the threshold, please reduce the threshold. - PUBLIC_API_MISSING_THRESHOLD = 96 # TODO: reduce this number again once this works for the Fluent DS dynamic methods + PUBLIC_API_MISSING_THRESHOLD = 97 # TODO: reduce this number again once this works for the Fluent DS dynamic methods if len(printable_definitions) != PUBLIC_API_MISSING_THRESHOLD: error_msg_prefix = f"There are {len(printable_definitions)} items missing from the public API, we currently allow {PUBLIC_API_MISSING_THRESHOLD}." if len(printable_definitions) > PUBLIC_API_MISSING_THRESHOLD: diff --git a/tests/integration/docusaurus/tutorials/quickstart/__init__.py b/tests/integration/docusaurus/tutorials/quickstart/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/docusaurus/tutorials/quickstart/quickstart.py b/tests/integration/docusaurus/tutorials/quickstart/quickstart.py new file mode 100644 index 000000000000..a953f9f1d2d8 --- /dev/null +++ b/tests/integration/docusaurus/tutorials/quickstart/quickstart.py @@ -0,0 +1,28 @@ +import great_expectations as gx + +# Set up +context = gx.get_context() + +# Connect to data +validator = context.sources.pandas_default.read_csv( + "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" +) + +# Create Expectations +validator.expect_column_values_to_not_be_null("pickup_datetime") +validator.expect_column_values_to_be_between("passenger_count", auto=True) + +# Validate data +checkpoint = gx.checkpoint.SimpleCheckpoint( + name="my_quickstart_checkpoint", + data_context=context, + validator=validator, +) +checkpoint_result = checkpoint.run() + +# View results +validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0] +context.open_data_docs(resource_identifier=validation_result_identifier) + +# Save the Data Context for future use +context.convert_to_file_context() diff --git a/tests/integration/test_script_runner.py b/tests/integration/test_script_runner.py index 977286e83a5b..bd76240eebb4 100644 --- a/tests/integration/test_script_runner.py +++ b/tests/integration/test_script_runner.py @@ -1,7 +1,7 @@ """Run integration and docs tests. Individual tests can be run by setting the '-k' flag and referencing the name of test, like the following example: - pytest -v --docs-tests -m integration -k "test_docs[migration_guide_spark_v2_api]" tests/integration/test_script_runner.py + pytest -v --docs-tests -m integration -k "test_docs[quickstart]" tests/integration/test_script_runner.py """ import importlib.machinery @@ -240,9 +240,17 @@ ), ] +quickstart = [ + IntegrationTestFixture( + name="quickstart", + user_flow_script="tests/integration/docusaurus/tutorials/quickstart/quickstart.py", + ), +] + # populate docs_test_matrix with sub-lists docs_test_matrix += local_tests +docs_test_matrix += quickstart docs_test_matrix += spark_integration_tests docs_test_matrix += sqlite_integration_tests docs_test_matrix += mysql_integration_tests From 8a416810a8632cfe721629dc8e43d766b0742cf8 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Sat, 1 Apr 2023 22:39:57 -0700 Subject: [PATCH 20/96] release prep for 0.16.5 (#7545) Co-authored-by: Gabriel All suggestions have been incorporated. We can force-merge this pull request, because only the version number has changed (plus the release notes in the changelog files). --- docs/docusaurus/docs/changelog.md | 20 ++++++++++++++++++++ docs_rtd/changelog.rst | 21 +++++++++++++++++++++ great_expectations/deployment_version | 2 +- 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/docusaurus/docs/changelog.md b/docs/docusaurus/docs/changelog.md index 8405ac9612cc..7b4cf129ce6a 100644 --- a/docs/docusaurus/docs/changelog.md +++ b/docs/docusaurus/docs/changelog.md @@ -2,6 +2,26 @@ title: Changelog --- +### 0.16.5 +* [FEATURE] Add batch metadata to sql datasources. ([#7499](https://github.com/great-expectations/great_expectations/pull/7499)) +* [BUGFIX] Fix issue running quickstart ([#7539](https://github.com/great-expectations/great_expectations/pull/7539)) +* [DOCS] doc 508 Updates footer links on docs pages ([#7521](https://github.com/great-expectations/great_expectations/pull/7521)) +* [DOCS] DSB-64 removes outdated v2/v3 references from the docs ([#7519](https://github.com/great-expectations/great_expectations/pull/7519)) +* [DOCS] Update CODEOWNERS ([#7528](https://github.com/great-expectations/great_expectations/pull/7528)) +* [DOCS] Quickstart code under test ([#7542](https://github.com/great-expectations/great_expectations/pull/7542)) +* [MAINTENANCE] SqlAlchemy2 Compatibility - `Row.keys()` ([#7520](https://github.com/great-expectations/great_expectations/pull/7520)) +* [MAINTENANCE] Refactoring of CapitalOne Metrics and Profiler-Based DataAssistant for Enhanced Code Elegance ([#7522](https://github.com/great-expectations/great_expectations/pull/7522)) +* [MAINTENANCE] SqlAlchemy 2 Compatibility - Autoload Parameter deprecation ([#7526](https://github.com/great-expectations/great_expectations/pull/7526)) +* [MAINTENANCE] Bump notebook from 6.4.1 to 6.4.12 in /docs_rtd ([#7511](https://github.com/great-expectations/great_expectations/pull/7511)) +* [MAINTENANCE] Break out unit tests to own stage. ([#7530](https://github.com/great-expectations/great_expectations/pull/7530)) +* [MAINTENANCE] Bump wheel from 0.37.1 to 0.38.1 in /contrib/cli ([#7493](https://github.com/great-expectations/great_expectations/pull/7493)) +* [MAINTENANCE] Simplifying CapitalOne DataProfilerColumnDomainBuilder Using Default "profile_path" Argument ([#7535](https://github.com/great-expectations/great_expectations/pull/7535)) +* [MAINTENANCE] : Clean up ununsed imports ([#7537](https://github.com/great-expectations/great_expectations/pull/7537)) +* [MAINTENANCE] Fix Type-Checking steps ([#7536](https://github.com/great-expectations/great_expectations/pull/7536)) +* [MAINTENANCE] Disable UserConfigurableProfiler tests relying on deprecated V2 functionality ([#7541](https://github.com/great-expectations/great_expectations/pull/7541)) +* [MAINTENANCE] : replace ColumnMetricProvider with ColumnAggregateMetricProvider ([#7538](https://github.com/great-expectations/great_expectations/pull/7538)) +* [MAINTENANCE] Exclude files from deprecation warning check ([#7544](https://github.com/great-expectations/great_expectations/pull/7544)) + ### 0.16.4 * [FEATURE] Add package, contributors and metrics filter in Algolia script for expectation ([#7000](https://github.com/great-expectations/great_expectations/pull/7000)) (thanks @kod-er) * [FEATURE] `BatchMetadata` for all fluent `DataAsset`s ([#7392](https://github.com/great-expectations/great_expectations/pull/7392)) diff --git a/docs_rtd/changelog.rst b/docs_rtd/changelog.rst index 88bca0bc0d5b..a910219529ee 100644 --- a/docs_rtd/changelog.rst +++ b/docs_rtd/changelog.rst @@ -4,6 +4,27 @@ Changelog ######### +0.16.5 +----------------- +* [FEATURE] Add batch metadata to sql datasources. ([#7499](https://github.com/great-expectations/great_expectations/pull/7499)) +* [BUGFIX] Fix issue running quickstart ([#7539](https://github.com/great-expectations/great_expectations/pull/7539)) +* [DOCS] doc 508 Updates footer links on docs pages ([#7521](https://github.com/great-expectations/great_expectations/pull/7521)) +* [DOCS] DSB-64 removes outdated v2/v3 references from the docs ([#7519](https://github.com/great-expectations/great_expectations/pull/7519)) +* [DOCS] Update CODEOWNERS ([#7528](https://github.com/great-expectations/great_expectations/pull/7528)) +* [DOCS] Quickstart code under test ([#7542](https://github.com/great-expectations/great_expectations/pull/7542)) +* [MAINTENANCE] SqlAlchemy2 Compatibility - `Row.keys()` ([#7520](https://github.com/great-expectations/great_expectations/pull/7520)) +* [MAINTENANCE] Refactoring of CapitalOne Metrics and Profiler-Based DataAssistant for Enhanced Code Elegance ([#7522](https://github.com/great-expectations/great_expectations/pull/7522)) +* [MAINTENANCE] SqlAlchemy 2 Compatibility - Autoload Parameter deprecation ([#7526](https://github.com/great-expectations/great_expectations/pull/7526)) +* [MAINTENANCE] Bump notebook from 6.4.1 to 6.4.12 in /docs_rtd ([#7511](https://github.com/great-expectations/great_expectations/pull/7511)) +* [MAINTENANCE] Break out unit tests to own stage. ([#7530](https://github.com/great-expectations/great_expectations/pull/7530)) +* [MAINTENANCE] Bump wheel from 0.37.1 to 0.38.1 in /contrib/cli ([#7493](https://github.com/great-expectations/great_expectations/pull/7493)) +* [MAINTENANCE] Simplifying CapitalOne DataProfilerColumnDomainBuilder Using Default "profile_path" Argument ([#7535](https://github.com/great-expectations/great_expectations/pull/7535)) +* [MAINTENANCE] : Clean up ununsed imports ([#7537](https://github.com/great-expectations/great_expectations/pull/7537)) +* [MAINTENANCE] Fix Type-Checking steps ([#7536](https://github.com/great-expectations/great_expectations/pull/7536)) +* [MAINTENANCE] Disable UserConfigurableProfiler tests relying on deprecated V2 functionality ([#7541](https://github.com/great-expectations/great_expectations/pull/7541)) +* [MAINTENANCE] : replace ColumnMetricProvider with ColumnAggregateMetricProvider ([#7538](https://github.com/great-expectations/great_expectations/pull/7538)) +* [MAINTENANCE] Exclude files from deprecation warning check ([#7544](https://github.com/great-expectations/great_expectations/pull/7544)) + 0.16.4 ----------------- * [FEATURE] Add package, contributors and metrics filter in Algolia script for expectation ([#7000](https://github.com/great-expectations/great_expectations/pull/7000)) (thanks @kod-er) diff --git a/great_expectations/deployment_version b/great_expectations/deployment_version index 5f2491c5adca..19270385eaf7 100644 --- a/great_expectations/deployment_version +++ b/great_expectations/deployment_version @@ -1 +1 @@ -0.16.4 +0.16.5 From 776784a94c45935619a339fb564cf7976ab090a2 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Mon, 3 Apr 2023 09:59:15 -0400 Subject: [PATCH 21/96] [BUGFIX] Fluent Datasource load from config fixes for remaining Pandas Datasources (#7442) --- .../_abs_fluent_data_asset_config_keys.mdx | 4 +- ...data_on_azure_blob_storage_using_pandas.md | 4 +- ..._to_connect_to_data_on_gcs_using_pandas.md | 4 +- ...w_to_connect_to_data_on_s3_using_pandas.md | 4 +- .../public_api_report.py | 2 +- .../azure_blob_storage_data_connector.py | 17 +- .../google_cloud_storage_data_connector.py | 17 +- .../data_connector/s3_data_connector.py | 17 +- .../datasource/fluent/file_path_data_asset.py | 19 +- .../datasource/fluent/interfaces.py | 21 +- .../pandas_azure_blob_storage_datasource.py | 265 +++--------------- .../pandas_azure_blob_storage_datasource.pyi | 125 ++++++--- .../fluent/pandas_dbfs_datasource.py | 221 ++------------- .../fluent/pandas_dbfs_datasource.pyi | 8 +- .../fluent/pandas_filesystem_datasource.pyi | 8 +- .../pandas_google_cloud_storage_datasource.py | 252 ++--------------- ...pandas_google_cloud_storage_datasource.pyi | 110 ++++++-- .../datasource/fluent/pandas_s3_datasource.py | 247 +++------------- .../fluent/pandas_s3_datasource.pyi | 122 ++++++-- .../datasource/fluent/sources.py | 8 +- .../spark_azure_blob_storage_datasource.py | 9 +- .../fluent/spark_dbfs_datasource.py | 9 +- .../fluent/spark_filesystem_datasource.py | 12 +- .../spark_google_cloud_storage_datasource.py | 9 +- .../datasource/fluent/spark_s3_datasource.py | 12 +- tests/datasource/fluent/conftest.py | 56 ++++ .../datasource/fluent/great_expectations.yml | 43 +++ .../test_integration_datasource.py | 5 +- tests/datasource/fluent/test_config.py | 2 + ...st_pandas_azure_blob_storage_datasource.py | 14 +- .../fluent/test_pandas_s3_datasource.py | 107 +++++++ .../fluent/test_spark_s3_datasource.py | 2 +- .../datasource/fluent/test_viral_snippets.py | 26 +- 33 files changed, 724 insertions(+), 1057 deletions(-) diff --git a/docs/docusaurus/docs/components/connect_to_data/cloud/_abs_fluent_data_asset_config_keys.mdx b/docs/docusaurus/docs/components/connect_to_data/cloud/_abs_fluent_data_asset_config_keys.mdx index e8edb51c85b5..0b0d4017e96f 100644 --- a/docs/docusaurus/docs/components/connect_to_data/cloud/_abs_fluent_data_asset_config_keys.mdx +++ b/docs/docusaurus/docs/components/connect_to_data/cloud/_abs_fluent_data_asset_config_keys.mdx @@ -9,6 +9,6 @@ To specify data to connect to you will need the following elements: ```python title="Python code" asset_name = "my_taxi_data_asset" batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv" -container = "my_container" -name_starts_with = "data/taxi_yellow_tripdata_samples/" +abs_container = "my_container" +abs_name_starts_with = "data/taxi_yellow_tripdata_samples/" ``` \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md index ec37202a49d6..0ac76fa47f76 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md @@ -69,8 +69,8 @@ Once these values have been defined, we will define our Data Asset with the code data_asset = datasource.add_csv_asset( name=asset_name, batching_regex=batching_regex, - container=container, - name_starts_with=name_starts_with, + abs_container=abs_container, + abs_name_starts_with=abs_name_starts_with, ) ``` diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md index 2402a273fcfd..49b0b7bd6e00 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md @@ -62,9 +62,9 @@ datasource = context.sources.add_pandas_gcs( ```python title = "Python code" batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv" -prefix = "data/taxi_yellow_tripdata_samples/" +gcs_prefix = "data/taxi_yellow_tripdata_samples/" data_asset = datasource.add_csv_asset( - name="my_taxi_data_asset", batching_regex=batching_regex, prefix=prefix + name="my_taxi_data_asset", batching_regex=batching_regex, gcs_prefix=gcs_prefix ) ``` diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md index 2223c5611843..62691eb3aa18 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md @@ -70,9 +70,9 @@ datasource = context.sources.add_pandas_s3( ```python title = "Python code" batching_regex = r"data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv" -prefix = "data/taxi_yellow_tripdata_samples/" +s3_prefix = "data/taxi_yellow_tripdata_samples/" data_asset = datasource.add_csv_asset( - name="my_taxi_data_asset", batching_regex=batching_regex, prefix=prefix + name="my_taxi_data_asset", batching_regex=batching_regex, s3_prefix=s3_prefix ) ``` diff --git a/docs/sphinx_api_docs_source/public_api_report.py b/docs/sphinx_api_docs_source/public_api_report.py index 68056ae47c35..01d375c2d47e 100755 --- a/docs/sphinx_api_docs_source/public_api_report.py +++ b/docs/sphinx_api_docs_source/public_api_report.py @@ -1859,7 +1859,7 @@ def main(): # any methods or classes you are adding to documentation with the @public_api # decorator and any relevant "new" or "deprecated" public api decorators. # If the actual is lower than the threshold, please reduce the threshold. - PUBLIC_API_MISSING_THRESHOLD = 97 # TODO: reduce this number again once this works for the Fluent DS dynamic methods + PUBLIC_API_MISSING_THRESHOLD = 94 # TODO: reduce this number again once this works for the Fluent DS dynamic methods if len(printable_definitions) != PUBLIC_API_MISSING_THRESHOLD: error_msg_prefix = f"There are {len(printable_definitions)} items missing from the public API, we currently allow {PUBLIC_API_MISSING_THRESHOLD}." if len(printable_definitions) > PUBLIC_API_MISSING_THRESHOLD: diff --git a/great_expectations/datasource/fluent/data_asset/data_connector/azure_blob_storage_data_connector.py b/great_expectations/datasource/fluent/data_asset/data_connector/azure_blob_storage_data_connector.py index 0fe76976e9ef..82f103a3e283 100644 --- a/great_expectations/datasource/fluent/data_asset/data_connector/azure_blob_storage_data_connector.py +++ b/great_expectations/datasource/fluent/data_asset/data_connector/azure_blob_storage_data_connector.py @@ -2,7 +2,9 @@ import logging import re -from typing import TYPE_CHECKING, Callable, List, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, List, Optional, Type + +import pydantic from great_expectations.core.batch_spec import AzureBatchSpec, PathBatchSpec from great_expectations.datasource.data_connector.util import ( @@ -22,6 +24,12 @@ logger = logging.getLogger(__name__) +class _AzureOptions(pydantic.BaseModel): + abs_container: str + abs_name_starts_with: str = "" + abs_delimiter: str = "/" + + class AzureBlobStorageDataConnector(FilePathDataConnector): """Extension of FilePathDataConnector used to connect to Microsoft Azure Blob Storage (ABS). @@ -41,6 +49,13 @@ class AzureBlobStorageDataConnector(FilePathDataConnector): file_path_template_map_fn: Format function mapping path to fully-qualified resource on ABS """ + asset_level_option_keys: ClassVar[tuple[str, ...]] = ( + "abs_container", + "abs_name_starts_with", + "abs_delimiter", + ) + asset_options_type: ClassVar[Type[_AzureOptions]] = _AzureOptions + def __init__( self, datasource_name: str, diff --git a/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py b/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py index b8a866dc7845..03b3f9475ebc 100644 --- a/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py +++ b/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py @@ -2,7 +2,9 @@ import logging import re -from typing import TYPE_CHECKING, Callable, List, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, List, Optional, Type + +import pydantic from great_expectations.core.batch_spec import GCSBatchSpec, PathBatchSpec from great_expectations.datasource.data_connector.util import ( @@ -21,6 +23,12 @@ logger = logging.getLogger(__name__) +class _GCSOptions(pydantic.BaseModel): + gcs_prefix: str = "" + gcs_delimiter: str = "/" + gcs_max_results: int = 1000 + + class GoogleCloudStorageDataConnector(FilePathDataConnector): """Extension of FilePathDataConnector used to connect to Google Cloud Storage (GCS). @@ -40,6 +48,13 @@ class GoogleCloudStorageDataConnector(FilePathDataConnector): file_path_template_map_fn: Format function mapping path to fully-qualified resource on GCS """ + asset_level_option_keys: ClassVar[tuple[str, ...]] = ( + "gcs_prefix", + "gcs_delimiter", + "gcs_max_results", + ) + asset_options_type: ClassVar[Type[_GCSOptions]] = _GCSOptions + def __init__( self, datasource_name: str, diff --git a/great_expectations/datasource/fluent/data_asset/data_connector/s3_data_connector.py b/great_expectations/datasource/fluent/data_asset/data_connector/s3_data_connector.py index 083205f523d1..3ba9892a239f 100644 --- a/great_expectations/datasource/fluent/data_asset/data_connector/s3_data_connector.py +++ b/great_expectations/datasource/fluent/data_asset/data_connector/s3_data_connector.py @@ -2,7 +2,9 @@ import logging import re -from typing import TYPE_CHECKING, Callable, List, Optional +from typing import TYPE_CHECKING, Callable, ClassVar, List, Optional, Type + +import pydantic from great_expectations.core.batch_spec import PathBatchSpec, S3BatchSpec from great_expectations.datasource.data_connector.util import ( @@ -22,6 +24,12 @@ logger = logging.getLogger(__name__) +class _S3Options(pydantic.BaseModel): + s3_prefix: str = "" + s3_delimiter: str = "/" + s3_max_keys: int = 1000 + + class S3DataConnector(FilePathDataConnector): """Extension of FilePathDataConnector used to connect to S3. @@ -42,6 +50,13 @@ class S3DataConnector(FilePathDataConnector): file_path_template_map_fn: Format function mapping path to fully-qualified resource on S3 """ + asset_level_option_keys: ClassVar[tuple[str, ...]] = ( + "s3_prefix", + "s3_delimiter", + "s3_max_keys", + ) + asset_options_type: ClassVar[Type[_S3Options]] = _S3Options + def __init__( self, datasource_name: str, diff --git a/great_expectations/datasource/fluent/file_path_data_asset.py b/great_expectations/datasource/fluent/file_path_data_asset.py index 84eaf0d5d35c..c68e39f0f1b7 100644 --- a/great_expectations/datasource/fluent/file_path_data_asset.py +++ b/great_expectations/datasource/fluent/file_path_data_asset.py @@ -3,7 +3,6 @@ import copy import dataclasses import logging -import re from pprint import pformat as pf from typing import ( TYPE_CHECKING, @@ -15,7 +14,6 @@ Optional, Pattern, Set, - Union, ) import pydantic @@ -33,7 +31,6 @@ BatchRequest, BatchRequestOptions, DataAsset, - Datasource, TestConnectionError, ) @@ -111,12 +108,6 @@ def __init__(self, **data): ) self._all_group_names = self._regex_parser.get_all_group_names() - @pydantic.validator("batching_regex", pre=True) - def _parse_batching_regex_string( - cls, batching_regex: Optional[Union[re.Pattern, str]] = None - ) -> re.Pattern: - return Datasource.parse_batching_regex_string(batching_regex=batching_regex) - @property def batch_request_options( self, @@ -266,8 +257,14 @@ def test_connection(self) -> None: Raises: TestConnectionError: If the connection test fails. """ - if not self._data_connector.test_connection(): - raise TestConnectionError(self._test_connection_error_message) + try: + if self._data_connector.test_connection(): + return None + except Exception as e: + raise TestConnectionError( + f"Could not connect to asset using {type(self._data_connector).__name__}: Got {type(e).__name__}" + ) from e + raise TestConnectionError(self._test_connection_error_message) def _get_reader_method(self) -> str: raise NotImplementedError( diff --git a/great_expectations/datasource/fluent/interfaces.py b/great_expectations/datasource/fluent/interfaces.py index c057e17e6e5f..3e7c8e87a8a8 100644 --- a/great_expectations/datasource/fluent/interfaces.py +++ b/great_expectations/datasource/fluent/interfaces.py @@ -3,7 +3,6 @@ import dataclasses import functools import logging -import re import uuid from pprint import pformat as pf from typing import ( @@ -30,9 +29,6 @@ from typing_extensions import TypeAlias, TypeGuard from great_expectations.core.id_dict import BatchSpec # noqa: TCH001 -from great_expectations.datasource.fluent.constants import ( - MATCH_ALL_PATTERN, -) from great_expectations.datasource.fluent.fluent_base_model import ( FluentBaseModel, ) @@ -409,7 +405,7 @@ def _load_asset_subtype( If a more specific subtype is needed the `data_asset` will be converted to a more specific `DataAsset`. """ - logger.info(f"Loading '{data_asset.name}' asset ->\n{pf(data_asset, depth=4)}") + logger.debug(f"Loading '{data_asset.name}' asset ->\n{pf(data_asset, depth=4)}") asset_type_name: str = data_asset.type asset_type: Type[_DataAssetT] = cls._type_lookup[asset_type_name] @@ -518,21 +514,6 @@ def parse_order_by_sorters( order_by_sorters.append(sorter) return order_by_sorters - @staticmethod - def parse_batching_regex_string( - batching_regex: Optional[Union[re.Pattern, str]] = None - ) -> re.Pattern: - pattern: re.Pattern - if not batching_regex: - pattern = MATCH_ALL_PATTERN - elif isinstance(batching_regex, str): - pattern = re.compile(batching_regex) - elif isinstance(batching_regex, re.Pattern): - pattern = batching_regex - else: - raise ValueError('"batching_regex" must be either re.Pattern, str, or None') - return pattern - # Abstract Methods @property def execution_engine_type(self) -> Type[_ExecutionEngineT]: diff --git a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py index 8b42e8b64eeb..f5453c2c307b 100644 --- a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py @@ -2,10 +2,10 @@ import logging import re -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Type, Union import pydantic -from typing_extensions import Literal +from typing_extensions import Final, Literal from great_expectations.core.util import AzureUrl from great_expectations.datasource.fluent import _PandasFilePathDatasource @@ -16,21 +16,12 @@ from great_expectations.datasource.fluent.pandas_datasource import ( PandasDatasourceError, ) -from great_expectations.datasource.fluent.pandas_file_path_datasource import ( - CSVAsset, - ExcelAsset, - JSONAsset, - ParquetAsset, -) -from great_expectations.datasource.fluent.signatures import _merge_signatures if TYPE_CHECKING: - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.file_path_data_asset import ( + _FilePathDataAsset, ) - logger = logging.getLogger(__name__) @@ -44,12 +35,19 @@ except ImportError: pass +_MISSING: Final = object() + class PandasAzureBlobStorageDatasourceError(PandasDatasourceError): pass class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[AzureBlobStorageDataConnector] + ] = AzureBlobStorageDataConnector + # instance attributes type: Literal["pandas_abs"] = "pandas_abs" @@ -122,231 +120,44 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( + def _build_data_connector( self, - name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", - order_by: Optional[SortersDefinition] = None, + data_asset: _FilePathDataAsset, + abs_container: str = _MISSING, # type: ignore[assignment] # _MISSING is used as sentinel value + abs_name_starts_with: str = "", + abs_delimiter: str = "/", **kwargs, - ) -> CSVAsset: # type: ignore[valid-type] - """Adds a CSV DataAsset to the present "PandasAzureBlobStorageDatasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches CSV filenames that is used to label the batches - container: container name for Microsoft Azure Blob Storage - name_starts_with: Microsoft Azure Blob Storage object name prefix - delimiter: Microsoft Azure Blob Storage object name delimiter - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - - asset = CSVAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = AzureBlobStorageDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - azure_client=self._get_azure_client(), - batching_regex=batching_regex_pattern, - account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, - file_path_template_map_fn=AzureUrl.AZURE_BLOB_STORAGE_HTTPS_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - AzureBlobStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, + ) -> None: + """Builds and attaches the `AzureBlobStorageDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" ) - ) - return self._add_asset(asset=asset) - - def add_excel_asset( - self, - name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ExcelAsset: # type: ignore[valid-type] - """Adds an Excel DataAsset to the present "PandasAzureBlobStorageDatasource" object. - - Args: - name: The name of the Excel asset - batching_regex: regex pattern that matches Excel filenames that is used to label the batches - container: container name for Microsoft Azure Blob Storage - name_starts_with: Microsoft Azure Blob Storage object name prefix - delimiter: Microsoft Azure Blob Storage object name delimiter - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - - asset = ExcelAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = AzureBlobStorageDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - azure_client=self._get_azure_client(), - batching_regex=batching_regex_pattern, - account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, - file_path_template_map_fn=AzureUrl.AZURE_BLOB_STORAGE_HTTPS_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - AzureBlobStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, + if abs_container is _MISSING: + raise TypeError( + f"'{data_asset.name}' is missing required argument 'abs_container'" ) - ) - return self._add_asset(asset=asset) - - def add_json_asset( - self, - name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> JSONAsset: # type: ignore[valid-type] - """Adds a JSON DataAsset to the present "PandasAzureBlobStorageDatasource" object. - Args: - name: The name of the JSON asset - batching_regex: regex pattern that matches JSON filenames that is used to label the batches - container: container name for Microsoft Azure Blob Storage - name_starts_with: Microsoft Azure Blob Storage object name prefix - delimiter: Microsoft Azure Blob Storage object name delimiter - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - - asset = JSONAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = AzureBlobStorageDataConnector.build_data_connector( + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, + data_asset_name=data_asset.name, azure_client=self._get_azure_client(), - batching_regex=batching_regex_pattern, + batching_regex=data_asset.batching_regex, account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, + container=abs_container, + name_starts_with=abs_name_starts_with, + delimiter=abs_delimiter, file_path_template_map_fn=AzureUrl.AZURE_BLOB_STORAGE_HTTPS_URL_TEMPLATE.format, ) - asset._test_connection_error_message = ( - AzureBlobStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, - ) - ) - return self._add_asset(asset=asset) - - def add_parquet_asset( - self, - name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ParquetAsset: # type: ignore[valid-type] - """Adds a Parquet DataAsset to the present "PandasAzureBlobStorageDatasource" object. - Args: - name: The name of the Parquet asset - batching_regex: regex pattern that matches Parquet filenames that is used to label the batches - container: container name for Microsoft Azure Blob Storage - name_starts_with: Microsoft Azure Blob Storage object name prefix - delimiter: Microsoft Azure Blob Storage object name delimiter - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = ParquetAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = AzureBlobStorageDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - azure_client=self._get_azure_client(), - batching_regex=batching_regex_pattern, - account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, - file_path_template_map_fn=AzureUrl.AZURE_BLOB_STORAGE_HTTPS_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - AzureBlobStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, + container=abs_container, + name_starts_with=abs_name_starts_with, + delimiter=abs_delimiter, ) ) - return self._add_asset(asset=asset) - - # attr-defined issue - # https://github.com/python/mypy/issues/12472 - add_csv_asset.__signature__ = _merge_signatures(add_csv_asset, CSVAsset, exclude={"type"}) # type: ignore[attr-defined] - add_excel_asset.__signature__ = _merge_signatures(add_excel_asset, ExcelAsset, exclude={"type"}) # type: ignore[attr-defined] - add_json_asset.__signature__ = _merge_signatures(add_json_asset, JSONAsset, exclude={"type"}) # type: ignore[attr-defined] - add_parquet_asset.__signature__ = _merge_signatures(add_parquet_asset, ParquetAsset, exclude={"type"}) # type: ignore[attr-defined] diff --git a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.pyi b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.pyi index 0da1334a5f6f..31abfaf80ba5 100644 --- a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.pyi @@ -16,7 +16,7 @@ from typing_extensions import Literal from great_expectations.core._docs_decorators import public_api as public_api from great_expectations.core.util import AzureUrl as AzureUrl -from great_expectations.datasource.fluent import Sorter, _PandasFilePathDatasource +from great_expectations.datasource.fluent import _PandasFilePathDatasource from great_expectations.datasource.fluent.data_asset.data_connector import ( AzureBlobStorageDataConnector as AzureBlobStorageDataConnector, ) @@ -53,6 +53,7 @@ if TYPE_CHECKING: IndexLabel, StorageOptions, ) + from great_expectations.datasource.fluent.interfaces import BatchMetadata from great_expectations.datasource.fluent.pandas_file_path_datasource import ( CSVAsset, ExcelAsset, @@ -84,12 +85,15 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_csv_asset( self, name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", # FIXME: this conflicts with the read_csv delimiter + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", sep: typing.Union[str, None] = ..., + delimiter: typing.Union[str, None] = ..., header: Union[int, Sequence[int], None, Literal["infer"]] = "infer", names: Union[Sequence[Hashable], None] = ..., index_col: Union[IndexLabel, Literal[False], None] = ..., @@ -142,11 +146,13 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_excel_asset( self, name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", sheet_name: typing.Union[str, int, None] = 0, header: Union[int, Sequence[int], None] = 0, names: typing.Union[typing.List[str], None] = ..., @@ -174,8 +180,13 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_feather_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", columns: Union[Sequence[Hashable], None] = ..., use_threads: bool = ..., storage_options: StorageOptions = ..., @@ -183,8 +194,13 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_hdf_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", key: typing.Any = ..., mode: str = "r", errors: str = "strict", @@ -199,8 +215,13 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_html_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", match: Union[str, typing.Pattern] = ".+", flavor: typing.Union[str, None] = ..., header: Union[int, Sequence[int], None] = ..., @@ -219,11 +240,13 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_json_asset( self, name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", orient: typing.Union[str, None] = ..., dtype: typing.Union[dict, None] = ..., convert_axes: typing.Any = ..., @@ -243,19 +266,26 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_orc_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", columns: typing.Union[typing.List[str], None] = ..., kwargs: typing.Union[dict, None] = ..., ) -> ORCAsset: ... def add_parquet_asset( self, name: str, - batching_regex: Union[re.Pattern, str], - container: str, - name_starts_with: str = "", - delimiter: str = "/", + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", engine: str = "auto", columns: typing.Union[typing.List[str], None] = ..., storage_options: StorageOptions = ..., @@ -265,16 +295,26 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_pickle_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", compression: CompressionOptions = "infer", storage_options: StorageOptions = ..., ) -> PickleAsset: ... def add_sas_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", format: typing.Union[str, None] = ..., index: Union[Hashable, None] = ..., encoding: typing.Union[str, None] = ..., @@ -285,16 +325,26 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_spss_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", usecols: typing.Union[int, str, typing.Sequence[int], None] = ..., convert_categoricals: bool = ..., ) -> SPSSAsset: ... def add_stata_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", convert_dates: bool = ..., convert_categoricals: bool = ..., index_col: typing.Union[str, None] = ..., @@ -310,8 +360,13 @@ class PandasAzureBlobStorageDatasource(_PandasFilePathDatasource): def add_xml_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", xpath: str = "./*", namespaces: typing.Union[typing.Dict[str, str], None] = ..., elems_only: bool = ..., diff --git a/great_expectations/datasource/fluent/pandas_dbfs_datasource.py b/great_expectations/datasource/fluent/pandas_dbfs_datasource.py index ba6fbc67f2c9..a9f82e66e301 100644 --- a/great_expectations/datasource/fluent/pandas_dbfs_datasource.py +++ b/great_expectations/datasource/fluent/pandas_dbfs_datasource.py @@ -1,8 +1,7 @@ from __future__ import annotations import logging -import re -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, ClassVar, Type from typing_extensions import Literal @@ -12,18 +11,10 @@ from great_expectations.datasource.fluent.data_asset.data_connector import ( DBFSDataConnector, ) -from great_expectations.datasource.fluent.pandas_file_path_datasource import ( - CSVAsset, - ExcelAsset, - JSONAsset, - ParquetAsset, -) -from great_expectations.datasource.fluent.signatures import _merge_signatures if TYPE_CHECKING: - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.file_path_data_asset import ( + _FilePathDataAsset, ) logger = logging.getLogger(__name__) @@ -33,207 +24,37 @@ class PandasDBFSDatasource(PandasFilesystemDatasource): """Pandas based Datasource for DataBricks File System (DBFS) based data assets.""" + # class attributes + data_connector_type: ClassVar[Type[DBFSDataConnector]] = DBFSDataConnector + # instance attributes # overridden from base `Literal['pandas_filesystem']` type: Literal["pandas_dbfs"] = "pandas_dbfs" # type: ignore[assignment] # base class has different type - @public_api - def add_csv_asset( - self, - name: str, - batching_regex: Optional[Union[re.Pattern, str]] = None, - glob_directive: str = "**/*", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> CSVAsset: # type: ignore[valid-type] - """Adds a CSV DataAsset to the present "PandasDBFSDatasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches CSV filenames that is used to label the batches - glob_directive: glob for selecting files in directory (defaults to `**/*`) or nested directories (e.g. `*/*/*.csv`) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = DBFSDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - batching_regex=batching_regex_pattern, - base_directory=self.base_directory, - glob_directive=glob_directive, - data_context_root_directory=self.data_context_root_directory, - file_path_template_map_fn=DBFSPath.convert_to_file_semantics_version, - ) - asset._test_connection_error_message = ( - DBFSDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - glob_directive=glob_directive, - base_directory=self.base_directory, - ) - ) - return self._add_asset(asset=asset) - - @public_api - def add_excel_asset( - self, - name: str, - batching_regex: Optional[Union[str, re.Pattern]] = None, - glob_directive: str = "**/*", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ExcelAsset: # type: ignore[valid-type] - """Adds an Excel DataAsset to the present "PandasDBFSDatasource" object. - - Args: - name: The name of the Excel asset - batching_regex: regex pattern that matches Excel filenames that is used to label the batches - glob_directive: glob for selecting files in directory (defaults to `**/*`) or nested directories (e.g. `*/*/*.csv`) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_excel`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - - asset = ExcelAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = DBFSDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - batching_regex=batching_regex_pattern, - base_directory=self.base_directory, - glob_directive=glob_directive, - data_context_root_directory=self.data_context_root_directory, - file_path_template_map_fn=DBFSPath.convert_to_file_semantics_version, - ) - asset._test_connection_error_message = ( - DBFSDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - glob_directive=glob_directive, - base_directory=self.base_directory, + def _build_data_connector( + self, data_asset: _FilePathDataAsset, glob_directive: str = "**/*", **kwargs + ) -> None: + """Builds and attaches the `DBFSDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" ) - ) - return self._add_asset(asset=asset) - - @public_api - def add_json_asset( - self, - name: str, - batching_regex: Optional[Union[str, re.Pattern]] = None, - glob_directive: str = "**/*", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> JSONAsset: # type: ignore[valid-type] - """Adds a JSON DataAsset to the present "PandasDBFSDatasource" object. - - Args: - name: The name of the JSON asset - batching_regex: regex pattern that matches JSON filenames that is used to label the batches - glob_directive: glob for selecting files in directory (defaults to `**/*`) or nested directories (e.g. `*/*/*.csv`) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_json`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - - asset = JSONAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = DBFSDataConnector.build_data_connector( + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, - batching_regex=batching_regex_pattern, + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, base_directory=self.base_directory, glob_directive=glob_directive, data_context_root_directory=self.data_context_root_directory, file_path_template_map_fn=DBFSPath.convert_to_file_semantics_version, ) - asset._test_connection_error_message = ( - DBFSDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - glob_directive=glob_directive, - base_directory=self.base_directory, - ) - ) - return self._add_asset(asset=asset) - - @public_api - def add_parquet_asset( - self, - name: str, - batching_regex: Optional[Union[str, re.Pattern]] = None, - glob_directive: str = "**/*", - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ParquetAsset: # type: ignore[valid-type] - """Adds a Parquet DataAsset to the present "PandasDBFSDatasource" object. - - Args: - name: The name of the Parquet asset - batching_regex: regex pattern that matches Parquet filenames that is used to label the batches - glob_directive: glob for selecting files in directory (defaults to `**/*`) or nested directories (e.g. `*/*/*.csv`) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_parquet`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = ParquetAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = DBFSDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - batching_regex=batching_regex_pattern, - base_directory=self.base_directory, - glob_directive=glob_directive, - data_context_root_directory=self.data_context_root_directory, - file_path_template_map_fn=DBFSPath.convert_to_file_semantics_version, - ) - asset._test_connection_error_message = ( - DBFSDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, glob_directive=glob_directive, base_directory=self.base_directory, ) ) - return self._add_asset(asset=asset) - - # attr-defined issue - # https://github.com/python/mypy/issues/12472 - add_csv_asset.__signature__ = _merge_signatures(add_csv_asset, CSVAsset, exclude={"type"}) # type: ignore[attr-defined] - add_excel_asset.__signature__ = _merge_signatures(add_excel_asset, ExcelAsset, exclude={"type"}) # type: ignore[attr-defined] - add_json_asset.__signature__ = _merge_signatures(add_json_asset, JSONAsset, exclude={"type"}) # type: ignore[attr-defined] - add_parquet_asset.__signature__ = _merge_signatures(add_parquet_asset, ParquetAsset, exclude={"type"}) # type: ignore[attr-defined] diff --git a/great_expectations/datasource/fluent/pandas_dbfs_datasource.pyi b/great_expectations/datasource/fluent/pandas_dbfs_datasource.pyi index 06874ddb0ace..fe5fe72a3161 100644 --- a/great_expectations/datasource/fluent/pandas_dbfs_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_dbfs_datasource.pyi @@ -51,7 +51,7 @@ class PandasDBFSDatasource(PandasFilesystemDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., @@ -110,7 +110,7 @@ class PandasDBFSDatasource(PandasFilesystemDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., @@ -196,7 +196,7 @@ class PandasDBFSDatasource(PandasFilesystemDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., @@ -231,7 +231,7 @@ class PandasDBFSDatasource(PandasFilesystemDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., diff --git a/great_expectations/datasource/fluent/pandas_filesystem_datasource.pyi b/great_expectations/datasource/fluent/pandas_filesystem_datasource.pyi index 6710e3d7bdfc..d405fcc472a8 100644 --- a/great_expectations/datasource/fluent/pandas_filesystem_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_filesystem_datasource.pyi @@ -54,7 +54,7 @@ class PandasFilesystemDatasource(_PandasFilePathDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., @@ -113,7 +113,7 @@ class PandasFilesystemDatasource(_PandasFilePathDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., @@ -199,7 +199,7 @@ class PandasFilesystemDatasource(_PandasFilePathDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., @@ -234,7 +234,7 @@ class PandasFilesystemDatasource(_PandasFilePathDatasource): self, name: str, *, - batching_regex: Optional[Union[str, re.Pattern]] = ..., + batching_regex: Union[re.Pattern, str] = ..., glob_directive: str = ..., order_by: Optional[SortersDefinition] = ..., batch_metadata: Optional[BatchMetadata] = ..., diff --git a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py index bb38b5dad2fb..7e4a05fb1ce5 100644 --- a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py @@ -1,8 +1,7 @@ from __future__ import annotations import logging -import re -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Type, Union import pydantic from typing_extensions import Literal @@ -19,13 +18,6 @@ from great_expectations.datasource.fluent.pandas_datasource import ( PandasDatasourceError, ) -from great_expectations.datasource.fluent.pandas_file_path_datasource import ( - CSVAsset, - ExcelAsset, - JSONAsset, - ParquetAsset, -) -from great_expectations.datasource.fluent.signatures import _merge_signatures if TYPE_CHECKING: from google.cloud.storage.client import Client as GoogleCloudStorageClient @@ -33,12 +25,10 @@ Credentials as GoogleServiceAccountCredentials, ) - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.file_path_data_asset import ( + _FilePathDataAsset, ) - logger = logging.getLogger(__name__) @@ -57,6 +47,11 @@ class PandasGoogleCloudStorageDatasourceError(PandasDatasourceError): class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[GoogleCloudStorageDataConnector] + ] = GoogleCloudStorageDataConnector + # instance attributes type: Literal["pandas_gcs"] = "pandas_gcs" @@ -130,223 +125,38 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( + def _build_data_connector( self, - name: str, - batching_regex: Union[re.Pattern, str], - prefix: str = "", - delimiter: str = "/", - max_results: int = 1000, - order_by: Optional[SortersDefinition] = None, + data_asset: _FilePathDataAsset, + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, **kwargs, - ) -> CSVAsset: # type: ignore[valid-type] - """Adds a CSV DataAsset to the present "PandasGoogleCloudStorageDatasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches CSV filenames that is used to label the batches - prefix (str): Google Cloud Storage object name prefix - delimiter (str): Google Cloud Storage object name delimiter - max_results (int): Google Cloud Storage max_results (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - - asset = CSVAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - - asset._data_connector = GoogleCloudStorageDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - gcs_client=self._get_gcs_client(), - batching_regex=batching_regex_pattern, - bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - max_results=max_results, - file_path_template_map_fn=GCSUrl.OBJECT_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - GoogleCloudStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, + ) -> None: + """Builds and attaches the `GoogleCloudStorageDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" ) - ) - return self._add_asset(asset=asset) - - def add_excel_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern], - prefix: str = "", - delimiter: str = "/", - max_results: int = 1000, - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ExcelAsset: # type: ignore[valid-type] - """Adds an Excel DataAsset to the present "PandasGoogleCloudStorageDatasource" object. - - Args: - name: The name of the Excel asset - batching_regex: regex pattern that matches Excel filenames that is used to label the batches - prefix (str): Google Cloud Storage object name prefix - delimiter (str): Google Cloud Storage object name delimiter - max_results (int): Google Cloud Storage max_results (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_excel`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = ExcelAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = GoogleCloudStorageDataConnector.build_data_connector( + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, + data_asset_name=data_asset.name, gcs_client=self._get_gcs_client(), - batching_regex=batching_regex_pattern, + batching_regex=data_asset.batching_regex, bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - max_results=max_results, + prefix=gcs_prefix, + delimiter=gcs_delimiter, + max_results=gcs_max_results, file_path_template_map_fn=GCSUrl.OBJECT_URL_TEMPLATE.format, ) - asset._test_connection_error_message = ( - GoogleCloudStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - ) - ) - return self._add_asset(asset=asset) - def add_json_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern], - prefix: str = "", - delimiter: str = "/", - max_results: int = 1000, - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> JSONAsset: # type: ignore[valid-type] - """Adds a JSON DataAsset to the present "PandasGoogleCloudStorageDatasource" object. - - Args: - name: The name of the JSON asset - batching_regex: regex pattern that matches JSON filenames that is used to label the batches - prefix (str): Google Cloud Storage object name prefix - delimiter (str): Google Cloud Storage object name delimiter - max_results (int): Google Cloud Storage max_results (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_json`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = JSONAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = GoogleCloudStorageDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - gcs_client=self._get_gcs_client(), - batching_regex=batching_regex_pattern, - bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - max_results=max_results, - file_path_template_map_fn=GCSUrl.OBJECT_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - GoogleCloudStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - ) - ) - return self._add_asset(asset=asset) - - def add_parquet_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern], - prefix: str = "", - delimiter: str = "/", - max_results: int = 1000, - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ParquetAsset: # type: ignore[valid-type] - """Adds a Parquet DataAsset to the present "PandasGoogleCloudStorageDatasource" object. - - Args: - name: The name of the Parquet asset - batching_regex: regex pattern that matches Parquet filenames that is used to label the batches - prefix (str): Google Cloud Storage object name prefix - delimiter (str): Google Cloud Storage object name delimiter - max_results (int): Google Cloud Storage max_results (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_parquet`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = ParquetAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = GoogleCloudStorageDataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - gcs_client=self._get_gcs_client(), - batching_regex=batching_regex_pattern, - bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - max_results=max_results, - file_path_template_map_fn=GCSUrl.OBJECT_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - GoogleCloudStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, + prefix=gcs_prefix, + delimiter=gcs_delimiter, ) ) - return self._add_asset(asset=asset) - - # attr-defined issue - # https://github.com/python/mypy/issues/12472 - add_csv_asset.__signature__ = _merge_signatures(add_csv_asset, CSVAsset, exclude={"type"}) # type: ignore[attr-defined] - add_excel_asset.__signature__ = _merge_signatures(add_excel_asset, ExcelAsset, exclude={"type"}) # type: ignore[attr-defined] - add_json_asset.__signature__ = _merge_signatures(add_json_asset, JSONAsset, exclude={"type"}) # type: ignore[attr-defined] - add_parquet_asset.__signature__ = _merge_signatures(add_parquet_asset, ParquetAsset, exclude={"type"}) # type: ignore[attr-defined] diff --git a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.pyi b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.pyi index 77d390e9ebb2..f62755a623a9 100644 --- a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.pyi @@ -16,7 +16,7 @@ from typing_extensions import Literal from great_expectations.core._docs_decorators import public_api as public_api from great_expectations.core.util import GCSUrl as GCSUrl -from great_expectations.datasource.fluent import Sorter, _PandasFilePathDatasource +from great_expectations.datasource.fluent import _PandasFilePathDatasource from great_expectations.datasource.fluent.data_asset.data_connector import ( FilesystemDataConnector as FilesystemDataConnector, ) @@ -88,10 +88,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_csv_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., - order_by: Optional[SortersDefinition] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, sep: typing.Union[str, None] = ..., delimiter: typing.Union[str, None] = ..., header: Union[int, Sequence[int], None, Literal["infer"]] = "infer", @@ -146,10 +149,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_excel_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., - order_by: Optional[SortersDefinition] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, sheet_name: typing.Union[str, int, None] = 0, header: Union[int, Sequence[int], None] = 0, names: typing.Union[typing.List[str], None] = ..., @@ -177,9 +183,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_feather_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, columns: Union[Sequence[Hashable], None] = ..., use_threads: bool = ..., storage_options: StorageOptions = ..., @@ -187,9 +197,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_hdf_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, key: typing.Any = ..., mode: str = "r", errors: str = "strict", @@ -204,9 +218,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_html_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, match: Union[str, typing.Pattern] = ".+", flavor: typing.Union[str, None] = ..., header: Union[int, Sequence[int], None] = ..., @@ -225,10 +243,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_json_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., - order_by: Optional[SortersDefinition] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, orient: typing.Union[str, None] = ..., dtype: typing.Union[dict, None] = ..., convert_axes: typing.Any = ..., @@ -248,19 +269,26 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_orc_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, columns: typing.Union[typing.List[str], None] = ..., kwargs: typing.Union[dict, None] = ..., ) -> ORCAsset: ... def add_parquet_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., - order_by: Optional[SortersDefinition] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, engine: str = "auto", columns: typing.Union[typing.List[str], None] = ..., storage_options: StorageOptions = ..., @@ -270,18 +298,26 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_pickle_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, compression: CompressionOptions = "infer", storage_options: StorageOptions = ..., ) -> PickleAsset: ... def add_sas_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, format: typing.Union[str, None] = ..., index: Union[Hashable, None] = ..., encoding: typing.Union[str, None] = ..., @@ -292,18 +328,26 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_spss_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, usecols: typing.Union[int, str, typing.Sequence[int], None] = ..., convert_categoricals: bool = ..., ) -> SPSSAsset: ... def add_stata_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, convert_dates: bool = ..., convert_categoricals: bool = ..., index_col: typing.Union[str, None] = ..., @@ -319,9 +363,13 @@ class PandasGoogleCloudStorageDatasource(_PandasFilePathDatasource): def add_xml_asset( self, name: str, - batching_regex: typing.Pattern = ..., - order_by: typing.List[Sorter] = ..., + *, batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, xpath: str = "./*", namespaces: typing.Union[typing.Dict[str, str], None] = ..., elems_only: bool = ..., diff --git a/great_expectations/datasource/fluent/pandas_s3_datasource.py b/great_expectations/datasource/fluent/pandas_s3_datasource.py index 55378eebaa3d..83a7aef07aaf 100644 --- a/great_expectations/datasource/fluent/pandas_s3_datasource.py +++ b/great_expectations/datasource/fluent/pandas_s3_datasource.py @@ -1,8 +1,7 @@ from __future__ import annotations import logging -import re -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Type, Union import pydantic from typing_extensions import Literal @@ -19,20 +18,12 @@ from great_expectations.datasource.fluent.pandas_datasource import ( PandasDatasourceError, ) -from great_expectations.datasource.fluent.pandas_file_path_datasource import ( - CSVAsset, - ExcelAsset, - JSONAsset, - ParquetAsset, -) -from great_expectations.datasource.fluent.signatures import _merge_signatures if TYPE_CHECKING: from botocore.client import BaseClient - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.file_path_data_asset import ( + _FilePathDataAsset, ) @@ -53,6 +44,9 @@ class PandasS3DatasourceError(PandasDatasourceError): class PandasS3Datasource(_PandasFilePathDatasource): + # class attributes + data_connector_type: ClassVar[Type[S3DataConnector]] = S3DataConnector + # instance attributes type: Literal["pandas_s3"] = "pandas_s3" @@ -104,221 +98,44 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( + def _build_data_connector( self, - name: str, - batching_regex: Union[re.Pattern, str], - prefix: str = "", - delimiter: str = "/", - max_keys: int = 1000, - order_by: Optional[SortersDefinition] = None, + data_asset: _FilePathDataAsset, + s3_prefix: str = "", + s3_delimiter: str = "/", # TODO: delimiter conflicts with csv asset args + s3_max_keys: int = 1000, **kwargs, - ) -> CSVAsset: # type: ignore[valid-type] - """Adds a CSV DataAsset to the present "PandasS3Datasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches CSV filenames that is used to label the batches - prefix: S3 object name prefix - delimiter: S3 object name delimiter - max_keys: S3 max_keys (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_csv`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = S3DataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - s3_client=self._get_s3_client(), - batching_regex=batching_regex_pattern, - bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - max_keys=max_keys, - file_path_template_map_fn=S3Url.OBJECT_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - S3DataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, + ) -> None: + """Builds and attaches the `S3DataConnector` to the asset.""" + # TODO: use the `asset_options_type` for validation and defaults + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" ) - ) - return self._add_asset(asset=asset) - - def add_excel_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern], - prefix: str = "", - delimiter: str = "/", - max_keys: int = 1000, - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ExcelAsset: # type: ignore[valid-type] - """Adds an Excel DataAsset to the present "PandasS3Datasource" object. - Args: - name: The name of the Excel asset - batching_regex: regex pattern that matches Excel filenames that is used to label the batches - prefix: S3 object name prefix - delimiter: S3 object name delimiter - max_keys: S3 object name max_keys (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_excel`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = ExcelAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = S3DataConnector.build_data_connector( + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, + data_asset_name=data_asset.name, s3_client=self._get_s3_client(), - batching_regex=batching_regex_pattern, + batching_regex=data_asset.batching_regex, bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - max_keys=max_keys, + prefix=s3_prefix, + delimiter=s3_delimiter, + max_keys=s3_max_keys, file_path_template_map_fn=S3Url.OBJECT_URL_TEMPLATE.format, ) - asset._test_connection_error_message = ( - S3DataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - ) - ) - return self._add_asset(asset=asset) - def add_json_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern], - prefix: str = "", - delimiter: str = "/", - max_keys: int = 1000, - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> JSONAsset: # type: ignore[valid-type] - """Adds a JSON DataAsset to the present "PandasS3Datasource" object. - - Args: - name: The name of the JSON asset - batching_regex: regex pattern that matches JSON filenames that is used to label the batches - prefix: S3 object name prefix - delimiter: S3 object name delimiter - max_keys: S3 object name max_keys (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_json`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = JSONAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = S3DataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - s3_client=self._get_s3_client(), - batching_regex=batching_regex_pattern, - bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - max_keys=max_keys, - file_path_template_map_fn=S3Url.OBJECT_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - S3DataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, + prefix=s3_prefix, + delimiter=s3_delimiter, ) ) - return self._add_asset(asset=asset) - - def add_parquet_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern], - prefix: str = "", - delimiter: str = "/", - max_keys: int = 1000, - order_by: Optional[SortersDefinition] = None, - **kwargs, - ) -> ParquetAsset: # type: ignore[valid-type] - """Adds a Parquet DataAsset to the present "PandasS3Datasource" object. - Args: - name: The name of the Parquet asset - batching_regex: regex pattern that matches Parquet filenames that is used to label the batches - prefix: S3 object name prefix - delimiter: S3 object name delimiter - max_keys: S3 object name max_keys (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - kwargs: Extra keyword arguments should correspond to ``pandas.read_parquet`` keyword args - """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = ParquetAsset( - name=name, - batching_regex=batching_regex_pattern, - order_by=order_by_sorters, - **kwargs, - ) - asset._data_connector = S3DataConnector.build_data_connector( - datasource_name=self.name, - data_asset_name=name, - s3_client=self._get_s3_client(), - batching_regex=batching_regex_pattern, - bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - max_keys=max_keys, - file_path_template_map_fn=S3Url.OBJECT_URL_TEMPLATE.format, - ) - asset._test_connection_error_message = ( - S3DataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=batching_regex_pattern, - bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - ) + logger.info( + f"{self.data_connector_type.__name__} created for '{data_asset.name}'" ) - return self._add_asset(asset=asset) - - # attr-defined issue - # https://github.com/python/mypy/issues/12472 - add_csv_asset.__signature__ = _merge_signatures(add_csv_asset, CSVAsset, exclude={"type"}) # type: ignore[attr-defined] - add_excel_asset.__signature__ = _merge_signatures(add_excel_asset, ExcelAsset, exclude={"type"}) # type: ignore[attr-defined] - add_json_asset.__signature__ = _merge_signatures(add_json_asset, JSONAsset, exclude={"type"}) # type: ignore[attr-defined] - add_parquet_asset.__signature__ = _merge_signatures(add_parquet_asset, ParquetAsset, exclude={"type"}) # type: ignore[attr-defined] diff --git a/great_expectations/datasource/fluent/pandas_s3_datasource.pyi b/great_expectations/datasource/fluent/pandas_s3_datasource.pyi index 096232fc03d4..f3f11675eebd 100644 --- a/great_expectations/datasource/fluent/pandas_s3_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_s3_datasource.pyi @@ -1,3 +1,5 @@ +from __future__ import annotations + import re import typing from logging import Logger @@ -17,7 +19,8 @@ from typing_extensions import Literal from great_expectations.core._docs_decorators import public_api as public_api from great_expectations.core.util import S3Url as S3Url -from great_expectations.datasource.fluent import Sorter, _PandasFilePathDatasource +from great_expectations.datasource.fluent import _PandasFilePathDatasource +from great_expectations.datasource.fluent.config_str import ConfigStr from great_expectations.datasource.fluent.data_asset.data_connector import ( FilesystemDataConnector as FilesystemDataConnector, ) @@ -47,6 +50,7 @@ from great_expectations.datasource.fluent.pandas_file_path_datasource import ( ) if TYPE_CHECKING: + from great_expectations.datasource.fluent.dynamic_pandas import ( CompressionOptions, CSVEngine, @@ -54,6 +58,7 @@ if TYPE_CHECKING: IndexLabel, StorageOptions, ) + from great_expectations.datasource.fluent.interfaces import BatchMetadata from great_expectations.datasource.fluent.pandas_file_path_datasource import ( CSVAsset, ExcelAsset, @@ -78,14 +83,18 @@ class PandasS3DatasourceError(PandasDatasourceError): ... class PandasS3Datasource(_PandasFilePathDatasource): type: Literal["pandas_s3"] bucket: str - boto3_options: Dict[str, Any] + boto3_options: Dict[str, ConfigStr | Any] def test_connection(self, test_assets: bool = ...) -> None: ... def add_csv_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, sep: typing.Union[str, None] = ..., delimiter: typing.Union[str, None] = ..., header: Union[int, Sequence[int], None, Literal["infer"]] = "infer", @@ -140,9 +149,13 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_excel_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, sheet_name: typing.Union[str, int, None] = 0, header: Union[int, Sequence[int], None] = 0, names: typing.Union[typing.List[str], None] = ..., @@ -170,8 +183,13 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_feather_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, columns: Union[Sequence[Hashable], None] = ..., use_threads: bool = ..., storage_options: StorageOptions = ..., @@ -179,8 +197,13 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_hdf_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, key: typing.Any = ..., mode: str = "r", errors: str = "strict", @@ -195,8 +218,13 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_html_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, match: Union[str, typing.Pattern] = ".+", flavor: typing.Union[str, None] = ..., header: Union[int, Sequence[int], None] = ..., @@ -215,9 +243,13 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_json_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, orient: typing.Union[str, None] = ..., dtype: typing.Union[dict, None] = ..., convert_axes: typing.Any = ..., @@ -237,17 +269,26 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_orc_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, columns: typing.Union[typing.List[str], None] = ..., kwargs: typing.Union[dict, None] = ..., ) -> ORCAsset: ... def add_parquet_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = ..., - glob_directive: str = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, engine: str = "auto", columns: typing.Union[typing.List[str], None] = ..., storage_options: StorageOptions = ..., @@ -257,16 +298,26 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_pickle_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, compression: CompressionOptions = "infer", storage_options: StorageOptions = ..., ) -> PickleAsset: ... def add_sas_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, format: typing.Union[str, None] = ..., index: Union[Hashable, None] = ..., encoding: typing.Union[str, None] = ..., @@ -277,16 +328,26 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_spss_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, usecols: typing.Union[int, str, typing.Sequence[int], None] = ..., convert_categoricals: bool = ..., ) -> SPSSAsset: ... def add_stata_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, convert_dates: bool = ..., convert_categoricals: bool = ..., index_col: typing.Union[str, None] = ..., @@ -302,8 +363,13 @@ class PandasS3Datasource(_PandasFilePathDatasource): def add_xml_asset( self, name: str, - order_by: typing.List[Sorter] = ..., - batching_regex: typing.Pattern = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: Union[re.Pattern, str] = ..., + order_by: Optional[SortersDefinition] = ..., + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, xpath: str = "./*", namespaces: typing.Union[typing.Dict[str, str], None] = ..., elems_only: bool = ..., diff --git a/great_expectations/datasource/fluent/sources.py b/great_expectations/datasource/fluent/sources.py index 10231352678e..bf83a1528b9c 100644 --- a/great_expectations/datasource/fluent/sources.py +++ b/great_expectations/datasource/fluent/sources.py @@ -282,12 +282,18 @@ def _add_asset_factory( # asset level attributes needed by the data_connector # push them to `connect_options` field if self.data_connector_type: + logger.info( + f"'{self.name}' {type(self).__name__} uses {self.data_connector_type.__name__}" + ) connect_options = { k: v for (k, v) in kwargs.items() - if k in self.data_connector_type.asset_level_option_keys and v + if k in self.data_connector_type.asset_level_option_keys } if connect_options: + logger.info( + f"{self.data_connector_type.__name__} connect_options provided -> {list(connect_options.keys())}" + ) for k in connect_options: # TODO: avoid this extra loop kwargs.pop(k) kwargs["connect_options"] = connect_options diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py index 0b84270a3b37..c1bac87764c2 100644 --- a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py @@ -141,13 +141,10 @@ def add_csv_asset( delimiter: Microsoft Azure Blob Storage object name delimiter order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( name=name, - batching_regex=batching_regex_pattern, + batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern header=header, inferSchema=infer_schema, order_by=order_by_sorters, @@ -156,7 +153,7 @@ def add_csv_asset( datasource_name=self.name, data_asset_name=name, azure_client=self._get_azure_client(), - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, account_name=self._account_name, container=container, name_starts_with=name_starts_with, @@ -166,7 +163,7 @@ def add_csv_asset( asset._test_connection_error_message = ( AzureBlobStorageDataConnector.build_test_connection_error_message( data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, account_name=self._account_name, container=container, name_starts_with=name_starts_with, diff --git a/great_expectations/datasource/fluent/spark_dbfs_datasource.py b/great_expectations/datasource/fluent/spark_dbfs_datasource.py index ecaf3d4c562c..03afbb60b26f 100644 --- a/great_expectations/datasource/fluent/spark_dbfs_datasource.py +++ b/great_expectations/datasource/fluent/spark_dbfs_datasource.py @@ -53,13 +53,10 @@ def add_csv_asset( infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( name=name, - batching_regex=batching_regex_pattern, + batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern header=header, inferSchema=infer_schema, order_by=order_by_sorters, @@ -67,7 +64,7 @@ def add_csv_asset( asset._data_connector = DBFSDataConnector.build_data_connector( datasource_name=self.name, data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, base_directory=self.base_directory, glob_directive=glob_directive, data_context_root_directory=self.data_context_root_directory, @@ -76,7 +73,7 @@ def add_csv_asset( asset._test_connection_error_message = ( DBFSDataConnector.build_test_connection_error_message( data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, glob_directive=glob_directive, base_directory=self.base_directory, ) diff --git a/great_expectations/datasource/fluent/spark_filesystem_datasource.py b/great_expectations/datasource/fluent/spark_filesystem_datasource.py index 9ca85cbf0bbe..edd88978524d 100644 --- a/great_expectations/datasource/fluent/spark_filesystem_datasource.py +++ b/great_expectations/datasource/fluent/spark_filesystem_datasource.py @@ -8,6 +8,7 @@ from typing_extensions import Literal from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent.constants import MATCH_ALL_PATTERN from great_expectations.datasource.fluent.data_asset.data_connector import ( FilesystemDataConnector, ) @@ -55,7 +56,7 @@ def test_connection(self, test_assets: bool = True) -> None: def add_csv_asset( self, name: str, - batching_regex: Optional[Union[re.Pattern, str]] = None, + batching_regex: Union[str, re.Pattern] = MATCH_ALL_PATTERN, glob_directive: str = "**/*", header: bool = False, infer_schema: bool = False, @@ -71,13 +72,10 @@ def add_csv_asset( infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( name=name, - batching_regex=batching_regex_pattern, + batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern header=header, inferSchema=infer_schema, order_by=order_by_sorters, @@ -85,7 +83,7 @@ def add_csv_asset( asset._data_connector = FilesystemDataConnector.build_data_connector( datasource_name=self.name, data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, base_directory=self.base_directory, glob_directive=glob_directive, data_context_root_directory=self.data_context_root_directory, @@ -93,7 +91,7 @@ def add_csv_asset( asset._test_connection_error_message = ( FilesystemDataConnector.build_test_connection_error_message( data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, glob_directive=glob_directive, base_directory=self.base_directory, ) diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py index f41d1f311dab..357782fa736b 100644 --- a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py @@ -149,13 +149,10 @@ def add_csv_asset( max_results (int): Google Cloud Storage max_results (default is 1000) order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( name=name, - batching_regex=batching_regex_pattern, + batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern header=header, inferSchema=infer_schema, order_by=order_by_sorters, @@ -164,7 +161,7 @@ def add_csv_asset( datasource_name=self.name, data_asset_name=name, gcs_client=self._get_gcs_client(), - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, bucket_or_name=self.bucket_or_name, prefix=prefix, delimiter=delimiter, @@ -174,7 +171,7 @@ def add_csv_asset( asset._test_connection_error_message = ( GoogleCloudStorageDataConnector.build_test_connection_error_message( data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, bucket_or_name=self.bucket_or_name, prefix=prefix, delimiter=delimiter, diff --git a/great_expectations/datasource/fluent/spark_s3_datasource.py b/great_expectations/datasource/fluent/spark_s3_datasource.py index 4b229e71df09..316942cbfbbf 100644 --- a/great_expectations/datasource/fluent/spark_s3_datasource.py +++ b/great_expectations/datasource/fluent/spark_s3_datasource.py @@ -12,6 +12,7 @@ from great_expectations.datasource.fluent.config_str import ( ConfigStr, # noqa: TCH001 # needed at runtime ) +from great_expectations.datasource.fluent.constants import MATCH_ALL_PATTERN from great_expectations.datasource.fluent.data_asset.data_connector import ( S3DataConnector, ) @@ -103,7 +104,7 @@ def test_connection(self, test_assets: bool = True) -> None: def add_csv_asset( self, name: str, - batching_regex: Optional[Union[str, re.Pattern]] = None, + batching_regex: Union[str, re.Pattern] = MATCH_ALL_PATTERN, header: bool = False, infer_schema: bool = False, prefix: str = "", @@ -123,13 +124,10 @@ def add_csv_asset( max_keys: S3 max_keys (default is 1000) order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default """ - batching_regex_pattern: re.Pattern = self.parse_batching_regex_string( - batching_regex=batching_regex - ) order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( name=name, - batching_regex=batching_regex_pattern, + batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern header=header, inferSchema=infer_schema, order_by=order_by_sorters, @@ -138,7 +136,7 @@ def add_csv_asset( datasource_name=self.name, data_asset_name=name, s3_client=self._get_s3_client(), - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, bucket=self.bucket, prefix=prefix, delimiter=delimiter, @@ -148,7 +146,7 @@ def add_csv_asset( asset._test_connection_error_message = ( S3DataConnector.build_test_connection_error_message( data_asset_name=name, - batching_regex=batching_regex_pattern, + batching_regex=asset.batching_regex, bucket=self.bucket, prefix=prefix, delimiter=delimiter, diff --git a/tests/datasource/fluent/conftest.py b/tests/datasource/fluent/conftest.py index 29a8d473c5d5..f9fd16f2803c 100644 --- a/tests/datasource/fluent/conftest.py +++ b/tests/datasource/fluent/conftest.py @@ -2,6 +2,7 @@ import logging import pathlib +from pprint import pformat as pf from typing import Any, Callable, Dict, Generator, List, Optional, Type, Union import pytest @@ -14,6 +15,10 @@ SqlAlchemyDatasourceBatchSpec, ) from great_expectations.data_context import FileDataContext +from great_expectations.datasource.fluent import ( + PandasAzureBlobStorageDatasource, + PandasGoogleCloudStorageDatasource, +) from great_expectations.datasource.fluent.interfaces import Datasource from great_expectations.datasource.fluent.sources import _SourceFactories from great_expectations.execution_engine import ( @@ -132,3 +137,54 @@ def fluent_gx_config_yml() -> pathlib.Path: @pytest.fixture(scope="session") def fluent_gx_config_yml_str(fluent_gx_config_yml: pathlib.Path) -> str: return fluent_gx_config_yml.read_text() + + +class _TestClientDummy: + pass + + +_CLIENT_DUMMY = _TestClientDummy() + + +def _get_test_client_dummy(*args, **kwargs) -> _TestClientDummy: + logger.debug( + f"_get_test_client_dummy() called with \nargs: {pf(args)}\nkwargs: {pf(kwargs)}" + ) + return _CLIENT_DUMMY + + +@pytest.fixture +def gcs_get_client_dummy(monkeypatch: MonkeyPatch): + monkeypatch.setattr( + PandasGoogleCloudStorageDatasource, + "_get_gcs_client", + _get_test_client_dummy, + raising=True, + ) + + +@pytest.fixture +def azure_get_client_dummy(monkeypatch: MonkeyPatch): + monkeypatch.setattr( + PandasAzureBlobStorageDatasource, + "_get_azure_client", + _get_test_client_dummy, + raising=True, + ) + + +@pytest.fixture +def cloud_storage_get_client_doubles( + gcs_get_client_dummy, + azure_get_client_dummy, +): + """ + Patches Datasources that rely on a private _get_*_client() method to return test doubles instead. + + gcs + azure + """ + # TODO: patch Spark datasources as needed + logger.warning( + "Patching cloud storage _get_*_client() methods to return client test doubles" + ) diff --git a/tests/datasource/fluent/great_expectations.yml b/tests/datasource/fluent/great_expectations.yml index a47dc6abfdab..a8b113c6f379 100644 --- a/tests/datasource/fluent/great_expectations.yml +++ b/tests/datasource/fluent/great_expectations.yml @@ -55,3 +55,46 @@ fluent_datasources: my_csv_asset_with_default_connect_options: type: csv sep: "," + my_pandas_s3_ds: + type: pandas_s3 + bucket: "test_bucket" + assets: + my_csv_asset_w_custom_connect_options: + type: csv + delimiter: "," + connect_options: + s3_delimiter: "/" + s3_prefix: "" + s3_max_keys: 99 + my_csv_asset_with_default_connect_options: + type: csv + delimiter: "," + my_pandas_gcs_ds: + type: pandas_gcs + bucket_or_name: "test_bucket" + assets: + my_csv_asset_w_custom_connect_options: + type: csv + delimiter: "," + connect_options: + gcs_delimiter: "/" + gcs_prefix: "" + gcs_max_results: 99 + my_csv_asset_with_default_connect_options: + type: csv + delimiter: "," + my_pandas_abs_ds: + type: pandas_abs + assets: + my_csv_asset_w_custom_connect_options: + type: csv + delimiter: "," + connect_options: + abs_container: "test" + abs_name_starts_with: "" + abs_delimiter: "/" + my_csv_asset_with_default_connect_options: + type: csv + delimiter: "," + connect_options: + abs_container: "this_is_always_required" diff --git a/tests/datasource/fluent/integration/test_integration_datasource.py b/tests/datasource/fluent/integration/test_integration_datasource.py index 018a8c5538dd..506b923aa228 100644 --- a/tests/datasource/fluent/integration/test_integration_datasource.py +++ b/tests/datasource/fluent/integration/test_integration_datasource.py @@ -10,6 +10,7 @@ PandasFilesystemDatasource, SparkFilesystemDatasource, ) +from great_expectations.datasource.fluent.constants import MATCH_ALL_PATTERN from great_expectations.datasource.fluent.interfaces import ( BatchRequest, DataAsset, @@ -168,7 +169,7 @@ def test_sql_query_data_asset(empty_data_context): "..", "..", "..", "test_sets", "taxi_yellow_tripdata_samples" ) ), - None, + MATCH_ALL_PATTERN, False, id="default regex", ), @@ -177,7 +178,7 @@ def test_sql_query_data_asset(empty_data_context): def test_filesystem_data_asset_batching_regex( filesystem_datasource: PandasFilesystemDatasource | SparkFilesystemDatasource, base_directory: pathlib.Path, - batching_regex: str | None, + batching_regex: str, raises_test_connection_error: bool, ): filesystem_datasource.base_directory = base_directory diff --git a/tests/datasource/fluent/test_config.py b/tests/datasource/fluent/test_config.py index 620a7d504b2a..a24050f59b10 100644 --- a/tests/datasource/fluent/test_config.py +++ b/tests/datasource/fluent/test_config.py @@ -784,6 +784,7 @@ def test_config_substitution_retains_original_value_on_save( monkeypatch: pytest.MonkeyPatch, file_dc_config_file_with_substitutions: pathlib.Path, sqlite_database_path: pathlib.Path, + cloud_storage_get_client_doubles, ): original: dict = cast( dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) @@ -830,6 +831,7 @@ def test_config_substitution_retains_original_value_on_save_w_run_time_mods( monkeypatch: pytest.MonkeyPatch, sqlite_database_path: pathlib.Path, file_dc_config_file_with_substitutions: pathlib.Path, + cloud_storage_get_client_doubles, ): # inject env variable my_conn_str = f"sqlite:///{sqlite_database_path}" diff --git a/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py index 1e69382824c9..4f25e8a05589 100644 --- a/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py @@ -118,7 +118,7 @@ def csv_asset( asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) return asset @@ -249,7 +249,7 @@ def test_add_csv_asset_to_datasource( asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", - container="my_container", + abs_container="my_container", ) assert asset.name == "csv_asset" assert asset.batching_regex.match("random string") is None @@ -301,7 +301,7 @@ def test_csv_asset_with_batching_regex_unnamed_parameters( asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", - container="my_container", + abs_container="my_container", ) options = asset.batch_request_options assert options == ( @@ -331,7 +331,7 @@ def test_csv_asset_with_batching_regex_named_parameters( asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) options = asset.batch_request_options assert options == ( @@ -361,7 +361,7 @@ def test_csv_asset_with_some_batching_regex_named_parameters( asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) options = asset.batch_request_options assert options == ( @@ -391,7 +391,7 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) with pytest.raises(ge_exceptions.InvalidBatchRequestError): # price is an int which will raise an error @@ -426,7 +426,7 @@ def instantiate_azure_client_spy(self) -> None: asset = pandas_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) request = asset.build_batch_request( diff --git a/tests/datasource/fluent/test_pandas_s3_datasource.py b/tests/datasource/fluent/test_pandas_s3_datasource.py index e98f7963c73b..bed6a0924673 100644 --- a/tests/datasource/fluent/test_pandas_s3_datasource.py +++ b/tests/datasource/fluent/test_pandas_s3_datasource.py @@ -3,11 +3,14 @@ import logging import os import re +from pprint import pformat as pf from typing import TYPE_CHECKING, List, cast import pandas as pd +import pydantic import pytest from moto import mock_s3 +from pytest import param import great_expectations.exceptions as ge_exceptions from great_expectations.core.util import S3Url @@ -160,6 +163,110 @@ def test_construct_csv_asset_directly(): assert m1 is not None +@pytest.mark.unit +def test_invalid_connect_options(pandas_s3_datasource: PandasS3Datasource): + with pytest.raises(pydantic.ValidationError) as exc_info: + pandas_s3_datasource.add_csv_asset( # type: ignore[call-arg] + name="csv_asset", + batching_regex=r"(.+)_(.+)_(\d{4})\.csv", + extra_field="invalid", + ) + + error_dicts = exc_info.value.errors() + print(pf(error_dicts)) + assert [ + { + "loc": ("extra_field",), + "msg": "extra fields not permitted", + "type": "value_error.extra", + } + ] == error_dicts + + +@pytest.mark.unit +@pytest.mark.parametrize( + ["connect_option_kwargs", "expected_error_dicts"], + [ + param( + {"my_prefix": "/"}, + [ + { + "loc": ("my_prefix",), + "msg": "extra fields not permitted", + "type": "value_error.extra", + } + ], + id="extra_fields", + ), + param( + {"s3_delimiter": ["/only", "/one_delimiter"]}, + [ + { + "loc": ("s3_delimiter",), + "msg": "str type expected", + "type": "type_error.str", + }, + ], + id="wrong_type", + ), + ], +) +def test_invalid_connect_options_value( + pandas_s3_datasource: PandasS3Datasource, + connect_option_kwargs: dict, + expected_error_dicts: list[dict], +): + with pytest.raises(pydantic.ValidationError) as exc_info: + pandas_s3_datasource.add_csv_asset( + name="csv_asset", + batching_regex=r"(.+)_(.+)_(\d{4})\.csv", + **connect_option_kwargs, + ) + + print(f"Exception raised:\n\t{repr(exc_info.value)}") + error_dicts = exc_info.value.errors() + print(pf(error_dicts)) + assert expected_error_dicts == error_dicts + + +@pytest.mark.unit +@pytest.mark.parametrize( + "connect_options", + [ + param({}, id="default connect options"), + param({"s3_prefix": ""}, id="prefix ''"), + param({"s3_delimiter": "/"}, id="s3_delimiter '/'"), + # param({"s3_prefix": "non_default"}, id="s3_prefix 'non_default'"), # TODO: what prefix should I test? + param( + {"s3_prefix": "", "s3_delimiter": "/", "s3_max_keys": 20}, + id="all options", + ), + ], +) +def test_asset_connect_options_in_repr( + pandas_s3_datasource: PandasS3Datasource, connect_options: dict +): + print(f"connect_options\n{pf(connect_options)}\n") + + asset = pandas_s3_datasource.add_csv_asset( + name="csv_asset", + batching_regex=r"(.+)_(.+)_(\d{4})\.csv", + **connect_options, + ) + + print(f"__repr__\n{repr(asset)}\n") + asset_as_str = str(asset) + print(f"__str__\n{asset_as_str}\n") + + for option_name, option_value in connect_options.items(): + assert option_name in asset_as_str + assert str(option_value) in asset_as_str + if not connect_options: + # if no connect options are provided the defaults should be used and should not + # be part of any serialization. str(asset) == asset.yaml() + assert "connect_options" not in asset_as_str + + @pytest.mark.integration def test_csv_asset_with_batching_regex_unnamed_parameters( pandas_s3_datasource: PandasS3Datasource, diff --git a/tests/datasource/fluent/test_spark_s3_datasource.py b/tests/datasource/fluent/test_spark_s3_datasource.py index 1af7694f7f4b..4f42864538a5 100644 --- a/tests/datasource/fluent/test_spark_s3_datasource.py +++ b/tests/datasource/fluent/test_spark_s3_datasource.py @@ -19,7 +19,7 @@ _FilePathDataAsset, ) from great_expectations.datasource.fluent.interfaces import TestConnectionError -from great_expectations.datasource.fluent.spark_s3_datasource import CSVAsset +from great_expectations.datasource.fluent.spark_file_path_datasource import CSVAsset if TYPE_CHECKING: from botocore.client import BaseClient diff --git a/tests/datasource/fluent/test_viral_snippets.py b/tests/datasource/fluent/test_viral_snippets.py index 42008512454e..28f1f123d283 100644 --- a/tests/datasource/fluent/test_viral_snippets.py +++ b/tests/datasource/fluent/test_viral_snippets.py @@ -15,7 +15,9 @@ from great_expectations import get_context from great_expectations.data_context import FileDataContext from great_expectations.datasource.fluent.config import GxConfig -from great_expectations.datasource.fluent.interfaces import Datasource +from great_expectations.datasource.fluent.interfaces import ( + Datasource, +) logger = logging.getLogger(__file__) @@ -67,7 +69,10 @@ def fluent_yaml_config_file( @pytest.fixture @functools.lru_cache(maxsize=1) -def fluent_file_context(fluent_yaml_config_file: pathlib.Path) -> FileDataContext: +def fluent_file_context( + cloud_storage_get_client_doubles, + fluent_yaml_config_file: pathlib.Path, +) -> FileDataContext: context = get_context( context_root_dir=fluent_yaml_config_file.parent, cloud_mode=False ) @@ -76,7 +81,9 @@ def fluent_file_context(fluent_yaml_config_file: pathlib.Path) -> FileDataContex def test_load_an_existing_config( - fluent_yaml_config_file: pathlib.Path, fluent_only_config: GxConfig + cloud_storage_get_client_doubles, + fluent_yaml_config_file: pathlib.Path, + fluent_only_config: GxConfig, ): context = get_context( context_root_dir=fluent_yaml_config_file.parent, cloud_mode=False @@ -85,7 +92,10 @@ def test_load_an_existing_config( assert context.fluent_config == fluent_only_config -def test_serialize_fluent_config(fluent_file_context: FileDataContext): +def test_serialize_fluent_config( + cloud_storage_get_client_doubles, + fluent_file_context: FileDataContext, +): dumped_yaml: str = fluent_file_context.fluent_config.yaml() print(f" Dumped Config\n\n{dumped_yaml}\n") @@ -99,19 +109,23 @@ def test_serialize_fluent_config(fluent_file_context: FileDataContext): def test_data_connectors_are_built_on_config_load(fluent_file_context: FileDataContext): + """ + Ensure that all Datasources that require data_connectors have their data_connectors + created when loaded from config. + """ dc_datasources: dict[str, list[str]] = defaultdict(list) for datasource in fluent_file_context.fluent_datasources.values(): if datasource.data_connector_type: print(f"class: {datasource.__class__.__name__}") print(f"type: {datasource.type}") + print(f"data_connector: {datasource.data_connector_type.__name__}") print(f"name: {datasource.name}", end="\n\n") dc_datasources[datasource.type].append(datasource.name) for asset in datasource.assets.values(): - asset.test_connection() - print(f"✅ '{asset.name}' connected with {type(asset._data_connector)}") + assert isinstance(asset._data_connector, datasource.data_connector_type) print() print(f"Datasources with DataConnectors\n{pf(dict(dc_datasources))}") From 6788ab342d873867bc070000d6941933ac1383c0 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Mon, 3 Apr 2023 10:36:26 -0400 Subject: [PATCH 22/96] [DOCS] Update GX version in `_data.jsx` component (#7549) --- docs/docusaurus/docs/components/_data.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docs/components/_data.jsx b/docs/docusaurus/docs/components/_data.jsx index 2053fffb64d3..e7cf96ab976d 100644 --- a/docs/docusaurus/docs/components/_data.jsx +++ b/docs/docusaurus/docs/components/_data.jsx @@ -1,5 +1,5 @@ export default { - release_version: 'great_expectations, version 0.15.50', + release_version: 'great_expectations, version 0.16.5', min_python: '3.7', max_python: '3.10' } From f7655ccf9d2752842eb700dc25cb6df1b9ef7c54 Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Mon, 3 Apr 2023 12:47:58 -0500 Subject: [PATCH 23/96] [DOCS] Adds guides on using Ephemeral Data Contexts and updates Quickstart Next Steps (#7500) --- .../_connecting_to_data_fluently.md | 55 +++++++++ .../_if_you_still_need_to_setup_gx.md | 5 + .../_admonition_convert_to_file_context.md | 7 ++ ...data_context_initialize_instatiate_save.md | 34 ++++++ .../setup/link_lists/_setup_and_install_gx.md | 57 +++++++++ .../setup/link_lists/_setup_configurations.md | 77 +++++++++++++ ...nize_batches_in_a_file_based_data_asset.md | 4 +- ...ta_context_to_a_filesystem_data_context.md | 109 ++++++++++++++++++ ...y_instantiate_an_ephemeral_data_context.md | 93 +++++++++++++++ ...w_to_quickly_instantiate_a_data_context.md | 8 ++ .../docs/tutorials/quickstart/quickstart.md | 36 ++---- docs/docusaurus/sidebars.js | 4 +- 12 files changed, 457 insertions(+), 32 deletions(-) create mode 100644 docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md create mode 100644 docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md create mode 100644 docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md create mode 100644 docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md create mode 100644 docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_gx.md create mode 100644 docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md create mode 100644 docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md create mode 100644 docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md diff --git a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md new file mode 100644 index 000000000000..74df936f01ba --- /dev/null +++ b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md @@ -0,0 +1,55 @@ +
    + + +#### Connecting GX to filesystem source data + + + +**Local Filesystems** +- [How to quickly connect to a single file using Pandas](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_quickly_connect_to_a_single_file_with_pandas.md) +- [How to connect to one or more files using Pandas](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_pandas.md) +- [How to connect to one or more files using Spark](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark.md) + +**Google Cloud Storage** +- [How to connect to data on GCS using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md) +- [How to connect to data on GCS using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md) + +**Azure Blob Storage** +- [How to connect to data on Azure Blob Storage using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md) +- [How to connect to data on Azure Blob Storage using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md) + +**Amazon Web Services** +- [How to connect to data on S3 using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md) +- [How to connect to data on S3 using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md) + +
    + +
    + + + +#### Connecting GX to in-memory source data + + + +- [How to connect to in-memory data using Pandas](docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas.md) + +
    + + +
    + + + +#### Connecting GX to SQL source data + + + +**General SQL Datasources** +- [How to connect to SQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md) + +**Specific SQL dialects** +- [How to connect to PostgreSQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md) +- [How to connect to SQLite data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md) + +
    \ No newline at end of file diff --git a/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md b/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md new file mode 100644 index 000000000000..93d1ffb03382 --- /dev/null +++ b/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md @@ -0,0 +1,5 @@ +Please reference the appropriate one of these guides: +- [How to install GX locally](docs/guides/setup/installation/local.md) +- [How to set up GX to work with data on AWS S3](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3.md) +- [How to set up GX to work with data in Azure Blob Storage](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs.md) +- [How to set up GX to work with data on GCS](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs.md) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md b/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md new file mode 100644 index 000000000000..c908817420ec --- /dev/null +++ b/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md @@ -0,0 +1,7 @@ +An Ephemeral Data Context is an in-memory Data Context that is not intended to persist beyond the current Python session. However, if you decide that you would like to save its contents for future use you can do so by converting it to a Filesystem Data Context: + +```python title="Python code" +context = context.convert_to_file_context() +``` + +This method will initialize a Filesystem Data Context in the current working directory of the Python process that contains the Ephemeral Data Context. For more detailed explanation of this method, please see our guide on [how to convert an ephemeral data context to a filesystem data context](docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md b/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md new file mode 100644 index 000000000000..cd65cab1139a --- /dev/null +++ b/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md @@ -0,0 +1,34 @@ +
    + + +#### Getting a Data Context + + + +**Quickstart Data Context** +- [How to quickly instantiate a Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md) + +**Filesystem Data Contexts** +- [How to initialize a new Data Context with the CLI](docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli.md) +- [How to initialize a filesystem Data Context in Python](docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md) +- [How to instantiate a specific Filesystem Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_instantiate_a_specific_filesystem_data_context.md) + +**In-memory Data Contexts** +- [How to explicitly instantiate an Ephemeral Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md) +- [How to instantiate a Data Context without a yml file](docs/guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file.md) + +
    + +
    + + +#### Saving a Data Context + + + +Filesystem and Cloud Data Contexts automatically save any changes as they are made. The only type of Data Context that does not immediately save changes in a persisting way is the Ephemeral Data Context, which is an in-memory Data Context that will not persist beyond the current Python session. However, an Ephemeral Data Context can be converted to a Filesystem Data Context if you wish to save its contents for future use. + +For more information, please see: +- [How to convert an Ephemeral Data Context to a Filesystem Data Context](docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md) + +
    \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_gx.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_gx.md new file mode 100644 index 000000000000..c8af7223a137 --- /dev/null +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_gx.md @@ -0,0 +1,57 @@ +import SetupAndInstallForSqlData from '/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md' +import SetupAndInstallForFilesystemData from '/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md' +import SetupAndInstallForHostedData from '/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md' +import SetupAndInstallForCloudData from '/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md' + +
    + + +#### Setup and installation of GX for local filesystems + + + +For more details on installing GX for use with local filesystems, please see: + + + +
    + + +
    + + +#### Setup and installation of GX for cloud storage systems + + + +For guides on installing GX for use with cloud storage systems, please reference: + + + +
    + +
    + + +#### Setup and installation of GX for SQL databases + + + +For information on installing GX for use with SQL databases, see: + + + +
    + +
    + + +#### Setup and installation of GX for hosted data systems + + + +For instructions on installing GX for use with hosted data systems, read: + + + +
    \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md new file mode 100644 index 000000000000..8af0c338efec --- /dev/null +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md @@ -0,0 +1,77 @@ +
    + + +#### Configuring credentials + + + +While some source data systems provide their own means of configuring credentials through environment variables, you can also configure GX to populate credentials from either a YAML file or a secret manager. For more information, please see: +- [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) + +
    + +
    + + +#### Configuring Expectation Stores + + + +- [How to configure an Expectation Store to use Amazon S3](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_amazon_s3.md) +- [How to configure an Expectation Store to use Azure Blob Storage](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md) +- [How to configure an Expectation Store to use GCS](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_gcs.md) +- [How to configure an Expectation Store on a filesystem](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_on_a_filesystem.md) +- [How to configure an Expectation Store to use PostgreSQL](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_to_postgresql.md) + +
    + +
    + + +#### Configuring Validation Results Stores + + + +- [How to configure a Validation Result Store in Amazon S3](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_amazon_s3.md) +- [How to configure a Validation Result Store in Azure Blob Storage](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_azure_blob_storage.md) +- [How to configure a Validation Result Store in GCS](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_gcs.md) +- [How to configure a Validation Result Store on a filesystem](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_on_a_filesystem.md) +- [How to configure a Validation Result Store to use PostgreSQL](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_to_postgresql.md) + +
    + +
    + + +#### Configuring Metric Stores + + + +- [How to configure and use a Metric Store](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_metricsstore.md) + +
    + +
    + + +#### Configuring Data Docs + + + +- [How to host and share Data Docs on Amazon S3](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_amazon_s3.md) +- [How to host and share Data Docs on Azure Blob Storage](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md) +- [How to host and share Data Docs on GCS](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_gcs.md) +- [How to host and share Data Docs on a filesystem](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_a_filesystem.md) + +
    + +
    + + +#### Validating Data Context configurations + + + +- [How to configure DataContext components using `test_yaml_config()`](docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md) + +
    \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md index 9b26b1d7890e..86ad4b0e1955 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md @@ -2,7 +2,7 @@ title: How to organize Batches in a file-based Data Asset tag: [how-to, connect to data] description: A technical guide demonstrating how to organize Batches of data in a file-based Data Asset. -keywords: [Great Expectations, Data Asset, Batch Request, fluent configuration method, GCS, Google Cloud Server, AWS S3, Amazon Web Services S3, Azure Blob Storage, Local Filesystem] +keywords: [Great Expectations, Data Asset, Batch Request, fluent configuration method, GCS, Google Cloud Storage, AWS S3, Amazon Web Services S3, Azure Blob Storage, Local Filesystem] --- import TechnicalTag from '/docs/term_tags/_tag.mdx'; @@ -65,7 +65,7 @@ Please reference the appropriate one of these guides: - [How to connect to one or more files using Pandas](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_pandas.md) - [How to connect to one or more files using Spark](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark.md) -#### Google Cloud Server +#### Google Cloud Storage - [How to connect to data on GCS using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md) - [How to connect to data on GCS using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md new file mode 100644 index 000000000000..8d61e24e6cd4 --- /dev/null +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md @@ -0,0 +1,109 @@ +--- +title: How to convert an Ephemeral Data Context to a Filesystem Data Context +tag: [how-to, setup] +keywords: [Great Expectations, Ephemeral Data Context, Filesystem Data Context] +--- + +import Prerequisites from '/docs/components/_prerequisites.jsx' +import IfYouStillNeedToSetupGx from '/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md' +import ConnectingToDataFluently from '/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md' +import SetupConfigurations from '/docs/components/setup/link_lists/_setup_configurations.md' + +## Introduction + +An Ephemeral Data Context is a temporary, in-memory Data Context that will not persist beyond the current Python session. However, if you decide you would like to save the contents of an Ephemeral Data Context for future use you can do so by converting it to a Filesystem Data Context. + +## Prerequisites + + + +- A working installation of Great Expectations +- An Ephemeral Data Context instance +- A passion for Data Quality + + + +
    + + +### If you still need to set up and install GX... + + + + + +
    + + +
    + + +### If you still need to create a Data Context... + + + +The `get_context()` method will return an Ephemeral Data Context if your system is not set up to work with GX Cloud and a Filesystem Data Context cannot be found. For more information, see: +- [How to quickly instantiate a Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md) + +You can also explicitly instantiate an Ephemeral Data Context (for those occasions when your system is set up to work with GX Cloud or you do have a previously initialized Filesystem Data Context). For more information, see: +- [How to explicitly instantiate an Ephemeral Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md) + +
    + +
    + + + +### If you aren't certain that your Data Context is Ephemeral... + + + +You can easily check to see if you are working with an Ephemeral Data Context with the following code (in this example, we are assuming your Data Context is stored in the variable `context`): + +```python title="Python code" +from great_expectations.data_context import EphemeralDataContext + +# ... + +if isinstance(context, EphemeralDataContext): + print(It's Ephemeral!) +``` + +
    + +## Steps + +### 1. Verify that your current working directory does not already contain a GX Filesystem Data Context + +The method for converting an Ephemeral Data Context to a Filesystem Data Context initializes the new Filesystem Data Context in the current working directory of the Python process that is being executed. If a Filesystem Data Context already exists at that location, the process will fail. + +You can determine if your current working directory already has a Filesystem Data Context by looking for a `great_expectations.yml` file. The presence of that file indicates that a Filesystem Data Context has already been initialized in the corresponding directory. + +### 2. Convert the Ephemeral Data Context into a Filesystem Data Context + +Converting an Ephemeral Data Context into a Filesystem Data Context can be done with one line of code: + +```python title="Python code" +context = context.convert_to_file_context() +``` + +:::info Replacing the Ephemeral Data Context + +The `convert_to_file_context()` method does not change the Ephemeral Data Context itself. Rather, it initializes a new Filesystem Data Context with the contents of the Ephemeral Data Context and then returns an instance of the new Filesystem Data Context. If you do not replace the Ephemeral Data Context instance with the Filesystem Data Context instance, it will be possible for you to continue using the Ephemeral Data Context. + +If you do this, it is important to note that changes to the Ephemeral Data Context **will not be reflected** in the Filesystem Data Context. Moreover, `convert_to_file_context()` does not support merge operations. This means you will not be able to save any additional changes you have made to the content of the Ephemeral Data Context. Neither will you be able to use `convert_to_file_context()` to replace the Filesystem Data Context you had previously created: `convert_to_file_context()` will fail if a Filesystem Data Context already exists in the current working directory. + +For these reasons, it is strongly advised that once you have converted your Ephemeral Data Context to a Filesystem Data Context you cease working with the Ephemeral Data Context instance and begin working with the Filesystem Data Context instance instead. + +::: + + +## Next steps + +### Customizing configurations in a Data Context + + + +### Connecting GX to source data systems + + \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md new file mode 100644 index 000000000000..905ec6520f28 --- /dev/null +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md @@ -0,0 +1,93 @@ +--- +title: How to explicitly instantiate an Ephemeral Data Context +tag: [how-to, setup] +keywords: [Great Expectations, Ephemeral Data Context] +--- + +import Prerequisites from '/docs/components/_prerequisites.jsx' +import IfYouStillNeedToSetupGx from '/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md' +import ConnectingToDataFluently from '/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md' +import AdmonitionConvertToFileContext from '/docs/components/setup/data_context/_admonition_convert_to_file_context.md' + +## Introduction + +An Ephemeral Data Context is a temporary, in-memory Data Context. They are ideal for doing data exploration and initial analysis when you do not want to save anything to an existing project, or for when you need to work in a hosted environment such as an EMR Spark Cluster. + +## Prerequisites + + + +- A working installation of Great Expectations +- A passion for Data Quality + + + +
    + + +### If you still need to set up and install GX... + + + + + +
    + +## Steps + +### 1. Import necessary classes for instantiating an Ephemeral Data Context + +To create our Data Context, we will create a configuration that uses in-memory Metadata Stores. This will require two classes from the Great Expectations module: the `DataContextConfig` class and the `InMemoryStoreBackendDefaults` class. These can be imported with the code: + +```python title="Python code" +from great_expectations.data_context.types.base import ( + DataContextConfig, + InMemoryStoreBackendDefaults + ) +``` + +We will also need to import the `EphemeralDataContext` class that we will be creating an instance of: + +```python title="Python code" +from great_expectations.data_context import EphemeralDataContext +``` + +### 2. Create the Data Context configuration + +To create a Data Context configuration that specifies the use of in-memory Metadata Stores we will pass in an instance of the `InMemoryStoreBackendDefaults` class as a parameter when initializing an instance of the `DataContextConfig` class: + +```python title="Python code" +project_config = DataContextConfig( + store_backend_defaults=InMemoryStoreBackendDefaults() +) +``` + +### 3. Instantiate an Ephemeral Data Context + +To create our Ephemeral Data Context instance, we initialize the `EphemeralDataContext` class while passing in the `DataContextConfig` instance we previously created as the value of the `project_config` parameter. + +```python title="Python code" +context = EphemeralDataContext(project_config=project_config) +``` + +We now have an Ephemeral Data Context to use for the rest of this Python session. + +:::info Saving the contents of an Ephemeral Data Context for future use + + + +::: + +## Next steps + +### Connecting GX to source data systems + +Now that you have an Ephemeral Data Context you will want to connect GX to your data. For this, please see the appropriate guides from the following: + + + +### Preserving the contents of an Ephemeral Data Context + +An Ephemeral Data Context is a temporary, in-memory object. It will not persist beyond the current Python session. If you decide that you would like to keep the contents of your Ephemeral Data Context for future use, please see: + +- [How to convert an Ephemeral Data Context to a Filesystem Data Context](docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md index a17cd1391f08..3736032982b9 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md @@ -14,6 +14,8 @@ import GxImport from '/docs/components/setup/python_environment/_gx_import.md' import DataContextVerifyContents from '/docs/components/setup/data_context/_data_context_verify_contents.md' +import AdmonitionConvertToFileContext from '/docs/components/setup/data_context/_admonition_convert_to_file_context.md' + ## Introduction A contains the configurations for , , , , and all things related to working with Great Expectations. This guide will demonstrate how to instantiate an existing Filesystem Data Context so that you can continue working with previously defined GX configurations. @@ -45,6 +47,12 @@ This functions as a convenience method for initializing, instantiating, and retu If you have GX Cloud configured on your system, `get_context()` will instantiate and return a Cloud Data Context. Otherwise, `get_context()` will attempt to instantiate and return the last accessed Filesystem Data Context. Finally, if a previously initialized Filesystem Data Context cannot be found, `get_context()` will initialize, instantiate, and return a temporary in-memory Ephemeral Data Context. +:::info Saving the contents of an Ephemeral Data Context for future use + + + +::: + ### 3. Verify the content of the returned Data Context diff --git a/docs/docusaurus/docs/tutorials/quickstart/quickstart.md b/docs/docusaurus/docs/tutorials/quickstart/quickstart.md index 173bbe79e7ea..bad490e9e428 100644 --- a/docs/docusaurus/docs/tutorials/quickstart/quickstart.md +++ b/docs/docusaurus/docs/tutorials/quickstart/quickstart.md @@ -5,10 +5,8 @@ tag: [tutorial, getting started] # Quickstart with Great Expectations import Prerequisites from '/docs/components/_prerequisites.jsx' -import SetupAndInstallForSqlData from '/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md' -import SetupAndInstallForFilesystemData from '/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md' -import SetupAndInstallForHostedData from '/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md' -import SetupAndInstallForCloudData from '/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md' +import SetupAndInstallGx from '/docs/components/setup/link_lists/_setup_and_install_gx.md' +import DataContextInitializeInstantiateSave from '/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md' ## Introduction @@ -60,9 +58,6 @@ checkpoint_result = checkpoint.run() # View results validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0] context.open_data_docs(resource_identifier=validation_result_identifier) - -# Save the Data Context for future use -context.convert_to_file_context() ``` In the following steps we'll break down exactly what is happening here so that you can follow along and perform a Validation yourself. @@ -160,15 +155,6 @@ validation_result_identifier = checkpoint_result.list_validation_result_identifi context.open_data_docs(resource_identifier=validation_result_identifier) ``` -#### 4.3 Save the Data Context for future use -Because we did not previously initialize a Filesystem Data Context or specify a path at which to create one, the Data Context we recieved from `gx.get_context()` was a temporary, in-memory Ephemeral Data Context. To save this Data Context for future use, we will convert it to a Filesystem Data Context: - -```python title="Python code" -context = context.convert_to_file_context() -``` - -You can provide the path to a specific folder when you convert your Ephemeral Data Context to a Filesystem Data Context. If you do, your Filesystem Data Context will be initialized at that location. If you do not, your new Filesystem Data Context will be initialized in the folder that your script is executed in. - ### 5. (Optional) Great Expectations Cloud By completing the Quickstart guide, you now have the opportunity to join the Cloud Early Access program and explore how Great Expectations Cloud visualizes and creates shareable links for anyone on your team. The GX Cloud interface significantly simplifies collaboration between data teams and domain experts. @@ -179,20 +165,12 @@ To access GX Cloud, you need to join our Cloud Early Access program. During this ## Next Steps -Now that you've seen how easy it is to implement the GX workflow, it is time to customize that workflow to suit your specific use cases! To help with this we have prepared some more detailed installation and setting up guides tailored to specific environments and resources. - -For more details on installing GX for use with local filesystems, please see: - - - -For guides on installing GX for use with cloud storage systems, please reference: - - +Now that you've seen how easy it is to implement the GX workflow, it is time to customize that workflow to suit your specific use cases! To help with this we have prepared some more detailed guides on setting up and installing GX and getting an initial Data Context that are tailored to specific environments and resources. -For information on installing GX for use with SQL databases, see: +### Installing GX for specific environments and source data systems - + -And for instructions on installing GX for use with hosted data systems, read: +### Initializing, instantiating, and saving a Data Context - + \ No newline at end of file diff --git a/docs/docusaurus/sidebars.js b/docs/docusaurus/sidebars.js index 49a88acbebd3..814c05cf5af9 100644 --- a/docs/docusaurus/sidebars.js +++ b/docs/docusaurus/sidebars.js @@ -88,6 +88,8 @@ module.exports = { value: '

    In-memory Data Contexts

    ', defaultStyle: true }, + 'guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context', + 'guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context', 'guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file', { type: 'html', @@ -165,7 +167,7 @@ module.exports = { 'guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark', { type: 'html', - value: '

    Google Cloud Server

    ', + value: '

    Google Cloud Storage

    ', defaultStyle: true }, 'guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas', From 30d1cb8c95f896a7ec7f26d42fde255ca9055baf Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Tue, 4 Apr 2023 07:43:01 -0500 Subject: [PATCH 24/96] [DOCS] Fixes broken code block and incorrectly numbered steps in "How to organize Batches in a SQL-based Data Asset" (#7533) --- .../how_to_organize_batches_in_a_sql_based_data_asset.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md index efeef25acf9f..b0450d097451 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md @@ -85,7 +85,7 @@ If you are using a Datasource that was created with the advanced block-config me -### 1. Retrieve a SQL Datasource and Data Asset +### 2. Retrieve a SQL Datasource and Data Asset For this guide, we will use a previously defined SQL Datasource named `"my_datasource"` with a Table Data Asset called `"my_asset"` which points to a table with taxi data. @@ -94,8 +94,9 @@ To retrieve this Datasource, we will supply the `get_datasource(...)` method of ```python title="Python code" my_datasource = context.get_datasource("my_datasource") my_asset = my_datasource.get_asset("my_asset") +``` -### 2. Add a Splitter to the Data Asset +### 3. Add a Splitter to the Data Asset Our table has a datetime column called "`pickup_datetime`" which we will use to split our TableAsset into Batches. @@ -103,7 +104,7 @@ Our table has a datetime column called "`pickup_datetime`" which we will use to table_asset.add_year_and_month_splitter(column_name="pickup_datetime") ``` -### 3. (Optional) Add Batch Sorters to the Data Asset +### 4. (Optional) Add Batch Sorters to the Data Asset We will now add a Batch Sorter to our Data Asset. This will allow us to explicitly state the order in which our Batches are returned when we request data from the Data Asset. To do this, we will pass a list of sorters to the `add_sorters(...)` method of our Data Asset. @@ -117,7 +118,7 @@ However, in this example we only have one named group, `"year"`, so our list of my_asset.add_sorters(["+year"]) ``` -### 4. Use a Batch Request to verify the Data Asset works as desired +### 5. Use a Batch Request to verify the Data Asset works as desired To verify that our Data Asset will return the desired files as Batches, we will define a quick Batch Request that will include all the Batches available in the Data asset. Then we will use that Batch Request to get a list of the returned Batches. From 2cffdb1b8a8d82bc351a747522bab1da085df196 Mon Sep 17 00:00:00 2001 From: Kyle Eaton Date: Tue, 4 Apr 2023 09:48:31 -0400 Subject: [PATCH 25/96] [DOCS] Update nav to match gx.io site (#7557) --- docs/docusaurus/docusaurus.config.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js index 591cb413752b..a20ef4a09e51 100644 --- a/docs/docusaurus/docusaurus.config.js +++ b/docs/docusaurus/docusaurus.config.js @@ -169,7 +169,7 @@ module.exports = { }, { to: 'https://greatexpectations.io/gx-cloud', - label: 'Cloud early access', + label: 'GX Cloud', position: 'right', className: 'header-cloud-link', 'aria-label': 'Early cloud access' From 9968e85ce9f83a5edd2a83e51820a3f7b4a041a1 Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Tue, 4 Apr 2023 09:38:07 -0500 Subject: [PATCH 26/96] [DOCS] Corrects step numbers in "How to organize Batches in a file-based Data Asset" (#7559) --- ...w_to_organize_batches_in_a_file_based_data_asset.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md index 86ad4b0e1955..d149afd29d2c 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md @@ -93,7 +93,7 @@ If you are using a Datasource that was created with the advanced block-config me -### 1. Retrieve a file-based Datasource +### 2. Retrieve a file-based Datasource For this guide, we will use a previously defined Datasource named `"my_datasource"`. For purposes of our demonstration, this Datasource is a Pandas Filesystem Datasource that uses a folder named "data" as its `base_folder`. @@ -103,7 +103,7 @@ To retrieve this Datasource, we will supply the `get_datasource(...)` method of my_datasource = context.get_datasource("my_datasource") ``` -### 1. Create a `batching_regex` +### 3. Create a `batching_regex` In a file-based Data Asset, any file that matches a provided regular expression (the `batching_regex` parameter) will be included as a Batch in the Data Asset. Therefore, to organize multiple files into Batches in a single Data Asset we must define a regular expression that will match one or more of our source data files. @@ -132,7 +132,7 @@ For more information, please see: [How to request data from a Data Asset](docs/g For more information on how to format regular expressions, we recommend referencing [Python's official how-to guide for working with regular expressions](https://docs.python.org/3/howto/regex.html). -### 2. Add a Data Asset using the `batching_regex` +### 4. Add a Data Asset using the `batching_regex` Now that we have put together a regular expression that will match one or more of the files in our Datasource's `base_folder`, we can use it to create our Data Asset. Since the files in this particular Datasource's `base_folder` are csv files, we will use the `add_pandas_csv(...)` method of our Datasource to create the new Data Asset: @@ -146,7 +146,7 @@ If you choose to omit the `batching_regex` parameter, your Data Asset will autom ::: -### 3. (Optional) Add Batch Sorters to the Data Asset +### 5. (Optional) Add Batch Sorters to the Data Asset We will now add a Batch Sorter to our Data Asset. This will allow us to explicitly state the order in which our Batches are returned when we request data from the Data Asset. To do this, we will pass a list of sorters to the `add_sorters(...)` method of our Data Asset. @@ -160,7 +160,7 @@ However, in this example we only have one named group, `"year"`, so our list of my_asset.add_sorters(["+year"]) ``` -### 4. Use a Batch Request to verify the Data Asset works as desired +### 6. Use a Batch Request to verify the Data Asset works as desired To verify that our Data Asset will return the desired files as Batches, we will define a quick Batch Request that will include all the Batches available in the Data asset. Then we will use that Batch Request to get a list of the returned Batches. From 035bf8f8748d6413c84d28b9d8783be5cb6ebe8a Mon Sep 17 00:00:00 2001 From: Kyle Eaton Date: Tue, 4 Apr 2023 17:34:39 -0400 Subject: [PATCH 27/96] [DOCS] Delete SLACK_GUIDELINES.md (#7566) --- CODE_OF_CONDUCT.md | 2 +- SLACK_GUIDELINES.md | 66 ------------------------------- ci/checks/check_repo_root_size.sh | 2 +- docs/docusaurus/docs/intro.md | 2 +- 4 files changed, 3 insertions(+), 69 deletions(-) delete mode 100644 SLACK_GUIDELINES.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index aadc911d801d..2bafb50d4f04 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -67,7 +67,7 @@ Expectations project, both in-person and virtual. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting Kyle Eaton on the Great Expectations Developer Relations team at kyle@superconductive.com. All +reported by contacting Josh Zheng on the Great Expectations Developer Relations team at josh.zheng@greatexpectations.io. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The Great Expectations core team is obligated to maintain confidentiality with regard to the reporter of an incident. diff --git a/SLACK_GUIDELINES.md b/SLACK_GUIDELINES.md deleted file mode 100644 index 08877083c691..000000000000 --- a/SLACK_GUIDELINES.md +++ /dev/null @@ -1,66 +0,0 @@ -# Slack Guidelines: - -## Be Nice. Be Respectful. -We cannot stress enough that we want this to be a safe, comfortable and inclusive environment. Please read our [code of conduct](https://github.com/great-expectations/great_expectations/blob/develop/CODE_OF_CONDUCT.md) if you need more information on this guideline. - -## Keep timezones in mind and be respectful of peoples’ time. -People on Slack are distributed and might be in a very different time zone from you, so don't use @channel @here (this is reserved for admins anyways). Before you @-mention someone, think about what timezone they are in and if you are likely to disturb them. You can check someone's timezone in their profile. As of today, the core GX team is based solely in the United States but the community is world wide. - -If you post in off hours be patient, Someone will get back to you once the sun comes up. - -## Asking for help - -- Do your best to try and solve the problem first as your efforts will help us more easily answer the question. -- [Read "How to write a good question in Slack"](https://github.com/great-expectations/great_expectations/discussions/4951) -- Head over to our [Documentation](https://docs.greatexpectations.io/en/latest/) -- Checkout [GitHub Discussions](https://github.com/great-expectations/great_expectations/discussions) this is where we want most of our problem solving, discussion, updates, etc to go because it helps keep a more visible record for GX users. - -#### Asking your question in Slack - -**Know your support channel:** -
      -
    • #support: Having trouble with customizing your Expectations, an integration or anything else .beyond just getting started? Post here.
    • -
    • #feature-requests: Have a good idea for an Expectation or a feature? Post it here.
    • -
    • #contributors-contributing: For previous, current and prospective contributors to talk about potential contributions and to help each other with contributions.
    • -
    -## Use Public Channels, Not Private Groups -Slack is about communication, so use public channels whenever possible. This is good for transparency, but also keeps the community vibrant and alive. You also have an opportunity to help someone else who might find your conversation helpful or encouraging. - - -## Promoting Products or Services -Great Expectations is a piece of the puzzle when it comes to being a data practitioner and so we want our community to be aware of the other pieces that make up the whole picture. If you plan on sharing an awesome tool or service you have no association with jump into #tools_and_services. If you are promoting your own please abide by the following rules. - -1. You're allowed to; - 1. Promote your tool ONLY in #tool_and_service or when you introduce yourself in #introductions. - 2. Promote your tool in the tools channel once. Beyond answering questions about the tool multiple postings may be viewed as excessive and receive a warning. - 3. Mention your tool if someone mentions or is organic to a solution. -2. If you see a partnership opportunity with Great Expectations please DM @Kyle Eaton and @Abe. We would love to create collaborative content and work with you if possible in promoting your product/service.  -3. We will warn you if your promotion becomes excessive or not aligned with our code of conduct. If continued the result could end in you being removed from the slack community. -4. **No Soliciting. Do not cold DM our users to promote your product. If we receive a valid soliciting complaint about you, we will remove you from the community.  -5. Have any questions about our promotional rules or want to run what you plan to share by us? DM @Kyle Eaton - -## All Channels - -**\#contributors-contributing**
    -For previous, current and prospective contributors to talk about potential contributions and to help each other with contributions. - -**\#job-openings**
    -Looking to hire someone in the community? Post your job here: - -**\#feature-requests**
    -Have a good idea for an expectation? Post it here. - -**\#support**
    -Looking for support for great expectations? Post your questions here. - -**\#announcements**
    -Any announcements related to the Great Expectations and the community will be posted here exclusively by admins (have something you think is announcement worthy? DM @Kyle-Eaton). - -**\#general**
    -A place to talk shop about data news, articles, conferences, data horror stories and really just anything in the data world. - -**\#documentation**
    -Have questions or suggestions for our documentation? Post here. - - -*If you have any questions or comments about the following feel free to comment here or dm @Kyle Eaton in the [slack channel](https://greatexpectations.io/slack). diff --git a/ci/checks/check_repo_root_size.sh b/ci/checks/check_repo_root_size.sh index 829013e568f2..f2a91118381b 100755 --- a/ci/checks/check_repo_root_size.sh +++ b/ci/checks/check_repo_root_size.sh @@ -5,7 +5,7 @@ # Please take care to only add files or directories to the repo root unless they are # required to be in the repo root, otherwise please find a more appropriate location. -NUM_ITEMS_SHOULD_BE=40 +NUM_ITEMS_SHOULD_BE=39 NUM_ITEMS=$(ls -la | wc -l) echo "Items found in repo root:" diff --git a/docs/docusaurus/docs/intro.md b/docs/docusaurus/docs/intro.md index de44b65c874e..f3e0a40729df 100755 --- a/docs/docusaurus/docs/intro.md +++ b/docs/docusaurus/docs/intro.md @@ -92,7 +92,7 @@ We’re committed to supporting and growing the community around Great Expectati Open source doesn’t always have the best reputation for being friendly and welcoming, and that makes us sad. Everyone belongs in open source, and Great Expectations is dedicated to making you feel welcome. #### Get in touch with the Great Expectations team -Join our public slack channel here: [join Slack](https://greatexpectations.io/slack). We’re not always available, but we’re there and responsive an awful lot of the time. Please read our [Slack Guidelines](https://github.com/great-expectations/great_expectations/blob/develop/SLACK_GUIDELINES.md). +Join our public slack channel here: [join Slack](https://greatexpectations.io/slack). We’re not always available, but we’re there and responsive an awful lot of the time. Please read our [Slack Guidelines](https://discourse.greatexpectations.io/t/slack-guidelines/1195). #### Ask a question Slack is good for that, too: [join Slack](https://greatexpectations.io/slack) and read [How to write a good question in Slack](https://github.com/great-expectations/great_expectations/discussions/4951). You can also use [GitHub Discussions](https://github.com/great-expectations/great_expectations/discussions/4951). From c27f5ba0a5c778ee99656c5aee54688c03737df0 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 4 Apr 2023 17:06:07 -0700 Subject: [PATCH 28/96] [BUGFIX] Address `pandas==2.0.0` test failures (#7553) Co-authored-by: Gabriel --- .../compatibility/pandas_compatibility.py | 89 +++++++++++++++++++ .../fluent/pandas_s3_datasource.pyi | 2 +- great_expectations/self_check/util.py | 11 ++- tests/cli/test_suite.py | 12 ++- tests/expectations/metrics/test_core.py | 8 +- .../test_expectation_atomic_renderers.py | 13 ++- 6 files changed, 122 insertions(+), 13 deletions(-) create mode 100644 great_expectations/compatibility/pandas_compatibility.py diff --git a/great_expectations/compatibility/pandas_compatibility.py b/great_expectations/compatibility/pandas_compatibility.py new file mode 100644 index 000000000000..a9c10abc1b95 --- /dev/null +++ b/great_expectations/compatibility/pandas_compatibility.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import pandas as pd +from typing_extensions import Literal + +from great_expectations.optional_imports import ( + is_version_less_than, +) + + +def execute_pandas_to_datetime( + arg, + errors: Literal["raise", "coerce", "ignore"] = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + utc: bool | None = None, + format: str | None = None, + exact: bool = True, + unit: str | None = None, + infer_datetime_format: bool = False, + origin="unix", + cache: bool = True, +): + """Wrapper method for calling Pandas `to_datetime()` for either 2.0.0 and above, or below. + + Args: + arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like + The object to convert to a datetime. + errors (strs): ignore, raise or coerce. + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as `NaT`. + - If 'ignore', then invalid parsing will return the input. + dayfirst (bool): Prefer to parse with dayfirst? Default + yearfirst (bool): Prefer to parse with yearfirst? + utc (bool): Control timezone-related parsing, localization and conversion. Default False. + format (str | None): The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. Default None. + exact (bool): How is `format` used? If True, then we require an exact match. Default True. + unit (str): Default unit since epoch. Default is 'ns'. + infer_datetime_format (bool): whether to infer datetime. Deprecated in pandas 2.0.0 + origin (str): reference date. Default is `unix`. + cache (bool): If true, then use a cache of unique, converted dates to apply the datetime conversion. Default is True. + + Returns: + Datetime converted output. + """ + if is_version_less_than(pd.__version__, "2.0.0"): + return pd.to_datetime( + arg=arg, + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + format=format, + exact=exact, + unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin, + cache=cache, + ) + else: + # pandas is 2.0.0 or greater + if format is None: + format = "mixed" + # format = `mixed` or `ISO8601` cannot be used in combination with `exact` parameter. + # infer_datetime_format is deprecated as of 2.0.0 + return pd.to_datetime( + arg=arg, + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + format=format, + unit=unit, + origin=origin, + cache=cache, + ) + else: + return pd.to_datetime( + arg=arg, + errors=errors, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + format=format, + exact=exact, + unit=unit, + origin=origin, + cache=cache, + ) diff --git a/great_expectations/datasource/fluent/pandas_s3_datasource.pyi b/great_expectations/datasource/fluent/pandas_s3_datasource.pyi index f3f11675eebd..0476ef8e30b7 100644 --- a/great_expectations/datasource/fluent/pandas_s3_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_s3_datasource.pyi @@ -20,7 +20,6 @@ from typing_extensions import Literal from great_expectations.core._docs_decorators import public_api as public_api from great_expectations.core.util import S3Url as S3Url from great_expectations.datasource.fluent import _PandasFilePathDatasource -from great_expectations.datasource.fluent.config_str import ConfigStr from great_expectations.datasource.fluent.data_asset.data_connector import ( FilesystemDataConnector as FilesystemDataConnector, ) @@ -51,6 +50,7 @@ from great_expectations.datasource.fluent.pandas_file_path_datasource import ( if TYPE_CHECKING: + from great_expectations.datasource.fluent.config_str import ConfigStr from great_expectations.datasource.fluent.dynamic_pandas import ( CompressionOptions, CSVEngine, diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index 3c409f9e040f..00e819d5200e 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -31,6 +31,9 @@ import pandas as pd from dateutil.parser import parse +from great_expectations.compatibility.pandas_compatibility import ( + execute_pandas_to_datetime, +) from great_expectations.core import ( ExpectationConfigurationSchema, ExpectationSuite, @@ -737,13 +740,13 @@ def _get_test_validator_with_data_pandas( # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: - df[key] = pd.to_datetime(df[key], utc=True) + df[key] = execute_pandas_to_datetime(df[key], utc=True) continue elif value.lower() in ["datetime", "datetime64", "datetime64[ns]"]: - df[key] = pd.to_datetime(df[key]) + df[key] = execute_pandas_to_datetime(df[key]) continue elif value.lower() in ["date"]: - df[key] = pd.to_datetime(df[key]).dt.date + df[key] = execute_pandas_to_datetime(df[key]).dt.date value = "object" try: type_ = np.dtype(value) @@ -1103,7 +1106,7 @@ def build_sa_validator_with_data( # noqa: C901 - 39 "TIMESTAMP_LTZ", "TIMESTAMP_TZ", ]: - df[col] = pd.to_datetime(df[col]) + df[col] = execute_pandas_to_datetime(df[col]) elif type_ in ["VARCHAR", "STRING"]: df[col] = df[col].apply(str) diff --git a/tests/cli/test_suite.py b/tests/cli/test_suite.py index a85aa2ef756c..99f620c8a422 100644 --- a/tests/cli/test_suite.py +++ b/tests/cli/test_suite.py @@ -4,6 +4,7 @@ from unittest import mock import pytest +import pandas as pd from _pytest.capture import CaptureResult from click.testing import CliRunner, Result @@ -24,6 +25,8 @@ from great_expectations.data_context.data_context.file_data_context import ( FileDataContext, ) +from great_expectations.optional_imports import is_version_greater_or_equal + from great_expectations.util import ( deep_filter_properties_iterable, get_context, @@ -1722,6 +1725,10 @@ def test_suite_edit_multiple_datasources_with_sql_with_no_additional_args_withou - NOT open Data Docs - open jupyter """ + + if is_version_greater_or_equal(pd.__version__, "2.0.0"): + pytest.xfail(reason="Test is currently not compatible with pandas 2.0.0") + context = titanic_v013_multi_datasource_multi_execution_engine_data_context_with_checkpoints_v1_with_empty_store_stats_enabled monkeypatch.chdir(os.path.dirname(context.root_directory)) @@ -1975,8 +1982,11 @@ def test_suite_edit_multiple_datasources_with_sql_with_no_additional_args_with_c The command should: - NOT open Data Docs - - NOT open jupyter """ + + if is_version_greater_or_equal(pd.__version__, "2.0.0"): + pytest.xfail(reason="Test is currently not compatible with pandas 2.0.0") + context = titanic_v013_multi_datasource_multi_execution_engine_data_context_with_checkpoints_v1_with_empty_store_stats_enabled monkeypatch.chdir(os.path.dirname(context.root_directory)) diff --git a/tests/expectations/metrics/test_core.py b/tests/expectations/metrics/test_core.py index 4e9bf144760e..ddf8f3105631 100644 --- a/tests/expectations/metrics/test_core.py +++ b/tests/expectations/metrics/test_core.py @@ -1991,9 +1991,7 @@ def test_map_column_values_increasing_pd(): ) metrics.update(results) - assert metrics[unexpected_rows_metric.id]["a"].index == pd.Int64Index( - [4], dtype="int64" - ) + assert metrics[unexpected_rows_metric.id]["a"].index == pd.Index([4], dtype="int64") assert metrics[unexpected_rows_metric.id]["a"].values == ["2021-02-21"] @@ -2180,9 +2178,7 @@ def test_map_column_values_decreasing_pd(): ) metrics.update(results) - assert metrics[unexpected_rows_metric.id]["a"].index == pd.Int64Index( - [3], dtype="int64" - ) + assert metrics[unexpected_rows_metric.id]["a"].index == pd.Index([3], dtype="int64") assert metrics[unexpected_rows_metric.id]["a"].values == ["2021-03-20"] diff --git a/tests/expectations/test_expectation_atomic_renderers.py b/tests/expectations/test_expectation_atomic_renderers.py index f45a2fbfd344..c93df876d490 100644 --- a/tests/expectations/test_expectation_atomic_renderers.py +++ b/tests/expectations/test_expectation_atomic_renderers.py @@ -2,11 +2,13 @@ from pprint import pprint from typing import Callable, Dict, Union +import pandas as pd import pytest from great_expectations.core import ExpectationValidationResult from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.expectations.registry import get_renderer_impl +from great_expectations.optional_imports import is_version_greater_or_equal from great_expectations.render import RenderedAtomicContent @@ -164,6 +166,9 @@ def test_atomic_prescriptive_summary_expect_column_kl_divergence_to_be_less_than snapshot, get_prescriptive_rendered_content, ): + if is_version_greater_or_equal(pd.__version__, "2.0.0"): + pytest.xfail(reason="Altair is currently not compatible with pandas 2.0.0") + update_dict = { "expectation_type": "expect_column_kl_divergence_to_be_less_than", "kwargs": { @@ -191,6 +196,9 @@ def test_atomic_prescriptive_summary_expect_column_kl_divergence_to_be_less_than def test_atomic_diagnostic_observed_value_expect_column_kl_divergence_to_be_less_than( snapshot, get_diagnostic_rendered_content ): + if is_version_greater_or_equal(pd.__version__, "2.0.0"): + pytest.xfail(reason="Altair is currently not compatible with pandas 2.0.0") + # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()` # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class. expectation_config = { @@ -234,9 +242,12 @@ def test_atomic_diagnostic_observed_value_expect_column_kl_divergence_to_be_less snapshot.assert_match(res) -def test_atomic_diagnostic_observed_value_with_boolean_columun_expect_column_kl_divergence_to_be_less_than( +def test_atomic_diagnostic_observed_value_with_boolean_column_expect_column_kl_divergence_to_be_less_than( snapshot, get_diagnostic_rendered_content ): + if is_version_greater_or_equal(pd.__version__, "2.0.0"): + pytest.xfail(reason="Altair is currently not compatible with pandas 2.0.0") + # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()` # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class. expectation_config = { From 3e3d841d5795903f8097321c010648aad14df0eb Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Tue, 4 Apr 2023 20:52:13 -0400 Subject: [PATCH 29/96] [DOCS] Update syntax highlighting of code blocks in GX Cloud Getting Started guide (#7563) --- .../getting_started/getting_started_with_gx_cloud.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md b/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md index b3f2a6993761..ecf6236df271 100644 --- a/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md +++ b/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md @@ -23,19 +23,19 @@ Welcome to Great Expectations Cloud! This tutorial will help you onboard with GX ### Step 2: Generate User Token -- Go to [“Settings” > “Tokens”](https://app.greatexpectations.io/tokens) in the navigation panel. In this tutorial, we’ll create a User Token, but GX Cloud also supports Organization tokens, e.g. for use in shared execution environments. These tokens are see-once and stored as a hash in Great Expectation Cloud's backend database. Once a user copies their API key, the Cloud UI will never show the token value again. +- Go to [“Settings” > “Tokens”](https://app.greatexpectations.io/tokens) in the navigation panel. In this tutorial, we’ll create a User Token, but GX Cloud also supports Organization tokens, e.g. for use in shared execution environments. These tokens are see-once and stored as a hash in Great Expectation Cloud's backend database. Once a user copies their API key, the Cloud UI will never show the token value again. ### Step 3: Set tokens and Create - Open Jupyter Notebook -:::tip +:::tip Any Python Interpreter or script file will work for the remaining steps in the guide, we recommend using a Jupyter Notebook, since they are included in the OSS GX installation and give the best experience of both composing a script file and running code in a live interpreter. ::: - Set environment variables in the notebook (alternatively, add these as [Data Context config variables](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md)) -```console +```python import os os.environ["GX_CLOUD_ORGANIZATION_ID"] = "" @@ -44,7 +44,7 @@ os.environ["GX_CLOUD_ACCESS_TOKEN"] = " Date: Tue, 4 Apr 2023 21:52:52 -0400 Subject: [PATCH 30/96] [MAINTENANCE] Test Pandas 2.0 prerelease in CI/CD (#7343) Co-authored-by: Nathan Farmer --- ci/azure-pipelines.yml | 15 +++++++++------ ci/constraints-test/pandas2-min-install.txt | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) create mode 100644 ci/constraints-test/pandas2-min-install.txt diff --git a/ci/azure-pipelines.yml b/ci/azure-pipelines.yml index ffe833a0c0ec..ce18a02632e1 100644 --- a/ci/azure-pipelines.yml +++ b/ci/azure-pipelines.yml @@ -426,23 +426,26 @@ stages: eval ${PYTEST} displayName: 'Pytest cli' - - job: test_min_versions + - job: test_dependency_versions dependsOn: [make_suffix, build_push] condition: succeeded() strategy: matrix: Python37: python.version: '3.7' - min_constraints: 'ci/constraints-test/py37-min-install.txt' + constraints: 'ci/constraints-test/py37-min-install.txt' Python38: python.version: '3.8' - min_constraints: 'ci/constraints-test/py38-min-install.txt' + constraints: 'ci/constraints-test/py38-min-install.txt' Python39: python.version: '3.9' - min_constraints: 'ci/constraints-test/py39-min-install.txt' + constraints: 'ci/constraints-test/py39-min-install.txt' Python310: python.version: '3.10' - min_constraints: 'ci/constraints-test/py310-min-install.txt' + constraints: 'ci/constraints-test/py310-min-install.txt' + Pandas20: + python.version: '3.9' + constraints: 'ci/constraints-test/pandas2-min-install.txt' variables: IMAGE_SUFFIX: $[ dependencies.make_suffix.outputs['suffix.IMAGE_SUFFIX'] ] steps: @@ -453,7 +456,7 @@ stages: PYTEST+="--network=host greatexpectations/test:${DOCKER_TAG} bash -c \"" PYTEST+="pip install --no-cache-dir --requirement requirements-dev.txt " PYTEST+="--requirement reqs/requirements-dev-test.txt --constraint constraints-dev.txt " - PYTEST+="--constraint $(min_constraints) && " + PYTEST+="--constraint $(constraints) && " PYTEST+="pytest --no-sqlalchemy --random-order --ignore=tests/cli --ignore=tests/integration/usage_statistic " PYTEST+="--ignore=tests/rule_based_profiler/data_assistant/test_onboarding_data_assistant.py\"" echo ${PYTEST} diff --git a/ci/constraints-test/pandas2-min-install.txt b/ci/constraints-test/pandas2-min-install.txt new file mode 100644 index 000000000000..5e9186c019ab --- /dev/null +++ b/ci/constraints-test/pandas2-min-install.txt @@ -0,0 +1 @@ +pandas>=2.0.0rc1 From e55b3484a86f654e8b819041dd6cc73730e01a8f Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Tue, 4 Apr 2023 22:49:12 -0400 Subject: [PATCH 31/96] [MAINTENANCE] Add noqa directives for existing sqlalchemy imports (#7564) --- great_expectations/cli/batch_request.py | 4 +-- great_expectations/cli/checkpoint.py | 4 +-- great_expectations/cli/datasource.py | 2 +- great_expectations/cli/init.py | 2 +- great_expectations/cli/suite.py | 2 +- .../data_context/abstract_data_context.py | 2 +- .../store/database_store_backend.py | 10 +++++-- .../data_context/store/query_store.py | 6 ++-- great_expectations/data_context/util.py | 2 +- great_expectations/dataset/dataset.py | 2 +- great_expectations/dataset/util.py | 6 ++-- .../query_batch_kwargs_generator.py | 6 ++-- .../table_batch_kwargs_generator.py | 6 ++-- .../configured_asset_sql_data_connector.py | 4 +-- .../inferred_asset_sql_data_connector.py | 8 +++--- .../datasource/fluent/pandas_datasource.pyi | 2 +- .../datasource/fluent/sql_datasource.py | 2 +- .../simple_sqlalchemy_datasource.py | 2 +- great_expectations/df_to_database_loader.py | 8 +++--- .../execution_engine/execution_engine.py | 2 +- .../sqlalchemy_data_splitter.py | 18 +++++++----- .../execution_engine/sqlalchemy_batch_data.py | 10 +++---- .../sqlalchemy_execution_engine.py | 16 +++++------ great_expectations/execution_engine/util.py | 2 +- .../core/expect_column_max_to_be_between.py | 2 +- ...pect_column_value_lengths_to_be_between.py | 2 +- .../core/expect_column_values_to_be_in_set.py | 2 +- ...pect_column_values_to_be_json_parseable.py | 2 +- .../expect_column_values_to_be_of_type.py | 4 +-- .../core/expect_column_values_to_be_unique.py | 2 +- ...pect_column_values_to_match_json_schema.py | 2 +- ...ect_column_values_to_match_like_pattern.py | 2 +- ...expect_column_values_to_not_match_regex.py | 2 +- .../column_quantile_values.py | 12 ++++---- .../column_values_in_set.py | 2 +- .../expectations/metrics/import_manager.py | 18 ++++++------ .../table_metrics/table_column_types.py | 2 +- .../expectations/metrics/util.py | 20 ++++++------- .../expectations/row_conditions.py | 4 +-- .../profile/basic_dataset_profiler.py | 2 +- .../attributed_resolved_metrics.py | 4 +-- .../sqlalchemy_connection_manager.py | 8 +++--- great_expectations/self_check/util.py | 28 ++++++++++--------- great_expectations/util.py | 12 ++++---- 44 files changed, 135 insertions(+), 125 deletions(-) diff --git a/great_expectations/cli/batch_request.py b/great_expectations/cli/batch_request.py index 713ec27f6408..29454cf2bf7e 100644 --- a/great_expectations/cli/batch_request.py +++ b/great_expectations/cli/batch_request.py @@ -32,8 +32,8 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy - from sqlalchemy.engine.reflection import Inspector + import sqlalchemy # noqa: TID251 + from sqlalchemy.engine.reflection import Inspector # noqa: TID251 except ImportError: logger.debug( "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" diff --git a/great_expectations/cli/checkpoint.py b/great_expectations/cli/checkpoint.py index d3f8f7ad4105..0f67f62dee0e 100644 --- a/great_expectations/cli/checkpoint.py +++ b/great_expectations/cli/checkpoint.py @@ -20,13 +20,13 @@ from great_expectations.util import lint_code try: - from sqlalchemy.exc import SQLAlchemyError + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: SQLAlchemyError = RuntimeError try: - from sqlalchemy.exc import SQLAlchemyError + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: SQLAlchemyError = RuntimeError diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py index b013e3113448..ad3e90dcc65a 100644 --- a/great_expectations/cli/datasource.py +++ b/great_expectations/cli/datasource.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy + import sqlalchemy # noqa: TID251 except ImportError: logger.debug( "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" diff --git a/great_expectations/cli/init.py b/great_expectations/cli/init.py index 2ce154b8cd80..f05d261c9595 100644 --- a/great_expectations/cli/init.py +++ b/great_expectations/cli/init.py @@ -29,7 +29,7 @@ ) try: - from sqlalchemy.exc import SQLAlchemyError + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: # We'll redefine this error in code below to catch ProfilerError, which is caught above, so SA errors will # just fall through diff --git a/great_expectations/cli/suite.py b/great_expectations/cli/suite.py index 02944a630c0b..eec9df4a2d40 100644 --- a/great_expectations/cli/suite.py +++ b/great_expectations/cli/suite.py @@ -33,7 +33,7 @@ ) try: - from sqlalchemy.exc import SQLAlchemyError + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: # We'll redefine this error in code below to catch ProfilerError, which is caught above, so SA errors will # just fall through diff --git a/great_expectations/data_context/data_context/abstract_data_context.py b/great_expectations/data_context/data_context/abstract_data_context.py index 604ca2b14702..cf8dd0b98bb5 100644 --- a/great_expectations/data_context/data_context/abstract_data_context.py +++ b/great_expectations/data_context/data_context/abstract_data_context.py @@ -133,7 +133,7 @@ ) try: - from sqlalchemy.exc import SQLAlchemyError + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: # We'll redefine this error in code below to catch ProfilerError, which is caught above, so SA errors will # just fall through diff --git a/great_expectations/data_context/store/database_store_backend.py b/great_expectations/data_context/store/database_store_backend.py index 3291ad5a6c0d..40f4309cdd40 100644 --- a/great_expectations/data_context/store/database_store_backend.py +++ b/great_expectations/data_context/store/database_store_backend.py @@ -14,9 +14,13 @@ ) try: - from sqlalchemy import Column, MetaData, String, Table, and_, column - from sqlalchemy.engine.url import URL - from sqlalchemy.exc import IntegrityError, NoSuchTableError, SQLAlchemyError + from sqlalchemy import Column, MetaData, String, Table, and_, column # noqa: TID251 + from sqlalchemy.engine.url import URL # noqa: TID251 + from sqlalchemy.exc import ( # noqa: TID251 + IntegrityError, + NoSuchTableError, + SQLAlchemyError, + ) make_url = import_make_url() except ImportError: diff --git a/great_expectations/data_context/store/query_store.py b/great_expectations/data_context/store/query_store.py index eae3555995f2..d9e25d177b3c 100644 --- a/great_expectations/data_context/store/query_store.py +++ b/great_expectations/data_context/store/query_store.py @@ -8,9 +8,9 @@ from great_expectations.util import filter_properties_dict try: - import sqlalchemy as sa - from sqlalchemy import create_engine - from sqlalchemy.engine.url import URL + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy import create_engine # noqa: TID251 + from sqlalchemy.engine.url import URL # noqa: TID251 if is_version_greater_or_equal(sa.__version__, "1.4.0"): url_create_fn = URL.create diff --git a/great_expectations/data_context/util.py b/great_expectations/data_context/util.py index aa241661a310..b0c119f8f851 100644 --- a/great_expectations/data_context/util.py +++ b/great_expectations/data_context/util.py @@ -17,7 +17,7 @@ from great_expectations.util import load_class, verify_dynamic_loading_support try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: sa = None diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py index 176e0a725d33..7cf17ed86194 100644 --- a/great_expectations/dataset/dataset.py +++ b/great_expectations/dataset/dataset.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) try: - from sqlalchemy.sql import quoted_name + from sqlalchemy.sql import quoted_name # noqa: TID251 except: logger.debug( diff --git a/great_expectations/dataset/util.py b/great_expectations/dataset/util.py index 8ba890343f7e..00bcb06d30ef 100644 --- a/great_expectations/dataset/util.py +++ b/great_expectations/dataset/util.py @@ -11,9 +11,9 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy # noqa: F401 - from sqlalchemy.engine.default import DefaultDialect - from sqlalchemy.sql.elements import WithinGroup + import sqlalchemy # noqa: F401, TID251 + from sqlalchemy.engine.default import DefaultDialect # noqa: TID251 + from sqlalchemy.sql.elements import WithinGroup # noqa: TID251 except ImportError: logger.debug("Unable to load SqlAlchemy or one of its subclasses.") DefaultDialect = None diff --git a/great_expectations/datasource/batch_kwargs_generator/query_batch_kwargs_generator.py b/great_expectations/datasource/batch_kwargs_generator/query_batch_kwargs_generator.py index d02c12863aa4..eb2ecddc8938 100644 --- a/great_expectations/datasource/batch_kwargs_generator/query_batch_kwargs_generator.py +++ b/great_expectations/datasource/batch_kwargs_generator/query_batch_kwargs_generator.py @@ -11,9 +11,9 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy - from sqlalchemy import create_engine - from sqlalchemy.engine import reflection + import sqlalchemy # noqa: TID251 + from sqlalchemy import create_engine # noqa: TID251 + from sqlalchemy.engine import reflection # noqa: TID251 except ImportError: sqlalchemy = None create_engine = None diff --git a/great_expectations/datasource/batch_kwargs_generator/table_batch_kwargs_generator.py b/great_expectations/datasource/batch_kwargs_generator/table_batch_kwargs_generator.py index b120f6367ac8..1bd9d59ea359 100644 --- a/great_expectations/datasource/batch_kwargs_generator/table_batch_kwargs_generator.py +++ b/great_expectations/datasource/batch_kwargs_generator/table_batch_kwargs_generator.py @@ -13,9 +13,9 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy - from sqlalchemy import create_engine - from sqlalchemy.engine import reflection + import sqlalchemy # noqa: TID251 + from sqlalchemy import create_engine # noqa: TID251 + from sqlalchemy.engine import reflection # noqa: TID251 except ImportError: sqlalchemy = None create_engine = None diff --git a/great_expectations/datasource/data_connector/configured_asset_sql_data_connector.py b/great_expectations/datasource/data_connector/configured_asset_sql_data_connector.py index 19513e9728e4..41107a204bee 100644 --- a/great_expectations/datasource/data_connector/configured_asset_sql_data_connector.py +++ b/great_expectations/datasource/data_connector/configured_asset_sql_data_connector.py @@ -36,12 +36,12 @@ from great_expectations.util import deep_filter_properties_iterable try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: sa = None try: - from sqlalchemy.sql import Selectable + from sqlalchemy.sql import Selectable # noqa: TID251 except ImportError: Selectable = None diff --git a/great_expectations/datasource/data_connector/inferred_asset_sql_data_connector.py b/great_expectations/datasource/data_connector/inferred_asset_sql_data_connector.py index 3922a0d21ba3..a0e331b3dc9e 100644 --- a/great_expectations/datasource/data_connector/inferred_asset_sql_data_connector.py +++ b/great_expectations/datasource/data_connector/inferred_asset_sql_data_connector.py @@ -9,10 +9,10 @@ from great_expectations.util import deep_filter_properties_iterable try: - import sqlalchemy as sa - from sqlalchemy.engine import Engine - from sqlalchemy.engine.reflection import Inspector - from sqlalchemy.exc import OperationalError + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy.engine import Engine # noqa: TID251 + from sqlalchemy.engine.reflection import Inspector # noqa: TID251 + from sqlalchemy.exc import OperationalError # noqa: TID251 except ImportError: sa = None Engine = None diff --git a/great_expectations/datasource/fluent/pandas_datasource.pyi b/great_expectations/datasource/fluent/pandas_datasource.pyi index d7e414d6e4ce..07972231b06e 100644 --- a/great_expectations/datasource/fluent/pandas_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_datasource.pyi @@ -23,7 +23,7 @@ from typing import ( import pandas as pd import pydantic -import sqlalchemy +import sqlalchemy # noqa: TID251 from typing_extensions import Literal from great_expectations.datasource.fluent.interfaces import ( diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index 27b3a2f74123..ff2eb171c70e 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -48,7 +48,7 @@ if TYPE_CHECKING: # min version of typing_extension missing `Self`, so it can't be imported at runtime - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 from typing_extensions import Self diff --git a/great_expectations/datasource/simple_sqlalchemy_datasource.py b/great_expectations/datasource/simple_sqlalchemy_datasource.py index 25424aadbb03..d983eaed2aed 100644 --- a/great_expectations/datasource/simple_sqlalchemy_datasource.py +++ b/great_expectations/datasource/simple_sqlalchemy_datasource.py @@ -10,7 +10,7 @@ from great_expectations.datasource.new_datasource import BaseDatasource if TYPE_CHECKING: - from sqlalchemy.engine import Engine as SaEngine + from sqlalchemy.engine import Engine as SaEngine # noqa: TID251 from great_expectations.execution_engine import SqlAlchemyExecutionEngine diff --git a/great_expectations/df_to_database_loader.py b/great_expectations/df_to_database_loader.py index c6c5d8ed6e62..8c1ce7e33e98 100644 --- a/great_expectations/df_to_database_loader.py +++ b/great_expectations/df_to_database_loader.py @@ -9,10 +9,10 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sa - from sqlalchemy import Table - from sqlalchemy.engine import reflection - from sqlalchemy.sql import Select + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy import Table # noqa: TID251 + from sqlalchemy.engine import reflection # noqa: TID251 + from sqlalchemy.sql import Select # noqa: TID251 except ImportError: logger.debug( diff --git a/great_expectations/execution_engine/execution_engine.py b/great_expectations/execution_engine/execution_engine.py index 27c1392abcd4..db926c5cf273 100644 --- a/great_expectations/execution_engine/execution_engine.py +++ b/great_expectations/execution_engine/execution_engine.py @@ -37,7 +37,7 @@ if TYPE_CHECKING: # noinspection PyPep8Naming import pyspark.sql.functions as F - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 from great_expectations.core.batch import ( BatchData, diff --git a/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_splitter.py b/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_splitter.py index 795658d7a098..a2c578e5c9ae 100644 --- a/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_splitter.py +++ b/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_splitter.py @@ -27,15 +27,19 @@ from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: sa = None try: - import sqlalchemy.sql.functions.concat as concat - from sqlalchemy.engine import LegacyRow - from sqlalchemy.sql import Selectable - from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList, Label + import sqlalchemy.sql.functions.concat as concat # noqa: TID251 + from sqlalchemy.engine import LegacyRow # noqa: TID251 + from sqlalchemy.sql import Selectable # noqa: TID251 + from sqlalchemy.sql.elements import ( # noqa: TID251 + BinaryExpression, + BooleanClauseList, + Label, + ) except ImportError: LegacyRow = None Selectable = None @@ -45,8 +49,8 @@ concat = None if TYPE_CHECKING: - from sqlalchemy.sql import Selectable - from sqlalchemy.sql.expression import Cast, ColumnOperators + from sqlalchemy.sql import Selectable # noqa: TID251 + from sqlalchemy.sql.expression import Cast, ColumnOperators # noqa: TID251 from great_expectations.execution_engine.sqlalchemy_execution_engine import ( SqlAlchemyExecutionEngine, diff --git a/great_expectations/execution_engine/sqlalchemy_batch_data.py b/great_expectations/execution_engine/sqlalchemy_batch_data.py index a1b369d36096..7f3a1fefda43 100644 --- a/great_expectations/execution_engine/sqlalchemy_batch_data.py +++ b/great_expectations/execution_engine/sqlalchemy_batch_data.py @@ -6,11 +6,11 @@ from great_expectations.util import generate_temporary_table_name try: - import sqlalchemy as sa - from sqlalchemy.engine import Engine - from sqlalchemy.engine.default import DefaultDialect - from sqlalchemy.exc import DatabaseError - from sqlalchemy.sql.elements import quoted_name + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy.engine import Engine # noqa: TID251 + from sqlalchemy.engine.default import DefaultDialect # noqa: TID251 + from sqlalchemy.exc import DatabaseError # noqa: TID251 + from sqlalchemy.sql.elements import quoted_name # noqa: TID251 except ImportError: sa = None quoted_name = None diff --git a/great_expectations/execution_engine/sqlalchemy_execution_engine.py b/great_expectations/execution_engine/sqlalchemy_execution_engine.py index 6704b62a3069..60cd5be41e22 100644 --- a/great_expectations/execution_engine/sqlalchemy_execution_engine.py +++ b/great_expectations/execution_engine/sqlalchemy_execution_engine.py @@ -89,7 +89,7 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 sqlalchemy_version_check(sa.__version__) @@ -98,16 +98,16 @@ sa = None try: - from sqlalchemy.engine import Dialect, Engine, Row - from sqlalchemy.exc import OperationalError - from sqlalchemy.sql import Selectable - from sqlalchemy.sql.elements import ( + from sqlalchemy.engine import Dialect, Engine, Row # noqa: TID251 + from sqlalchemy.exc import OperationalError # noqa: TID251 + from sqlalchemy.sql import Selectable # noqa: TID251 + from sqlalchemy.sql.elements import ( # noqa: TID251 BooleanClauseList, Label, TextClause, quoted_name, ) - from sqlalchemy.sql.selectable import Select, TextualSelect + from sqlalchemy.sql.selectable import Select, TextualSelect # noqa: TID251 except ImportError: Engine = None BooleanClauseList = None @@ -126,7 +126,7 @@ try: import psycopg2 # noqa: F401 - import sqlalchemy.dialects.postgresql.psycopg2 as sqlalchemy_psycopg2 # noqa: F401 + import sqlalchemy.dialects.postgresql.psycopg2 as sqlalchemy_psycopg2 # noqa: F401, TID251 except (ImportError, KeyError): sqlalchemy_psycopg2 = None @@ -214,7 +214,7 @@ trinotypes = None if TYPE_CHECKING: - from sqlalchemy.engine import Engine as SaEngine + from sqlalchemy.engine import Engine as SaEngine # noqa: TID251 def _get_dialect_type_module(dialect): diff --git a/great_expectations/execution_engine/util.py b/great_expectations/execution_engine/util.py index b81dfd52ca24..4f8b383bdb38 100644 --- a/great_expectations/execution_engine/util.py +++ b/great_expectations/execution_engine/util.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy # noqa: F401 + import sqlalchemy # noqa: F401, TID251 except ImportError: logger.debug("Unable to load SqlAlchemy or one of its subclasses.") diff --git a/great_expectations/expectations/core/expect_column_max_to_be_between.py b/great_expectations/expectations/core/expect_column_max_to_be_between.py index 9f9127be632c..161eee8a7a27 100644 --- a/great_expectations/expectations/core/expect_column_max_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_max_to_be_between.py @@ -37,7 +37,7 @@ ) try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/core/expect_column_value_lengths_to_be_between.py b/great_expectations/expectations/core/expect_column_value_lengths_to_be_between.py index 763d682bdf18..b4ab0edb08aa 100644 --- a/great_expectations/expectations/core/expect_column_value_lengths_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_value_lengths_to_be_between.py @@ -42,7 +42,7 @@ ) try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/core/expect_column_values_to_be_in_set.py b/great_expectations/expectations/core/expect_column_values_to_be_in_set.py index 0d1267db1c3e..4f2b98ffab6b 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_in_set.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_in_set.py @@ -40,7 +40,7 @@ ) try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass from great_expectations.expectations.expectation import ( diff --git a/great_expectations/expectations/core/expect_column_values_to_be_json_parseable.py b/great_expectations/expectations/core/expect_column_values_to_be_json_parseable.py index 9b6874ad2b93..9ad20b8e8024 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_json_parseable.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_json_parseable.py @@ -25,7 +25,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py index 12c85087c2af..5f360c1fea3f 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py @@ -55,8 +55,8 @@ ) try: - import sqlalchemy as sa - from sqlalchemy.dialects import registry + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy.dialects import registry # noqa: TID251 except ImportError: logger.debug( diff --git a/great_expectations/expectations/core/expect_column_values_to_be_unique.py b/great_expectations/expectations/core/expect_column_values_to_be_unique.py index 7f11f43cdd0d..392186dcd45f 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_unique.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_unique.py @@ -23,7 +23,7 @@ ) try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/core/expect_column_values_to_match_json_schema.py b/great_expectations/expectations/core/expect_column_values_to_match_json_schema.py index b38e59aac5cf..d796f83ee126 100644 --- a/great_expectations/expectations/core/expect_column_values_to_match_json_schema.py +++ b/great_expectations/expectations/core/expect_column_values_to_match_json_schema.py @@ -25,7 +25,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/core/expect_column_values_to_match_like_pattern.py b/great_expectations/expectations/core/expect_column_values_to_match_like_pattern.py index c1c76582a6bd..9d25332af7d8 100644 --- a/great_expectations/expectations/core/expect_column_values_to_match_like_pattern.py +++ b/great_expectations/expectations/core/expect_column_values_to_match_like_pattern.py @@ -14,7 +14,7 @@ from great_expectations.render.util import substitute_none_for_missing try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/core/expect_column_values_to_not_match_regex.py b/great_expectations/expectations/core/expect_column_values_to_not_match_regex.py index f170802ba01c..9e40a7d3807d 100644 --- a/great_expectations/expectations/core/expect_column_values_to_not_match_regex.py +++ b/great_expectations/expectations/core/expect_column_values_to_not_match_regex.py @@ -41,7 +41,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs try: - import sqlalchemy as sa # noqa: F401 + import sqlalchemy as sa # noqa: F401, TID251 except ImportError: pass diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py index 4ef1352a7216..03dd8406e2da 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py @@ -31,10 +31,10 @@ TrinoUserError = None try: - from sqlalchemy.exc import ProgrammingError - from sqlalchemy.sql import Select - from sqlalchemy.sql.elements import Label, TextClause, WithinGroup - from sqlalchemy.sql.selectable import CTE + from sqlalchemy.exc import ProgrammingError # noqa: TID251 + from sqlalchemy.sql import Select # noqa: TID251 + from sqlalchemy.sql.elements import Label, TextClause, WithinGroup # noqa: TID251 + from sqlalchemy.sql.selectable import CTE # noqa: TID251 except ImportError: logger.debug( "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" @@ -47,10 +47,10 @@ CTE = None try: - from sqlalchemy.engine.row import Row + from sqlalchemy.engine.row import Row # noqa: TID251 except ImportError: try: - from sqlalchemy.engine.row import RowProxy + from sqlalchemy.engine.row import RowProxy # noqa: TID251 Row = RowProxy except ImportError: diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py index 3cd7dba51184..c382241284d4 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py @@ -15,7 +15,7 @@ from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: sa = None diff --git a/great_expectations/expectations/metrics/import_manager.py b/great_expectations/expectations/metrics/import_manager.py index 4bcc75458da3..6ba45648c36e 100644 --- a/great_expectations/expectations/metrics/import_manager.py +++ b/great_expectations/expectations/metrics/import_manager.py @@ -8,15 +8,15 @@ spark_import_warning_required = False try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy module available.") sa = None try: - from sqlalchemy.engine import Engine as sqlalchemy_engine_Engine - from sqlalchemy.engine import Row as sqlalchemy_engine_Row - from sqlalchemy.engine import reflection + from sqlalchemy.engine import Engine as sqlalchemy_engine_Engine # noqa: TID251 + from sqlalchemy.engine import Row as sqlalchemy_engine_Row # noqa: TID251 + from sqlalchemy.engine import reflection # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy.engine module available.") reflection = None @@ -24,15 +24,15 @@ sqlalchemy_engine_Row = None try: - import sqlalchemy.func.count as sa_func_count + import sqlalchemy.func.count as sa_func_count # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy.func module available.") sa_func_count = None try: - import sqlalchemy.sql.expression.ColumnClause as sa_sql_expression_ColumnClause - import sqlalchemy.sql.expression.Select as sa_sql_expression_Select - import sqlalchemy.sql.expression.Selectable as sa_sql_expression_Selectable + import sqlalchemy.sql.expression.ColumnClause as sa_sql_expression_ColumnClause # noqa: TID251 + import sqlalchemy.sql.expression.Select as sa_sql_expression_Select # noqa: TID251 + import sqlalchemy.sql.expression.Selectable as sa_sql_expression_Selectable # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy.sql.expression module available.") sa_sql_expression_ColumnClause = None @@ -40,7 +40,7 @@ sa_sql_expression_Selectable = None try: - from sqlalchemy.sql.elements import quoted_name + from sqlalchemy.sql.elements import quoted_name # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy.sql.elements module available.") quoted_name = None diff --git a/great_expectations/expectations/metrics/table_metrics/table_column_types.py b/great_expectations/expectations/metrics/table_metrics/table_column_types.py index 9a0235d74cc6..47f834accdd1 100644 --- a/great_expectations/expectations/metrics/table_metrics/table_column_types.py +++ b/great_expectations/expectations/metrics/table_metrics/table_column_types.py @@ -18,7 +18,7 @@ from great_expectations.expectations.metrics.util import get_sqlalchemy_column_metadata try: - from sqlalchemy.sql.elements import TextClause + from sqlalchemy.sql.elements import TextClause # noqa: TID251 except ImportError: TextClause = None diff --git a/great_expectations/expectations/metrics/util.py b/great_expectations/expectations/metrics/util.py index 5c8e9bac2dcc..2d28460655b2 100644 --- a/great_expectations/expectations/metrics/util.py +++ b/great_expectations/expectations/metrics/util.py @@ -24,7 +24,7 @@ try: import psycopg2 # noqa: F401 - import sqlalchemy.dialects.postgresql.psycopg2 as sqlalchemy_psycopg2 + import sqlalchemy.dialects.postgresql.psycopg2 as sqlalchemy_psycopg2 # noqa: TID251 except (ImportError, KeyError): sqlalchemy_psycopg2 = None @@ -34,13 +34,13 @@ snowflake = None try: - import sqlalchemy as sa - from sqlalchemy.dialects import registry - from sqlalchemy.engine import Connection, Engine, reflection - from sqlalchemy.engine.interfaces import Dialect - from sqlalchemy.exc import OperationalError - from sqlalchemy.sql import Insert, Select, TableClause - from sqlalchemy.sql.elements import ( + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy.dialects import registry # noqa: TID251 + from sqlalchemy.engine import Connection, Engine, reflection # noqa: TID251 + from sqlalchemy.engine.interfaces import Dialect # noqa: TID251 + from sqlalchemy.exc import OperationalError # noqa: TID251 + from sqlalchemy.sql import Insert, Select, TableClause # noqa: TID251 + from sqlalchemy.sql.elements import ( # noqa: TID251 BinaryExpression, ColumnElement, Label, @@ -48,7 +48,7 @@ literal, quoted_name, ) - from sqlalchemy.sql.operators import custom_op + from sqlalchemy.sql.operators import custom_op # noqa: TID251 except ImportError: sa = None registry = None @@ -133,7 +133,7 @@ teradatatypes = None if TYPE_CHECKING: - import sqlalchemy + import sqlalchemy # noqa: TID251 def get_dialect_regex_expression( # noqa: C901 - 36 diff --git a/great_expectations/expectations/row_conditions.py b/great_expectations/expectations/row_conditions.py index 085ff76d7d0f..16f0dc3a0dc0 100644 --- a/great_expectations/expectations/row_conditions.py +++ b/great_expectations/expectations/row_conditions.py @@ -28,13 +28,13 @@ F = None try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: sa = None if TYPE_CHECKING: import pyspark.sql - from sqlalchemy.sql.expression import ColumnElement + from sqlalchemy.sql.expression import ColumnElement # noqa: TID251 def _set_notnull(s, l, t) -> None: # noqa: E741 # ambiguous name `l` diff --git a/great_expectations/profile/basic_dataset_profiler.py b/great_expectations/profile/basic_dataset_profiler.py index 8a1f09decb4c..d571d08356db 100644 --- a/great_expectations/profile/basic_dataset_profiler.py +++ b/great_expectations/profile/basic_dataset_profiler.py @@ -8,7 +8,7 @@ ) try: - from sqlalchemy.exc import OperationalError + from sqlalchemy.exc import OperationalError # noqa: TID251 except ModuleNotFoundError: OperationalError = RuntimeError diff --git a/great_expectations/rule_based_profiler/attributed_resolved_metrics.py b/great_expectations/rule_based_profiler/attributed_resolved_metrics.py index 2cecf0622369..e76831cc3efe 100644 --- a/great_expectations/rule_based_profiler/attributed_resolved_metrics.py +++ b/great_expectations/rule_based_profiler/attributed_resolved_metrics.py @@ -17,13 +17,13 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sa + import sqlalchemy as sa # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy module available.") sa = None try: - from sqlalchemy.engine import Row as sqlalchemy_engine_Row + from sqlalchemy.engine import Row as sqlalchemy_engine_Row # noqa: TID251 except ImportError: logger.debug("No SqlAlchemy.engine module available.") sqlalchemy_engine_Row = None diff --git a/great_expectations/self_check/sqlalchemy_connection_manager.py b/great_expectations/self_check/sqlalchemy_connection_manager.py index 6f2347be5f04..266133f0e086 100644 --- a/great_expectations/self_check/sqlalchemy_connection_manager.py +++ b/great_expectations/self_check/sqlalchemy_connection_manager.py @@ -5,10 +5,10 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sqlalchemy - from sqlalchemy import create_engine - from sqlalchemy.engine import Connection, Engine - from sqlalchemy.exc import SQLAlchemyError + import sqlalchemy as sqlalchemy # noqa: TID251 + from sqlalchemy import create_engine # noqa: TID251 + from sqlalchemy.engine import Connection, Engine # noqa: TID251 + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: sqlalchemy = None create_engine = None diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index 00e819d5200e..f21293171163 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -95,10 +95,10 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sqlalchemy - from sqlalchemy import create_engine - from sqlalchemy.engine import Engine - from sqlalchemy.exc import SQLAlchemyError + import sqlalchemy as sqlalchemy # noqa: TID251 + from sqlalchemy import create_engine # noqa: TID251 + from sqlalchemy.engine import Engine # noqa: TID251 + from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 except ImportError: sqlalchemy = None create_engine = None @@ -121,10 +121,10 @@ spark_DataFrame = type(None) # type: ignore[assignment,misc] try: - import sqlalchemy.dialects.sqlite as sqlitetypes + import sqlalchemy.dialects.sqlite as sqlitetypes # noqa: TID251 # noinspection PyPep8Naming - from sqlalchemy.dialects.sqlite import dialect as sqliteDialect + from sqlalchemy.dialects.sqlite import dialect as sqliteDialect # noqa: TID251 SQLITE_TYPES = { "VARCHAR": sqlitetypes.VARCHAR, @@ -229,8 +229,10 @@ try: - import sqlalchemy.dialects.postgresql as postgresqltypes - from sqlalchemy.dialects.postgresql import dialect as postgresqlDialect + import sqlalchemy.dialects.postgresql as postgresqltypes # noqa: TID251 + from sqlalchemy.dialects.postgresql import ( # noqa: TID251 + dialect as postgresqlDialect, + ) POSTGRESQL_TYPES = { "TEXT": postgresqltypes.TEXT, @@ -250,10 +252,10 @@ POSTGRESQL_TYPES = {} try: - import sqlalchemy.dialects.mysql as mysqltypes + import sqlalchemy.dialects.mysql as mysqltypes # noqa: TID251 # noinspection PyPep8Naming - from sqlalchemy.dialects.mysql import dialect as mysqlDialect + from sqlalchemy.dialects.mysql import dialect as mysqlDialect # noqa: TID251 MYSQL_TYPES = { "TEXT": mysqltypes.TEXT, @@ -277,10 +279,10 @@ try: # SQLAlchemy does not export the "INT" type for the MS SQL Server dialect; however "INT" is supported by the engine. # Since SQLAlchemy exports the "INTEGER" type for the MS SQL Server dialect, alias "INT" to the "INTEGER" type. - import sqlalchemy.dialects.mssql as mssqltypes + import sqlalchemy.dialects.mssql as mssqltypes # noqa: TID251 # noinspection PyPep8Naming - from sqlalchemy.dialects.mssql import dialect as mssqlDialect + from sqlalchemy.dialects.mssql import dialect as mssqlDialect # noqa: TID251 try: getattr(mssqltypes, "INT") @@ -2807,7 +2809,7 @@ def _create_trino_engine( engine = create_engine( _get_trino_connection_string(hostname=hostname, schema_name=schema_name) ) - from sqlalchemy import text + from sqlalchemy import text # noqa: TID251 from trino.exceptions import TrinoUserError with engine.begin() as conn: diff --git a/great_expectations/util.py b/great_expectations/util.py index 6c83d630002d..66671f67cbee 100644 --- a/great_expectations/util.py +++ b/great_expectations/util.py @@ -77,10 +77,10 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sa - from sqlalchemy import Table - from sqlalchemy.engine import reflection - from sqlalchemy.sql import Select + import sqlalchemy as sa # noqa: TID251 + from sqlalchemy import Table # noqa: TID251 + from sqlalchemy.engine import reflection # noqa: TID251 + from sqlalchemy.sql import Select # noqa: TID251 except ImportError: logger.debug( @@ -2086,9 +2086,9 @@ def import_make_url(): still be accessed from sqlalchemy.engine.url to avoid import errors. """ if version.parse(sa.__version__) < version.parse("1.4"): - from sqlalchemy.engine.url import make_url + from sqlalchemy.engine.url import make_url # noqa: TID251 else: - from sqlalchemy.engine import make_url + from sqlalchemy.engine import make_url # noqa: TID251 return make_url From 3ad5cffc1cca0881cc1e56ef71c567215eb8f16a Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Wed, 5 Apr 2023 09:29:23 -0400 Subject: [PATCH 32/96] [DOCS] Fix code snippets for earlier versions (#7554) --- docs/build_docs | 54 +++++++++++--- docs/docusaurus/docusaurus.config.js | 4 +- docs/docusaurus/package.json | 3 +- .../scripts/remark-named-snippets/index.js | 15 +++- .../scripts/remark-named-snippets/snippet.js | 14 +++- docs/docusaurus/yarn.lock | 25 ++++++- docs/prepare_prior_versions.py | 73 +++++++++++++++++++ docs/yarn.lock | 4 - 8 files changed, 172 insertions(+), 20 deletions(-) create mode 100644 docs/prepare_prior_versions.py delete mode 100644 docs/yarn.lock diff --git a/docs/build_docs b/docs/build_docs index c1c3dbec0192..bbd85461e31f 100755 --- a/docs/build_docs +++ b/docs/build_docs @@ -10,6 +10,26 @@ CURRENT_COMMIT=$(git rev-parse HEAD) # git pull to get the latest tags git pull +echo -e "${ORANGE}Copying previous versioned docs${NC}" +curl "https://superconductive-public.s3.us-east-2.amazonaws.com/oss_docs_versions_20230404.zip" -o "oss_docs_versions.zip" +unzip -oq oss_docs_versions.zip -d . + +# Move versions.json outside of the repo so there are no conflicts when checking out earlier versions +VERSIONS_JSON_PATH=../../../versions.json +mv versions.json $VERSIONS_JSON_PATH + +for version in $(jq -r '.[]' $VERSIONS_JSON_PATH); do + echo -e "${ORANGE}Copying code referenced in docs from $version and writing to versioned_code/version-$version${NC}" + + git checkout "$version" + git pull + mkdir -p versioned_code/version-"$version" + cp -r ../../tests versioned_code/version-"$version" + cp -r ../../examples versioned_code/version-"$version" + cp -r ../../great_expectations versioned_code/version-"$version" + +done + # Get latest released version from tag, check out to build API docs. # Only if not PR deploy preview. if [ "$PULL_REQUEST" == "false" ] @@ -20,27 +40,39 @@ then git pull else echo -e "${ORANGE}Building from within a pull request, using the latest commit to build API docs so changes can be viewed in the Netlify deploy preview.${NC}" + git checkout "$CURRENT_COMMIT" + git pull fi +# Update versioned code and docs + +echo -e "${ORANGE}Updating filepath in versioned docs${NC}" +# This is done in prepare_prior_versions.py +# Update filepath in versioned docs if they are using the old linenumber style of file=L +# by adding the correct versioned_code filepath e.g. versioned_code/version-0.14.13/ + +echo -e "${ORANGE}Updating snippet names in versioned docs and code${NC}" +# This is done in prepare_prior_versions.py +# Update snippet names in versioned docs if they are using the style of name="" +# by prepending the version e.g. name="version-0.15.50 " +# This is done in the docs and code so that the snippet processing tool can match up the correct snippet +# based on the version of the code file that existed when the document was released. +cd ../ +python prepare_prior_versions.py +cd docusaurus + +# Build current docs echo -e "${ORANGE}Installing Great Expectations library dev dependencies.${NC}" (cd ../../; pip install -c constraints-dev.txt -e ".[test]") echo -e "${ORANGE}Installing api docs dependencies.${NC}" (cd ../sphinx_api_docs_source; pip install -r requirements-dev-api-docs.txt) -echo -e "${ORANGE}Building API docs.${NC}" +echo -e "${ORANGE}Building API docs for current version.${NC}" (cd ../../; invoke docs) -if [ "$PULL_REQUEST" == "false" ] -then - echo -e "${ORANGE}Not in a pull request. Checking back out current commit ${CURRENT_COMMIT} to build the rest of the docs.${NC}" - git checkout "$CURRENT_COMMIT" - git pull -fi - -echo -e "${ORANGE}Copying previous versions${NC}" -curl "https://superconductive-public.s3.us-east-2.amazonaws.com/oss_docs_versions.zip" -o "oss_docs_versions.zip" -unzip -oq oss_docs_versions.zip -d . +# Move versions.json back from outside of the repo +mv $VERSIONS_JSON_PATH versions.json echo -e "${ORANGE}Building docusaurus docs.${NC}" yarn build diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js index a20ef4a09e51..4fb8298bdbb7 100644 --- a/docs/docusaurus/docusaurus.config.js +++ b/docs/docusaurus/docusaurus.config.js @@ -1,6 +1,7 @@ /** @type {import('@docusaurus/types').DocusaurusConfig} */ const remarkNamedSnippets = require('./scripts/remark-named-snippets/index') +const remarkCodeImport = require('remark-code-import') module.exports = { title: 'Great Expectations', @@ -253,7 +254,8 @@ module.exports = { { docs: { sidebarPath: require.resolve('./sidebars.js'), - remarkPlugins: [remarkNamedSnippets], + // Note: remarkCodeImport is included to handle earlier versions with line number references (e.g. v0.14.13) + remarkPlugins: [remarkNamedSnippets, remarkCodeImport], lastVersion: 'current', versions: { current: { diff --git a/docs/docusaurus/package.json b/docs/docusaurus/package.json index ac3819e5b07f..ebe855f835c4 100644 --- a/docs/docusaurus/package.json +++ b/docs/docusaurus/package.json @@ -13,7 +13,7 @@ "write-translations": "docusaurus write-translations", "write-heading-ids": "docusaurus write-heading-ids", "lint": "standard --fix", - "snippet-check": "node docs/scripts/remark-named-snippets/snippet.js" + "snippet-check": "node scripts/remark-named-snippets/snippet.js" }, "dependencies": { "@cmfcmf/docusaurus-search-local": "^0.11.0", @@ -32,6 +32,7 @@ "react": "^17.0.1", "react-dom": "^17.0.1", "react-select": "^4.3.0", + "remark-code-import": "^0.3.0", "sass": "^1.56.1", "search-insights": "^2.2.3", "standard": "^16.0.3" diff --git a/docs/docusaurus/scripts/remark-named-snippets/index.js b/docs/docusaurus/scripts/remark-named-snippets/index.js index 2d6a3c968deb..0da657f2d076 100644 --- a/docs/docusaurus/scripts/remark-named-snippets/index.js +++ b/docs/docusaurus/scripts/remark-named-snippets/index.js @@ -19,11 +19,24 @@ Named snippets are defined with the following syntax: ``` */ const visit = require('unist-util-visit') +const glob = require('glob') const constructSnippetMap = require('./snippet') +function getDirs () { + // Get all directories that should be processed + const manualDirs = ['../../great_expectations', '../../tests'] + const versionDirs = glob.sync('versioned_code/*/') + // remove v0.14.13 from processing since it does not use named snippets + const index = versionDirs.indexOf('versioned_code/version-0.14.13/') + if (index !== -1) { + versionDirs.splice(index, 1) + } + return manualDirs.concat(versionDirs) +} + function codeImport () { // Instantiated within the import so it can be hot-reloaded - const snippetMap = constructSnippetMap(['../../great_expectations', '../../tests']) + const snippetMap = constructSnippetMap(getDirs()) return function transformer (tree, file) { const codes = [] diff --git a/docs/docusaurus/scripts/remark-named-snippets/snippet.js b/docs/docusaurus/scripts/remark-named-snippets/snippet.js index 532fe8c03911..15e6a985972c 100644 --- a/docs/docusaurus/scripts/remark-named-snippets/snippet.js +++ b/docs/docusaurus/scripts/remark-named-snippets/snippet.js @@ -161,8 +161,20 @@ function sanitizeText (text) { * Note that this is what is run if this file is invoked by Node. * An alias `yarn snippet-check` is defined in `package.json` for convenience. */ +function getDirs () { + // Get all directories that should be processed + const manualDirs = ['../../great_expectations', '../../tests'] + const versionDirs = glob.sync('versioned_code/*/') + // remove v0.14.13 from processing since it does not use named snippets + const index = versionDirs.indexOf('versioned_code/version-0.14.13/') + if (index !== -1) { + versionDirs.splice(index, 1) + } + return manualDirs.concat(versionDirs) +} + function main () { - const snippets = parseSourceDirectories(['../../great_expectations', '../../tests']) + const snippets = parseSourceDirectories(getDirs()) const targetFiles = process.argv.slice(2) const out = {} diff --git a/docs/docusaurus/yarn.lock b/docs/docusaurus/yarn.lock index 9f7b53c5379e..6c137340644e 100644 --- a/docs/docusaurus/yarn.lock +++ b/docs/docusaurus/yarn.lock @@ -7943,6 +7943,14 @@ relateurl@^0.2.7: resolved "https://registry.yarnpkg.com/relateurl/-/relateurl-0.2.7.tgz#54dbf377e51440aca90a4cd274600d3ff2d888a9" integrity sha1-VNvzd+UUQKypCkzSdGANP/LYiKk= +remark-code-import@^0.3.0: + version "0.3.0" + resolved "https://registry.yarnpkg.com/remark-code-import/-/remark-code-import-0.3.0.tgz#adc5b407e98ba50ad633b696a7843268cb227430" + integrity sha512-OAidTyShEroWMVP/WDEeth+DtbpnfCiOA03sDK86/EH+tukTxZaKakzSM5YlU9pb38v9NAX6FztZTazXunSKjQ== + dependencies: + to-gatsby-remark-plugin "^0.1.0" + unist-util-visit "^2.0.1" + remark-emoji@^2.2.0: version "2.2.0" resolved "https://registry.yarnpkg.com/remark-emoji/-/remark-emoji-2.2.0.tgz#1c702090a1525da5b80e15a8f963ef2c8236cac7" @@ -8863,6 +8871,13 @@ to-fast-properties@^2.0.0: resolved "https://registry.yarnpkg.com/to-fast-properties/-/to-fast-properties-2.0.0.tgz#dc5e698cbd079265bc73e0377681a4e4e83f616e" integrity sha1-3F5pjL0HkmW8c+A3doGk5Og/YW4= +to-gatsby-remark-plugin@^0.1.0: + version "0.1.0" + resolved "https://registry.yarnpkg.com/to-gatsby-remark-plugin/-/to-gatsby-remark-plugin-0.1.0.tgz#34167b2c3cf3209745cf97e5a488042586f9990d" + integrity sha512-blmhJ/gIrytWnWLgPSRCkhCPeki6UBK2daa3k9mGahN7GjwHu8KrS7F70MvwlsG7IE794JLgwAdCbi4hU4faFQ== + dependencies: + to-vfile "^6.1.0" + to-readable-stream@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/to-readable-stream/-/to-readable-stream-1.0.0.tgz#ce0aa0c2f3df6adf852efb404a783e77c0475771" @@ -8875,6 +8890,14 @@ to-regex-range@^5.0.1: dependencies: is-number "^7.0.0" +to-vfile@^6.1.0: + version "6.1.0" + resolved "https://registry.yarnpkg.com/to-vfile/-/to-vfile-6.1.0.tgz#5f7a3f65813c2c4e34ee1f7643a5646344627699" + integrity sha512-BxX8EkCxOAZe+D/ToHdDsJcVI4HqQfmw0tCkp31zf3dNP/XWIAjU4CmeuSwsSoOzOTqHPOL0KUzyZqJplkD0Qw== + dependencies: + is-buffer "^2.0.0" + vfile "^4.0.0" + toidentifier@1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/toidentifier/-/toidentifier-1.0.1.tgz#3be34321a88a820ed1bd80dfaa33e479fbb8dd35" @@ -9098,7 +9121,7 @@ unist-util-visit-parents@^3.0.0: "@types/unist" "^2.0.0" unist-util-is "^4.0.0" -unist-util-visit@2.0.3, unist-util-visit@^2.0.0, unist-util-visit@^2.0.3: +unist-util-visit@2.0.3, unist-util-visit@^2.0.0, unist-util-visit@^2.0.1, unist-util-visit@^2.0.3: version "2.0.3" resolved "https://registry.yarnpkg.com/unist-util-visit/-/unist-util-visit-2.0.3.tgz#c3703893146df47203bb8a9795af47d7b971208c" integrity sha512-iJ4/RczbJMkD0712mGktuGpm/U4By4FfDonL7N/9tATGIF4imikjOuagyMY53tnZq3NP6BcmlrHhEKAfGWjh7Q== diff --git a/docs/prepare_prior_versions.py b/docs/prepare_prior_versions.py new file mode 100644 index 000000000000..1b5040a62775 --- /dev/null +++ b/docs/prepare_prior_versions.py @@ -0,0 +1,73 @@ +"""Prepare prior docs versions of GX for inclusion into the latest docs under the version dropdown. + +There are changes to paths that need to be made to prior versions of docs. +""" +from __future__ import annotations + +import glob +import pathlib +import re + + +def _docs_dir() -> pathlib.Path: + """Base directory for docs (contains docusaurus folder).""" + return pathlib.Path().absolute() + + +def change_paths_for_docs_file_references(): + """Change file= style references to use versioned_docs paths. + + This is used in v0.14 docs like v0.14.13 since we moved to using named + snippets only for v0.15.50 and later. + """ + path = _docs_dir() / "docusaurus/versioned_docs/version-0.14.13/" + files = glob.glob(f"{path}/**/*.md", recursive=True) + pattern = re.compile(r"((.*)(file *= *)((../)*))(.*)") + path_to_insert = "versioned_code/version-0.14.13/" + + for file_path in files: + with open(file_path, "r+") as f: + contents = f.read() + contents = re.sub(pattern, rf"\1{path_to_insert}\6", contents) + f.seek(0) + f.truncate() + f.write(contents) + print(f"processed {file_path}") + + +def _paths_to_versioned_docs() -> list[pathlib.Path]: + data_path = _docs_dir() / "docusaurus/versioned_docs" + paths = [f for f in data_path.iterdir() if f.is_dir() and "0.14.13" not in str(f)] + return paths + + +def _paths_to_versioned_code() -> list[pathlib.Path]: + data_path = _docs_dir() / "docusaurus/versioned_code" + paths = [f for f in data_path.iterdir() if f.is_dir() and "0.14.13" not in str(f)] + return paths + + +def prepend_version_info_to_name_for_snippet_by_name_references(): + """Prepend version info e.g. name="snippet_name" -> name="version-0.15.50 snippet_name" """ + + pattern = re.compile(r"((.*)(name *= *\"))(.*)") + paths = _paths_to_versioned_docs() + _paths_to_versioned_code() + + for path in paths: + version = path.name + files = [] + for extension in (".md", ".mdx", ".py", ".yml", ".yaml"): + files.extend(glob.glob(f"{path}/**/*{extension}", recursive=True)) + for file_path in files: + with open(file_path, "r+") as f: + contents = f.read() + contents = re.sub(pattern, rf"\1{version} \4", contents) + f.seek(0) + f.truncate() + f.write(contents) + print(f"processed {file_path}") + + +if __name__ == "__main__": + change_paths_for_docs_file_references() + prepend_version_info_to_name_for_snippet_by_name_references() diff --git a/docs/yarn.lock b/docs/yarn.lock deleted file mode 100644 index fb57ccd13afb..000000000000 --- a/docs/yarn.lock +++ /dev/null @@ -1,4 +0,0 @@ -# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. -# yarn lockfile v1 - - From 6ffff828ecb212e4538021329d6e18445e89049b Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Wed, 5 Apr 2023 09:44:38 -0400 Subject: [PATCH 33/96] [MAINTENANCE] Add ruff rule for sqlalchemy imports (#7562) Co-authored-by: Gabriel Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .git-blame-ignore-revs | 5 ++++- contrib/ruff.toml | 2 ++ pyproject.toml | 3 +-- tests/ruff.toml | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 78167b53ecfb..ae201d0ec35a 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -6,6 +6,9 @@ # https://black.readthedocs.io/en/stable/guides/introducing_black_to_your_project.html#avoiding-ruining-git-blame # Apply noqa markers for all TCH001 violations f5e7df1846102d9a62cc9b9110387925ffae60cc -# Apply noqa markes for all PTH (use-pathlib) violations +# Apply noqa markers for all PTH (use-pathlib) violations # https://github.com/great-expectations/great_expectations/pull/7290 597b2b625569b6f5f110f8230ac26ab405167da6 +# Apply noqa markers for TID251 (sqlalchemy) violations +# https://github.com/great-expectations/great_expectations/pull/7564 +e55b3484a86f654e8b819041dd6cc73730e01a8f diff --git a/contrib/ruff.toml b/contrib/ruff.toml index adfe58009d07..51aa132690d9 100644 --- a/contrib/ruff.toml +++ b/contrib/ruff.toml @@ -7,6 +7,8 @@ extend-ignore = [ # https://github.com/charliermarsh/ruff#flake8-type-checking-tch # This is likely to be a high-touch rule that most contribs don't need to care about. "TCH001", + # https://beta.ruff.rs/docs/rules/#flake8-tidy-imports-tid + "TID251", # banned-api - is used to prevent opaque attribute errors caused by missing optional dependencies ] [isort] diff --git a/pyproject.toml b/pyproject.toml index adcd5f843b3c..6fc4db8fae8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -295,8 +295,7 @@ convention = "google" [tool.ruff.flake8-tidy-imports] [tool.ruff.flake8-tidy-imports.banned-api] -# Uncomment this to prevent importing sqlalchemy everywhere but optional_imports.py -# "sqlalchemy".msg = "Don't import sqlalchemy directly, please use the great_expectations.import_manager." +"sqlalchemy".msg = "Don't import sqlalchemy directly, please use the great_expectations.optional_imports." [tool.pytest.ini_options] filterwarnings = [ diff --git a/tests/ruff.toml b/tests/ruff.toml index aef98628a5c0..a8987368acd5 100644 --- a/tests/ruff.toml +++ b/tests/ruff.toml @@ -8,6 +8,8 @@ extend-ignore = [ # This is likely to be a high-touch rule. Doing this in `tests` doesn't help circular imports. # Let's differ this for tests until there are auto-fixes "TCH001", + # https://beta.ruff.rs/docs/rules/#flake8-tidy-imports-tid + "TID251", # banned-api - is used to prevent opaque attribute errors caused by missing optional dependencies ] [isort] From 2eff18a4febeef1051926182934f335122d878d3 Mon Sep 17 00:00:00 2001 From: Kyle Eaton Date: Wed, 5 Apr 2023 10:18:35 -0400 Subject: [PATCH 34/96] [MAINTENANCE] adding a footer to data docs with a link to the cloud page (#7532) --- .../styles/data_docs_default_styles.css | 37 +++++++++++++++++++ .../render/view/templates/cloud-footer.j2 | 4 ++ .../render/view/templates/index_page.j2 | 1 + .../render/view/templates/page.j2 | 1 + 4 files changed, 43 insertions(+) create mode 100644 great_expectations/render/view/templates/cloud-footer.j2 diff --git a/great_expectations/render/view/static/styles/data_docs_default_styles.css b/great_expectations/render/view/static/styles/data_docs_default_styles.css index e83c4c8bf654..21aa65c600c7 100644 --- a/great_expectations/render/view/static/styles/data_docs_default_styles.css +++ b/great_expectations/render/view/static/styles/data_docs_default_styles.css @@ -127,6 +127,31 @@ td .show-scrollbars { top: 0; } +footer { + position: fixed; + border-top: 1px solid #98989861; + bottom: 0; + left: 0; + right: 0; + height: 32px; + padding: 4px; + font-size: 14px; + text-align: right; + width: 100%; + background: white; + z-index: 100000; +} +footer a { + padding-right: 8px; + color: #ff6210; + font-weight: 600; +} + +footer a:hover { + color: #bc490d; + text-decoration: underline; +} + /* some css overrides for dark mode*/ @media (prefers-color-scheme: dark) { .table { @@ -170,4 +195,16 @@ td .show-scrollbars { .navbar-brand a img { visibility:hidden } + footer { + border-top: 1px solid #ffffff61; + background: black; + z-index: 100000; + } + footer a { + color: #ff6210; + } + + footer a:hover { + color: #ff6210; + } } \ No newline at end of file diff --git a/great_expectations/render/view/templates/cloud-footer.j2 b/great_expectations/render/view/templates/cloud-footer.j2 new file mode 100644 index 000000000000..b57fd0d0234d --- /dev/null +++ b/great_expectations/render/view/templates/cloud-footer.j2 @@ -0,0 +1,4 @@ + +
    +

    Explore how Great Expectations Cloud visualizes and creates shareable links for anyone on your team. Check out GX Cloud.

    +
    diff --git a/great_expectations/render/view/templates/index_page.j2 b/great_expectations/render/view/templates/index_page.j2 index c6ef8fbd0de4..f83a17dfe03b 100644 --- a/great_expectations/render/view/templates/index_page.j2 +++ b/great_expectations/render/view/templates/index_page.j2 @@ -53,6 +53,7 @@ + {% include 'cloud-footer.j2' %} {% if cta_footer %} {% include 'cta_footer.j2' %} diff --git a/great_expectations/render/view/templates/page.j2 b/great_expectations/render/view/templates/page.j2 index 4dbf966965fb..a97113f36aa9 100644 --- a/great_expectations/render/view/templates/page.j2 +++ b/great_expectations/render/view/templates/page.j2 @@ -56,5 +56,6 @@ + {% include 'cloud-footer.j2' %} From b451226e0c07782a1195346f07384cb11bf7748a Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 5 Apr 2023 09:13:00 -0600 Subject: [PATCH 35/96] [DOCS] Fix typo in docs (#7568) --- .../how_to_create_custom_set_based_column_map_expectations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_set_based_column_map_expectations.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_set_based_column_map_expectations.md index e03c3ba9214d..da0321df5e4e 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_set_based_column_map_expectations.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_set_based_column_map_expectations.md @@ -4,7 +4,7 @@ title: How to create a Custom Set-Based Column Map Expectation import Prerequisites from '../creating_custom_expectations/components/prerequisites.jsx' import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -**`SetBasedColumnMapExpectations`** are a sub-type of . They are evaluated for a single column and ask whether each row in that column belongs to the specified set. +**`SetBasedColumnMapExpectations`** are a sub-type of . They are evaluated for a single column and ask whether each row in that column belongs to the specified set. Based on the result, they then calculate the percentage of rows that gave a positive answer. If that percentage meets a specified threshold (100% by default), the Expectation considers that data valid. This threshold is configured via the `mostly` parameter, which can be passed as input to your Custom `SetBasedColumnMapExpectation` as a `float` between 0 and 1. From cba3db448686aa03d709b26a3e54f9e50322135a Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 5 Apr 2023 09:13:59 -0600 Subject: [PATCH 36/96] [DOCS] Moar typo fix (#7569) Co-authored-by: Anthony Burdi --- ...ow_to_create_custom_regex_based_column_map_expectations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations.md index 83816c359529..80f88203cf99 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations.md @@ -4,7 +4,7 @@ title: How to create a Custom Regex-Based Column Map Expectation import Prerequisites from '../creating_custom_expectations/components/prerequisites.jsx' import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -**`RegexBasedColumnMapExpectations`** are a sub-type of that allow for highly-extensible, regex-powered validation of your data. +**`RegexBasedColumnMapExpectations`** are a sub-type of that allow for highly-extensible, regex-powered validation of your data. They are evaluated for a single column and ask a yes/no, regex-based question for every row in that column. Based on the result, they then calculate the percentage of rows that gave a positive answer. If that percentage meets a specified threshold (100% by default), the Expectation considers that data valid. This threshold is configured via the `mostly` parameter, which can be passed as input to your Custom `RegexBasedColumnMapExpectation` as a `float` between 0 and 1. @@ -298,4 +298,4 @@ For more information on our code standards and contribution, see our guide on [L To view the full script used in this page, see it on GitHub: - [expect_column_values_to_only_contain_vowels.py](https://github.com/great-expectations/great_expectations/blob/develop/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py) -::: \ No newline at end of file +::: From 6ac7c16a25c4b03932592ca6dae30ebb39f3eae0 Mon Sep 17 00:00:00 2001 From: Nathan Farmer Date: Wed, 5 Apr 2023 12:24:39 -0400 Subject: [PATCH 37/96] [FEATURE] Fluent `DataAsset` `batch_metadata` config variables (#7513) --- .../data_context/abstract_data_context.py | 6 +- .../datasource/fluent/file_path_data_asset.py | 8 +- .../datasource/fluent/interfaces.py | 31 +++- .../datasource/fluent/pandas_datasource.py | 12 +- .../datasource/fluent/pandas_datasource.pyi | 3 +- .../datasource/fluent/spark_datasource.py | 6 +- .../datasource/fluent/sql_datasource.py | 8 +- .../datasource/fluent/integration/conftest.py | 2 + tests/datasource/fluent/test_config.py | 3 + .../fluent/test_pandas_datasource.py | 32 ++++- .../fluent/test_pandas_dbfs_datasource.py | 9 +- .../test_pandas_filesystem_datasource.py | 54 ++++++- .../fluent/test_pandas_s3_datasource.py | 9 +- .../fluent/test_postgres_datasource.py | 133 +++++++++++++++--- .../fluent/test_spark_datasource.py | 64 +++++++++ .../test_spark_filesystem_datasource.py | 8 +- .../fluent/test_sqlite_datasource.py | 28 ++-- 17 files changed, 351 insertions(+), 65 deletions(-) diff --git a/great_expectations/data_context/data_context/abstract_data_context.py b/great_expectations/data_context/data_context/abstract_data_context.py index cf8dd0b98bb5..0b0b633301ef 100644 --- a/great_expectations/data_context/data_context/abstract_data_context.py +++ b/great_expectations/data_context/data_context/abstract_data_context.py @@ -1443,6 +1443,7 @@ def get_datasource( ) if datasource_name in self._cached_datasources: + self._cached_datasources[datasource_name]._data_context = self return self._cached_datasources[datasource_name] datasource_config: DatasourceConfig = self._datasource_store.retrieve_by_name( @@ -1455,12 +1456,13 @@ def get_datasource( substituted_config = self.config_provider.substitute_config(raw_config_dict) # Instantiate the datasource and add to our in-memory cache of datasources, this does not persist: - datasource_config = datasourceConfigSchema.load(substituted_config) datasource: Union[ - LegacyDatasource, BaseDatasource + LegacyDatasource, BaseDatasource, FluentDatasource ] = self._instantiate_datasource_from_config( raw_config=raw_config, substituted_config=substituted_config ) + if isinstance(datasource, FluentDatasource): + datasource._data_context = self self._cached_datasources[datasource_name] = datasource return datasource diff --git a/great_expectations/datasource/fluent/file_path_data_asset.py b/great_expectations/datasource/fluent/file_path_data_asset.py index c68e39f0f1b7..51739df1b346 100644 --- a/great_expectations/datasource/fluent/file_path_data_asset.py +++ b/great_expectations/datasource/fluent/file_path_data_asset.py @@ -40,6 +40,7 @@ from great_expectations.datasource.fluent.data_asset.data_connector import ( DataConnector, ) + from great_expectations.datasource.fluent.interfaces import BatchMetadata from great_expectations.execution_engine import ( PandasExecutionEngine, SparkDFExecutionEngine, @@ -55,6 +56,7 @@ class _FilePathDataAsset(DataAsset): "order_by", "batching_regex", # file_path argument "kwargs", # kwargs need to be unpacked and passed separately + "batch_metadata", } # General file-path DataAsset pertaining attributes. @@ -199,7 +201,7 @@ def get_batch_list_from_batch_request( batch_spec_options: dict batch_data: Any batch_markers: BatchMarkers - batch_metadata: BatchRequestOptions + batch_metadata: BatchMetadata batch: Batch for batch_definition in batch_definition_list: batch_spec = self._data_connector.build_batch_spec( @@ -226,7 +228,9 @@ def get_batch_list_from_batch_request( batch_definition.batch_identifiers ) - batch_metadata = copy.deepcopy(fully_specified_batch_request.options) + batch_metadata = self._get_batch_metadata_from_batch_request( + batch_request=fully_specified_batch_request + ) # Some pydantic annotations are postponed due to circular imports. # Batch.update_forward_refs() will set the annotations before we diff --git a/great_expectations/datasource/fluent/interfaces.py b/great_expectations/datasource/fluent/interfaces.py index 3e7c8e87a8a8..070d71741f07 100644 --- a/great_expectations/datasource/fluent/interfaces.py +++ b/great_expectations/datasource/fluent/interfaces.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import dataclasses import functools import logging @@ -28,6 +29,7 @@ from pydantic import dataclasses as pydantic_dc from typing_extensions import TypeAlias, TypeGuard +from great_expectations.core.config_substitutor import _ConfigurationSubstitutor from great_expectations.core.id_dict import BatchSpec # noqa: TCH001 from great_expectations.datasource.fluent.fluent_base_model import ( FluentBaseModel, @@ -143,7 +145,7 @@ def _sorter_from_str(sort_key: str) -> Sorter: # It would be best to bind this to Datasource, but we can't now due to circular dependencies -_DatasourceT = TypeVar("_DatasourceT") +_DatasourceT = TypeVar("_DatasourceT", bound=MetaDatasource) class DataAsset(FluentBaseModel, Generic[_DatasourceT]): @@ -204,8 +206,6 @@ def get_batch_list_from_batch_request( ) -> List[Batch]: raise NotImplementedError - # End Abstract Methods - def build_batch_request( self, options: Optional[BatchRequestOptions] = None ) -> BatchRequest: @@ -224,9 +224,6 @@ def build_batch_request( """One must implement "build_batch_request" on a DataAsset subclass.""" ) - def _valid_batch_request_options(self, options: BatchRequestOptions) -> bool: - return set(options.keys()).issubset(set(self.batch_request_options)) - def _validate_batch_request(self, batch_request: BatchRequest) -> None: """Validates the batch_request has the correct form. @@ -237,6 +234,25 @@ def _validate_batch_request(self, batch_request: BatchRequest) -> None: """One must implement "_validate_batch_request" on a DataAsset subclass.""" ) + # End Abstract Methods + + def _valid_batch_request_options(self, options: BatchRequestOptions) -> bool: + return set(options.keys()).issubset(set(self.batch_request_options)) + + def _get_batch_metadata_from_batch_request( + self, batch_request: BatchRequest + ) -> BatchMetadata: + """Performs config variable substitution and populates batch request options for + Batch.metadata at runtime. + """ + batch_metadata = copy.deepcopy(self.batch_metadata) + config_variables = self._datasource._data_context.config_variables # type: ignore[attr-defined] + batch_metadata = _ConfigurationSubstitutor().substitute_all_config_variables( + data=batch_metadata, replace_variables_dict=config_variables + ) + batch_metadata.update(copy.deepcopy(batch_request.options)) + return batch_metadata + # Sorter methods @pydantic.validator("order_by", pre=True) def _parse_order_by_sorters( @@ -458,6 +474,7 @@ def get_asset(self, asset_name: str) -> _DataAssetT: """Returns the DataAsset referred to by name""" # This default implementation will be used if protocol is inherited try: + self.assets[asset_name]._datasource = self return self.assets[asset_name] except KeyError as exc: raise LookupError( @@ -576,7 +593,7 @@ class Batch(FluentBaseModel): id: str = "" # metadata is any arbitrary data one wants to associate with a batch. GX will add arbitrary metadata # to a batch so developers may want to namespace any custom metadata they add. - metadata: Dict[str, Any] = {} + metadata: Dict[str, Any] = Field(default_factory=dict, allow_mutation=True) # TODO: These legacy fields are currently required. They are only used in usage stats so we # should figure out a better way to anonymize and delete them. diff --git a/great_expectations/datasource/fluent/pandas_datasource.py b/great_expectations/datasource/fluent/pandas_datasource.py index 27f6427dc040..750e7c9a2cb2 100644 --- a/great_expectations/datasource/fluent/pandas_datasource.py +++ b/great_expectations/datasource/fluent/pandas_datasource.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import dataclasses import logging import sqlite3 @@ -63,6 +62,7 @@ AbstractSetIntStr = AbstractSet[Union[int, str]] from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, BatchRequestOptions, ) from great_expectations.execution_engine import PandasExecutionEngine @@ -146,8 +146,9 @@ def get_batch_list_from_batch_request( batch_spec_passthrough=None, ) - batch_metadata = copy.deepcopy(self.batch_metadata) or {} - batch_metadata.update(copy.deepcopy(batch_request.options)) + batch_metadata: BatchMetadata = self._get_batch_metadata_from_batch_request( + batch_request=batch_request + ) # Some pydantic annotations are postponed due to circular imports. # Batch.update_forward_refs() will set the annotations before we @@ -374,8 +375,9 @@ def get_batch_list_from_batch_request( batch_spec_passthrough=None, ) - batch_metadata = copy.deepcopy(self.batch_metadata) or {} - batch_metadata.update(copy.deepcopy(batch_request.options)) + batch_metadata: BatchMetadata = self._get_batch_metadata_from_batch_request( + batch_request=batch_request + ) # Some pydantic annotations are postponed due to circular imports. # Batch.update_forward_refs() will set the annotations before we diff --git a/great_expectations/datasource/fluent/pandas_datasource.pyi b/great_expectations/datasource/fluent/pandas_datasource.pyi index 07972231b06e..023fbeadb1dd 100644 --- a/great_expectations/datasource/fluent/pandas_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_datasource.pyi @@ -143,12 +143,11 @@ class PandasDatasource(_PandasDatasource): type: Literal["pandas"] assets: Dict[str, _PandasDataAsset] def test_connection(self, test_assets: bool = ...) -> None: ... - def _get_validator(self, asset: _PandasDataAssetT) -> Validator: ... def add_dataframe_asset( self, name: str, - *, dataframe: pd.DataFrame, + *, batch_metadata: Optional[BatchMetadata] = ..., ) -> DataFrameAsset: ... def read_dataframe( diff --git a/great_expectations/datasource/fluent/spark_datasource.py b/great_expectations/datasource/fluent/spark_datasource.py index 7f79e3af7466..adc676db8528 100644 --- a/great_expectations/datasource/fluent/spark_datasource.py +++ b/great_expectations/datasource/fluent/spark_datasource.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import logging from typing import ( TYPE_CHECKING, @@ -28,6 +27,7 @@ from great_expectations.optional_imports import SPARK_NOT_IMPORTED, pyspark if TYPE_CHECKING: + from great_expectations.datasource.fluent.interfaces import BatchMetadata from great_expectations.execution_engine import SparkDFExecutionEngine @@ -132,7 +132,9 @@ def get_batch_list_from_batch_request( batch_spec_passthrough=None, ) - batch_metadata = copy.deepcopy(batch_request.options) + batch_metadata: BatchMetadata = self._get_batch_metadata_from_batch_request( + batch_request=batch_request + ) # Some pydantic annotations are postponed due to circular imports. # Batch.update_forward_refs() will set the annotations before we diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index ff2eb171c70e..0ac628cdec4a 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -30,7 +30,6 @@ ) from great_expectations.datasource.fluent.interfaces import ( Batch, - BatchMetadata, BatchRequest, BatchRequestOptions, DataAsset, @@ -51,6 +50,8 @@ import sqlalchemy as sa # noqa: TID251 from typing_extensions import Self + from great_expectations.datasource.fluent.interfaces import BatchMetadata + class SQLDatasourceError(Exception): pass @@ -576,8 +577,9 @@ def get_batch_list_from_batch_request( splitter = self.splitter batch_spec_kwargs: dict[str, str | dict | None] for request in self._fully_specified_batch_requests(batch_request): - batch_metadata = copy.deepcopy(self.batch_metadata) - batch_metadata.update(copy.deepcopy(request.options)) + batch_metadata: BatchMetadata = self._get_batch_metadata_from_batch_request( + batch_request=request + ) batch_spec_kwargs = self._create_batch_spec_kwargs() if splitter: batch_spec_kwargs["splitter_method"] = splitter.method_name diff --git a/tests/datasource/fluent/integration/conftest.py b/tests/datasource/fluent/integration/conftest.py index 5fc56f749ff8..85ddc0401ed2 100644 --- a/tests/datasource/fluent/integration/conftest.py +++ b/tests/datasource/fluent/integration/conftest.py @@ -84,11 +84,13 @@ def pandas_filesystem_datasource( def pandas_data( context: AbstractDataContext, ) -> tuple[AbstractDataContext, PandasFilesystemDatasource, DataAsset, BatchRequest]: + context.config_variables.update({"pipeline_filename": __file__}) pandas_ds = pandas_filesystem_datasource(context=context) asset = pandas_ds.add_csv_asset( name="csv_asset", batching_regex=r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv", order_by=["year", "month"], + batch_metadata={"my_pipeline": "${pipeline_filename}"}, ) batch_request = asset.build_batch_request({"year": "2019", "month": "01"}) return context, pandas_ds, asset, batch_request diff --git a/tests/datasource/fluent/test_config.py b/tests/datasource/fluent/test_config.py index a24050f59b10..04f6a642cd3b 100644 --- a/tests/datasource/fluent/test_config.py +++ b/tests/datasource/fluent/test_config.py @@ -99,6 +99,9 @@ "batching_regex": r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2}).csv", "sep": "|", "names": ["col1", "col2"], + "batch_metadata": { + "pipeline_filename": "${pipeline_filename}", + }, }, "my_json_asset": { "type": "json", diff --git a/tests/datasource/fluent/test_pandas_datasource.py b/tests/datasource/fluent/test_pandas_datasource.py index 96bb6d15c3a9..4288591ac631 100644 --- a/tests/datasource/fluent/test_pandas_datasource.py +++ b/tests/datasource/fluent/test_pandas_datasource.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import inspect import logging import pathlib @@ -473,22 +474,41 @@ def test_dataframe_asset(empty_data_context: AbstractDataContext, test_df_pandas ) -def test_dynamic_pandas_batch_metadata( +def test_pandas_data_asset_batch_metadata( empty_data_context: AbstractDataContext, valid_file_path: pathlib.Path ): + my_config_variables = {"pipeline_filename": __file__} + empty_data_context.config_variables.update(my_config_variables) + pandas_datasource = empty_data_context.sources.pandas_default batch_metadata = { - "pipeline_filename": "my_data_pipeline.ipynb", + "no_curly_pipeline_filename": "$pipeline_filename", + "curly_pipeline_filename": "${pipeline_filename}", "pipeline_step": "transform_3", } - csv_asset_name = "my_csv_asset" - csv_asset = pandas_datasource.add_csv_asset( - name=csv_asset_name, + name="my_csv_asset", filepath_or_buffer=valid_file_path, batch_metadata=batch_metadata, ) - assert csv_asset assert csv_asset.batch_metadata == batch_metadata + + batch_list = csv_asset.get_batch_list_from_batch_request( + csv_asset.build_batch_request() + ) + assert len(batch_list) == 1 + + # allow mutation of this attribute + batch_list[0].metadata["also_this_one"] = "other_batch-level_value" + + substituted_batch_metadata = copy.deepcopy(batch_metadata) + substituted_batch_metadata.update( + { + "no_curly_pipeline_filename": __file__, + "curly_pipeline_filename": __file__, + "also_this_one": "other_batch-level_value", + } + ) + assert batch_list[0].metadata == substituted_batch_metadata diff --git a/tests/datasource/fluent/test_pandas_dbfs_datasource.py b/tests/datasource/fluent/test_pandas_dbfs_datasource.py index 43838d5f3a77..ab975bfd18b4 100644 --- a/tests/datasource/fluent/test_pandas_dbfs_datasource.py +++ b/tests/datasource/fluent/test_pandas_dbfs_datasource.py @@ -41,7 +41,9 @@ @pytest.fixture -def pandas_dbfs_datasource(fs: FakeFilesystem) -> PandasDBFSDatasource: +def pandas_dbfs_datasource( + empty_data_context, fs: FakeFilesystem +) -> PandasDBFSDatasource: # Copy boto modules into fake filesystem (see https://github.com/spulec/moto/issues/1682#issuecomment-645016188) for module in [boto3, botocore]: module_dir = pathlib.Path(module.__file__).parent @@ -71,10 +73,13 @@ def pandas_dbfs_datasource(fs: FakeFilesystem) -> PandasDBFSDatasource: ], ) - return PandasDBFSDatasource( # type: ignore[call-arg] + pandas_dbfs_datasource = PandasDBFSDatasource( # type: ignore[call-arg] name="pandas_dbfs_datasource", base_directory=pathlib.Path(base_directory), ) + pandas_dbfs_datasource._data_context = empty_data_context + + return pandas_dbfs_datasource @pytest.fixture diff --git a/tests/datasource/fluent/test_pandas_filesystem_datasource.py b/tests/datasource/fluent/test_pandas_filesystem_datasource.py index a65a667def1a..3ae3b603077d 100644 --- a/tests/datasource/fluent/test_pandas_filesystem_datasource.py +++ b/tests/datasource/fluent/test_pandas_filesystem_datasource.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import inspect import logging import pathlib @@ -31,6 +32,7 @@ from great_expectations.alias_types import PathStr from great_expectations.data_context import AbstractDataContext from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, SortersDefinition, ) @@ -45,7 +47,7 @@ @pytest.fixture -def pandas_filesystem_datasource() -> PandasFilesystemDatasource: +def pandas_filesystem_datasource(empty_data_context) -> PandasFilesystemDatasource: base_directory_rel_path = pathlib.Path( "..", "..", "test_sets", "taxi_yellow_tripdata_samples" ) @@ -54,10 +56,12 @@ def pandas_filesystem_datasource() -> PandasFilesystemDatasource: .parent.joinpath(base_directory_rel_path) .resolve(strict=True) ) - return PandasFilesystemDatasource( # type: ignore[call-arg] + pandas_filesystem_datasource = PandasFilesystemDatasource( # type: ignore[call-arg] name="pandas_filesystem_datasource", base_directory=base_directory_abs_path, ) + pandas_filesystem_datasource._data_context = empty_data_context + return pandas_filesystem_datasource @pytest.fixture @@ -726,3 +730,49 @@ def test_test_connection_failures( with pytest.raises(type(test_connection_error)) as e: pandas_filesystem_datasource.test_connection() assert str(e.value) == str(test_connection_error) + + +@pytest.mark.unit +def test_csv_asset_batch_metadata( + pandas_filesystem_datasource: PandasFilesystemDatasource, +): + my_config_variables = {"pipeline_filename": __file__} + pandas_filesystem_datasource._data_context.config_variables.update( + my_config_variables + ) + + asset_specified_metadata = { + "pipeline_name": "my_pipeline", + "no_curly_pipeline_filename": "$pipeline_filename", + "curly_pipeline_filename": "${pipeline_filename}", + } + + asset = pandas_filesystem_datasource.add_csv_asset( + name="csv_asset", + batching_regex=r"yellow_tripdata_sample_\d{4}-(?P\d{2})\.csv", + batch_metadata=asset_specified_metadata, + ) + assert asset.batch_metadata == asset_specified_metadata + + batch_request = asset.build_batch_request() + + batches = pandas_filesystem_datasource.get_batch_list_from_batch_request( + batch_request + ) + + substituted_batch_metadata: BatchMetadata = copy.deepcopy(asset_specified_metadata) + substituted_batch_metadata.update( + { + "no_curly_pipeline_filename": __file__, + "curly_pipeline_filename": __file__, + } + ) + + months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] + + for i, month in enumerate(months): + substituted_batch_metadata["month"] = month + actual_metadata = copy.deepcopy(batches[i].metadata) + # not testing path for the purposes of this test + actual_metadata.pop("path") + assert actual_metadata == substituted_batch_metadata diff --git a/tests/datasource/fluent/test_pandas_s3_datasource.py b/tests/datasource/fluent/test_pandas_s3_datasource.py index bed6a0924673..de02a75a5e35 100644 --- a/tests/datasource/fluent/test_pandas_s3_datasource.py +++ b/tests/datasource/fluent/test_pandas_s3_datasource.py @@ -83,7 +83,9 @@ def s3_bucket(s3_mock: BaseClient, aws_s3_bucket_name: str) -> str: @pytest.fixture -def pandas_s3_datasource(s3_mock, s3_bucket: str) -> PandasS3Datasource: +def pandas_s3_datasource( + empty_data_context, s3_mock, s3_bucket: str +) -> PandasS3Datasource: test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ @@ -106,10 +108,13 @@ def pandas_s3_datasource(s3_mock, s3_bucket: str) -> PandasS3Datasource: Key=key, ) - return PandasS3Datasource( # type: ignore[call-arg] + pandas_s3_datasource = PandasS3Datasource( # type: ignore[call-arg] name="pandas_s3_datasource", bucket=s3_bucket, ) + pandas_s3_datasource._data_context = empty_data_context + + return pandas_s3_datasource @pytest.fixture diff --git a/tests/datasource/fluent/test_postgres_datasource.py b/tests/datasource/fluent/test_postgres_datasource.py index 673f35d4796e..902375df3beb 100644 --- a/tests/datasource/fluent/test_postgres_datasource.py +++ b/tests/datasource/fluent/test_postgres_datasource.py @@ -1,8 +1,19 @@ from __future__ import annotations +import copy from contextlib import contextmanager from pprint import pprint -from typing import Any, Callable, ContextManager, Dict, Generator, List, Optional, Tuple +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ContextManager, + Dict, + Generator, + List, + Optional, + Tuple, +) import pytest from pydantic import ValidationError @@ -28,6 +39,10 @@ from tests.datasource.fluent.conftest import sqlachemy_execution_engine_mock_cls from tests.sqlalchemy_test_doubles import Dialect, MockSaEngine, MockSaInspector +if TYPE_CHECKING: + from great_expectations.data_context import AbstractDataContext + from great_expectations.datasource.fluent.interfaces import BatchMetadata + # We set a default time range that we use for testing. _DEFAULT_TEST_YEARS = list(range(2021, 2022 + 1)) _DEFAULT_TEST_MONTHS = list(range(1, 13)) @@ -38,6 +53,7 @@ def _source( validate_batch_spec: Callable[[SqlAlchemyDatasourceBatchSpec], None], dialect: str, connection_string: str = "postgresql+psycopg2://postgres:@localhost/test_ci", + data_context: Optional[AbstractDataContext] = None, splitter_query_response: Optional[List[Dict[str, Any]]] = None, create_temp_table: bool = True, ) -> Generator[PostgresDatasource, None, None]: @@ -57,11 +73,14 @@ def _source( original_override = PostgresDatasource.execution_engine_override # type: ignore[misc] try: PostgresDatasource.execution_engine_override = execution_eng_cls # type: ignore[misc] - yield PostgresDatasource( + postgres_datasource = PostgresDatasource( name="my_datasource", connection_string=connection_string, # type: ignore[arg-type] # coerced create_temp_table=create_temp_table, ) + if data_context: + postgres_datasource._data_context = data_context + yield postgres_datasource finally: PostgresDatasource.execution_engine_override = original_override # type: ignore[misc] @@ -241,7 +260,7 @@ def test_construct_table_asset_directly_with_splitter(create_source): @pytest.mark.unit -def test_datasource_gets_batch_list_no_splitter(create_source): +def test_datasource_gets_batch_list_no_splitter(empty_data_context, create_source): def validate_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: assert spec == { "batch_identifiers": {}, @@ -252,7 +271,9 @@ def validate_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: } with create_source( - validate_batch_spec=validate_batch_spec, dialect="postgresql" + validate_batch_spec=validate_batch_spec, + dialect="postgresql", + data_context=empty_data_context, ) as source: source, asset = create_and_add_table_asset_without_testing_connection( source=source, name="my_asset", table_name="my_table" @@ -290,6 +311,7 @@ def assert_batches_correct_with_year_month_splitter_defaults(batches): @pytest.mark.unit def test_datasource_gets_batch_list_splitter_with_unspecified_batch_request_options( + empty_data_context, create_source: CreateSourceFixture, ): batch_specs = [] @@ -298,7 +320,9 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: batch_specs.append(spec) with create_source( - validate_batch_spec=collect_batch_spec, dialect="postgresql" + validate_batch_spec=collect_batch_spec, + dialect="postgresql", + data_context=empty_data_context, ) as source: source, asset = create_and_add_table_asset_without_testing_connection( source=source, name="my_asset", table_name="my_table" @@ -313,6 +337,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: @pytest.mark.unit def test_datasource_gets_batch_list_splitter_with_batch_request_options_set_to_none( + empty_data_context, create_source: CreateSourceFixture, ): batch_specs = [] @@ -321,7 +346,9 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: batch_specs.append(spec) with create_source( - validate_batch_spec=collect_batch_spec, dialect="postgresql" + validate_batch_spec=collect_batch_spec, + dialect="postgresql", + data_context=empty_data_context, ) as source: source, asset = create_and_add_table_asset_without_testing_connection( source=source, name="my_asset", table_name="my_table" @@ -340,6 +367,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: @pytest.mark.unit def test_datasource_gets_batch_list_splitter_with_partially_specified_batch_request_options( + empty_data_context, create_source: CreateSourceFixture, ): batch_specs = [] @@ -351,6 +379,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: with create_source( validate_batch_spec=collect_batch_spec, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[ {"year": year, "month": month} for month in list(range(1, 13)) ], @@ -384,6 +413,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: @pytest.mark.unit def test_datasource_gets_batch_list_with_fully_specified_batch_request_options( + empty_data_context, create_source: CreateSourceFixture, ): year = 2022 @@ -403,6 +433,7 @@ def validate_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: with create_source( validate_batch_spec=validate_batch_spec, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[{"month": month, "year": year}], ) as source: source, asset = create_and_add_table_asset_without_testing_connection( @@ -504,10 +535,14 @@ def test_bad_batch_request_passed_into_get_batch_list_from_batch_request( [{}, {"year": 2021}, {"year": 2021, "month": 10}, {"year": None, "month": 10}], ) def test_get_batch_list_from_batch_request_with_good_batch_request( - create_source: CreateSourceFixture, batch_request_options + empty_data_context, + create_source: CreateSourceFixture, + batch_request_options, ): with create_source( - validate_batch_spec=lambda _: None, dialect="postgresql" + validate_batch_spec=lambda _: None, + dialect="postgresql", + data_context=empty_data_context, ) as source: source, asset = create_and_add_table_asset_without_testing_connection( source=source, name="my_asset", table_name="my_table" @@ -620,10 +655,14 @@ def test_get_bad_batch_request(create_source: CreateSourceFixture): ), ], ) -def test_sort_batch_list_by_metadata(sort_info, create_source: CreateSourceFixture): +def test_sort_batch_list_by_metadata( + empty_data_context, sort_info, create_source: CreateSourceFixture +): sort_keys, sort_values = sort_info with create_source( - validate_batch_spec=lambda _: None, dialect="postgresql" + validate_batch_spec=lambda _: None, + dialect="postgresql", + data_context=empty_data_context, ) as source: source, asset = create_and_add_table_asset_without_testing_connection( source=source, name="my_asset", table_name="my_table" @@ -650,9 +689,13 @@ def test_sort_batch_list_by_metadata(sort_info, create_source: CreateSourceFixtu @pytest.mark.unit -def test_sort_batch_list_by_unknown_key(create_source: CreateSourceFixture): +def test_sort_batch_list_by_unknown_key( + empty_data_context, create_source: CreateSourceFixture +): with create_source( - validate_batch_spec=lambda _: None, dialect="postgresql" + validate_batch_spec=lambda _: None, + dialect="postgresql", + data_context=empty_data_context, ) as source: source, asset = create_and_add_table_asset_without_testing_connection( source=source, name="my_asset", table_name="my_table" @@ -873,7 +916,7 @@ def test_test_connection_failures( @pytest.mark.unit -def test_query_data_asset(create_source): +def test_query_data_asset(empty_data_context, create_source): query = "SELECT * FROM my_table" def validate_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: @@ -885,7 +928,9 @@ def validate_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: } with create_source( - validate_batch_spec=validate_batch_spec, dialect="postgresql" + validate_batch_spec=validate_batch_spec, + dialect="postgresql", + data_context=empty_data_context, ) as source: asset = source.add_query_asset( name="query_asset", query="SELECT * FROM my_table" @@ -906,6 +951,7 @@ def test_non_select_query_data_asset(create_source): @pytest.mark.unit def test_splitter_year( + empty_data_context, create_source: CreateSourceFixture, ): years = [2020, 2021] @@ -917,6 +963,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: with create_source( validate_batch_spec=collect_batch_spec, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[{"year": year} for year in years], ) as source: # We use a query asset because then we don't have to mock out db connection tests @@ -939,6 +986,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: @pytest.mark.unit def test_splitter_year_and_month( + empty_data_context, create_source: CreateSourceFixture, ): years = [2020, 2021] @@ -951,6 +999,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: with create_source( validate_batch_spec=collect_batch_spec, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[ {"year": year, "month": month} for year in years for month in months ], @@ -979,6 +1028,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: @pytest.mark.unit def test_splitter_year_and_month_and_day( + empty_data_context, create_source: CreateSourceFixture, ): years = [2020, 2021] @@ -992,6 +1042,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: with create_source( validate_batch_spec=collect_batch_spec, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[ {"year": year, "month": month, "day": day} for year in years @@ -1141,6 +1192,7 @@ def collect_batch_spec(spec: SqlAlchemyDatasourceBatchSpec) -> None: ], ) def test_splitter( + empty_data_context, create_source: CreateSourceFixture, add_splitter_method, splitter_kwargs, @@ -1154,6 +1206,7 @@ def test_splitter( with create_source( validate_batch_spec=lambda _: None, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[response for response in splitter_query_responses], ) as source: asset = source.add_query_asset(name="query_asset", query="SELECT * from table") @@ -1174,6 +1227,7 @@ def test_splitter( @pytest.mark.unit def test_sorting_none_in_metadata( + empty_data_context, create_source: CreateSourceFixture, ): years = [None, 2020, 2021] @@ -1181,6 +1235,7 @@ def test_sorting_none_in_metadata( with create_source( validate_batch_spec=lambda _: None, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[{"year": year} for year in years], ) as source: # We use a query asset because then we don't have to mock out db connection tests @@ -1195,10 +1250,11 @@ def test_sorting_none_in_metadata( @pytest.mark.unit -def test_create_temp_table(create_source): +def test_create_temp_table(empty_data_context, create_source): with create_source( validate_batch_spec=lambda _: None, dialect="postgresql", + data_context=empty_data_context, create_temp_table=False, ) as source: assert source.create_temp_table is False @@ -1209,14 +1265,23 @@ def test_create_temp_table(create_source): @pytest.mark.unit def test_add_postgres_query_asset_with_batch_metadata( + empty_data_context, create_source: CreateSourceFixture, ): + my_config_variables = {"pipeline_filename": __file__} + empty_data_context.config_variables.update(my_config_variables) + years = [2021, 2022] - asset_specified_metadata = {"pipeline_name": "my_pipeline"} + asset_specified_metadata = { + "pipeline_name": "my_pipeline", + "no_curly_pipeline_filename": "$pipeline_filename", + "curly_pipeline_filename": "${pipeline_filename}", + } with create_source( validate_batch_spec=lambda _: None, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[{"year": year} for year in years], ) as source: asset = source.add_query_asset( @@ -1229,22 +1294,40 @@ def test_add_postgres_query_asset_with_batch_metadata( asset.add_splitter_year(column_name="col") batches = source.get_batch_list_from_batch_request(asset.build_batch_request()) assert len(batches) == len(years) + substituted_batch_metadata: BatchMetadata = copy.deepcopy( + asset_specified_metadata + ) + substituted_batch_metadata.update( + { + "no_curly_pipeline_filename": __file__, + "curly_pipeline_filename": __file__, + } + ) for i, year in enumerate(years): - assert batches[i].metadata == {"pipeline_name": "my_pipeline", "year": year} + substituted_batch_metadata["year"] = year + assert batches[i].metadata == substituted_batch_metadata @pytest.mark.unit def test_add_postgres_table_asset_with_batch_metadata( - create_source: CreateSourceFixture, monkeypatch + empty_data_context, create_source: CreateSourceFixture, monkeypatch ): + my_config_variables = {"pipeline_filename": __file__} + empty_data_context.config_variables.update(my_config_variables) + monkeypatch.setattr(TableAsset, "test_connection", lambda _: None) monkeypatch.setattr(TableAsset, "test_splitter_connection", lambda _: None) years = [2021, 2022] - asset_specified_metadata = {"pipeline_name": "my_pipeline"} + asset_specified_metadata = { + "pipeline_name": "my_pipeline", + "no_curly_pipeline_filename": "$pipeline_filename", + "curly_pipeline_filename": "${pipeline_filename}", + } with create_source( validate_batch_spec=lambda _: None, dialect="postgresql", + data_context=empty_data_context, splitter_query_response=[{"year": year} for year in years], ) as source: asset = source.add_table_asset( @@ -1257,5 +1340,15 @@ def test_add_postgres_table_asset_with_batch_metadata( asset.add_splitter_year(column_name="my_col") batches = source.get_batch_list_from_batch_request(asset.build_batch_request()) assert len(batches) == len(years) + substituted_batch_metadata: BatchMetadata = copy.deepcopy( + asset_specified_metadata + ) + substituted_batch_metadata.update( + { + "no_curly_pipeline_filename": __file__, + "curly_pipeline_filename": __file__, + } + ) for i, year in enumerate(years): - assert batches[i].metadata == {"pipeline_name": "my_pipeline", "year": year} + substituted_batch_metadata["year"] = year + assert batches[i].metadata == substituted_batch_metadata diff --git a/tests/datasource/fluent/test_spark_datasource.py b/tests/datasource/fluent/test_spark_datasource.py index 326756dce853..c04bcb2974a9 100644 --- a/tests/datasource/fluent/test_spark_datasource.py +++ b/tests/datasource/fluent/test_spark_datasource.py @@ -1,8 +1,11 @@ from __future__ import annotations +import copy import logging +import pathlib from typing import TYPE_CHECKING +import pandas as pd import pydantic import pytest @@ -17,6 +20,22 @@ logger = logging.getLogger(__file__) +@pytest.fixture +def csv_path() -> pathlib.Path: + relative_path = pathlib.Path( + "..", "..", "test_sets", "taxi_yellow_tripdata_samples" + ) + abs_csv_path = ( + pathlib.Path(__file__).parent.joinpath(relative_path).resolve(strict=True) + ) + return abs_csv_path + + +@pytest.fixture +def valid_file_path(csv_path: pathlib.Path) -> pathlib.Path: + return csv_path / "yellow_tripdata_sample_2018-03.csv" + + def test_dataframe_asset( empty_data_context: AbstractDataContext, spark_session, @@ -53,3 +72,48 @@ def test_dataframe_asset( for asset in datasource.assets.values() ] ) + + +@pytest.mark.xfail( + strict=True, + reason="this will fail until we add batch_metadata to Spark add_*_asset methods", +) +def test_spark_data_asset_batch_metadata( + empty_data_context: AbstractDataContext, + valid_file_path: pathlib.Path, + test_df_pandas: pd.DataFrame, + spark_session, + spark_df_from_pandas_df, +): + my_config_variables = {"pipeline_filename": __file__} + empty_data_context.config_variables.update(my_config_variables) + + spark_df = spark_df_from_pandas_df(spark_session, test_df_pandas) + + spark_datasource = empty_data_context.sources.add_spark("my_spark_datasource") + + batch_metadata = { + "no_curly_pipeline_filename": "$pipeline_filename", + "curly_pipeline_filename": "${pipeline_filename}", + "pipeline_step": "transform_3", + } + + dataframe_asset = spark_datasource.add_dataframe_asset( + name="my_dataframe_asset", + dataframe=spark_df, + batch_metadata=batch_metadata, + ) + assert dataframe_asset.batch_metadata == batch_metadata + + batch_list = dataframe_asset.get_batch_list_from_batch_request( + dataframe_asset.build_batch_request() + ) + assert len(batch_list) == 1 + substituted_batch_metadata = copy.deepcopy(batch_metadata) + substituted_batch_metadata.update( + { + "no_curly_pipeline_filename": __file__, + "curly_pipeline_filename": __file__, + } + ) + assert batch_list[0].metadata == substituted_batch_metadata diff --git a/tests/datasource/fluent/test_spark_filesystem_datasource.py b/tests/datasource/fluent/test_spark_filesystem_datasource.py index 6a2ef2462fda..e29ac57f2d9d 100644 --- a/tests/datasource/fluent/test_spark_filesystem_datasource.py +++ b/tests/datasource/fluent/test_spark_filesystem_datasource.py @@ -29,7 +29,9 @@ @pytest.fixture -def spark_filesystem_datasource(test_backends) -> SparkFilesystemDatasource: +def spark_filesystem_datasource( + empty_data_context, test_backends +) -> SparkFilesystemDatasource: if "SparkDFDataset" not in test_backends: pytest.skip("No spark backend selected.") @@ -41,10 +43,12 @@ def spark_filesystem_datasource(test_backends) -> SparkFilesystemDatasource: .parent.joinpath(base_directory_rel_path) .resolve(strict=True) ) - return SparkFilesystemDatasource( + spark_filesystem_datasource = SparkFilesystemDatasource( name="spark_filesystem_datasource", base_directory=base_directory_abs_path, ) + spark_filesystem_datasource._data_context = empty_data_context + return spark_filesystem_datasource @pytest.fixture diff --git a/tests/datasource/fluent/test_sqlite_datasource.py b/tests/datasource/fluent/test_sqlite_datasource.py index 3dcc4e925acc..3d959d6d2973 100644 --- a/tests/datasource/fluent/test_sqlite_datasource.py +++ b/tests/datasource/fluent/test_sqlite_datasource.py @@ -2,7 +2,7 @@ import pathlib from contextlib import _GeneratorContextManager, contextmanager -from typing import Any, Callable, Generator, Optional +from typing import TYPE_CHECKING, Any, Callable, Generator, Optional import pytest from pydantic import ValidationError @@ -10,6 +10,9 @@ from great_expectations.datasource.fluent import SqliteDatasource from tests.datasource.fluent.conftest import sqlachemy_execution_engine_mock_cls +if TYPE_CHECKING: + from great_expectations.data_context import AbstractDataContext + @pytest.fixture def sqlite_datasource_name() -> str: @@ -30,13 +33,14 @@ def sqlite_database_path() -> pathlib.Path: @pytest.fixture -def sqlite_datasource(sqlite_database_path, sqlite_datasource_name) -> SqliteDatasource: +def sqlite_datasource( + empty_data_context, sqlite_database_path, sqlite_datasource_name +) -> SqliteDatasource: connection_string = f"sqlite:///{sqlite_database_path}" - datasource = SqliteDatasource( + return SqliteDatasource( name=sqlite_datasource_name, connection_string=connection_string, # type: ignore[arg-type] # pydantic will coerce ) - return datasource @pytest.mark.unit @@ -79,6 +83,7 @@ def test_non_select_query_asset(sqlite_datasource): # Test double used to return canned responses for splitter queries. @contextmanager def _create_sqlite_source( + data_context: Optional[AbstractDataContext] = None, splitter_query_response: Optional[list[tuple[str]]] = None, create_temp_table: bool = True, ) -> Generator[Any, Any, Any]: @@ -93,18 +98,21 @@ def _create_sqlite_source( original_override = SqliteDatasource.execution_engine_override # type: ignore[misc] try: SqliteDatasource.execution_engine_override = execution_eng_cls # type: ignore[misc] - yield SqliteDatasource( + sqlite_datasource = SqliteDatasource( name="sqlite_datasource", connection_string="sqlite://", # type: ignore[arg-type] # pydantic will coerce create_temp_table=create_temp_table, ) + if data_context: + sqlite_datasource._data_context = data_context + yield sqlite_datasource finally: SqliteDatasource.execution_engine_override = original_override # type: ignore[misc] @pytest.fixture def create_sqlite_source() -> Callable[ - [list[tuple[str]]], _GeneratorContextManager[Any] + [Optional[AbstractDataContext], list[tuple[str]]], _GeneratorContextManager[Any] ]: return _create_sqlite_source @@ -147,6 +155,7 @@ def create_sqlite_source() -> Callable[ ], ) def test_sqlite_specific_splitter( + empty_data_context, create_sqlite_source, add_splitter_method_name, splitter_kwargs, @@ -158,6 +167,7 @@ def test_sqlite_specific_splitter( last_specified_batch_metadata, ): with create_sqlite_source( + data_context=empty_data_context, splitter_query_response=[response for response in splitter_query_responses], ) as source: asset = source.add_query_asset(name="query_asset", query="SELECT * from table") @@ -177,8 +187,10 @@ def test_sqlite_specific_splitter( @pytest.mark.unit -def test_create_temp_table(create_sqlite_source): - with create_sqlite_source(create_temp_table=False) as source: +def test_create_temp_table(empty_data_context, create_sqlite_source): + with create_sqlite_source( + data_context=empty_data_context, create_temp_table=False + ) as source: assert source.create_temp_table is False asset = source.add_query_asset(name="query_asset", query="SELECT * from table") _ = asset.get_batch_list_from_batch_request(asset.build_batch_request()) From 2145622ed3a2dc0a5bb873a64339eef012e11262 Mon Sep 17 00:00:00 2001 From: Nathan Farmer Date: Wed, 5 Apr 2023 13:17:31 -0400 Subject: [PATCH 38/96] [MAINTENANCE] Harden tests for `CloudDataContext` always `include_rendered_content` (#7558) --- .../test_include_rendered_content.py | 54 +++++-------------- tests/data_context/test_get_data_context.py | 19 +++++++ 2 files changed, 32 insertions(+), 41 deletions(-) diff --git a/tests/data_context/cloud_data_context/test_include_rendered_content.py b/tests/data_context/cloud_data_context/test_include_rendered_content.py index 6f24e2570287..46a5741dff63 100644 --- a/tests/data_context/cloud_data_context/test_include_rendered_content.py +++ b/tests/data_context/cloud_data_context/test_include_rendered_content.py @@ -96,49 +96,21 @@ def test_cloud_backed_data_context_save_expectation_suite_include_rendered_conte ) -# TODO: ACB - Enable this test after merging fixes in PRs 5778 and 5763 @pytest.mark.cloud @pytest.mark.integration -@pytest.mark.xfail(strict=True, reason="Remove xfail on merge of PRs 5778 and 5763") -@pytest.mark.parametrize( - "data_context_fixture_name", - [ - # In order to leverage existing fixtures in parametrization, we provide - # their string names and dynamically retrieve them using pytest's built-in - # `request` fixture. - # Source: https://stackoverflow.com/a/64348247 - pytest.param( - "cloud_base_data_context_in_cloud_mode_with_datasource_pandas_engine", - id="BaseDataContext", - ), - pytest.param( - "cloud_data_context_in_cloud_mode_with_datasource_pandas_engine", - id="DataContext", - ), - pytest.param( - "cloud_data_context_with_datasource_pandas_engine", - id="CloudDataContext", - ), - ], -) def test_cloud_backed_data_context_expectation_validation_result_include_rendered_content( - data_context_fixture_name: str, - request, + empty_cloud_data_context: CloudDataContext, ) -> None: """ - All Cloud-backed contexts (DataContext, BaseDataContext, and CloudDataContext) should save an ExpectationValidationResult - with rendered_content by default. + All CloudDataContexts should save an ExpectationValidationResult with rendered_content by default. """ - context = request.getfixturevalue(data_context_fixture_name) + context = empty_cloud_data_context df = pd.DataFrame([1, 2, 3, 4, 5]) - batch_request = RuntimeBatchRequest( - datasource_name="my_datasource", - data_connector_name="default_runtime_data_connector_name", - data_asset_name="my_data_asset", - runtime_parameters={"batch_data": df}, - batch_identifiers={"default_identifier_name": "my_id"}, + data_asset = context.sources.pandas_default.add_dataframe_asset( + name="my_dataframe_asset", + dataframe=df, ) with mock.patch( @@ -147,7 +119,7 @@ def test_cloud_backed_data_context_expectation_validation_result_include_rendere "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend._set" ): validator: Validator = context.get_validator( - batch_request=batch_request, + batch_request=data_asset.build_batch_request(), create_expectation_suite_with_name="test_suite", ) @@ -155,10 +127,10 @@ def test_cloud_backed_data_context_expectation_validation_result_include_rendere validator.expect_table_row_count_to_equal(value=10) ) - for result in expectation_validation_result.results: - for rendered_content in result.rendered_content: - assert isinstance(rendered_content, RenderedAtomicContent) + for rendered_content in expectation_validation_result.rendered_content: + assert isinstance(rendered_content, RenderedAtomicContent) - for expectation_configuration in expectation_validation_result.expectation_config: - for rendered_content in expectation_configuration.rendered_content: - assert isinstance(rendered_content, RenderedAtomicContent) + for ( + rendered_content + ) in expectation_validation_result.expectation_config.rendered_content: + assert isinstance(rendered_content, RenderedAtomicContent) diff --git a/tests/data_context/test_get_data_context.py b/tests/data_context/test_get_data_context.py index 79637f5be890..5b6cd976b772 100644 --- a/tests/data_context/test_get_data_context.py +++ b/tests/data_context/test_get_data_context.py @@ -204,6 +204,7 @@ def test_cloud_context_with_in_memory_config_overrides( ) assert isinstance(context, CloudDataContext) assert context.expectations_store_name == "default_expectations_store" + assert context.variables.include_rendered_content.globally config: DataContextConfig = DataContextConfig( config_version=3.0, @@ -245,3 +246,21 @@ def test_get_context_with_no_arguments_returns_ephemeral_with_sensible_defaults( defaults = InMemoryStoreBackendDefaults(init_temp_docs_sites=True) assert context.config.stores == defaults.stores + + +@pytest.mark.parametrize("ge_cloud_mode", [True, None]) +@pytest.mark.cloud +def test_cloud_context_include_rendered_content( + set_up_cloud_envs, empty_ge_cloud_data_context_config, ge_cloud_mode +): + with mock.patch.object( + CloudDataContext, + "retrieve_data_context_config_from_cloud", + return_value=empty_ge_cloud_data_context_config, + ): + context = gx.get_context(cloud_mode=ge_cloud_mode) + assert isinstance( + context, + CloudDataContext, + ) + assert context.variables.include_rendered_content.globally From 1de8e63309871e12d47f83eb112198c4db5a5c62 Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Wed, 5 Apr 2023 13:08:59 -0500 Subject: [PATCH 39/96] [DOCS] removes the original getting started tutorial pages and redirects to the quickstart guide (#7548) Co-authored-by: Will Shin --- ci/azure-pipelines-docs.yml | 2 +- .../_connecting_to_data_fluently.md | 26 +-- ...necting_to_sql_datasources_block_config.md | 16 +- ..._connecting_to_sql_datasources_fluently.md | 6 +- .../_after_create_and_configure_data_asset.md | 6 +- .../_after_create_in_memory_data_asset.md | 6 +- .../_after_create_non_sql_datasource.md | 8 +- .../_after_create_sql_data_asset.md | 2 +- .../_after_create_sql_datasource.md | 4 +- .../next_steps/_after_create_validator.md | 4 +- .../_after_request_data_from_a_data_asset.md | 6 +- .../_if_you_still_need_to_setup_gx.md | 8 +- .../prerequisites/_quickstart_completed.mdx | 1 + .../_admonition_convert_to_file_context.md | 2 +- ...s_configure_credentials_in_data_context.md | 2 +- ...ure_credentials_in_config_variables_yml.md | 2 +- .../_sql_generic_configure_credentials.md | 2 +- ...data_context_initialize_instatiate_save.md | 14 +- .../_setup_and_install_for_cloud_data.md | 6 +- .../_setup_and_install_for_filesystem_data.md | 2 +- .../_setup_and_install_for_hosted_data.md | 4 +- .../_setup_and_install_for_sql_data.md | 2 +- .../setup/link_lists/_setup_configurations.md | 34 ++-- .../next_steps/_links_after_installing_gx.md | 8 +- ..._storage_configurations_to_data_context.md | 6 +- ...stgresql_configurations_to_data_context.md | 4 +- .../contributing/components/prerequisites.jsx | 2 +- .../deployment_pattern_prerequisites.jsx | 2 +- ...iate_a_data_context_hosted_environments.md | 2 +- .../how_to_use_great_expectations_in_flyte.md | 6 +- ..._to_use_great_expectations_with_airflow.md | 4 +- ..._to_use_great_expectations_with_prefect.md | 6 +- ...nd_sampling_a_file_system_or_blob_store.md | 2 +- ...or_splitting_and_sampling_tables_in_sql.md | 2 +- .../components/prerequisites.jsx | 2 +- .../connect_to_data_overview.md | 4 +- .../database/athena.md | 2 +- .../_part_base_directory_for_filesystem.mdx | 2 +- .../_part_asset_keys_overview_configured.mdx | 2 +- .../_part_asset_keys_overview_inferred.mdx | 2 +- .../how_to_request_data_from_a_data_asset.md | 6 +- ...data_on_azure_blob_storage_using_pandas.md | 4 +- ..._data_on_azure_blob_storage_using_spark.md | 4 +- ..._to_connect_to_data_on_gcs_using_pandas.md | 4 +- ...w_to_connect_to_data_on_gcs_using_spark.md | 4 +- ...w_to_connect_to_data_on_s3_using_pandas.md | 4 +- ...ow_to_connect_to_data_on_s3_using_spark.md | 4 +- ...nize_batches_in_a_file_based_data_asset.md | 36 ++--- ...anize_batches_in_a_sql_based_data_asset.md | 2 +- .../database/how_to_connect_to_a_sql_table.md | 12 +- .../how_to_connect_to_postgresql_data.md | 10 +- .../database/how_to_connect_to_sql_data.md | 22 +-- ...ow_to_connect_to_sql_data_using_a_query.md | 12 +- .../database/how_to_connect_to_sqlite_data.md | 10 +- ...ow_to_choose_which_dataconnector_to_use.md | 2 +- ...d_partition_a_file_system_or_blob_store.md | 2 +- ..._introspect_and_partition_tables_in_sql.md | 2 +- ..._an_in_memory_spark_or_pandas_dataframe.md | 8 +- ...es_of_data_from_a_configured_datasource.md | 2 +- ...ectations_and_display_them_in_data_docs.md | 6 +- ...ite_by_profiling_from_a_jsonschema_file.md | 2 +- ...d_evaluation_parameters_from_a_database.md | 6 +- .../_preface.mdx | 4 +- .../create_expectations_overview.md | 6 +- .../components/prerequisites.jsx | 2 +- ...uite_with_the_onboarding_data_assistant.md | 2 +- ...wledge_without_inspecting_data_directly.md | 2 +- ...e_and_edit_expectations_with_a_profiler.md | 4 +- ...w_to_use_auto_initializing_expectations.md | 6 +- .../components/defaultPrerequisiteItems.jsx | 2 +- .../setup/components/install_prereq.jsx | 2 +- .../_preface.mdx | 2 +- .../how_to_configure_credentials.md | 2 +- ...ntext_components_using_test_yaml_config.md | 2 +- ...ta_context_to_a_filesystem_data_context.md | 4 +- ...ize_a_filesystem_data_context_in_python.md | 16 +- ...y_instantiate_an_ephemeral_data_context.md | 2 +- ...iate_a_specific_filesystem_data_context.md | 18 +-- ...w_to_quickly_instantiate_a_data_context.md | 14 +- .../_preface.mdx | 2 +- ...ost_and_share_data_docs_on_a_filesystem.md | 2 +- ...d_share_data_docs_on_azure_blob_storage.md | 2 +- ...ts_store_has_been_correctly_configured.mdx | 2 +- .../_preface.mdx | 6 +- .../_preface.mdx | 4 +- ...tion_result_store_in_azure_blob_storage.md | 8 +- ...figure_a_validation_result_store_in_gcs.md | 8 +- ...validation_result_store_on_a_filesystem.md | 6 +- ...a_validation_result_store_to_postgresql.md | 8 +- ...expectation_store_in_azure_blob_storage.md | 4 +- ...o_configure_an_expectation_store_in_gcs.md | 4 +- ...re_an_expectation_store_on_a_filesystem.md | 4 +- ...gure_an_expectation_store_to_postgresql.md | 4 +- ..._to_setup_gx_to_work_with_sql_databases.md | 4 +- .../docs/guides/setup/setup_overview.md | 4 +- ...deploy_a_scheduled_checkpoint_with_cron.md | 2 +- ...docs_urls_for_custom_validation_actions.md | 2 +- ...idate_data_with_an_in_memory_checkpoint.md | 2 +- .../_steps_for_checkpoints_.mdx | 6 +- ...idations_data_or_suites_to_a_checkpoint.md | 4 +- ...a_new_checkpoint_using_test_yaml_config.md | 6 +- ..._an_in_memory_dataframe_to_a_checkpoint.md | 2 +- ...o_validate_data_by_running_a_checkpoint.md | 6 +- .../validation/validate_data_overview.md | 6 +- ...ie_notifications_as_a_validation_action.md | 6 +- .../getting_started_with_gx_cloud.md | 4 +- .../integrations/components/prerequisites.jsx | 2 +- docs/docusaurus/docs/intro.md | 2 +- docs/docusaurus/docs/terms/data_context.md | 2 +- .../docs/terms/evaluation_parameter.md | 2 +- docs/docusaurus/docs/terms/profiler.md | 2 +- .../tutorial_connect_to_data.md | 115 -------------- .../tutorial_create_expectations.md | 150 ------------------ .../getting_started/tutorial_overview.md | 112 ------------- .../getting_started/tutorial_review.md | 86 ---------- .../getting_started/tutorial_setup.md | 142 ----------------- .../getting_started/tutorial_validate_data.md | 66 -------- docs/docusaurus/static/_redirects | 11 +- 118 files changed, 311 insertions(+), 976 deletions(-) create mode 100644 docs/docusaurus/docs/components/prerequisites/_quickstart_completed.mdx delete mode 100644 docs/docusaurus/docs/tutorials/getting_started/tutorial_connect_to_data.md delete mode 100644 docs/docusaurus/docs/tutorials/getting_started/tutorial_create_expectations.md delete mode 100644 docs/docusaurus/docs/tutorials/getting_started/tutorial_overview.md delete mode 100644 docs/docusaurus/docs/tutorials/getting_started/tutorial_review.md delete mode 100644 docs/docusaurus/docs/tutorials/getting_started/tutorial_setup.md delete mode 100644 docs/docusaurus/docs/tutorials/getting_started/tutorial_validate_data.md diff --git a/ci/azure-pipelines-docs.yml b/ci/azure-pipelines-docs.yml index fa42e5ae2f9f..a82e2ee7770d 100644 --- a/ci/azure-pipelines-docs.yml +++ b/ci/azure-pipelines-docs.yml @@ -39,7 +39,7 @@ stages: - job: link_checker condition: or(eq(stageDependencies.scope_check.changes.outputs['CheckDocsChanges.DocsChanged'], true), eq(variables.isDevelop, true), eq(variables.isManual, true)) steps: - - bash: python docs/checks/docs_link_checker.py -p docs/docusaurus/docs -r docs/docusaurus/ -s docs --skip-external + - bash: python docs/checks/docs_link_checker.py -p docs/docusaurus/docs -r docs/docusaurus/docs -s docs --skip-external name: LinkChecker - job: docs_snippet_checker diff --git a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md index 74df936f01ba..38937e2ac6bd 100644 --- a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md +++ b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_data_fluently.md @@ -6,21 +6,21 @@ **Local Filesystems** -- [How to quickly connect to a single file using Pandas](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_quickly_connect_to_a_single_file_with_pandas.md) -- [How to connect to one or more files using Pandas](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_pandas.md) -- [How to connect to one or more files using Spark](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark.md) +- [How to quickly connect to a single file using Pandas](/docs/guides/connecting_to_your_data/fluent/filesystem/how_to_quickly_connect_to_a_single_file_with_pandas) +- [How to connect to one or more files using Pandas](/docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_pandas) +- [How to connect to one or more files using Spark](/docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark) **Google Cloud Storage** -- [How to connect to data on GCS using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md) -- [How to connect to data on GCS using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md) +- [How to connect to data on GCS using Pandas](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas) +- [How to connect to data on GCS using Spark](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark) **Azure Blob Storage** -- [How to connect to data on Azure Blob Storage using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md) -- [How to connect to data on Azure Blob Storage using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md) +- [How to connect to data on Azure Blob Storage using Pandas](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas) +- [How to connect to data on Azure Blob Storage using Spark](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark) **Amazon Web Services** -- [How to connect to data on S3 using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md) -- [How to connect to data on S3 using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md) +- [How to connect to data on S3 using Pandas](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas) +- [How to connect to data on S3 using Spark](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark) @@ -32,7 +32,7 @@ -- [How to connect to in-memory data using Pandas](docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas.md) +- [How to connect to in-memory data using Pandas](/docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas) @@ -46,10 +46,10 @@ **General SQL Datasources** -- [How to connect to SQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md) +- [How to connect to SQL data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data) **Specific SQL dialects** -- [How to connect to PostgreSQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md) -- [How to connect to SQLite data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md) +- [How to connect to PostgreSQL data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data) +- [How to connect to SQLite data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_block_config.md b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_block_config.md index 1deb21873b50..03f1b153e86c 100644 --- a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_block_config.md +++ b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_block_config.md @@ -1,8 +1,8 @@ -- [How to connect to an Athena database with the block-config method](docs/guides/connecting_to_your_data/database/athena.md) -- [How to connect to a BigQuery database with the block-config method](docs/guides/connecting_to_your_data/database/bigquery.md) -- [How to connect to an MSSQL database with the block-config method](docs/guides/connecting_to_your_data/database/mssql.md) -- [How to connect to a MySQL database with the block-config method](docs/guides/connecting_to_your_data/database/mysql.md) -- [How to connect to a Redshift database with the block-config method](docs/guides/connecting_to_your_data/database/redshift.md) -- [How to connect to a Snowflake database with the block-config method](docs/guides/connecting_to_your_data/database/snowflake.md) -- [How to connect to a SQLite database with the block-config method](docs/guides/connecting_to_your_data/database/sqlite.md) -- [How to connect to a Trino database with the block-config method](docs/guides/connecting_to_your_data/database/trino.md) \ No newline at end of file +- [How to connect to an Athena database with the block-config method](/docs/guides/connecting_to_your_data/database/athena) +- [How to connect to a BigQuery database with the block-config method](/docs/guides/connecting_to_your_data/database/bigquery) +- [How to connect to an MSSQL database with the block-config method](/docs/guides/connecting_to_your_data/database/mssql) +- [How to connect to a MySQL database with the block-config method](/docs/guides/connecting_to_your_data/database/mysql) +- [How to connect to a Redshift database with the block-config method](/docs/guides/connecting_to_your_data/database/redshift) +- [How to connect to a Snowflake database with the block-config method](/docs/guides/connecting_to_your_data/database/snowflake) +- [How to connect to a SQLite database with the block-config method](/docs/guides/connecting_to_your_data/database/sqlite) +- [How to connect to a Trino database with the block-config method](/docs/guides/connecting_to_your_data/database/trino) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_fluently.md b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_fluently.md index 1b26838e033f..f9c93f517ca5 100644 --- a/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_fluently.md +++ b/docs/docusaurus/docs/components/connect_to_data/link_lists/_connecting_to_sql_datasources_fluently.md @@ -1,3 +1,3 @@ -- [How to connect to SQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md) -- [How to connect to PostgreSQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md) -- [How to connect to SQLite data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md) \ No newline at end of file +- [How to connect to SQL data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data) +- [How to connect to PostgreSQL data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data) +- [How to connect to SQLite data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_and_configure_data_asset.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_and_configure_data_asset.md index 1348d893c1c3..3f41097f157d 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_and_configure_data_asset.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_and_configure_data_asset.md @@ -1,3 +1,3 @@ -- [Learn more about requesting data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) \ No newline at end of file +- [Learn more about requesting data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_in_memory_data_asset.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_in_memory_data_asset.md index 9a2691072103..22d23f189830 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_in_memory_data_asset.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_in_memory_data_asset.md @@ -1,3 +1,3 @@ -- [How to request Data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) -- [How to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [How to use the Onboarding Data Assistant to evaluate data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) \ No newline at end of file +- [How to request Data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) +- [How to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [How to use the Onboarding Data Assistant to evaluate data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_non_sql_datasource.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_non_sql_datasource.md index 854fe1760e6c..ba50236f368a 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_non_sql_datasource.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_non_sql_datasource.md @@ -1,4 +1,4 @@ -- [How to organize Batches in a file-based Data Asset](docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md) -- [How to request Data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) -- [How to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [How to use the Onboarding Data Assistant to evaluate data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) \ No newline at end of file +- [How to organize Batches in a file-based Data Asset](/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset) +- [How to request Data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) +- [How to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [How to use the Onboarding Data Assistant to evaluate data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_data_asset.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_data_asset.md index 9841f30e9278..3904ca4c579d 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_data_asset.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_data_asset.md @@ -1 +1 @@ -- [Organize a SQL Data Asset into multiple Batches](docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md) \ No newline at end of file +- [Organize a SQL Data Asset into multiple Batches](/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_datasource.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_datasource.md index b2131072d060..6dd667af30b3 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_datasource.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_sql_datasource.md @@ -1,2 +1,2 @@ -- [Use a Data Asset to connect to the data in a SQL table](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table.md) -- [Use a Data Asset to connect to the results of a SQL query](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query.md) \ No newline at end of file +- [Use a Data Asset to connect to the data in a SQL table](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table) +- [Use a Data Asset to connect to the results of a SQL query](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_validator.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_validator.md index c405afe6c4b1..1234e3667357 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_validator.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_create_validator.md @@ -1,2 +1,2 @@ -- [Create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) \ No newline at end of file +- [Create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_request_data_from_a_data_asset.md b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_request_data_from_a_data_asset.md index fc3c8189951f..4d58cec543f1 100644 --- a/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_request_data_from_a_data_asset.md +++ b/docs/docusaurus/docs/components/connect_to_data/next_steps/_after_request_data_from_a_data_asset.md @@ -1,6 +1,6 @@ ### Requesting Data from a Data Asset -- [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +- [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ### Using Data Assets to create Expectations -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) \ No newline at end of file +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md b/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md index 93d1ffb03382..ca75d8dd00b1 100644 --- a/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md +++ b/docs/docusaurus/docs/components/prerequisites/_if_you_still_need_to_setup_gx.md @@ -1,5 +1,5 @@ Please reference the appropriate one of these guides: -- [How to install GX locally](docs/guides/setup/installation/local.md) -- [How to set up GX to work with data on AWS S3](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3.md) -- [How to set up GX to work with data in Azure Blob Storage](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs.md) -- [How to set up GX to work with data on GCS](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs.md) \ No newline at end of file +- [How to install GX locally](/docs/guides/setup/installation/local) +- [How to set up GX to work with data on AWS S3](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3) +- [How to set up GX to work with data in Azure Blob Storage](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs) +- [How to set up GX to work with data on GCS](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/prerequisites/_quickstart_completed.mdx b/docs/docusaurus/docs/components/prerequisites/_quickstart_completed.mdx new file mode 100644 index 000000000000..c5f78c9e6b93 --- /dev/null +++ b/docs/docusaurus/docs/components/prerequisites/_quickstart_completed.mdx @@ -0,0 +1 @@ +Completed the Quickstart guide \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md b/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md index c908817420ec..21cab4082dc5 100644 --- a/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md +++ b/docs/docusaurus/docs/components/setup/data_context/_admonition_convert_to_file_context.md @@ -4,4 +4,4 @@ An Ephemeral Data Context is an in-memory Data Context that is not intended to p context = context.convert_to_file_context() ``` -This method will initialize a Filesystem Data Context in the current working directory of the Python process that contains the Ephemeral Data Context. For more detailed explanation of this method, please see our guide on [how to convert an ephemeral data context to a filesystem data context](docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md) \ No newline at end of file +This method will initialize a Filesystem Data Context in the current working directory of the Python process that contains the Ephemeral Data Context. For more detailed explanation of this method, please see our guide on [how to convert an ephemeral data context to a filesystem data context](/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/dependencies/_abs_configure_credentials_in_data_context.md b/docs/docusaurus/docs/components/setup/dependencies/_abs_configure_credentials_in_data_context.md index 7ece0e20bdfe..9c4f0d6826bd 100644 --- a/docs/docusaurus/docs/components/setup/dependencies/_abs_configure_credentials_in_data_context.md +++ b/docs/docusaurus/docs/components/setup/dependencies/_abs_configure_credentials_in_data_context.md @@ -1,4 +1,4 @@ -We recommend that Azure Storage credentials be stored in the ``config_variables.yml`` file, which is located in the ``uncommitted/`` folder by default, and is not part of source control. The following lines add Azure Storage credentials under the key ``AZURE_STORAGE_CONNECTION_STRING``. Additional options for configuring the ``config_variables.yml`` file or additional environment variables can be found [here](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md). +We recommend that Azure Storage credentials be stored in the ``config_variables.yml`` file, which is located in the ``uncommitted/`` folder by default, and is not part of source control. The following lines add Azure Storage credentials under the key ``AZURE_STORAGE_CONNECTION_STRING``. Additional options for configuring the ``config_variables.yml`` file or additional environment variables can be found [here](/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials). ```yaml AZURE_STORAGE_CONNECTION_STRING: "DefaultEndpointsProtocol=https;EndpointSuffix=core.windows.net;AccountName=;AccountKey=" diff --git a/docs/docusaurus/docs/components/setup/dependencies/_postgresql_configure_credentials_in_config_variables_yml.md b/docs/docusaurus/docs/components/setup/dependencies/_postgresql_configure_credentials_in_config_variables_yml.md index ac4f91eeaf81..7e7a853f3701 100644 --- a/docs/docusaurus/docs/components/setup/dependencies/_postgresql_configure_credentials_in_config_variables_yml.md +++ b/docs/docusaurus/docs/components/setup/dependencies/_postgresql_configure_credentials_in_config_variables_yml.md @@ -10,4 +10,4 @@ db_creds: database: '' ``` -For additional options on configuring the `config_variables.yml` file or additional environment variables, please see our guide on [how to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md). \ No newline at end of file +For additional options on configuring the `config_variables.yml` file or additional environment variables, please see our guide on [how to configure credentials](/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials). \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/dependencies/_sql_generic_configure_credentials.md b/docs/docusaurus/docs/components/setup/dependencies/_sql_generic_configure_credentials.md index 6dcfe4368738..0182709c7016 100644 --- a/docs/docusaurus/docs/components/setup/dependencies/_sql_generic_configure_credentials.md +++ b/docs/docusaurus/docs/components/setup/dependencies/_sql_generic_configure_credentials.md @@ -15,4 +15,4 @@ my_connection_string = "${credentials}" ``` -For additional options on configuring the `config_variables.yml` file or additional environment variables, please see our guide on [how to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md). \ No newline at end of file +For additional options on configuring the `config_variables.yml` file or additional environment variables, please see our guide on [how to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials). \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md b/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md index cd65cab1139a..a2d6fb6f897e 100644 --- a/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md +++ b/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_instatiate_save.md @@ -6,16 +6,16 @@ **Quickstart Data Context** -- [How to quickly instantiate a Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md) +- [How to quickly instantiate a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) **Filesystem Data Contexts** -- [How to initialize a new Data Context with the CLI](docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli.md) -- [How to initialize a filesystem Data Context in Python](docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md) -- [How to instantiate a specific Filesystem Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_instantiate_a_specific_filesystem_data_context.md) +- [How to initialize a new Data Context with the CLI](/docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli) +- [How to initialize a filesystem Data Context in Python](/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python) +- [How to instantiate a specific Filesystem Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_instantiate_a_specific_filesystem_data_context) **In-memory Data Contexts** -- [How to explicitly instantiate an Ephemeral Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md) -- [How to instantiate a Data Context without a yml file](docs/guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file.md) +- [How to explicitly instantiate an Ephemeral Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context) +- [How to instantiate a Data Context without a yml file](/docs/guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file) @@ -29,6 +29,6 @@ Filesystem and Cloud Data Contexts automatically save any changes as they are made. The only type of Data Context that does not immediately save changes in a persisting way is the Ephemeral Data Context, which is an in-memory Data Context that will not persist beyond the current Python session. However, an Ephemeral Data Context can be converted to a Filesystem Data Context if you wish to save its contents for future use. For more information, please see: -- [How to convert an Ephemeral Data Context to a Filesystem Data Context](docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md) +- [How to convert an Ephemeral Data Context to a Filesystem Data Context](/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md index ce08ac5f788b..c673dddef8b4 100644 --- a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md @@ -1,3 +1,3 @@ -- [How to set up GX to work with data on AWS S3](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3.md) -- [How to set up GX to work with data in Azure Blob Storage](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs.md) -- [How to set up GX to work with data on GCS](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs.md) +- [How to set up GX to work with data on AWS S3](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3) +- [How to set up GX to work with data in Azure Blob Storage](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs) +- [How to set up GX to work with data on GCS](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs) diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md index bdbc23e41cc6..4c2c9e3a6993 100644 --- a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md @@ -1 +1 @@ -- [How to install Great Expectations for use with local filesystem data](docs/guides/setup/installation/local.md) \ No newline at end of file +- [How to install Great Expectations for use with local filesystem data](/docs/guides/setup/installation/local) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md index ae716adb281c..be4590a70ec8 100644 --- a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md @@ -1,2 +1,2 @@ -- [How to instantiate a Data Context on an EMR Spark Cluster](docs/deployment_patterns/how_to_instantiate_a_data_context_on_an_emr_spark_cluster.md) -- [How to use Great Expectations in Databricks](docs/deployment_patterns/how_to_use_great_expectations_in_databricks.md) \ No newline at end of file +- [How to instantiate a Data Context on an EMR Spark Cluster](/docs/deployment_patterns/how_to_instantiate_a_data_context_on_an_emr_spark_cluster) +- [How to use Great Expectations in Databricks](/docs/deployment_patterns/how_to_use_great_expectations_in_databricks) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md index 099e3577a272..581d732e24cb 100644 --- a/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md @@ -1 +1 @@ -- [How to set up GX to work with SQL databases](docs/guides/setup/optional_dependencies/sql_databases/how_to_setup_gx_to_work_with_sql_databases.md) \ No newline at end of file +- [How to set up GX to work with SQL databases](/docs/guides/setup/optional_dependencies/sql_databases/how_to_setup_gx_to_work_with_sql_databases) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md b/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md index 8af0c338efec..78ccf6d86cb1 100644 --- a/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md +++ b/docs/docusaurus/docs/components/setup/link_lists/_setup_configurations.md @@ -6,7 +6,7 @@ While some source data systems provide their own means of configuring credentials through environment variables, you can also configure GX to populate credentials from either a YAML file or a secret manager. For more information, please see: -- [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) +- [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) @@ -17,11 +17,11 @@ While some source data systems provide their own means of configuring credential -- [How to configure an Expectation Store to use Amazon S3](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_amazon_s3.md) -- [How to configure an Expectation Store to use Azure Blob Storage](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md) -- [How to configure an Expectation Store to use GCS](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_gcs.md) -- [How to configure an Expectation Store on a filesystem](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_on_a_filesystem.md) -- [How to configure an Expectation Store to use PostgreSQL](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_to_postgresql.md) +- [How to configure an Expectation Store to use Amazon S3](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_amazon_s3) +- [How to configure an Expectation Store to use Azure Blob Storage](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage) +- [How to configure an Expectation Store to use GCS](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_gcs) +- [How to configure an Expectation Store on a filesystem](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_on_a_filesystem) +- [How to configure an Expectation Store to use PostgreSQL](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_to_postgresql) @@ -32,11 +32,11 @@ While some source data systems provide their own means of configuring credential -- [How to configure a Validation Result Store in Amazon S3](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_amazon_s3.md) -- [How to configure a Validation Result Store in Azure Blob Storage](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_azure_blob_storage.md) -- [How to configure a Validation Result Store in GCS](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_gcs.md) -- [How to configure a Validation Result Store on a filesystem](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_on_a_filesystem.md) -- [How to configure a Validation Result Store to use PostgreSQL](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_to_postgresql.md) +- [How to configure a Validation Result Store in Amazon S3](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_amazon_s3) +- [How to configure a Validation Result Store in Azure Blob Storage](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_azure_blob_storage) +- [How to configure a Validation Result Store in GCS](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_gcs) +- [How to configure a Validation Result Store on a filesystem](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_on_a_filesystem) +- [How to configure a Validation Result Store to use PostgreSQL](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_to_postgresql) @@ -47,7 +47,7 @@ While some source data systems provide their own means of configuring credential -- [How to configure and use a Metric Store](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_metricsstore.md) +- [How to configure and use a Metric Store](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_metricsstore) @@ -58,10 +58,10 @@ While some source data systems provide their own means of configuring credential -- [How to host and share Data Docs on Amazon S3](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_amazon_s3.md) -- [How to host and share Data Docs on Azure Blob Storage](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md) -- [How to host and share Data Docs on GCS](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_gcs.md) -- [How to host and share Data Docs on a filesystem](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_a_filesystem.md) +- [How to host and share Data Docs on Amazon S3](/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_amazon_s3) +- [How to host and share Data Docs on Azure Blob Storage](/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage) +- [How to host and share Data Docs on GCS](/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_gcs) +- [How to host and share Data Docs on a filesystem](/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_a_filesystem) @@ -72,6 +72,6 @@ While some source data systems provide their own means of configuring credential -- [How to configure DataContext components using `test_yaml_config()`](docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md) +- [How to configure DataContext components using `test_yaml_config()`](/docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/next_steps/_links_after_installing_gx.md b/docs/docusaurus/docs/components/setup/next_steps/_links_after_installing_gx.md index e9ec1d9c3349..8e21ac21a02d 100644 --- a/docs/docusaurus/docs/components/setup/next_steps/_links_after_installing_gx.md +++ b/docs/docusaurus/docs/components/setup/next_steps/_links_after_installing_gx.md @@ -1,9 +1,9 @@ To quickly create a Data Context and dive into working with GX, please see: -- [How to quickly instantiate a Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md) +- [How to quickly instantiate a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) To initialize a Data Context on your filesystem, please reference: -- [How to initialize a Filesystem Data Context in Python](docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md) -- [How to initialize new Data Context from the CLI](docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli.md) +- [How to initialize a Filesystem Data Context in Python](/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python) +- [How to initialize new Data Context from the CLI](/docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli) To explicitly work with a temporary, in-memory Data Context, see: -- [How to instantiate a Data Context without a yml file](docs/guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file.md) \ No newline at end of file +- [How to instantiate a Data Context without a yml file](/docs/guides/setup/configuring_data_contexts/how_to_instantiate_a_data_context_without_a_yml_file) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_azure_blob_storage_configurations_to_data_context.md b/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_azure_blob_storage_configurations_to_data_context.md index e4226f4c7cbd..2bf7a850fd78 100644 --- a/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_azure_blob_storage_configurations_to_data_context.md +++ b/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_azure_blob_storage_configurations_to_data_context.md @@ -1,4 +1,4 @@ To continue configuring your Data Context to use Azure Blob Storage, please see: -- [How to configure an Expectation Store in Azure Blob Storage](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage.md) -- [How to configure a Validation Results Store in Azure Blob Storage](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_azure_blob_storage.md) -- [How to host and share Data Docs on Azure Blob Storage](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md) \ No newline at end of file +- [How to configure an Expectation Store in Azure Blob Storage](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_in_azure_blob_storage) +- [How to configure a Validation Results Store in Azure Blob Storage](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_in_azure_blob_storage) +- [How to host and share Data Docs on Azure Blob Storage](/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage) \ No newline at end of file diff --git a/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_postgresql_configurations_to_data_context.md b/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_postgresql_configurations_to_data_context.md index 87bd8627bcf9..0a2f705c6b22 100644 --- a/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_postgresql_configurations_to_data_context.md +++ b/docs/docusaurus/docs/components/setup/next_steps/_links_for_adding_postgresql_configurations_to_data_context.md @@ -1,3 +1,3 @@ To continue configuring your Data Context to use PostgreSQL, please see: -- [How to configure an Expectation Store to use PostgreSQL](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_to_postgresql.md) -- [How to configure a Validation Results Store to use PostgreSQL](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_to_postgresql.md) +- [How to configure an Expectation Store to use PostgreSQL](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_to_postgresql) +- [How to configure a Validation Results Store to use PostgreSQL](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_to_postgresql) diff --git a/docs/docusaurus/docs/contributing/components/prerequisites.jsx b/docs/docusaurus/docs/contributing/components/prerequisites.jsx index b4e6fb69d55b..cf3d8379339a 100644 --- a/docs/docusaurus/docs/contributing/components/prerequisites.jsx +++ b/docs/docusaurus/docs/contributing/components/prerequisites.jsx @@ -33,7 +33,7 @@ export default class Prerequisites extends React.Component { defaultPrerequisiteItems () { return [
  • -
  • Completed the Getting Started Tutorial
  • +
  • Completed the Quickstart guide
  • Set up your dev environment
  • Created a Custom Expectation
  • ] diff --git a/docs/docusaurus/docs/deployment_patterns/components/deployment_pattern_prerequisites.jsx b/docs/docusaurus/docs/deployment_patterns/components/deployment_pattern_prerequisites.jsx index 42214e8d115c..b02e294b5dc1 100644 --- a/docs/docusaurus/docs/deployment_patterns/components/deployment_pattern_prerequisites.jsx +++ b/docs/docusaurus/docs/deployment_patterns/components/deployment_pattern_prerequisites.jsx @@ -32,7 +32,7 @@ export default class Prerequisites extends React.Component { defaultPrerequisiteItems () { return [ -
  • Completed the Getting Started Tutorial
  • +
  • Completed the Quickstart guide
  • ] } diff --git a/docs/docusaurus/docs/deployment_patterns/how_to_instantiate_a_data_context_hosted_environments.md b/docs/docusaurus/docs/deployment_patterns/how_to_instantiate_a_data_context_hosted_environments.md index 9f9b3b690238..f0d7de112132 100644 --- a/docs/docusaurus/docs/deployment_patterns/how_to_instantiate_a_data_context_hosted_environments.md +++ b/docs/docusaurus/docs/deployment_patterns/how_to_instantiate_a_data_context_hosted_environments.md @@ -2,7 +2,7 @@ title: Deploying Great Expectations in a hosted environment without file system or CLI --- -If you follow the steps of the [Getting Started](../tutorials/getting_started/tutorial_overview.md) tutorial, you create a standard deployment of Great Expectations. By default, this relies on two components: +By default, creating a standard deployment of Great Expectations relies on two components: 1. The Great Expectations [CLI](../guides/miscellaneous/how_to_use_the_great_expectations_cli.md) to initialize a Data Context, create Expectation Suites, add Datasources, etc. 2. The ``great_expectations.yml`` file to configure your Data Context, e.g. to point at different Stores for Validation Results, etc. diff --git a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_in_flyte.md b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_in_flyte.md index 746863349edc..14636c4545dc 100644 --- a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_in_flyte.md +++ b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_in_flyte.md @@ -9,9 +9,9 @@ This guide will help you run a Great Expectations in [Flyte](https://flyte.org/) -- [Set up a working deployment of Great Expectations](../tutorials/getting_started/tutorial_overview.md) -- [Created an Expectation Suite](../tutorials/getting_started/tutorial_create_expectations.md) -- [Connecting to Data](../tutorials/getting_started/tutorial_connect_to_data.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) +- [Connecting to Data](/docs/guides/connecting_to_your_data/connect_to_data_overview) +- [Created an Expectation Suite](/docs/guides/expectations/create_expectations_overview) - Flyte [Getting Started Guide](https://docs.flyte.org/en/latest/getting_started.html) diff --git a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md index 5aa178961177..d80826bddeab 100644 --- a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md +++ b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_airflow.md @@ -7,8 +7,8 @@ This guide will help you run a Great Expectations checkpoint in Apache Airflow, -- [Set up a working deployment of Great Expectations](../tutorials/getting_started/tutorial_overview.md) -- [Created an Expectation Suite](../tutorials/getting_started/tutorial_create_expectations.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) +- [Created an Expectation Suite](/docs/guides/expectations/create_expectations_overview) - [Created a checkpoint for that Expectation Suite and a data asset](../guides/validation/checkpoints/how_to_create_a_new_checkpoint.md) - Created an Airflow DAG file diff --git a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_prefect.md b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_prefect.md index a78da3b1f37e..9e038bcfa5e9 100644 --- a/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_prefect.md +++ b/docs/docusaurus/docs/deployment_patterns/how_to_use_great_expectations_with_prefect.md @@ -8,9 +8,9 @@ This guide will help you run a Great Expectations with [Prefect](https://prefect -- [Set up a working deployment of Great Expectations](../tutorials/getting_started/tutorial_overview.md) -- [Created an Expectation Suite](../tutorials/getting_started/tutorial_create_expectations.md) -- [Connecting to Data](../tutorials/getting_started/tutorial_connect_to_data.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) +- [Connecting to Data](/docs/guides/connecting_to_your_data/connect_to_data_overview) +- [Created an Expectation Suite](/docs/guides/expectations/create_expectations_overview) - [Prefect Quick Start guide](https://docs.prefect.io/core/getting_started/quick-start.html) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md index a917f722b0a7..b5a3c119bfa0 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md @@ -15,7 +15,7 @@ your data at various levels of granularity: -- [Configured and loaded a Data Context](../../../tutorials/getting_started/tutorial_setup.md) +- [Configured and loaded a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) - [Configured a Datasource and Data Connector](../../../terms/datasource.md) - Reviewed [How to configure a DataConnector to introspect and partition a file system or blob store](../how_to_configure_a_dataconnector_to_introspect_and_partition_a_file_system_or_blob_store.md) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md index f919b883a920..ecec3148d988 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md @@ -20,7 +20,7 @@ your data at various levels of granularity: -- [Configured and loaded a Data Context](../../../tutorials/getting_started/tutorial_setup.md) +- [Configured and loaded a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) - [Configured a Datasource and Data Connector](../../../terms/datasource.md) - Reviewed [How to configure a DataConnector to introspect and partition tables in SQL](../how_to_configure_a_dataconnector_to_introspect_and_partition_tables_in_sql.md) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/components/prerequisites.jsx b/docs/docusaurus/docs/guides/connecting_to_your_data/components/prerequisites.jsx index f5585e95a8b8..76e1b4c6d5ea 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/components/prerequisites.jsx +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/components/prerequisites.jsx @@ -32,7 +32,7 @@ export default class Prerequisites extends React.Component { defaultPrerequisiteItems () { return [ -
  • Completed the Getting Started Tutorial
  • , +
  • Completed the Quickstart guide
  • ,
  • A working installation of Great Expectations
  • ] } diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/connect_to_data_overview.md b/docs/docusaurus/docs/guides/connecting_to_your_data/connect_to_data_overview.md index d38978558f39..a36ba38db79c 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/connect_to_data_overview.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/connect_to_data_overview.md @@ -15,10 +15,10 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; :::note Prerequisites -- Completing [Step 2: Connect to data](../../tutorials/getting_started/tutorial_connect_to_data.md) of the Getting Started tutorial is recommended. +- Completing the [Quickstart guide](tutorials/quickstart/quickstart.md) is recommended. ::: -Connecting to your data in Great Expectations is designed to be a painless process. Once you have performed this step, you will have a consistent API for accessing and validating data on all kinds of source data systems: SQL-type data sources, local and remote file stores, in-memory data frames, and more. +Connecting to your data in Great Expectations is designed to be a painless process. Once you have defined your Datasources and Data Assets, you will have a consistent API for accessing and validating data on all kinds of source data systems such as SQL-type data sources, local and remote file stores, and in-memory data frames. ## The connect to data process diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/database/athena.md b/docs/docusaurus/docs/guides/connecting_to_your_data/database/athena.md index 57a09d7781b8..39c1c2483a64 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/database/athena.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/database/athena.md @@ -9,7 +9,7 @@ This guide will help you add an Athena instance (or a database) as a - - [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) + - [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) - Installed the pyathena package for the Athena SQLAlchemy dialect (``pip install "pyathena[SQLAlchemy]"``)
    diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/components/_part_base_directory_for_filesystem.mdx b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/components/_part_base_directory_for_filesystem.mdx index cb34b230cc6b..fab2cd2a0a2e 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/components/_part_base_directory_for_filesystem.mdx +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/components/_part_base_directory_for_filesystem.mdx @@ -1,4 +1,4 @@ -For the base directory, you will want to put the relative path of your data from the folder that contains your Data Context. In this example we will use the same path that was used in the [Getting Started Tutorial, Step 2: Connect to Data](../../../../tutorials/getting_started/tutorial_connect_to_data.md). Since we are manually entering this value rather than letting the CLI generate it, the key/value pair will look like: +For the base directory, you will want to put the relative path of your data from the folder that contains your Data Context. Since we are manually entering this value rather than letting the CLI generate it, the key/value pair will look like: ```python name="inferred data connector add base_directory" "base_directory": "../data", diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_configured.mdx b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_configured.mdx index bcc6ef746124..053fd4036eae 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_configured.mdx +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_configured.mdx @@ -12,7 +12,7 @@ You may provide the following key/value pairs in your Data Asset configuration t - **`schema_name`:** An optional string that defines the `schema` for the Data Asset. - **`include_schema_name`:** A boolean value that determines whether the `schema_name` should be included as a prefix to the Data Asset's name. -For example, imagine that you have a copy of the NYC taxi data from the getting started tutorial in a table called `yellow_tripdata_sample_2020`, along with a public schema. You could access this data by defining an entry in the `assets` dictionary like: +For example, imagine that you have a copy of the 2020 NYC taxi data in a table called `yellow_tripdata_sample_2020`, along with a public schema. You could access this data by defining an entry in the `assets` dictionary like: ```python name="configured sql data asset single batch" ``` diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_inferred.mdx b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_inferred.mdx index 3547c6578dc5..ad71a07d7520 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_inferred.mdx +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_part_asset_keys_overview_inferred.mdx @@ -14,7 +14,7 @@ Next is the matter of how (or even if) your Data Connector splits Data Assets in -For example, imagine that you have one or more tables containing the NYC taxi data from the getting started tutorial in your database. You could instruct your Data Connector to infer Data Assets that return each table as a single Batch by simply not including a `splitter_method`. Such a configuration would be identical to the data connector `name_of_my_inferred_data_connector` that was defined in the example at the end of step 7, so let's rename that `data_connector` entry `inferred_data_connector_single_batch_asset` since that is more meaningful. Your configuration for a single Batch Data Asset would now look like: +For example, imagine that you have one or more tables containing the NYC taxi data in your database. You could instruct your Data Connector to infer Data Assets that return each table as a single Batch by simply not including a `splitter_method`. Such a configuration would be identical to the data connector `name_of_my_inferred_data_connector` that was defined in the example at the end of step 7, so let's rename that `data_connector` entry `inferred_data_connector_single_batch_asset` since that is more meaningful. Your configuration for a single Batch Data Asset would now look like: ```python name="inferred sql data asset single batch" ``` diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md index ea4056ebea8d..71e7b4da822f 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md @@ -13,7 +13,7 @@ import Prerequisites from '/docs/components/_prerequisites.jsx' In this guide we will demonstrate the process of requesting data from a Datasource that has been defined using the `context.sources.add_*` method. If you are using a Datasource that was created by using the block-config method of directly building the Datasource's yaml or Python dictionary configuration, please see: -- [How to request data from a block-config style Datasource](docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md) +- [How to request data from a block-config style Datasource](/docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource) ## Prerequisites @@ -80,7 +80,7 @@ for batch in batches: ## Next steps Now that you have a retrieved data from a Data Asset, you may be interested in creating Expectations about your data: -- [How to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [How to use the Onboarding Data Assistant to evaluate data](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) +- [How to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [How to use the Onboarding Data Assistant to evaluate data](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md index 0ac76fa47f76..176c48a7a16a 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md @@ -56,7 +56,7 @@ datasource = context.sources.add_pandas_abs( ``` :::tip Where did that connection string come from? -In the above example, the value for `account_url` will be substituted for the contents of the `AZURE_STORAGE_CONNECTION_STRING` key you configured when you [installed GX and set up your Azure Blob Storage dependancies](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs.md). +In the above example, the value for `account_url` will be substituted for the contents of the `AZURE_STORAGE_CONNECTION_STRING` key you configured when you [installed GX and set up your Azure Blob Storage dependancies](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs). ::: ### 3. Add ABS data to the Datasource as a Data Asset @@ -84,4 +84,4 @@ data_asset = datasource.add_csv_asset( ### Related reading -For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) \ No newline at end of file +For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md index 6ea8ae8c6d9a..55872bb807aa 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md @@ -56,7 +56,7 @@ datasource = context.sources.add_spark_abs( ``` :::tip Where did that connection string come from? -In the above example, the value for `account_url` will be substituted for the contents of the `AZURE_STORAGE_CONNECTION_STRING` key you configured when you [installed GX and set up your Azure Blob Storage dependancies](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs.md). +In the above example, the value for `account_url` will be substituted for the contents of the `AZURE_STORAGE_CONNECTION_STRING` key you configured when you [installed GX and set up your Azure Blob Storage dependancies](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs). ::: ### 3. Add ABS data to the Datasource as a Data Asset @@ -85,4 +85,4 @@ data_asset = datasource.add_csv_asset( ### Related reading -For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) \ No newline at end of file +For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md index 49b0b7bd6e00..f67b24837c76 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md @@ -25,7 +25,7 @@ In this guide we will demonstrate how to use Pandas to connect to data stored on -- [An installation of GX set up to work with GCS](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs.md) +- [An installation of GX set up to work with GCS](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs) - Access to data on a GCS bucket - A passion for data quality @@ -99,4 +99,4 @@ For more information on Google Cloud and authentication, please visit the follow ### Related reading -For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) \ No newline at end of file +For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md index 722f4007f5c4..1284eca96d5a 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md @@ -25,7 +25,7 @@ In this guide we will demonstrate how to use Spark to connect to data stored on -- [An installation of GX set up to work with GCS](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs.md) +- [An installation of GX set up to work with GCS](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs) - Access to data on a GCS bucket - A passion for data quality @@ -109,4 +109,4 @@ For more information on Google Cloud and authentication, please visit the follow ### Related reading -For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) \ No newline at end of file +For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md index 62691eb3aa18..d0a5dc5ff898 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md @@ -25,7 +25,7 @@ In this guide we will demonstrate how to use Pandas to connect to data stored on -- [An installation of GX set up to work with S3](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3.md) +- [An installation of GX set up to work with S3](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3) - Access to data on a S3 bucket - A passion for data quality @@ -101,4 +101,4 @@ To see the full source code used for the examples in this guide, please referenc ### Related reading -For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) \ No newline at end of file +For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md index 5075ff9a8461..abae72664909 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md @@ -26,7 +26,7 @@ In this guide we will demonstrate how to use Spark to connect to data stored on -- [An installation of GX set up to work with S3](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3.md) +- [An installation of GX set up to work with S3](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3) - Access to data on a S3 bucket - A passion for data quality @@ -106,4 +106,4 @@ To see the full source code used for the examples in this guide, please referenc ### Related reading -For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md) \ No newline at end of file +For more details regarding storing credentials for use with GX, please see our guide: [How to configure credentials(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md index d149afd29d2c..500f3d4744f7 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_file_based_data_asset.md @@ -45,10 +45,10 @@ In this guide we will demonstrate the ways in which Batches can be organized in Please reference the appropriate one of these guides: -- [How to install GX locally](docs/guides/setup/installation/local.md) -- [How to set up GX to work with data on AWS S3](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3.md) -- [How to set up GX to work with data in Azure Blob Storage](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs.md) -- [How to set up GX to work with data on GCS](docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs.md) +- [How to install GX locally](/docs/guides/setup/installation/local) +- [How to set up GX to work with data on AWS S3](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_aws_s3) +- [How to set up GX to work with data in Azure Blob Storage](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_in_abs) +- [How to set up GX to work with data on GCS](/docs/guides/setup/optional_dependencies/cloud/how_to_set_up_gx_to_work_with_data_on_gcs) @@ -62,28 +62,28 @@ Please reference the appropriate one of these guides: Please reference the appropriate one of these guides: #### Local Filesystems -- [How to connect to one or more files using Pandas](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_pandas.md) -- [How to connect to one or more files using Spark](docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark.md) +- [How to connect to one or more files using Pandas](/docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_pandas) +- [How to connect to one or more files using Spark](/docs/guides/connecting_to_your_data/fluent/filesystem/how_to_connect_to_one_or_more_files_using_spark) #### Google Cloud Storage -- [How to connect to data on GCS using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas.md) -- [How to connect to data on GCS using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md) +- [How to connect to data on GCS using Pandas](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_pandas) +- [How to connect to data on GCS using Spark](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark) #### Azure Blob Storage -- [How to connect to data in Azure Blob Storage using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas.md) -- [How to connect to data in Azure Blob Storage using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md) +- [How to connect to data in Azure Blob Storage using Pandas](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_pandas) +- [How to connect to data in Azure Blob Storage using Spark](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark) #### Amazon Web Services S3 -- [How to connect to data on Amazon Web Services S3 using Pandas](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas.md) -- [How to connect to data on Amazon Web Services S3 using Spark](docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md) +- [How to connect to data on Amazon Web Services S3 using Pandas](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_pandas) +- [How to connect to data on Amazon Web Services S3 using Spark](/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark) :::caution Datasources defined with the block-config method If you are using a Datasource that was created with the advanced block-config method please follow the appropriate guide from: -- [how to configure a Spark Datasource with the block-config method](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_spark_datasource.md) -- [how to configure a Pandas Datasource with the block-config method](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_pandas_datasource.md) +- [how to configure a Spark Datasource with the block-config method](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_spark_datasource) +- [how to configure a Pandas Datasource with the block-config method](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_pandas_datasource) ::: @@ -124,7 +124,7 @@ In the above example, the named group "`year`" will match any four numeric chara By naming the group in your `batching_regex` you make it something you can reference in the future. When requesting data from this Data Asset, you can use the names of your regular expression groups to limit the Batches that are returned. -For more information, please see: [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +For more information, please see: [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ::: @@ -181,11 +181,11 @@ for batch in batches: Now that you have further configured a file-based Data Asset, you may want to look into: ### Requesting Data from a Data Asset -- [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +- [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ### Using Data Assets to create Expectations -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md index b0450d097451..50233b3b0d7a 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md @@ -75,7 +75,7 @@ Or, for guides on using the block-config method for advanced SQL Datasource conf :::caution Datasources defined with the block-config method -If you are using a Datasource that was created with the advanced block-config method please follow our guide on [how to configure a SQL Datasource with the block-config method](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource.md), instead. +If you are using a Datasource that was created with the advanced block-config method please follow our guide on [how to configure a SQL Datasource with the block-config method](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource), instead. ::: diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table.md index 64c926f523ae..094555d87788 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table.md @@ -20,7 +20,7 @@ import ConnectingToSqlDatasourcesBlockConfig from '/docs/components/connect_to_d In this guide we will demonstrate how to connect Great Expectations to a generic SQL table. GX uses SQLAlchemy to connect to SQL data, and therefore supports most SQL dialects that SQLAlchemy does. For more information on the SQL dialects supported by SQLAlchemy, please see [SQLAlchemy's official documentation on dialects](https://docs.sqlalchemy.org/en/20/dialects/index.html). -If you would like to connect to the results of a SQL query instead of the contents of a SQL table, please see [our guide on how to connect to SQL data using a query](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query.md), instead. +If you would like to connect to the results of a SQL query instead of the contents of a SQL table, please see [our guide on how to connect to SQL data using a query](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query), instead. ## Prerequisites @@ -66,7 +66,7 @@ Or, for guides on using the block-config method for advanced SQL Datasource conf :::caution Datasources defined with the block-config method -If you are using a Datasource that was created with the advanced block-config method please follow our guide on [how to configure a SQL Datasource with the block-config method](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource.md), instead. +If you are using a Datasource that was created with the advanced block-config method please follow our guide on [how to configure a SQL Datasource with the block-config method](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource), instead. ::: @@ -103,12 +103,12 @@ If you wish to connect to additional tables in the same SQL Database, simply rep Now that you have connected to a SQL table, you may want to look into: ### Configuring SQL Data Assets further -- [How to organize Batches in a SQL based Data Asset](docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md) +- [How to organize Batches in a SQL based Data Asset](/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset) ### Requesting Data from a Data Asset -- [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +- [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ### Using Data Assets to create Expectations -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md index 20c77cb5ffb0..e57812760305 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md @@ -18,7 +18,7 @@ In this guide we will demonstrate how to connect Great Expectations to data in a -- [An installation of GX set up to work with PostgreSQL](docs/guides/setup/optional_dependencies/sql_databases/how_to_setup_gx_to_work_with_sql_databases.md) +- [An installation of GX set up to work with PostgreSQL](/docs/guides/setup/optional_dependencies/sql_databases/how_to_setup_gx_to_work_with_sql_databases) - Source data stored in a PostgreSQL database - A passion for data quality @@ -114,14 +114,14 @@ If you wish to connect to additional tables or queries in the same PostgreSQL Da Now that you have connected to a PostgreSQL database and created a Data Asset, you may want to look into: ### Configuring SQL Data Assets further -- [How to organize Batches in a SQL based Data Asset](docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md) +- [How to organize Batches in a SQL based Data Asset](/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset) ### Requesting Data from a Data Asset -- [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +- [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ### Using Data Assets to create Expectations -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md index 682c7346795e..a7d78dbdeb03 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data.md @@ -42,19 +42,19 @@ GX supports a variety of different SQL source data systems. However, most SQL d Here are some examples of connection strings for various SQL dialects. GX also has dialect-specific guides on setting up any extra dependencies, configuring credentials, and using the advanced block-config method of connecting to these particular SQL database types. These guides are included as the links in the following list of connection string formats. -- [AWS Athena](docs/guides/connecting_to_your_data/database/athena.md): `awsathena+rest://@athena..amazonaws.com/?s3_staging_dir=` -- [BigQuery](docs/guides/connecting_to_your_data/database/bigquery.md): `bigquery:///` -- [MSSQL](docs/guides/connecting_to_your_data/database/mssql.md): `mssql+pyodbc://:@:/?driver=&charset=utf&autocommit=true` -- [MySQL](docs/guides/connecting_to_your_data/database/mysql.md): `mysql+pymysql://:@:/` +- [AWS Athena](/docs/guides/connecting_to_your_data/database/athena): `awsathena+rest://@athena..amazonaws.com/?s3_staging_dir=` +- [BigQuery](/docs/guides/connecting_to_your_data/database/bigquery): `bigquery:///` +- [MSSQL](/docs/guides/connecting_to_your_data/database/mssql): `mssql+pyodbc://:@:/?driver=&charset=utf&autocommit=true` +- [MySQL](/docs/guides/connecting_to_your_data/database/mysql): `mysql+pymysql://:@:/` - PostgreSQL: `postgresql+psycopg2://:@:/` - - [Connecting to PostgreSQL data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data.md) - - [Defining advanced PostgreSQL Datasource configurations with the block-config method](docs/guides/connecting_to_your_data/database/postgres.md) -- [Redshift](docs/guides/connecting_to_your_data/database/redshift.md): `postgresql+psycopg2://:@:/?sslmode=` -- [Snowflake](docs/guides/connecting_to_your_data/database/snowflake.md): `snowflake://:@//?warehouse=&role=&application=great_expectations_oss` + - [Connecting to PostgreSQL data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_postgresql_data) + - [Defining advanced PostgreSQL Datasource configurations with the block-config method](/docs/guides/connecting_to_your_data/database/postgres) +- [Redshift](/docs/guides/connecting_to_your_data/database/redshift): `postgresql+psycopg2://:@:/?sslmode=` +- [Snowflake](/docs/guides/connecting_to_your_data/database/snowflake): `snowflake://:@//?warehouse=&role=&application=great_expectations_oss` - SQLite: `sqlite:///` - - [Connecting to SQLite data](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md) - - [Defining advanced SQLite Datasource configurations with the block-config method](docs/guides/connecting_to_your_data/database/sqlite.md) -- [Trino](docs/guides/connecting_to_your_data/database/trino.md): `trino://:@://` + - [Connecting to SQLite data](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data) + - [Defining advanced SQLite Datasource configurations with the block-config method](/docs/guides/connecting_to_your_data/database/sqlite) +- [Trino](/docs/guides/connecting_to_your_data/database/trino): `trino://:@://` ::: diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query.md index a22715a4e084..ee2f47664166 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sql_data_using_a_query.md @@ -17,7 +17,7 @@ import ConnectingToSqlDatasourcesBlockConfig from '/docs/components/connect_to_d In this guide we will demonstrate how to connect Great Expectations to the data returned by a query in a generic SQL database. GX uses SQLAlchemy to connect to SQL data, and therefore supports most SQL dialects that SQLAlchemy does. For more information on the SQL dialects supported by SQLAlchemy, please see [SQLAlchemy's official documentation on dialects](https://docs.sqlalchemy.org/en/20/dialects/index.html). -If you would like to connect to the contents of a SQL table instead of the results of a SQL query, please see [our guide on how to connect to a SQL table](docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table.md), instead. +If you would like to connect to the contents of a SQL table instead of the results of a SQL query, please see [our guide on how to connect to a SQL table](/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_a_sql_table), instead. ## Prerequisites @@ -62,7 +62,7 @@ Or, for guides on using the block-config method for advanced SQL Datasource conf :::caution Datasources defined with the block-config method -If you are using a Datasource that was created with the advanced block-config method please follow our guide on [how to configure a SQL Datasource with the block-config method](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource.md), instead. +If you are using a Datasource that was created with the advanced block-config method please follow our guide on [how to configure a SQL Datasource with the block-config method](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource), instead. ::: @@ -103,14 +103,14 @@ If you wish to connect to the contents of additional queries in the same SQL Dat Now that you have connected to the data returned by a SQL query, you may want to look into: ### Configuring SQL Data Assets further -- [How to organize Batches in a SQL based Data Asset](docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md) +- [How to organize Batches in a SQL based Data Asset](/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset) ### Requesting Data from a Data Asset -- [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +- [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ### Using Data Assets to create Expectations -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md index 5245317c0bf2..d3e120ebd513 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/database/how_to_connect_to_sqlite_data.md @@ -16,7 +16,7 @@ In this guide we will demonstrate how to connect Great Expectations to data in a -- [An installation of GX set up to work with SQL](docs/guides/setup/optional_dependencies/sql_databases/how_to_setup_gx_to_work_with_sql_databases.md) +- [An installation of GX set up to work with SQL](/docs/guides/setup/optional_dependencies/sql_databases/how_to_setup_gx_to_work_with_sql_databases) - Source data stored in a SQLite database - A passion for data quality @@ -104,13 +104,13 @@ If you wish to connect to additional tables or queries in the same PostgreSQL Da Now that you have connected to a SQLite Database and created a Data Asset, you may want to look into: ### Configuring SQL Data Assets further -- [How to organize Batches in a SQL based Data Asset](docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset.md) +- [How to organize Batches in a SQL based Data Asset](/docs/guides/connecting_to_your_data/fluent/data_assets/how_to_organize_batches_in_a_sql_based_data_asset) ### Requesting Data from a Data Asset -- [How to request data from a Data Asset](docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset.md) +- [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset) ### Using Data Assets to create Expectations -- [Use a Data Asset to create Expectations while interactively evaluating a set of data](docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) -- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md) +- [Use a Data Asset to create Expectations while interactively evaluating a set of data](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) +- [Use the Onboarding Data Assistant to evaluate one or more Batches of data and create Expectations](/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md index 9c217f883d9d..8b099e2284ec 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_choose_which_dataconnector_to_use.md @@ -36,7 +36,7 @@ The third type of `DataConnector` class is for providing a knows how to -- [Configured and loaded a Data Context](../../tutorials/getting_started/tutorial_setup.md) +- [Configured and loaded a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) - [Configured a Datasource and Data Connector](../../terms/datasource.md) diff --git a/docs/docusaurus/docs/guides/expectations/advanced/how_to_add_comments_to_expectations_and_display_them_in_data_docs.md b/docs/docusaurus/docs/guides/expectations/advanced/how_to_add_comments_to_expectations_and_display_them_in_data_docs.md index a401c135d164..e857105eca78 100644 --- a/docs/docusaurus/docs/guides/expectations/advanced/how_to_add_comments_to_expectations_and_display_them_in_data_docs.md +++ b/docs/docusaurus/docs/guides/expectations/advanced/how_to_add_comments_to_expectations_and_display_them_in_data_docs.md @@ -8,9 +8,9 @@ This guide will help you add descriptive comments (or notes, here used interchan -- [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) -- [Configured a Data Context](../../../tutorials/getting_started/tutorial_setup.md). -- [Configured an Expectations Suite](../../../tutorials/getting_started/tutorial_create_expectations.md). +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) +- [Created a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) +- [Configured an Expectations Suite](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) diff --git a/docs/docusaurus/docs/guides/expectations/advanced/how_to_create_a_new_expectation_suite_by_profiling_from_a_jsonschema_file.md b/docs/docusaurus/docs/guides/expectations/advanced/how_to_create_a_new_expectation_suite_by_profiling_from_a_jsonschema_file.md index 697ac556d12d..d5559b93e730 100644 --- a/docs/docusaurus/docs/guides/expectations/advanced/how_to_create_a_new_expectation_suite_by_profiling_from_a_jsonschema_file.md +++ b/docs/docusaurus/docs/guides/expectations/advanced/how_to_create_a_new_expectation_suite_by_profiling_from_a_jsonschema_file.md @@ -8,7 +8,7 @@ The ``JsonSchemaProfiler`` helps you quickly create -- [Configured a Data Context](../../../tutorials/getting_started/tutorial_setup.md). -- [Created a Datasource](../../../tutorials/getting_started/tutorial_connect_to_data.md). +- [Configured a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context). +- [Created a Datasource](/docs/guides/connecting_to_your_data/connect_to_data_overview). diff --git a/docs/docusaurus/docs/guides/expectations/create_expectations_overview.md b/docs/docusaurus/docs/guides/expectations/create_expectations_overview.md index 7690e42f6daa..c8f384f02c2b 100644 --- a/docs/docusaurus/docs/guides/expectations/create_expectations_overview.md +++ b/docs/docusaurus/docs/guides/expectations/create_expectations_overview.md @@ -11,14 +11,12 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; :::note Prerequisites -- Completing [Step 3: Create Expectations](../../tutorials/getting_started/tutorial_create_expectations.md) of the Getting Started tutorial is recommended. +- Completing the [Quickstart guide](tutorials/quickstart/quickstart.md) is recommended. ::: -Creating is an integral part of Great Expectations. By the end of this step, you will have created an containing one or more Expectations which you will use when you data. - ## The Create Expectations process -There are a few workflows you can potentially follow when creating Expectations. These workflows represent various ways of creating Expectations, although they converge in the end when you will save and test those Expectations. +Although creating is an integral part of Great Expectations, there are a few workflows you can potentially follow to do so. These workflows represent various ways of creating Expectations, although they converge in the end when you will save and test those Expectations. ![Where do Expectations come from?](../../images/universal_map/overviews/where_expectations_come_from.png) diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/components/prerequisites.jsx b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/components/prerequisites.jsx index a651f7eadcc8..da060ae13a2e 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/components/prerequisites.jsx +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/components/prerequisites.jsx @@ -33,7 +33,7 @@ export default class Prerequisites extends React.Component { defaultPrerequisiteItems () { return [
  • -
  • Completed the Getting Started Tutorial
  • +
  • Completed the Quickstart guide
  • Set up your dev environment
  • ] } diff --git a/docs/docusaurus/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md b/docs/docusaurus/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md index ea4e1efdcf83..d01181c46410 100644 --- a/docs/docusaurus/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md +++ b/docs/docusaurus/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant.md @@ -19,7 +19,7 @@ great_expectations suite new --profile -- A [configured Data Context](../../../tutorials/getting_started/tutorial_setup.md). +- A [configured Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context). - The knowledge to [configure and save a Datasource](../../connecting_to_your_data/connect_to_data_overview.md). - The knowledge to [configure and save a Batch Request](../../connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md). diff --git a/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_based_on_domain_knowledge_without_inspecting_data_directly.md b/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_based_on_domain_knowledge_without_inspecting_data_directly.md index 0cdd959328e2..74e709734246 100644 --- a/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_based_on_domain_knowledge_without_inspecting_data_directly.md +++ b/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_based_on_domain_knowledge_without_inspecting_data_directly.md @@ -19,7 +19,7 @@ If you have a use case we have not considered, please [contact us on Slack](http -- [Configured a Data Context](../../tutorials/getting_started/tutorial_setup.md). +- [Configured a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context). - Have your configured to save Expectations to your filesystem (please see [How to configure an Expectation store to use a filesystem](../../guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_on_a_filesystem.md)) or another if you are in a hosted environment. diff --git a/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_with_a_profiler.md b/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_with_a_profiler.md index 90541f963737..cd1488958773 100644 --- a/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_with_a_profiler.md +++ b/docs/docusaurus/docs/guides/expectations/how_to_create_and_edit_expectations_with_a_profiler.md @@ -9,8 +9,8 @@ This guide will help you create a new -- [Configured a Data Context](../../tutorials/getting_started/tutorial_setup.md). -- Configured a [Datasource](../../tutorials/getting_started/tutorial_connect_to_data.md) +- [Configured a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context). +- [Configured a Datasource](/docs/guides/connecting_to_your_data/connect_to_data_overview) diff --git a/docs/docusaurus/docs/guides/expectations/how_to_use_auto_initializing_expectations.md b/docs/docusaurus/docs/guides/expectations/how_to_use_auto_initializing_expectations.md index 839492b84ddb..32812b3065aa 100644 --- a/docs/docusaurus/docs/guides/expectations/how_to_use_auto_initializing_expectations.md +++ b/docs/docusaurus/docs/guides/expectations/how_to_use_auto_initializing_expectations.md @@ -8,9 +8,9 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; This guide will walk you through the process of using a auto-initializing to automate parameter estimation when you are creating Expectations interactively by using a or Batches that have been loaded into a . :::note PREREQUISITES: THIS HOW-TO GUIDE ASSUMES YOU HAVE: -- Completed the [Getting started tutorial](../../tutorials/getting_started/tutorial_overview.md) -- [Configured a Data Context](../../tutorials/getting_started/tutorial_setup.md). -- [Configured a Data Source](../../tutorials/getting_started/tutorial_connect_to_data.md) +- Completed the [Quickstart guide](tutorials/quickstart/quickstart.md) +- [Configured a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) +- [Configured a Datasource](/docs/guides/connecting_to_your_data/connect_to_data_overview) - [An understanding of how to configure a BatchRequest](../../guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md) - [An understanding of how to create and edit expectations with instant feedback from a sample batch of data](./how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data.md) ::: diff --git a/docs/docusaurus/docs/guides/setup/components/defaultPrerequisiteItems.jsx b/docs/docusaurus/docs/guides/setup/components/defaultPrerequisiteItems.jsx index 0c043d9cb022..a8dd1efe75bc 100644 --- a/docs/docusaurus/docs/guides/setup/components/defaultPrerequisiteItems.jsx +++ b/docs/docusaurus/docs/guides/setup/components/defaultPrerequisiteItems.jsx @@ -27,7 +27,7 @@ export default class Prerequisites extends React.Component { defaultPrerequisiteItems () { return [ -
  • Completed the Getting Started Tutorial
  • +
  • Completed the Quickstart guide
  • ] } diff --git a/docs/docusaurus/docs/guides/setup/components/install_prereq.jsx b/docs/docusaurus/docs/guides/setup/components/install_prereq.jsx index 03d4fede482f..b4c7e0ddb663 100644 --- a/docs/docusaurus/docs/guides/setup/components/install_prereq.jsx +++ b/docs/docusaurus/docs/guides/setup/components/install_prereq.jsx @@ -5,7 +5,7 @@ import Prerequisites from './defaultPrerequisiteItems.jsx' export default class InsPrerequisites extends Prerequisites { defaultPrerequisiteItems () { return [ -
  • Completed the Getting Started Tutorial
  • +
  • Completed the Quickstart guide
  • ] } } diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/components_how_to_configure_a_new_data_context_with_the_cli/_preface.mdx b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/components_how_to_configure_a_new_data_context_with_the_cli/_preface.mdx index 2e943364fd55..154924c69553 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/components_how_to_configure_a_new_data_context_with_the_cli/_preface.mdx +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/components_how_to_configure_a_new_data_context_with_the_cli/_preface.mdx @@ -14,7 +14,7 @@ import Prerequisites from '../../../connecting_to_your_data/components/prerequis -- [Configured a Data Context](../../../../tutorials/getting_started/tutorial_setup.md) +- [Configured a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md index b6849455f7b1..71f898da8a54 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md @@ -249,7 +249,7 @@ This guide will explain how to configure your ``great_expectations.yml`` project -- [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) - Configured a secret manager and secrets in the cloud with [Azure Key Vault](https://docs.microsoft.com/en-us/azure/key-vault/general/overview) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md index f56e6515176f..5c11862a1e27 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_configure_datacontext_components_using_test_yaml_config.md @@ -8,7 +8,7 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -- [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md index 8d61e24e6cd4..51eacee3066e 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/how_to_convert_an_ephemeral_data_context_to_a_filesystem_data_context.md @@ -43,10 +43,10 @@ An Ephemeral Data Context is a temporary, in-memory Data Context that will not p The `get_context()` method will return an Ephemeral Data Context if your system is not set up to work with GX Cloud and a Filesystem Data Context cannot be found. For more information, see: -- [How to quickly instantiate a Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context.md) +- [How to quickly instantiate a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) You can also explicitly instantiate an Ephemeral Data Context (for those occasions when your system is set up to work with GX Cloud or you do have a previously initialized Filesystem Data Context). For more information, see: -- [How to explicitly instantiate an Ephemeral Data Context](docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context.md) +- [How to explicitly instantiate an Ephemeral Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_explicitly_instantiate_an_ephemeral_data_context) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md index f822dffd38c5..4144c99e901f 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md @@ -66,22 +66,22 @@ If a Data Context already exists at the provided `path`, the `get_context(...)` ## Next steps For guidance on further customizing your Data Context's configurations for and , please see: -- [How to configure an Expectation Store on a filesystem](docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_on_a_filesystem.md) -- [How to configure a Validation Result Store on a filesystem](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_on_a_filesystem.md) -- [How to configure and use a Metric Store](docs/guides/setup/configuring_metadata_stores/how_to_configure_a_metricsstore.md) -- [How to host and share Data Docs on a filesystem](docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_a_filesystem.md) +- [How to configure an Expectation Store on a filesystem](/docs/guides/setup/configuring_metadata_stores/how_to_configure_an_expectation_store_on_a_filesystem) +- [How to configure a Validation Result Store on a filesystem](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_validation_result_store_on_a_filesystem) +- [How to configure and use a Metric Store](/docs/guides/setup/configuring_metadata_stores/how_to_configure_a_metricsstore) +- [How to host and share Data Docs on a filesystem](/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_a_filesystem) If you are content with the default configuration of your Data Context, you can move on to connecting GX to your source data: -- [How to configure a Pandas Datasource](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_pandas_datasource.md) -- [How to configure a Spark Datasource](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_spark_datasource.md) -- [How to configure a SQL Datasource](docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource.md) +- [How to configure a Pandas Datasource](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_pandas_datasource) +- [How to configure a Spark Datasource](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_spark_datasource) +- [How to configure a SQL Datasource](/docs/guides/connecting_to_your_data/datasource_configuration/how_to_configure_a_sql_datasource) ## Additional information ### Related guides To initialize a Filesystem Data Context from the terminal, please see: -- [How to initialize a new Data Context with the CLI](docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli.md) +- [How to initialize a new Data Context with the CLI](/docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli) :::note Prerequisites -- Completing [Step 1: Setup](../../tutorials/getting_started/tutorial_setup.md) of the Getting Started tutorial is recommended. +- Completing the [Quickstart guide](tutorials/quickstart/quickstart.md) is recommended. ::: -Getting started with Great Expectations is quick and easy. Once you have completed setup for your production deployment, you will have access to all of the features of Great Expectations from a single entry point: Your . You will also have your and configured in the manner most suitable for your project's purposes. +Getting started with Great Expectations is quick and easy. Once you have completed setup for your production deployment, you will have access to all the features of Great Expectations from a single entry point: Your . You will also have your and configured in the manner most suitable for your project's purposes. ### The alternative to manual Setup diff --git a/docs/docusaurus/docs/guides/validation/advanced/how_to_deploy_a_scheduled_checkpoint_with_cron.md b/docs/docusaurus/docs/guides/validation/advanced/how_to_deploy_a_scheduled_checkpoint_with_cron.md index 9adaa641c8cf..af981c83bb16 100644 --- a/docs/docusaurus/docs/guides/validation/advanced/how_to_deploy_a_scheduled_checkpoint_with_cron.md +++ b/docs/docusaurus/docs/guides/validation/advanced/how_to_deploy_a_scheduled_checkpoint_with_cron.md @@ -8,7 +8,7 @@ This guide will help you deploy a scheduled -- [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) - You have created a Checkpoint.
    diff --git a/docs/docusaurus/docs/guides/validation/advanced/how_to_get_data_docs_urls_for_custom_validation_actions.md b/docs/docusaurus/docs/guides/validation/advanced/how_to_get_data_docs_urls_for_custom_validation_actions.md index 32b8be26c6b7..007212258b66 100644 --- a/docs/docusaurus/docs/guides/validation/advanced/how_to_get_data_docs_urls_for_custom_validation_actions.md +++ b/docs/docusaurus/docs/guides/validation/advanced/how_to_get_data_docs_urls_for_custom_validation_actions.md @@ -11,7 +11,7 @@ This will work to get the URLs for any type of Data Docs site setup, e.g. S3 or - - [Created an Expectation Suite to use for validation](../../../tutorials/getting_started/tutorial_create_expectations.md) + - [Created an Expectation Suite to use for Validation](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) - [Reviewed our guidance on Validation Actions](../../../terms/action.md) diff --git a/docs/docusaurus/docs/guides/validation/advanced/how_to_validate_data_with_an_in_memory_checkpoint.md b/docs/docusaurus/docs/guides/validation/advanced/how_to_validate_data_with_an_in_memory_checkpoint.md index 86ad20d8f228..f8ed5589b0f9 100644 --- a/docs/docusaurus/docs/guides/validation/advanced/how_to_validate_data_with_an_in_memory_checkpoint.md +++ b/docs/docusaurus/docs/guides/validation/advanced/how_to_validate_data_with_an_in_memory_checkpoint.md @@ -56,7 +56,7 @@ Checkpoints require a Data Context in order to access necessary Stores from whic ### 3. Define your Checkpoint configuration -In addition to a Data Context, you will need a configuration with which to initialize your Checkpoint. This configuration can be in the form of a YAML string or a Python dictionary, The following examples show configurations that are equivalent to the one used by the Getting Started Tutorial. +In addition to a Data Context, you will need a configuration with which to initialize your Checkpoint. This configuration can be in the form of a YAML string or a Python dictionary. Normally, a Checkpoint configuration will include the keys `class_name` and `module_name`. These are used by Great Expectations to identify the class of Checkpoint that should be initialized with a given configuration. Since we are initializing an instance of the `Checkpoint` class directly we don't need the configuration to indicate the class of Checkpoint to be initialized. Therefore, these two keys will be left out of our configuration. diff --git a/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx b/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx index f0a691560182..73ee806e1d08 100644 --- a/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx +++ b/docs/docusaurus/docs/guides/validation/checkpoints/components_how_to_create_a_new_checkpoint/_steps_for_checkpoints_.mdx @@ -3,8 +3,8 @@ This how-to guide assumes you have already: -* [Set up a working deployment of Great Expectations](../../../../tutorials/getting_started/tutorial_overview.md) -* [Configured a Datasource](../../../../tutorials/getting_started/tutorial_connect_to_data.md) -* [Created an Expectation Suite](../../../../tutorials/getting_started/tutorial_create_expectations.md) +- [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) +- [Connected to Data](/docs/guides/connecting_to_your_data/connect_to_data_overview) +- [Created an Expectation Suite](/docs/guides/expectations/create_expectations_overview) ::: diff --git a/docs/docusaurus/docs/guides/validation/checkpoints/how_to_add_validations_data_or_suites_to_a_checkpoint.md b/docs/docusaurus/docs/guides/validation/checkpoints/how_to_add_validations_data_or_suites_to_a_checkpoint.md index 6a09dcebe7bf..3444805387fb 100644 --- a/docs/docusaurus/docs/guides/validation/checkpoints/how_to_add_validations_data_or_suites_to_a_checkpoint.md +++ b/docs/docusaurus/docs/guides/validation/checkpoints/how_to_add_validations_data_or_suites_to_a_checkpoint.md @@ -10,8 +10,8 @@ This guide will help you add validation data or your data by running a . -As stated in the Getting Started Tutorial [Step 4: Validate data](../../tutorials/getting_started/tutorial_validate_data.md), the best way to Validate data in production with Great Expectations is using a . The advantage of using a Checkpoint is ease of use, due to its principal capability of combining the existing configuration in order to set up and perform the Validation: +The best way to Validate data in production with Great Expectations is using a . The advantage of using a Checkpoint is ease of use, due to its principal capability of combining the existing configuration in order to set up and perform the Validation: - - - @@ -19,8 +19,8 @@ Otherwise, configuring these validation parameters would have to be done via the -- [Configured a Data Context](../../tutorials/getting_started/tutorial_setup.md#create-a-data-context). -- [Configured an Expectations Suite](../../tutorials/getting_started/tutorial_create_expectations.md). +- [Configured a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) +- [Configured an Expectations Suite](/docs/guides/expectations/how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data) - [Configured a Checkpoint](./checkpoints/how_to_create_a_new_checkpoint.md) diff --git a/docs/docusaurus/docs/guides/validation/validate_data_overview.md b/docs/docusaurus/docs/guides/validation/validate_data_overview.md index 163651ece989..fc0a71e32489 100644 --- a/docs/docusaurus/docs/guides/validation/validate_data_overview.md +++ b/docs/docusaurus/docs/guides/validation/validate_data_overview.md @@ -11,14 +11,12 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; :::note Prerequisites -- Completing [Step 4: Validate data](../../tutorials/getting_started/tutorial_validate_data.md) of the Getting Started tutorial is recommended. +- Completing the [Quickstart guide](tutorials/quickstart/quickstart.md) is recommended. ::: -When you complete this step for the first time, you will have created and run a . This Checkpoint can then be reused to data in the future, and you can also create and configure additional Checkpoints to cover different use cases, should you have them. - ## The Validate Data process -The recommended workflow for validating data is through **the use of Checkpoints.** Checkpoints handle the rest of the Validation process for you: They will Validate data, save , run any you have specified, and finally create with their results. +The recommended workflow for validating data is through **the use of Checkpoints.** Checkpoints handle the rest of the Validation process for you: They will Validate data, save , run any you have specified, and finally create with their results. A Checkpoint can then be reused to data in the future, and you can also create and configure additional Checkpoints to cover different use cases, should you have them. ![How a Checkpoint works](../../images/universal_map/overviews/how_a_checkpoint_works.png) diff --git a/docs/docusaurus/docs/guides/validation/validation_actions/how_to_trigger_opsgenie_notifications_as_a_validation_action.md b/docs/docusaurus/docs/guides/validation/validation_actions/how_to_trigger_opsgenie_notifications_as_a_validation_action.md index 4638e7aa3e5c..06b6e5725cfe 100644 --- a/docs/docusaurus/docs/guides/validation/validation_actions/how_to_trigger_opsgenie_notifications_as_a_validation_action.md +++ b/docs/docusaurus/docs/guides/validation/validation_actions/how_to_trigger_opsgenie_notifications_as_a_validation_action.md @@ -8,9 +8,9 @@ This guide will help you set up Opsgenie alert notifications when running Great - - [Set up a working deployment of Great Expectations](../../../tutorials/getting_started/tutorial_overview.md) - - You already have an Opsgenie account - - You have created a that will be configured with the notification Action. + - [Set up a working deployment of Great Expectations](/docs/guides/setup/setup_overview) + - An Opsgenie account + - Created a that will be configured with the notification Action. diff --git a/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md b/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md index ecf6236df271..25ac569dd824 100644 --- a/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md +++ b/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md @@ -10,7 +10,7 @@ import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; Welcome to Great Expectations Cloud! This tutorial will help you onboard with GX Cloud and get ready to connect to your data. :::note Prerequisites -- This tutorial assumes you have Great Expectations OSS installed on your machine. If that's not the case please complete [Step 1: OSS Setup](docs/tutorials/getting_started/tutorial_setup.md) first. +- This tutorial assumes you have Great Expectations OSS installed on your machine. If that's not the case please complete [OSS Setup](/docs/guides/setup/installation/local) first. ::: ## Steps @@ -33,7 +33,7 @@ Welcome to Great Expectations Cloud! This tutorial will help you onboard with GX Any Python Interpreter or script file will work for the remaining steps in the guide, we recommend using a Jupyter Notebook, since they are included in the OSS GX installation and give the best experience of both composing a script file and running code in a live interpreter. ::: -- Set environment variables in the notebook (alternatively, add these as [Data Context config variables](docs/guides/setup/configuring_data_contexts/how_to_configure_credentials.md)) +- Set environment variables in the notebook (alternatively, add these as [Data Context config variables(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials)) ```python import os diff --git a/docs/docusaurus/docs/integrations/components/prerequisites.jsx b/docs/docusaurus/docs/integrations/components/prerequisites.jsx index b4e6fb69d55b..cf3d8379339a 100644 --- a/docs/docusaurus/docs/integrations/components/prerequisites.jsx +++ b/docs/docusaurus/docs/integrations/components/prerequisites.jsx @@ -33,7 +33,7 @@ export default class Prerequisites extends React.Component { defaultPrerequisiteItems () { return [
  • -
  • Completed the Getting Started Tutorial
  • +
  • Completed the Quickstart guide
  • Set up your dev environment
  • Created a Custom Expectation
  • ] diff --git a/docs/docusaurus/docs/intro.md b/docs/docusaurus/docs/intro.md index f3e0a40729df..a86912cb49c1 100755 --- a/docs/docusaurus/docs/intro.md +++ b/docs/docusaurus/docs/intro.md @@ -5,7 +5,7 @@ slug: / Welcome to Great Expectations! -Great Expectations is the leading tool for [validating](./terms/validation.md), [documenting](./terms/data_docs.md), and [profiling](./terms/profiler.md) your data to maintain quality and improve communication between teams. Head over to our [getting started](./tutorials/getting_started/tutorial_overview.md) tutorial. +Great Expectations is the leading tool for [validating](./terms/validation.md), [documenting](./terms/data_docs.md), and [profiling](./terms/profiler.md) your data to maintain quality and improve communication between teams. Head over to our [Quickstart guide](tutorials/quickstart/quickstart.md). Software developers have long known that automated testing is essential for managing complex codebases. Great Expectations brings the same discipline, confidence, and acceleration to data science and data engineering teams. diff --git a/docs/docusaurus/docs/terms/data_context.md b/docs/docusaurus/docs/terms/data_context.md index 21008aea3a11..498a3316c52c 100644 --- a/docs/docusaurus/docs/terms/data_context.md +++ b/docs/docusaurus/docs/terms/data_context.md @@ -65,7 +65,7 @@ The Data Context makes it easy to manage configuration of its own top-level comp The Data Context doesn't just give you convenient ways to access and configure components. It also provides the ability to *create* top-level components such as Datasources, Checkpoints, and Expectation Suites and manage where the information about those components is stored. -In the Getting Started Tutorial, everything was created locally and stored. This is a simple way to get started with Great Expectations. For production deployments, however, you'll probably want to swap out some of the components that were used in the Getting Started Tutorial for others that correspond to your source data systems and production environment. This may include storing information about those components in something other than your local environment. You can see several soup-to-nuts examples of how to do this for specific environments and source data systems in the [Reference Architecture guides](../deployment_patterns/index.md). +For production deployments you will want to define these components according to your source data systems and production environment. This may include storing information about those components in something other than your local environment. You can see several soup-to-nuts examples of how to do this for specific environments and source data systems in the [Reference Architecture guides](../deployment_patterns/index.md). If the exact deployment pattern you want to follow isn't documented in a Reference Architecture, you can see details for configuring specific components that component's related how-to guides. diff --git a/docs/docusaurus/docs/terms/evaluation_parameter.md b/docs/docusaurus/docs/terms/evaluation_parameter.md index a8be7687cc59..0898ddf3e6d9 100644 --- a/docs/docusaurus/docs/terms/evaluation_parameter.md +++ b/docs/docusaurus/docs/terms/evaluation_parameter.md @@ -45,7 +45,7 @@ The core of this is a `$PARAMETER : URN` pair. When Great Expectations encounter If you do not have a previous Expectation Suite's Validation Results to reference, however, you can instead provide Evaluation Parameters with a temporary initial value. For example, the interactive method of creating Expectations is based on Validating Expectations against a previous run of the same Expectation Suite. Since a previous run has not been performed when Expectations are being created, Evaluation Parameters cannot reference a past Validation and will require a temporary value instead. This will allow you to test Expectations that are meant to rely on values from previous Validation runs before you have actually used them to Validate data. -Say you are creating additional expectations for the data that you used in the [Getting Started Tutorial](../tutorials/getting_started/tutorial_overview.md). (You have completed the Getting Started Tutorial, right?) You want to create an expression that asserts that the row count for each Validation remains the same as the previous `upstream_row_count`, but since there is no previous `upstream_row_count` you need to provide a value that matches what the Expectation you are creating will find. +Say you are creating additional Expectations for the data that you used in the [Quickstart guide](tutorials/quickstart/quickstart.md). You want to create an expression that asserts that the row count for each Validation remains the same as the previous `upstream_row_count`, but since there is no previous `upstream_row_count` you need to provide a value that matches what the Expectation you are creating will find. To do so, you would first edit your existing (or create a new) Expectation Suite using the CLI. This will open a Jupyter Notebook. After running the first cell, you will have access to a Validator object named `validator` that you can use to add new Expectations to the Expectation Suite. diff --git a/docs/docusaurus/docs/terms/profiler.md b/docs/docusaurus/docs/terms/profiler.md index e17a6ac2f311..2354f8bf840d 100644 --- a/docs/docusaurus/docs/terms/profiler.md +++ b/docs/docusaurus/docs/terms/profiler.md @@ -17,7 +17,7 @@ A Profiler generates ### Features and promises -A Profiler creates a starting point for quickly generating Expectations. For example, during the [Getting Started Tutorial](../tutorials/getting_started/tutorial_overview.md), Great Expectations uses the `UserConfigurableProfiler` to demonstrate important features of Expectations by creating and validating an that has several kinds of Expectations built from a small sample of data. +A Profiler creates a starting point for quickly generating Expectations. There are several Profilers included with Great Expectations; conceptually, each Profiler is a checklist of questions which will generate an Expectation Suite when asked of a Batch of data. diff --git a/docs/docusaurus/docs/tutorials/getting_started/tutorial_connect_to_data.md b/docs/docusaurus/docs/tutorials/getting_started/tutorial_connect_to_data.md deleted file mode 100644 index aa77e2bb1ac6..000000000000 --- a/docs/docusaurus/docs/tutorials/getting_started/tutorial_connect_to_data.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -title: 'Tutorial, Step 2: Connect to data' ---- -import UniversalMap from '/docs/images/universal_map/_universal_map.mdx'; -import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; - - - -:::note Prerequisites - -- Completed [Step 1: Setup](./tutorial_setup.md) of this tutorial. - -::: - -In Step 1: Setup, we created a . Now that we have that Data Context, you'll want to connect to your actual data. In Great Expectations, simplify these connections by managing and providing a consistent, cross-platform API for referencing data. - -### Create a Datasource with the CLI -Let's create and configure your first Datasource: a connection to the data directory we've provided in the repo. This could also be a database connection, but because our tutorial data consists of .CSV files we're just using a simple file store. - -Start by using the to run the following command from your `gx_tutorials` directory: - -````console -great_expectations datasource new -```` - -You will then be presented with a choice that looks like this: - -````console -What data would you like Great Expectations to connect to? - 1. Files on a filesystem (for processing with Pandas or Spark) - 2. Relational database (SQL) -:1 -```` - -The only difference is that we've included a "1" after the colon and you haven't typed anything in answer to the prompt, yet. - -As we've noted before, we're working with .CSV files. So you'll want to answer with `1` and hit enter. - -The next prompt you see will look like this: -````console -What are you processing your files with? - 1. Pandas - 2. PySpark -:1 -```` - -For this tutorial we will use Pandas to process our files, so again answer with `1` and press enter to continue. - -:::note -When you select `1. Pandas` from the above list, you are specifying your Datasource's . Although the tutorial uses Pandas, Spark and SqlAlchemy are also supported as Execution Engines. -::: - -We're almost done with the CLI! You'll be prompted once more, this time for the path of the directory where the data files are located. The prompt will look like: - -````console -Enter the path of the root directory where the data files are stored. If files are on local disk -enter a path relative to your current working directory or an absolute path. -:data -```` - -The data that this tutorial uses is stored in `gx_tutorials/data`. Since we are working from the `gx_tutorials` directory, you only need to enter `data` and hit return to continue. - -This will now **open up a new Jupyter Notebook** to complete the Datasource configuration. Your console will display a series of messages as the Jupyter Notebook is loaded, but you can disregard them. The rest of the Datasource setup takes place in the Jupyter Notebook and we won't return to the terminal until that is done. - -### The ```datasource new``` notebook - -The Jupyter Notebook contains some boilerplate code to configure your new Datasource. You can run the entire notebook as-is, but we recommend changing at least the Datasource name to something more specific. - -Edit the second code cell as follows: - -````console -datasource_name = "getting_started_datasource" -```` - -Then **execute all cells in the notebook** in order to save the new Datasource. If successful, the last cell will print a list of all Datasources, including the one you just created. - -**Before continuing, let’s stop and unpack what just happened.** - -### Configuring Datasources - -When you completed those last few steps, you told Great Expectations that: - -+ You want to create a new Datasource called `getting_started_datasource` (or whatever custom name you chose above). -+ You want to use Pandas to read the data from CSV. - -Based on that information, the CLI added the following entry into your ```great_expectations.yml``` file, under the `datasources` header: - -```yaml name="tests/integration/docusaurus/tutorials/getting-started/getting_started.py datasource_yaml" -``` - -Please note that due to how data is serialized, the entry in your ```great_expectations.yml``` file may not have these key/value pairs in the same order as the above example. However, they will all have been added. - -
    - What does the configuration contain? -
    -

    - -**ExecutionEngine** : The provides backend-specific computing resources that are used to read-in and perform validation on data. For more information on ExecutionEngines, please refer to the following Core Concepts document on ExecutionEngines - -

    -

    - -**DataConnectors** : facilitate access to external data stores, such as filesystems, databases, and cloud storage. The current configuration contains both an InferredAssetFilesystemDataConnector, which allows you to retrieve a batch of data by naming a data asset (which is the filename in our case), and a RuntimeDataConnector, which allows you to retrieve a batch of data by defining a filepath. In this tutorial we will only be using the InferredAssetFilesystemDataConnector. For more information on DataConnectors, please refer to the Core Concepts document on Datasources. - -

    -

    - This Datasource does not require any credentials. However, if you were to connect to a database that requires connection credentials, those would be stored in great_expectations/uncommitted/config_variables.yml. -

    -
    -
    - -In the future, you can modify or delete your configuration by editing your ```great_expectations.yml``` and ```config_variables.yml``` files directly. - -For now, let’s move on to [Step 3: Create Expectations.](./tutorial_create_expectations.md) - diff --git a/docs/docusaurus/docs/tutorials/getting_started/tutorial_create_expectations.md b/docs/docusaurus/docs/tutorials/getting_started/tutorial_create_expectations.md deleted file mode 100644 index 905da3cfd7d7..000000000000 --- a/docs/docusaurus/docs/tutorials/getting_started/tutorial_create_expectations.md +++ /dev/null @@ -1,150 +0,0 @@ ---- -title: 'Tutorial, Step 3: Create Expectations' ---- -import UniversalMap from '/docs/images/universal_map/_universal_map.mdx'; -import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; - - - -:::note Prerequisites - -- Completed [Step 2: Connect to Data](./tutorial_connect_to_data.md) of this tutorial. - -::: - - are the workhorse abstraction in Great Expectations. Each Expectation is a declarative, machine-verifiable assertion about the expected format, content, or behavior of your data. Great Expectations comes with dozens of built-in Expectations, and it’s possible to develop your own custom Expectations, too. - -The will help you create your first . Suites are simply collections of Expectations. In order to create a new suite, we will use the built-in Data Assistant to automatically create an Expectation Suite called `getting_started_expectation_suite_taxi.demo`. - -### Create an Expectation Suite using the CLI - -Since we are using the CLI, you will want to return to your console and the `gx_tutorials` folder. Remember: This is where we initialized our , and the Data Context is our access point to everything else in Great Expectations! - -From the `gx_tutorials` folder, type the following into your terminal: - -```console -great_expectations suite new -``` - -This will bring up the following prompt: - -```console -How would you like to create your Expectation Suite? - 1. Manually, without interacting with a sample Batch of data (default) - 2. Interactively, with a sample Batch of data - 3. Automatically, using a Data Assistant -: 3 -``` - -In this tutorial we will be using a Data Assistant to populate the Expectation Suite, so go ahead and enter `3` and hit enter to continue to the next prompt. - -```console -A batch of data is required to edit the suite - let's help you to specify it. - - -Which data asset (accessible by data connector "default_inferred_data_connector_name") would you like to use? - 1. yellow_tripdata_sample_2019-01.csv - 2. yellow_tripdata_sample_2019-02.csv -: 1 -``` -The Data Assistant will require a to analyze. This prompt is asking us which data to use for that. As you can see, the prompt it is giving corresponds to the .CSV files in our `data` folder. These are the very same ones we configured our to connect to back in Step 2: Connect to Data. - -We're going to choose the first file. If you're wondering why, here's an explanation: Recall that our data directory contains two CSV files: `yellow_tripdata_sample_2019-01` and `yellow_tripdata_sample_2019-02`. - - - `yellow_tripdata_sample_2019-01` contains the January 2019 taxi data. Since we want to build an Expectation Suite based on what we know about our taxi data from the January 2019 data set, we want to use it for profiling. - - `yellow_tripdata_sample_2019-02` contains the February 2019 data, which we consider the “new” data set that we want to validate before using in production. We’ll use it later when showing you how to validate data. - -Makes sense, right? - -Go ahead and answer `1` and hit enter now. That will bring up the next prompt. - -```console -Name the new Expectation Suite [yellow_tripdata_sample_2019-01.csv.warning]: getting_started_expectation_suite_taxi.demo -``` - -This prompt is asking for a name for our new Expectation Suite. You can name it whatever you would like, but since this is the Getting Started Tutorial, we're demonstrating how to create an expectation suite, and we're using NYC taxi data we've used `getting_started_expectation_suite_taxi.demo` as the provided name. - -Once you've provided a name for your Expectation Suite and hit enter, you will receive one more prompt. This one will ask if you want to proceed with creating the Expectation Suite as you've specified so far: - -```console -Great Expectations will create a notebook, containing code cells that select from available columns in your dataset and -generate expectations about them to demonstrate some examples of assertions you can make about your data. - -When you run this notebook, Great Expectations will store these expectations in a new Expectation Suite "Name the new Expectation Suite [yellow_tripdata_sample_2019-01.csv.warning]: getting_started_expectation_suite_taxi.demo" here: - - /gx_tutorials/great_expectations/expectations/Name the new Expectation Suite [yellow_tripdata_sample_2019-01/csv/warning]: getting_started_expectation_suite_taxi/demo.json - -Would you like to proceed? [Y/n]: Y -``` - -When you answer with `Y` (or just press enter) Great Expectations will **open a Jupyter Notebook** that helps you populate the new suite. - -### Creating Expectations in Jupyter Notebooks - -Notebooks are a simple way of interacting with the Great Expectations Python API. You could also just write all this in plain Python code, but for convenience, Great Expectations provides you some boilerplate code in notebooks. - -Since notebooks are often less permanent, creating Expectations in a notebook also helps reinforce that the source of truth about Expectations is the Expectation Suite, not the code that generates the Expectations. - -### Let’s take a look through the notebook and see what’s happening in each cell - -**Cell 1** -![Cell1](../../images/getting_started_tutorial/tutorial_create_expectations_cell1.png) - -1. The first cell does several things: It imports all the relevant libraries, loads a Data Context, and creates a `Validator`, which combines a to define your batch of data, and an Expectation Suite. - -**Cell 2** -![Cell2](../../images/getting_started_tutorial/tutorial_create_expectations_cell2.png) - -2. The second cell allows you to specify which columns you want to **ignore** when creating Expectations. For our tutorial, we're going to ensure that the number of passengers recorded in our data is reasonable. To do this, we'll want our Data Assistant to examine the `passenger_count` column and determine just what a reasonable range _is_ based on our January data. **Let’s comment just this one line to include it**: - -```python name="tests/integration/docusaurus/tutorials/getting-started/getting_started.py exclude_column_names no comment" -``` - -**Cell 3** -![Cell3](../../images/getting_started_tutorial/tutorial_create_expectations_cell3_onboarding_assistant.png) - -3. Cell 3 is where you run a Data Assistant. In this case, the assistant being used is the Onboarding Assistant, which will analyze the data provided by your Batch Request and create the relevant Expectations to add to your `taxi.demo` suite. - -**Cell 4** -![Cell4](../../images/getting_started_tutorial/tutorial_create_expectations_cell4.png) - -4. The last cell does several things again: It saves the Expectation Suite to disk, runs the against the loaded data batch, and then builds and opens Data Docs, so you can look at the Validation Results. *We will explain the validation step later in the next step, [Step 4: Validate Data](./tutorial_validate_data.md).* - -For purposes of this tutorial, the default values in all of these cells (except for the second one, which we changed to include the `passenger_count` field) provide the configurations and execute the steps that we need them to. So as long as you've made that one change, you're ready to continue. - -**Let’s execute all the cells** and wait for Great Expectations to open a browser window with Data Docs, which will then let us see and edit the Expectations that were composed for us by the Data Assistant. - -### Viewing your Expectations in Data Docs - -Once the Data Assistant is done executing it will open up in your browser automatically. - -Data Docs translate Expectations, , and other metadata into clean, human-readable documentation. Automatically compiling your data documentation from your data tests in the form of Data Docs guarantees that your documentation will never go stale. Feel free to take a moment to browse through the Expectations that the Data Assistant put together from the data that we provided it. - -In particular, take a look at the Expectations that were created for the `passenger_count` field. These are the rules that we will be comparing the February data against when we validate it in step four of this tutorial. - -### How did we get those Expectations? - -You can create and edit Expectations using several different workflows. Using a Data Assistant as we just did is one of the quickest options to get started with an Expectation Suite. - -This Data Assistant connected to your data (using the Datasource you configured in the previous step), took a quick look at the contents of the data, and produced an initial set of Expectations. The Data Assistant considers the following properties, amongst others: - - - the data type of the column - - - simple statistics like column min, max, mean - - - the number of times values occur - - - the number of `NULL` values - -These Expectations are not intended to be very smart. Instead, the goal is to quickly provide some good examples, so that you’re not starting from a blank slate. - -
    - Creating Custom Expectations -
    -

    - Later, you should also take a look at other workflows for creating Custom Expectations. Creating Custom Expectations is an active area of work in the Great Expectations community. Stay tuned for improvements over time. -

    -
    -
    - -For the purposes of this tutorial, the Expectations created by the Data Assistant are all we need. On to [Step 4: Validate your data](./tutorial_validate_data.md)! diff --git a/docs/docusaurus/docs/tutorials/getting_started/tutorial_overview.md b/docs/docusaurus/docs/tutorials/getting_started/tutorial_overview.md deleted file mode 100644 index c63769511c15..000000000000 --- a/docs/docusaurus/docs/tutorials/getting_started/tutorial_overview.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -title: Getting started with Great Expectations ---- -import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -import UniversalMap from '/docs/images/universal_map/_universal_map.mdx'; - -Welcome to the Great Expectations getting started tutorial! This tutorial will help you set up your first local deployment of Great Expectations. This deployment will contain a small that we will use to some sample data. We'll also introduce important concepts, with links to detailed material you can dig into later. - -:::tip -The steps described in this tutorial assume you are installing Great Expectations version 0.13.8 or above. - -For a tutorial for older versions of Great Expectations, please see older versions of this documentation, which can be found [here](https://docs.greatexpectations.io/en/latest/guides/tutorials.html). -::: - -### This tutorial will walk you through the following steps - - - - - - - - - - - - - - - - - - -
    - Setup - -

    Setup

    -

    - -First, we will make sure you have Great Expectations installed and show you how to initialize a . - -

    -
    - Connect to Data - -

    Connect to Data

    -

    - -Then you will learn how to configure a to connect to your data. - -

    -
    - Create Expectations - -

    Create Expectations

    -

    - -You will then create your first Expectation Suite using the built-in automated . You'll also take your first look at , where you will be able to see the that were created. - -

    -
    - Validate Data - -

    Validate Data

    -

    - -Finally, we will show you how to use this Expectation Suite to Validate a new batch of data, and take a deeper look at the Data Docs which will show your . - -

    -
    - -But before we dive into the first step, let's bring you up to speed on the problem we are going to address in this tutorial, and the data that we'll be using to illustrate it. - -### The data problem we're solving in this tutorial - -In this tutorial we will be looking at two sets of data representing the same information over different periods of time. We will use the values of the first set of data to populate the rules that we expect this data to follow in the future. We will then use these Expectations to determine if there is a problem with the second set of data. - -The data we're going to use for this tutorial is the [NYC taxi data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). This is an open data set which is updated every month. Each record in the data corresponds to one taxi ride and contains information such as the pick-up and drop-off location, the payment amount, and the number of passengers, among others. - -In this tutorial, we provide two CSV files, each with a 10,000 row sample of the Yellow Taxi Trip Records set: - -- **yellow_tripdata_sample_2019-01.csv**: a sample of the January 2019 taxi data - -- **yellow_tripdata_sample_2019-02.csv**: a sample of the February 2019 taxi data - -For purposes of this tutorial, we are treating the January 2019 taxi data as our "current" data, and the February 2019 taxi data as "future" data that we have not yet looked at. We will use Great Expectations to build a profile of the January data and then use that profile to check for any unexpected data quality issues in the February data. In a real-life scenario, this would ensure that any problems with the February data would be caught (so it could be dealt with) before the February data is used in a production application! - -It should be noted that in the tutorial we only have one month's worth of "current" data. However, you can use Multi-Batch Profilers to build profiles of multiple past or current sets of data. Doing so will generally result in a more accurate data profile but for this small example a single set of "current" data will suffice. - -### Getting started with the Getting Started Tutorial - -Now that you have the background for the data we're using and what we want to do with it, we're ready to start the tutorial in earnest. - -Remember the icons for the four steps we'll be going through? - - - -Great! You should know: The icon associated with each of these steps will also be displayed on any related documentation. So if you do follow links into more detailed discussions of anything we introduce you to, you will be able to find your way back to the step you were on with ease. - -And now it looks like you're ready to move on to [Step 1: Setup.](./tutorial_setup.md) diff --git a/docs/docusaurus/docs/tutorials/getting_started/tutorial_review.md b/docs/docusaurus/docs/tutorials/getting_started/tutorial_review.md deleted file mode 100644 index 97a33c9e45be..000000000000 --- a/docs/docusaurus/docs/tutorials/getting_started/tutorial_review.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -title: 'Review and next steps' ---- -import UniversalMap from '/docs/images/universal_map/_universal_map.mdx'; -import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; - - - -:::note Prerequisites - -- Completed [Step 4: Validate Data](./tutorial_validate_data.md) of this tutorial. - -::: - -### Review -In this tutorial we've taken you through the four steps you need to be able to perform to use Great Expectations. - -Let's review each of these steps and take a look at the important concepts and features we used. - - - - - - -
    Setup -

    Step 1: Setup

    -

    - -You installed Great Expectations and initialized your Data Context. - -

    -
    - -- ****: The folder structure that contains the entirety of your Great Expectations project. It is also the entry point for accessing all the primary methods for creating elements of your project, configuring those elements, and working with the metadata for your project. -- ****: The Command Line Interface for Great Expectations. The CLI provides helpful utilities for deploying and configuring Data Contexts, as well as a few other convenience methods. - - - - - - -
    Connect to Data -

    Step 2: Connect to Data

    -

    You created and configured your Datasource.

    -
    - -- ****: An object that brings together a way of interacting with data (an Execution Engine) and a way of accessing that data (a Data Connector). Datasources are used to obtain Batches for Validators, Expectation Suites, and Profilers. -- **Jupyter Notebooks**: These notebooks are launched by some processes in the CLI. They provide useful boilerplate code for everything from configuring a new Datasource to building an Expectation Suite to running a Checkpoint. - - - - - - -
    Create Expectations -

    Step 3: Create Expectations

    -

    You used the automatic Profiler to build an Expectation Suite.

    -
    - -- ****: A collection of Expectations. -- ****: A verifiable assertion about data. Great Expectations is a framework for defining Expectations and running them against your data. In the tutorial's example, we asserted that NYC taxi rides should have a minimum of one passenger. When we ran that Expectation against our second set of data Great Expectations reported back that some records in the new data indicated a ride with zero passengers, which failed to meet this Expectation. -- ****: A tool that automatically generates Expectations from a of data. - - - - - - -
    Validate Data -

    Step 4: Validate Data

    -

    You created a Checkpoint which you used to validate new data. You then viewed the Validation Results in Data Docs.

    -
    - -- ****: An object that uses a Validator to run an Expectation Suite against a batch of data. Running a Checkpoint produces Validation Results for the data it was run on. -- ****: A report generated from an Expectation Suite being run against a batch of data. The Validation Result itself is in JSON and is rendered as Data Docs. -- ****: Human readable documentation that describes Expectations for data and its Validation Results. Data docs can be generated both from Expectation Suites (describing our Expectations for the data) and also from Validation Results (describing if the data meets those Expectations). - -### Going forward - -Your specific use case will no doubt differ from that of our tutorial. However, the four steps you'll need to perform in order to get Great Expectations working for you will be the same. Setup, connect to data, create Expectations, and validate data. That's all there is to it! As long as you can perform these four steps you can have Great Expectations working to validate data for you. - -For those who only need to know the basics in order to make Great Expectations work our documentation include an Overview reference for each step. - -For those who prefer working from examples, we have "How to" guides which show working examples of how to configure objects from Great Expectations according to specific use cases. You can find these in the table of contents under the category that corresponds to when you would need to do so. Or, if you want a broad overview of the options for customizing your deployment we also provide a [reference document on ways to customize your deployment](../../reference/customize_your_deployment.md). - - diff --git a/docs/docusaurus/docs/tutorials/getting_started/tutorial_setup.md b/docs/docusaurus/docs/tutorials/getting_started/tutorial_setup.md deleted file mode 100644 index 2eaac1355973..000000000000 --- a/docs/docusaurus/docs/tutorials/getting_started/tutorial_setup.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -title: 'Tutorial, Step 1: Setup' ---- -import UniversalMap from '/docs/images/universal_map/_universal_map.mdx'; -import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -import VersionSnippet from '/docs/components/versions/_gx_version_code_box.mdx' - - - -:::note Prerequisites -In order to work with Great Expectations, you will need: - -- A working Python install (3.7 to 3.10) -- The ability to pip install for Python - - Note: A best practice would be to do this in a virtual environment! -- A working Git install -- A working internet browser install (for viewing Data Docs in steps 3 and 4). - -If you need assistance with setting up any of these utilities, we have links to their documentation on our page for . -::: - -### Setting up the tutorial data - -The first thing we'll need is a copy of the data that this tutorial will work with. Fortunately, we've already put that data into a convenient repository that you can clone to your machine. - -Clone the [gx_tutorials](https://github.com/great-expectations/gx_tutorials) repository to download the data. This repository also contains directories with the final versions of the tutorial, which you can use for reference. - -To clone the repository and go into the directory you'll be working from, start from your working directory and enter the following commands into your terminal: - -```console -git clone https://github.com/great-expectations/gx_tutorials -cd gx_tutorials -``` - -The repository you cloned contains several directories with final versions for this and our other tutorials. The final version for this tutorial is located in the `getting_started_tutorial_final_v3_api` folder. You can use the final version as a reference or to explore a complete deployment of Great Expectations, but **you do not need it for this tutorial**. - -### Install Great Expectations and dependencies - -Great Expectations requires Python 3 and can be installed using pip. If you haven’t already, install Great Expectations by running: - -```bash -pip install great_expectations -``` - -You can confirm that installation worked by running - -```bash -great_expectations --version -``` - -This should return something like: - - - -For detailed installation instructions, see [How to install Great Expectations locally](../../guides/setup/installation/local.md). - -
    - Other deployment patterns -
    -

    - -This tutorial deploys Great Expectations locally. Note that other options (e.g. running Great Expectations on an EMR Cluster) are also available. You can find more information in the [Reference Architectures](../../deployment_patterns/index.md) section of the documentation. - -

    -
    -
    - -### Create a Data Context - -In Great Expectations, your manages your project configuration, so let’s go and create a Data Context for our tutorial project! - -When you installed Great Expectations, you also installed the Great Expectations command line interface (). It provides helpful utilities for deploying and configuring Data Contexts, plus a few other convenience methods. - -To initialize your Great Expectations deployment for the project, run this command in the terminal from the `gx_tutorials` directory: - -```console -great_expectations init -``` - -You should see this: -```console - ___ _ ___ _ _ _ - / __|_ _ ___ __ _| |_ | __|_ ___ __ ___ __| |_ __ _| |_(_)___ _ _ ___ -| (_ | '_/ -_) _` | _| | _|\ \ / '_ \/ -_) _| _/ _` | _| / _ \ ' \(_-< - \___|_| \___\__,_|\__| |___/_\_\ .__/\___\__|\__\__,_|\__|_\___/_||_/__/ - |_| - ~ Always know what to expect from your data ~ - -Let's create a new Data Context to hold your project configuration. - -Great Expectations will create a new directory with the following structure: - - great_expectations - |-- great_expectations.yml - |-- expectations - |-- checkpoints - |-- plugins - |-- .gitignore - |-- uncommitted - |-- config_variables.yml - |-- data_docs - |-- validations - -OK to proceed? [Y/n]: -``` - -When you see the prompt, press enter to continue. Great Expectations will build out the directory structure and configuration files it needs for you to proceed. All of these together are your Data Context. - -:::note - -Your Data Context will contain the entirety of your Great Expectations project. It is also the entry point for accessing all of the primary methods for creating elements of your project, configuring those elements, and working with the metadata for your project. That is why the first thing you do when working with Great Expectations is to initialize a Data Context! - -[You can follow this link to read more about Data Contexts.](../../terms/data_context.md) - -::: - -
    - About the great_expectations directory structure -
    -

    - After running the init command, your great_expectations directory will contain all of the important components of a local Great Expectations deployment. This is what the directory structure looks like -

    -
      -
    • great_expectations.yml contains the main configuration of your deployment.
    • -
    • - -The `expectations` directory stores all your as JSON files. If you want to store them somewhere else, you can change that later. - -
    • -
    • The plugins/ directory holds code for any custom plugins you develop as part of your deployment.
    • -
    • The uncommitted/ directory contains files that shouldn’t live in version control. It has a .gitignore configured to exclude all its contents from version control. The main contents of the directory are: -
        -
      • uncommitted/config_variables.yml, which holds sensitive information, such as database credentials and other secrets.
      • -
      • uncommitted/data_docs, which contains Data Docs generated from Expectations, Validation Results, and other metadata.
      • -
      • uncommitted/validations, which holds Validation Results generated by Great Expectations.
      • -
      -
    • -
    -
    -
    - -Congratulations, that's all there is to Step 1: Setup with Great Expectations. You've finished the first step! Let's move on to [Step 2: Connect to Data](./tutorial_connect_to_data.md) diff --git a/docs/docusaurus/docs/tutorials/getting_started/tutorial_validate_data.md b/docs/docusaurus/docs/tutorials/getting_started/tutorial_validate_data.md deleted file mode 100644 index 9489dd82c067..000000000000 --- a/docs/docusaurus/docs/tutorials/getting_started/tutorial_validate_data.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: 'Tutorial, Step 4: Validate data' ---- -import UniversalMap from '/docs/images/universal_map/_universal_map.mdx'; -import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; - - - -:::note Prerequisites - -- Completed [Step 3: Create Expectations](./tutorial_create_expectations.md) of this tutorial. - -::: - -### Set up a Checkpoint - -Let’s set up our first ! - -A Checkpoint runs an against a (or ). Running a Checkpoint produces . Checkpoints can also be configured to perform additional . - -For the purposes of this tutorial, the Checkpoint we create will run the Expectation Suite we previously configured against the data we provide. We will use it to verify that there are no unexpected changes in the February NYC taxi data compared to what our observed in the January NYC taxi data. - -**Go back to your terminal** and shut down the Jupyter Notebook, if you haven’t yet. Then run the following command: - -```console -great_expectations checkpoint new getting_started_checkpoint -``` - -This will **open a Jupyter Notebook** that will allow you to complete the configuration of your Checkpoint. - -The Jupyter Notebook contains some boilerplate code that allows you to configure a new Checkpoint. The second code cell is pre-populated with an arbitrarily chosen Batch Request and Expectation Suite to get you started. Edit the `data_asset_name` to reference the data we want to validate (the February data), as follows: - -```python name="tests/integration/docusaurus/tutorials/getting-started/getting_started.py checkpoint_yaml_config" -``` - -You can then execute all cells in the notebook in order to store the Checkpoint to your Data Context. - -#### What just happened? - -- `getting_started_checkpoint` is the name of your new Checkpoint. - -- The Checkpoint uses `getting_started_expectation_suite_taxi.demo` as its primary Expectation Suite. - -- You configured the Checkpoint to validate the `yellow_tripdata_sample_2019-02.csv` (i.e. our February data) file. - -### How to run validation and inspect your Validation Results - -In order to build and get your results in a nice, human-readable format, you can simply uncomment and run the last cell in the notebook. This will open Data Docs, where you can click on the latest run to see the Validation Results page for this Checkpoint run. - -![data_docs_failed_validation1](../../images/data_docs_taxi_failed_validation01.png) - -You’ll see that the test suite failed when you ran it against the February data. - -#### What just happened? Why did it fail?? Help!? - -We ran the Checkpoint and it successfully failed! **Wait - what?** Yes, that’s correct, this indicates that the February data has data quality issues, which means we want the Validation to fail. - -Click on the highlighted row to access the Validation Results page, which will tell us specifically what is wrong with the February data. - -![data_docs_failed_validation2](../../images/data_docs_taxi_failed_validation02.png) - -On the Validation Results page, you will see that the Validation of the staging data *failed* because the set of *Observed Values* in the `passenger_count` column contained the value `0`! This violates our Expectation, which makes the validation fail. - -**And this is it!** - -We have successfully created an Expectation Suite based on historical data, and used it to detect an issue with our new data. **Congratulations! You have now completed the “Getting started with Great Expectations” tutorial.** diff --git a/docs/docusaurus/static/_redirects b/docs/docusaurus/static/_redirects index 6f56a4e86eb2..67f730ac33f3 100644 --- a/docs/docusaurus/static/_redirects +++ b/docs/docusaurus/static/_redirects @@ -231,4 +231,13 @@ /docs/api_docs/methods/great_expectations-data_context-data_context-data_context-datacontext-create /docs/reference/api/great_expectations.util # Redirects for renamed reference docs -/docs/reference/anonymous_usage_statistics /docs/reference/usage_statistics \ No newline at end of file +/docs/reference/anonymous_usage_statistics /docs/reference/usage_statistics + +# Redirect from Getting Started Tutorial to Quickstart +/docs/docusaurus/docs/tutorials/getting_started/ /docs/tutorials/quickstart +/docs/tutorials/getting_started/tutorial_connect_to_data /docs/tutorials/quickstart +/docs/tutorials/getting_started/tutorial_create_expectations /docs/tutorials/quickstart +/docs/tutorials/getting_started/tutorial_overview /docs/tutorials/quickstart +/docs/tutorials/getting_started/tutorial_review /docs/tutorials/quickstart +/docs/tutorials/getting_started/tutorial_setup /docs/tutorials/quickstart +/docs/tutorials/getting_started/tutorial_validate_data /docs/tutorials/quickstart \ No newline at end of file From 81a1249ca784409189b279795183f52c32647cca Mon Sep 17 00:00:00 2001 From: Bill Dirks Date: Wed, 5 Apr 2023 12:13:59 -0700 Subject: [PATCH 40/96] [FEATURE] Add batch metadata to spark add_*_asset methods (#7534) --- .../datasource/fluent/file_path_data_asset.py | 2 +- .../spark_azure_blob_storage_datasource.py | 9 ++- .../datasource/fluent/spark_datasource.py | 63 ++++++++++++++++++- .../fluent/spark_dbfs_datasource.py | 5 ++ .../fluent/spark_dbfs_datasource.pyi | 4 ++ .../fluent/spark_filesystem_datasource.py | 5 ++ .../spark_google_cloud_storage_datasource.py | 9 ++- .../datasource/fluent/spark_s3_datasource.py | 9 ++- ...est_spark_azure_blob_storage_datasource.py | 6 ++ .../fluent/test_spark_datasource.py | 4 -- .../fluent/test_spark_dbfs_datasource.py | 6 ++ .../test_spark_filesystem_datasource.py | 23 +++++++ ...t_spark_google_cloud_storage_datasource.py | 6 ++ .../fluent/test_spark_s3_datasource.py | 6 ++ 14 files changed, 148 insertions(+), 9 deletions(-) diff --git a/great_expectations/datasource/fluent/file_path_data_asset.py b/great_expectations/datasource/fluent/file_path_data_asset.py index 51739df1b346..4950fe079962 100644 --- a/great_expectations/datasource/fluent/file_path_data_asset.py +++ b/great_expectations/datasource/fluent/file_path_data_asset.py @@ -54,6 +54,7 @@ class _FilePathDataAsset(DataAsset): "type", "name", "order_by", + "batch_metadata", "batching_regex", # file_path argument "kwargs", # kwargs need to be unpacked and passed separately "batch_metadata", @@ -227,7 +228,6 @@ def get_batch_list_from_batch_request( fully_specified_batch_request.options.update( batch_definition.batch_identifiers ) - batch_metadata = self._get_batch_metadata_from_batch_request( batch_request=fully_specified_batch_request ) diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py index c1bac87764c2..e867a8db4738 100644 --- a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py @@ -12,7 +12,10 @@ from great_expectations.datasource.fluent.data_asset.data_connector import ( AzureBlobStorageDataConnector, ) -from great_expectations.datasource.fluent.interfaces import TestConnectionError +from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, + TestConnectionError, +) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) @@ -128,6 +131,7 @@ def add_csv_asset( name_starts_with: str = "", delimiter: str = "/", order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> CSVAsset: """Adds a CSV DataAsset to the present "SparkAzureBlobStorageDatasource" object. @@ -140,6 +144,8 @@ def add_csv_asset( name_starts_with: Microsoft Azure Blob Storage object name prefix delimiter: Microsoft Azure Blob Storage object name delimiter order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default + batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any + batches created from the asset. """ order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( @@ -148,6 +154,7 @@ def add_csv_asset( header=header, inferSchema=infer_schema, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) asset._data_connector = AzureBlobStorageDataConnector.build_data_connector( datasource_name=self.name, diff --git a/great_expectations/datasource/fluent/spark_datasource.py b/great_expectations/datasource/fluent/spark_datasource.py index adc676db8528..c314d1cafc20 100644 --- a/great_expectations/datasource/fluent/spark_datasource.py +++ b/great_expectations/datasource/fluent/spark_datasource.py @@ -1,12 +1,15 @@ from __future__ import annotations +import dataclasses import logging +from pprint import pformat as pf from typing import ( TYPE_CHECKING, ClassVar, Dict, Generic, List, + Optional, Type, TypeVar, ) @@ -14,6 +17,7 @@ import pydantic from typing_extensions import Literal +import great_expectations.exceptions as gx_exceptions from great_expectations.core.batch_spec import RuntimeDataBatchSpec from great_expectations.datasource.fluent.constants import ( _DATA_CONNECTOR_NAME, @@ -21,6 +25,7 @@ from great_expectations.datasource.fluent.interfaces import ( Batch, BatchRequest, + BatchRequestOptions, DataAsset, Datasource, ) @@ -104,6 +109,45 @@ def _get_reader_options_include(self) -> set[str] | None: """Spark DataFrameAsset does not implement "_get_reader_options_include()" method, because DataFrame is already available.""" ) + def build_batch_request( + self, options: Optional[BatchRequestOptions] = None + ) -> BatchRequest: + if options: + actual_keys = set(options.keys()) + raise gx_exceptions.InvalidBatchRequestError( + "Data Assets associated with SparkDatasource can only contain a single batch,\n" + "therefore BatchRequest options cannot be supplied. BatchRequest options with keys:\n" + f"{actual_keys}\nwere passed.\n" + ) + + return BatchRequest( + datasource_name=self.datasource.name, + data_asset_name=self.name, + options={}, + ) + + def _validate_batch_request(self, batch_request: BatchRequest) -> None: + """Validates the batch_request has the correct form. + + Args: + batch_request: A batch request object to be validated. + """ + if not ( + batch_request.datasource_name == self.datasource.name + and batch_request.data_asset_name == self.name + and not batch_request.options + ): + expect_batch_request_form = BatchRequest( + datasource_name=self.datasource.name, + data_asset_name=self.name, + options={}, + ) + raise gx_exceptions.InvalidBatchRequestError( + "BatchRequest should have form:\n" + f"{pf(dataclasses.asdict(expect_batch_request_form))}\n" + f"but actually has form:\n{pf(dataclasses.asdict(batch_request))}\n" + ) + def get_batch_list_from_batch_request( self, batch_request: BatchRequest ) -> list[Batch]: @@ -167,9 +211,26 @@ class SparkDatasource(_SparkDatasource): def test_connection(self, test_assets: bool = True) -> None: ... - def add_dataframe_asset(self, name: str, dataframe: DataFrame) -> DataFrameAsset: + def add_dataframe_asset( + self, + name: str, + dataframe: DataFrame, + batch_metadata: Optional[BatchMetadata] = None, + ) -> DataFrameAsset: + """Adds a Dataframe DataAsset to this SparkDatasource object. + + Args: + name: The name of the Dataframe asset. This can be any arbitrary string. + dataframe: The Dataframe containing the data for this data asset. + batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any + batches created from the asset. + + Returns: + The DataFameAsset that has been added to this datasource. + """ asset = DataFrameAsset( name=name, dataframe=dataframe, + batch_metadata=batch_metadata or {}, ) return self._add_asset(asset=asset) diff --git a/great_expectations/datasource/fluent/spark_dbfs_datasource.py b/great_expectations/datasource/fluent/spark_dbfs_datasource.py index 03afbb60b26f..4a754543f5e3 100644 --- a/great_expectations/datasource/fluent/spark_dbfs_datasource.py +++ b/great_expectations/datasource/fluent/spark_dbfs_datasource.py @@ -18,6 +18,7 @@ if TYPE_CHECKING: from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, Sorter, SortersDefinition, ) @@ -42,6 +43,7 @@ def add_csv_asset( header: bool = False, infer_schema: bool = False, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> CSVAsset: """Adds a CSV DataAsset to the present "SparkDBFSDatasource" object. @@ -52,6 +54,8 @@ def add_csv_asset( header: boolean (default False) indicating whether or not first line of CSV file is header line infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default + batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any + batches created from the asset. """ order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( @@ -60,6 +64,7 @@ def add_csv_asset( header=header, inferSchema=infer_schema, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) asset._data_connector = DBFSDataConnector.build_data_connector( datasource_name=self.name, diff --git a/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi b/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi index 5936a8072308..ea4e8422a311 100644 --- a/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi @@ -9,6 +9,9 @@ from great_expectations.datasource.fluent import SparkFilesystemDatasource from great_expectations.datasource.fluent.data_asset.data_connector import ( DBFSDataConnector as DBFSDataConnector, ) +from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, +) from great_expectations.datasource.fluent.interfaces import ( SortersDefinition as SortersDefinition, ) @@ -34,4 +37,5 @@ class SparkDBFSDatasource(SparkFilesystemDatasource): header: bool = ..., infer_schema: bool = ..., order_by: Optional[SortersDefinition] = ..., + batch_metadata: Optional[BatchMetadata] = ..., ) -> CSVAsset: ... diff --git a/great_expectations/datasource/fluent/spark_filesystem_datasource.py b/great_expectations/datasource/fluent/spark_filesystem_datasource.py index edd88978524d..28aec3f39893 100644 --- a/great_expectations/datasource/fluent/spark_filesystem_datasource.py +++ b/great_expectations/datasource/fluent/spark_filesystem_datasource.py @@ -13,6 +13,7 @@ FilesystemDataConnector, ) from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, TestConnectionError, ) from great_expectations.datasource.fluent.spark_file_path_datasource import ( @@ -61,6 +62,7 @@ def add_csv_asset( header: bool = False, infer_schema: bool = False, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> CSVAsset: """Adds a CSV DataAsset to the present "SparkFilesystemDatasource" object. @@ -71,6 +73,8 @@ def add_csv_asset( header: boolean (default False) indicating whether or not first line of CSV file is header line infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default + batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any + batches created from the asset. """ order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( @@ -79,6 +83,7 @@ def add_csv_asset( header=header, inferSchema=infer_schema, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) asset._data_connector = FilesystemDataConnector.build_data_connector( datasource_name=self.name, diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py index 357782fa736b..10554f0b18ef 100644 --- a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py @@ -15,7 +15,10 @@ from great_expectations.datasource.fluent.data_asset.data_connector import ( GoogleCloudStorageDataConnector, ) -from great_expectations.datasource.fluent.interfaces import TestConnectionError +from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, + TestConnectionError, +) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) @@ -136,6 +139,7 @@ def add_csv_asset( delimiter: str = "/", max_results: int = 1000, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> CSVAsset: """Adds a CSV DataAsset to the present "SparkGoogleCloudStorageDatasource" object. @@ -148,6 +152,8 @@ def add_csv_asset( delimiter (str): Google Cloud Storage object name delimiter max_results (int): Google Cloud Storage max_results (default is 1000) order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default + batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any + batches created from the asset. """ order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( @@ -156,6 +162,7 @@ def add_csv_asset( header=header, inferSchema=infer_schema, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) asset._data_connector = GoogleCloudStorageDataConnector.build_data_connector( datasource_name=self.name, diff --git a/great_expectations/datasource/fluent/spark_s3_datasource.py b/great_expectations/datasource/fluent/spark_s3_datasource.py index 316942cbfbbf..0e308d8d2713 100644 --- a/great_expectations/datasource/fluent/spark_s3_datasource.py +++ b/great_expectations/datasource/fluent/spark_s3_datasource.py @@ -16,7 +16,10 @@ from great_expectations.datasource.fluent.data_asset.data_connector import ( S3DataConnector, ) -from great_expectations.datasource.fluent.interfaces import TestConnectionError +from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, + TestConnectionError, +) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) @@ -111,6 +114,7 @@ def add_csv_asset( delimiter: str = "/", max_keys: int = 1000, order_by: Optional[SortersDefinition] = None, + batch_metadata: Optional[BatchMetadata] = None, ) -> CSVAsset: """Adds a CSV DataAsset to the present "SparkS3Datasource" object. @@ -123,6 +127,8 @@ def add_csv_asset( delimiter: S3 delimiter max_keys: S3 max_keys (default is 1000) order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default + batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any + batches created from the asset. """ order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) asset = CSVAsset( @@ -131,6 +137,7 @@ def add_csv_asset( header=header, inferSchema=infer_schema, order_by=order_by_sorters, + batch_metadata=batch_metadata or {}, ) asset._data_connector = S3DataConnector.build_data_connector( datasource_name=self.name, diff --git a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py index d4e89ab6bc90..2e8a3c747db9 100644 --- a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py @@ -237,16 +237,19 @@ def test_add_csv_asset_to_datasource( spark_abs_datasource: SparkAzureBlobStorageDatasource, ): mock_list_keys.return_value = object_keys + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", container="my_container", + batch_metadata=asset_specified_metadata, ) assert asset.name == "csv_asset" assert asset.batching_regex.match("random string") is None assert asset.batching_regex.match("alex_20200819_13D0.csv") is None m1 = asset.batching_regex.match("alex_20200819_1300.csv") assert m1 is not None + assert asset.batch_metadata == asset_specified_metadata @pytest.mark.integration @@ -414,10 +417,12 @@ def instantiate_azure_client_spy(self) -> None: instantiate_azure_client_spy, raising=True, ) + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", container="my_container", + batch_metadata=asset_specified_metadata, ) request = asset.build_batch_request( @@ -439,6 +444,7 @@ def instantiate_azure_client_spy(self) -> None: "name": "alex", "timestamp": "20200819", "price": "1300", + **asset_specified_metadata, } assert ( batch.id diff --git a/tests/datasource/fluent/test_spark_datasource.py b/tests/datasource/fluent/test_spark_datasource.py index c04bcb2974a9..8d8f607fd0ce 100644 --- a/tests/datasource/fluent/test_spark_datasource.py +++ b/tests/datasource/fluent/test_spark_datasource.py @@ -74,10 +74,6 @@ def test_dataframe_asset( ) -@pytest.mark.xfail( - strict=True, - reason="this will fail until we add batch_metadata to Spark add_*_asset methods", -) def test_spark_data_asset_batch_metadata( empty_data_context: AbstractDataContext, valid_file_path: pathlib.Path, diff --git a/tests/datasource/fluent/test_spark_dbfs_datasource.py b/tests/datasource/fluent/test_spark_dbfs_datasource.py index c7856221db12..b738715fb5f0 100644 --- a/tests/datasource/fluent/test_spark_dbfs_datasource.py +++ b/tests/datasource/fluent/test_spark_dbfs_datasource.py @@ -99,15 +99,18 @@ def test_construct_spark_dbfs_datasource(spark_dbfs_datasource: SparkDBFSDatasou @pytest.mark.integration def test_add_csv_asset_to_datasource(spark_dbfs_datasource: SparkDBFSDatasource): + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_dbfs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", + batch_metadata=asset_specified_metadata, ) assert asset.name == "csv_asset" assert asset.batching_regex.match("random string") is None assert asset.batching_regex.match("alex_20200819_13D0.csv") is None m1 = asset.batching_regex.match("alex_20200819_1300.csv") assert m1 is not None + assert asset.batch_metadata == asset_specified_metadata @pytest.mark.integration @@ -131,9 +134,11 @@ def test_construct_csv_asset_directly(): def test_get_batch_list_from_fully_specified_batch_request( spark_dbfs_datasource: SparkDBFSDatasource, ): + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_dbfs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", + batch_metadata=asset_specified_metadata, ) request = asset.build_batch_request( @@ -155,6 +160,7 @@ def test_get_batch_list_from_fully_specified_batch_request( "name": "alex", "timestamp": "20200819", "price": "1300", + **asset_specified_metadata, } assert ( batch.id diff --git a/tests/datasource/fluent/test_spark_filesystem_datasource.py b/tests/datasource/fluent/test_spark_filesystem_datasource.py index e29ac57f2d9d..bf4bedf13e30 100644 --- a/tests/datasource/fluent/test_spark_filesystem_datasource.py +++ b/tests/datasource/fluent/test_spark_filesystem_datasource.py @@ -403,3 +403,26 @@ def test_get_batch_list_from_batch_request_does_not_modify_input_batch_request( assert request == request_before_call # We get all 12 batches, one for each month of 2018. assert len(batches) == 12 + + +@pytest.mark.unit +def test_add_csv_asset_with_batch_metadata( + spark_filesystem_datasource: SparkFilesystemDatasource, +): + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} + asset = spark_filesystem_datasource.add_csv_asset( + name="csv_asset", + batching_regex=r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv", + header=True, + infer_schema=True, + batch_metadata=asset_specified_metadata, + ) + batch_options = {"year": "2018", "month": "05"} + request = asset.build_batch_request(batch_options) + batches = asset.get_batch_list_from_batch_request(request) + assert len(batches) == 1 + assert batches[0].metadata == { + "path": "yellow_tripdata_sample_2018-05.csv", + **batch_options, + **asset_specified_metadata, + } diff --git a/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py b/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py index d856520d652d..fa61caa32184 100644 --- a/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py @@ -206,15 +206,18 @@ def test_add_csv_asset_to_datasource( spark_gcs_datasource: SparkGoogleCloudStorageDatasource, ): mock_list_keys.return_value = object_keys + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_gcs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", + batch_metadata=asset_specified_metadata, ) assert asset.name == "csv_asset" assert asset.batching_regex.match("random string") is None assert asset.batching_regex.match("alex_20200819_13D0.csv") is None m1 = asset.batching_regex.match("alex_20200819_1300.csv") assert m1 is not None + assert asset.batch_metadata == asset_specified_metadata @pytest.mark.integration @@ -372,9 +375,11 @@ def instantiate_gcs_client_spy(self) -> None: instantiate_gcs_client_spy, raising=True, ) + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_gcs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", + batch_metadata=asset_specified_metadata, ) request = asset.build_batch_request( @@ -396,6 +401,7 @@ def instantiate_gcs_client_spy(self) -> None: "name": "alex", "timestamp": "20200819", "price": "1300", + **asset_specified_metadata, } assert ( batch.id diff --git a/tests/datasource/fluent/test_spark_s3_datasource.py b/tests/datasource/fluent/test_spark_s3_datasource.py index 4f42864538a5..ed1a9786af68 100644 --- a/tests/datasource/fluent/test_spark_s3_datasource.py +++ b/tests/datasource/fluent/test_spark_s3_datasource.py @@ -126,17 +126,20 @@ def test_construct_spark_s3_datasource(spark_s3_datasource: SparkS3Datasource): @pytest.mark.integration def test_add_csv_asset_to_datasource(spark_s3_datasource: SparkS3Datasource): + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_s3_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", header=True, infer_schema=True, + batch_metadata=asset_specified_metadata, ) assert asset.name == "csv_asset" assert asset.batching_regex.match("random string") is None assert asset.batching_regex.match("alex_20200819_13D0.csv") is None m1 = asset.batching_regex.match("alex_20200819_1300.csv") assert m1 is not None + assert asset.batch_metadata == asset_specified_metadata @pytest.mark.integration @@ -234,11 +237,13 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( def test_get_batch_list_from_fully_specified_batch_request( spark_s3_datasource: SparkS3Datasource, ): + asset_specified_metadata = {"asset_level_metadata": "my_metadata"} asset = spark_s3_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", header=True, infer_schema=True, + batch_metadata=asset_specified_metadata, ) request = asset.build_batch_request( @@ -260,6 +265,7 @@ def test_get_batch_list_from_fully_specified_batch_request( "name": "alex", "timestamp": "20200819", "price": "1300", + **asset_specified_metadata, } assert ( batch.id From fa1b6a11876accc6bb53f677660c91c1a2a158f5 Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 5 Apr 2023 13:08:49 -0700 Subject: [PATCH 41/96] [MAINTENANCE] FluentDatasources - Quickstart Snippets converted to Named Snippets (#7550) --- .../docs/tutorials/quickstart/quickstart.md | 60 ++++--------------- .../tutorials/quickstart/quickstart.py | 21 ++++++- 2 files changed, 29 insertions(+), 52 deletions(-) diff --git a/docs/docusaurus/docs/tutorials/quickstart/quickstart.md b/docs/docusaurus/docs/tutorials/quickstart/quickstart.md index bad490e9e428..c81c8a406640 100644 --- a/docs/docusaurus/docs/tutorials/quickstart/quickstart.md +++ b/docs/docusaurus/docs/tutorials/quickstart/quickstart.md @@ -32,32 +32,7 @@ With GX you can get up and running with just a few lines of code. The full proc pip install great_expectations ``` -```python title="Python code" -import great_expectations as gx - -# Set up -context = gx.get_context() - -# Connect to data -validator = context.sources.pandas_default.read_csv( - "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" -) - -# Create Expectations -validator.expect_column_values_to_not_be_null("pickup_datetime") -validator.expect_column_values_to_be_between("passenger_count", auto=True) - -# Validate data -checkpoint = gx.checkpoint.SimpleCheckpoint( - name="my_quickstart_checkpoint", - data_context=context, - validator=validator, -) -checkpoint_result = checkpoint.run() - -# View results -validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0] -context.open_data_docs(resource_identifier=validation_result_identifier) +```python name="tutorials/quickstart/quickstart.py all" ``` In the following steps we'll break down exactly what is happening here so that you can follow along and perform a Validation yourself. @@ -83,16 +58,14 @@ For the rest of this tutorial we will be working with Python code in a Jupyter N The code to import the `great_expectations` module is: -```python title="Python code" -import great_expectations as gx +```python name="tutorials/quickstart/quickstart.py import_gx" ``` #### 1.3 Instantiate a Data Context We will get a `DataContext` object with the following code: -```python title="Python code" -context = gx.get_context() +```python name="tutorials/quickstart/quickstart.py get_context" ``` The Data Context will provide you with access to a variety of utility and convenience methods. It is the entry point for using the GX Python API. @@ -101,10 +74,8 @@ The Data Context will provide you with access to a variety of utility and conven For the purpose of this guide, we will connect to `.csv` data stored in our GitHub repo: -```python title="Python code" -validator = context.sources.pandas_default.read_csv( - "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" -) + +```python name="tutorials/quickstart/quickstart.py connect_to_data" ``` The above code uses our Data Context's default Datasource for Pandas to access the `.csv` data in the file at the provided `path`. @@ -117,9 +88,7 @@ In this guide, we will define two Expectations, one based on our domain knowledg The code we will use for this is: -```python title="Python code" -validator.expect_column_values_to_not_be_null("pickup_datetime") -validator.expect_column_values_to_be_between("passenger_count", auto=True) +```python name="tutorials/quickstart/quickstart.py create_expectation" ``` With the Expectation defined above, we are stating that we _expect_ the column `pickup_datetime` to always be populated. That is: none of the column's values should be null. @@ -133,28 +102,21 @@ In the future, you may define numerous Expectations about a Validator's associat Now that we have defined our Expectations it is time for GX to introspect our data and see if it corresponds to what we told GX to expect. To do this, we define a Checkpoint (which will allow us to repeat the Validation in the future). -```python title="Python code" -checkpoint = gx.checkpoint.SimpleCheckpoint( - name="my_quickstart_checkpoint", - data_context=context, - validator=validator, -) +```python name="tutorials/quickstart/quickstart.py create_checkpoint" ``` Once we have created the Checkpoint, we will run it and get back the results from our Validation. -```python title="Python code" -checkpoint_result = checkpoint.run() +```python name="tutorials/quickstart/quickstart.py run_checkpoint" ``` #### 4.2 Review your results Great Expectations provides a friendly, human-readable way to view the results of Validations: Data Docs. Our Checkpoint will have automatically compiled new Data Docs to include the results of the Validation we ran, so we can view them immediately: -```python title="Python code" -validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0] -context.open_data_docs(resource_identifier=validation_result_identifier) +```python name="tutorials/quickstart/quickstart.py view_results" ``` + ### 5. (Optional) Great Expectations Cloud By completing the Quickstart guide, you now have the opportunity to join the Cloud Early Access program and explore how Great Expectations Cloud visualizes and creates shareable links for anyone on your team. The GX Cloud interface significantly simplifies collaboration between data teams and domain experts. @@ -173,4 +135,4 @@ Now that you've seen how easy it is to implement the GX workflow, it is time to ### Initializing, instantiating, and saving a Data Context - \ No newline at end of file + diff --git a/tests/integration/docusaurus/tutorials/quickstart/quickstart.py b/tests/integration/docusaurus/tutorials/quickstart/quickstart.py index a953f9f1d2d8..20a78285d5ef 100644 --- a/tests/integration/docusaurus/tutorials/quickstart/quickstart.py +++ b/tests/integration/docusaurus/tutorials/quickstart/quickstart.py @@ -1,28 +1,43 @@ +# +# import great_expectations as gx +# + # Set up +# context = gx.get_context() +# # Connect to data +# validator = context.sources.pandas_default.read_csv( "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" ) +# # Create Expectations +# validator.expect_column_values_to_not_be_null("pickup_datetime") validator.expect_column_values_to_be_between("passenger_count", auto=True) +# # Validate data +# checkpoint = gx.checkpoint.SimpleCheckpoint( name="my_quickstart_checkpoint", data_context=context, validator=validator, ) +# + +# checkpoint_result = checkpoint.run() +# # View results +# validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0] context.open_data_docs(resource_identifier=validation_result_identifier) - -# Save the Data Context for future use -context.convert_to_file_context() +# +# From 7caa1fddb4588908e835d2a2ee0b03cb456fd445 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Wed, 5 Apr 2023 17:02:25 -0400 Subject: [PATCH 42/96] [MAINTENANCE] Simplify `GXCloudStoreBackend._has_key` check (#7561) --- .../store/gx_cloud_store_backend.py | 29 +++++++++---------- great_expectations/exceptions/exceptions.py | 6 ++++ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/great_expectations/data_context/store/gx_cloud_store_backend.py b/great_expectations/data_context/store/gx_cloud_store_backend.py index f6c9e0735785..3b3eedfb9026 100644 --- a/great_expectations/data_context/store/gx_cloud_store_backend.py +++ b/great_expectations/data_context/store/gx_cloud_store_backend.py @@ -16,7 +16,7 @@ from great_expectations.data_context.store.store_backend import StoreBackend from great_expectations.data_context.types.refs import GXCloudResourceRef from great_expectations.data_context.types.resource_identifiers import GXCloudIdentifier -from great_expectations.exceptions import StoreBackendError +from great_expectations.exceptions import StoreBackendError, StoreBackendTransientError from great_expectations.util import bidict, filter_properties_dict, hyphen logger = logging.getLogger(__name__) @@ -245,7 +245,7 @@ def _get(self, key: Tuple[str, ...]) -> ResponsePayload: # type: ignore[overrid ) except requests.Timeout as timeout_exc: logger.exception(timeout_exc) - raise StoreBackendError( + raise StoreBackendTransientError( "Unable to get object in GX Cloud Store Backend: This is likely a transient error. Please try again." ) @@ -298,7 +298,7 @@ def _update(self, id: str, value: Any) -> bool: ) except requests.Timeout as timeout_exc: logger.exception(timeout_exc) - raise StoreBackendError( + raise StoreBackendTransientError( "Unable to update object in GX Cloud Store Backend: This is likely a transient error. Please try again." ) except Exception as e: @@ -380,7 +380,7 @@ def _set( # type: ignore[override] ) except requests.Timeout as timeout_exc: logger.exception(timeout_exc) - raise StoreBackendError( + raise StoreBackendTransientError( "Unable to set object in GX Cloud Store Backend: This is likely a transient error. Please try again." ) except Exception as e: @@ -493,7 +493,7 @@ def remove_key(self, key): ) except requests.Timeout as timeout_exc: logger.exception(timeout_exc) - raise StoreBackendError( + raise StoreBackendTransientError( "Unable to delete object in GX Cloud Store Backend: This is likely a transient error. Please try again." ) except Exception as e: @@ -503,17 +503,14 @@ def remove_key(self, key): ) def _has_key(self, key: Tuple[str, ...]) -> bool: - # Due to list_keys being inconsistently sized (due to the possible of resource names), - # we remove any resource names and assert against key ids. - - def _shorten_key(key) -> Tuple[str, str]: - if len(key) > 2: - key = key[:2] - return key - - key = _shorten_key(key) - all_keys = set(map(_shorten_key, self.list_keys())) - return key in all_keys + try: + _ = self._get(key) + except StoreBackendTransientError: + raise + except StoreBackendError as e: + logger.info(f"Could not find object associated with key {key}: {e}") + return False + return True @property def config(self) -> dict: diff --git a/great_expectations/exceptions/exceptions.py b/great_expectations/exceptions/exceptions.py index bc68aabc218b..a1bcc809ac99 100644 --- a/great_expectations/exceptions/exceptions.py +++ b/great_expectations/exceptions/exceptions.py @@ -52,6 +52,12 @@ class StoreBackendError(DataContextError): pass +class StoreBackendTransientError(StoreBackendError): + """The result of a timeout or other networking issues""" + + pass + + class ParserError(GreatExpectationsError): pass From 7e50af19c6486f2453d596e411172191fd3f4e59 Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 5 Apr 2023 14:55:50 -0700 Subject: [PATCH 43/96] [MAINTENANCE] Temporarily Pin `pandas<2.0.0` for compatibility (#7571) --- great_expectations/self_check/util.py | 8 ++++---- requirements.txt | 7 ++++--- tests/cli/test_suite.py | 7 ------- .../expectations/test_expectation_atomic_renderers.py | 10 ---------- tests/test_definitions/test_expectations_v3_api.py | 10 +++++----- 5 files changed, 13 insertions(+), 29 deletions(-) diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index f21293171163..a1d6e111591b 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -230,9 +230,9 @@ try: import sqlalchemy.dialects.postgresql as postgresqltypes # noqa: TID251 - from sqlalchemy.dialects.postgresql import ( # noqa: TID251 - dialect as postgresqlDialect, - ) + + # noinspection PyPep8Naming + from sqlalchemy.dialects.postgresql import dialect as pgDialect # noqa: TID251 POSTGRESQL_TYPES = { "TEXT": postgresqltypes.TEXT, @@ -248,7 +248,7 @@ } except (ImportError, KeyError): postgresqltypes = None - postgresqlDialect = None + pgDialect = None POSTGRESQL_TYPES = {} try: diff --git a/requirements.txt b/requirements.txt index 9127d5657bdf..032c902e2cf9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,9 +18,10 @@ numpy>=1.19.5; python_version == "3.8" or python_version == "3.9" numpy>=1.23.0; python_version >= "3.10" packaging # Note: pip install pandas==1.3.5 locally for pydantic schema validation. -pandas>=1.1.0; python_version <= "3.8" -pandas>=1.1.3; python_version == "3.9" -pandas>=1.3.0; python_version >= "3.10" +# Note: Pandas set to less than 2.0.0 as of 2023-04-05 due to incompatibilities with Altair and Metrics +pandas>=1.1.0,<2.0.0; python_version <= "3.8" +pandas>=1.1.3,<2.0.0; python_version == "3.9" +pandas>=1.3.0,<2.0.0; python_version >= "3.10" # patch version updates `typing_extensions` to the needed version pydantic>=1.9.2,<2.0 pyparsing>=2.4 diff --git a/tests/cli/test_suite.py b/tests/cli/test_suite.py index 99f620c8a422..df44a1475365 100644 --- a/tests/cli/test_suite.py +++ b/tests/cli/test_suite.py @@ -25,7 +25,6 @@ from great_expectations.data_context.data_context.file_data_context import ( FileDataContext, ) -from great_expectations.optional_imports import is_version_greater_or_equal from great_expectations.util import ( deep_filter_properties_iterable, @@ -1726,9 +1725,6 @@ def test_suite_edit_multiple_datasources_with_sql_with_no_additional_args_withou - open jupyter """ - if is_version_greater_or_equal(pd.__version__, "2.0.0"): - pytest.xfail(reason="Test is currently not compatible with pandas 2.0.0") - context = titanic_v013_multi_datasource_multi_execution_engine_data_context_with_checkpoints_v1_with_empty_store_stats_enabled monkeypatch.chdir(os.path.dirname(context.root_directory)) @@ -1984,9 +1980,6 @@ def test_suite_edit_multiple_datasources_with_sql_with_no_additional_args_with_c - NOT open Data Docs """ - if is_version_greater_or_equal(pd.__version__, "2.0.0"): - pytest.xfail(reason="Test is currently not compatible with pandas 2.0.0") - context = titanic_v013_multi_datasource_multi_execution_engine_data_context_with_checkpoints_v1_with_empty_store_stats_enabled monkeypatch.chdir(os.path.dirname(context.root_directory)) diff --git a/tests/expectations/test_expectation_atomic_renderers.py b/tests/expectations/test_expectation_atomic_renderers.py index c93df876d490..fd70621223e7 100644 --- a/tests/expectations/test_expectation_atomic_renderers.py +++ b/tests/expectations/test_expectation_atomic_renderers.py @@ -8,7 +8,6 @@ from great_expectations.core import ExpectationValidationResult from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.expectations.registry import get_renderer_impl -from great_expectations.optional_imports import is_version_greater_or_equal from great_expectations.render import RenderedAtomicContent @@ -166,9 +165,6 @@ def test_atomic_prescriptive_summary_expect_column_kl_divergence_to_be_less_than snapshot, get_prescriptive_rendered_content, ): - if is_version_greater_or_equal(pd.__version__, "2.0.0"): - pytest.xfail(reason="Altair is currently not compatible with pandas 2.0.0") - update_dict = { "expectation_type": "expect_column_kl_divergence_to_be_less_than", "kwargs": { @@ -196,9 +192,6 @@ def test_atomic_prescriptive_summary_expect_column_kl_divergence_to_be_less_than def test_atomic_diagnostic_observed_value_expect_column_kl_divergence_to_be_less_than( snapshot, get_diagnostic_rendered_content ): - if is_version_greater_or_equal(pd.__version__, "2.0.0"): - pytest.xfail(reason="Altair is currently not compatible with pandas 2.0.0") - # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()` # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class. expectation_config = { @@ -245,9 +238,6 @@ def test_atomic_diagnostic_observed_value_expect_column_kl_divergence_to_be_less def test_atomic_diagnostic_observed_value_with_boolean_column_expect_column_kl_divergence_to_be_less_than( snapshot, get_diagnostic_rendered_content ): - if is_version_greater_or_equal(pd.__version__, "2.0.0"): - pytest.xfail(reason="Altair is currently not compatible with pandas 2.0.0") - # Please note that the vast majority of Expectations are calling `Expectation._atomic_diagnostic_observed_value()` # As such, the specific expectation_type used here is irrelevant and is simply used to trigger the parent class. expectation_config = { diff --git a/tests/test_definitions/test_expectations_v3_api.py b/tests/test_definitions/test_expectations_v3_api.py index 8f46d84e19dd..a84d4dae97ef 100644 --- a/tests/test_definitions/test_expectations_v3_api.py +++ b/tests/test_definitions/test_expectations_v3_api.py @@ -20,7 +20,7 @@ get_test_validator_with_data, mssqlDialect, mysqlDialect, - postgresqlDialect, + pgDialect, snowflakeDialect, sqliteDialect, trinoDialect, @@ -144,10 +144,10 @@ def pytest_generate_tests(metafunc): # noqa C901 - 35 generate_test = True elif ( "postgresql" in only_for - and postgresqlDialect is not None + and pgDialect is not None and isinstance( validator_with_data.active_batch_data.sql_engine_dialect, - postgresqlDialect, + pgDialect, ) ): generate_test = True @@ -279,7 +279,7 @@ def pytest_generate_tests(metafunc): # noqa C901 - 35 ) or ( "postgresql" in suppress_test_for - and postgresqlDialect is not None + and pgDialect is not None and validator_with_data and isinstance( validator_with_data.active_batch_data, @@ -287,7 +287,7 @@ def pytest_generate_tests(metafunc): # noqa C901 - 35 ) and isinstance( validator_with_data.active_batch_data.sql_engine_dialect, - postgresqlDialect, + pgDialect, ) ) or ( From 7fb701cbef2867129f9392a2d9591fcb07d1d3b2 Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 5 Apr 2023 16:05:42 -0700 Subject: [PATCH 44/96] [MAINTENANCE] SqlAlchemy 2.0 Compatibility - branched connection + `bind` argument now required (#7529) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../sqlalchemy_compatibility_wrappers.py} | 54 ++++++++++++++++++- .../fluent/pandas_s3_datasource.pyi | 1 - .../fluent/spark_dbfs_datasource.pyi | 6 +-- .../metrics/table_metrics/table_head.py | 7 ++- great_expectations/self_check/util.py | 4 +- pyproject.toml | 6 --- tests/conftest.py | 10 ++-- .../test_data_context_test_yaml_config.py | 4 +- .../datasource/fluent/integration/conftest.py | 5 +- ...st_sqlalchemy_execution_engine_sampling.py | 4 +- .../test_sqlalchemy_batch_data.py | 6 ++- .../test_sqlalchemy_execution_engine.py | 4 +- tests/expectations/metrics/test_core.py | 6 ++- tests/expectations/test_null_filters.py | 7 ++- ..._configurable_profiler_v3_batch_request.py | 4 +- ...t_onboarding_data_assistant_happy_paths.py | 6 ++- tests/test_deprecation.py | 2 +- tests/test_utils.py | 6 ++- 18 files changed, 114 insertions(+), 28 deletions(-) rename great_expectations/{df_to_database_loader.py => compatibility/sqlalchemy_compatibility_wrappers.py} (60%) diff --git a/great_expectations/df_to_database_loader.py b/great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py similarity index 60% rename from great_expectations/df_to_database_loader.py rename to great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py index 8c1ce7e33e98..f904c80cad12 100644 --- a/great_expectations/df_to_database_loader.py +++ b/great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py @@ -2,7 +2,7 @@ import logging import warnings -from typing import Callable +from typing import Callable, Iterator, Sequence import pandas as pd @@ -24,6 +24,58 @@ Select = None +def read_sql_table_as_df( + table_name, + con, + schema=None, + index_col: str | Sequence[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + columns=None, + chunksize: int | None = None, +) -> pd.DataFrame | Iterator[pd.DataFrame]: + """Read SQL table as DataFrame. + + Wrapper for `read_sql_table()` method in Pandas. Created as part of the effort to allow GX to be compatible + with SqlAlchemy 2, and is used to suppress warnings that arise from implicit auto-commits. + + Args: + table_name (str): name of SQL Table. + con (sqlalchemy engine or connection): sqlalchemy.engine or sqlite3.Connection + schema (str | None): Specify the schema (if database flavor supports this). If None, use + default schema. Defaults to None. + index_col (str | Sequence[str] | None): Column(s) to set as index(MultiIndex). + coerce_float (bool): If True, method to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point. Can result in loss of Precision. + parse_dates (List or Dict): list or dict, default None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + columns: List of column names to select from SQL table. + chunksize: If specified, returns an iterator where `chunksize` is the number of + rows to include in each chunk. + """ + if isinstance(con, sa.engine.Engine): + con = con.connect() + with warnings.catch_warnings(): + warnings.filterwarnings(action="ignore", category=DeprecationWarning) + return pd.read_sql_table( + table_name=table_name, + con=con, + schema=schema, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) + + def add_dataframe_to_db( df: pd.DataFrame, name: str, diff --git a/great_expectations/datasource/fluent/pandas_s3_datasource.pyi b/great_expectations/datasource/fluent/pandas_s3_datasource.pyi index 0476ef8e30b7..ac98b39ccffb 100644 --- a/great_expectations/datasource/fluent/pandas_s3_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_s3_datasource.pyi @@ -49,7 +49,6 @@ from great_expectations.datasource.fluent.pandas_file_path_datasource import ( ) if TYPE_CHECKING: - from great_expectations.datasource.fluent.config_str import ConfigStr from great_expectations.datasource.fluent.dynamic_pandas import ( CompressionOptions, diff --git a/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi b/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi index ea4e8422a311..83acba7e502d 100644 --- a/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi @@ -9,9 +9,6 @@ from great_expectations.datasource.fluent import SparkFilesystemDatasource from great_expectations.datasource.fluent.data_asset.data_connector import ( DBFSDataConnector as DBFSDataConnector, ) -from great_expectations.datasource.fluent.interfaces import ( - BatchMetadata, -) from great_expectations.datasource.fluent.interfaces import ( SortersDefinition as SortersDefinition, ) @@ -20,6 +17,9 @@ from great_expectations.datasource.fluent.interfaces import ( ) if TYPE_CHECKING: + from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, + ) from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) diff --git a/great_expectations/expectations/metrics/table_metrics/table_head.py b/great_expectations/expectations/metrics/table_metrics/table_head.py index 00d09858282f..0a2ce1880a7f 100644 --- a/great_expectations/expectations/metrics/table_metrics/table_head.py +++ b/great_expectations/expectations/metrics/table_metrics/table_head.py @@ -4,6 +4,9 @@ import pandas as pd +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + read_sql_table_as_df, +) from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.execution_engine import ( PandasExecutionEngine, @@ -103,14 +106,14 @@ def _sqlalchemy( # noqa: C901 - 16 else: try: if metric_value_kwargs["fetch_all"]: - df = pd.read_sql_table( + df = read_sql_table_as_df( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, ) else: # passing chunksize causes the Iterator to be returned - df_chunk_iterator = pd.read_sql_table( + df_chunk_iterator = read_sql_table_as_df( table_name=getattr(selectable, "name", None), schema=getattr(selectable, "schema", None), con=execution_engine.engine, diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index a1d6e111591b..402c7f39f7b6 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -34,6 +34,9 @@ from great_expectations.compatibility.pandas_compatibility import ( execute_pandas_to_datetime, ) +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core import ( ExpectationConfigurationSchema, ExpectationSuite, @@ -50,7 +53,6 @@ from great_expectations.dataset import PandasDataset from great_expectations.datasource import Datasource from great_expectations.datasource.data_connector import ConfiguredAssetSqlDataConnector -from great_expectations.df_to_database_loader import add_dataframe_to_db from great_expectations.exceptions.exceptions import ( ExecutionEngineError, InvalidExpectationConfigurationError, diff --git a/pyproject.toml b/pyproject.toml index 6fc4db8fae8e..dc94bef711a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -473,12 +473,6 @@ filterwarnings = [ # Example Actual Warning: Found by running setup of test_validate_dataset[sqlite] # sqlalchemy.exc.RemovedIn20Warning: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) 'ignore: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0.:DeprecationWarning', - # Example Actual Warning: Found in mysql test_table_column_reflection_fallback[test_backends0] - # sqlalchemy.exc.RemovedIn20Warning: The .close() method on a so-called 'branched' connection is deprecated as of 1.4, as are 'branched' connections overall, and will be removed in a future release. If this is a default-handling function, don't close the connection. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: The .close\(\) method on a so-called:DeprecationWarning', - # Example Actual Warning: Found in setup of test_validate_dataset[sqlite] - # sqlalchemy.exc.RemovedIn20Warning: The ``bind`` argument for schema methods that invoke SQL against an engine or connection will be required in SQLAlchemy 2.0. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: The ``bind`` argument for schema methods that invoke SQL against an engine or connection will be required in SQLAlchemy 2.0.:DeprecationWarning', # Example Actual Warning: sqlalchemy.exc.RemovedIn20Warning: Deprecated API features detected! These feature(s) are not compatible with SQLAlchemy 2.0. To prevent incompatible upgrades prior to updating applications, ensure requirements files are pinned to "sqlalchemy<2.0". Set environment variable SQLALCHEMY_WARN_20=1 to show all deprecation warnings. Set environment variable SQLALCHEMY_SILENCE_UBER_WARNING=1 to silence this message. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) # Found so far in test_cli_datasource_list 'ignore: Deprecated API features detected!:DeprecationWarning', diff --git a/tests/conftest.py b/tests/conftest.py index cc57d0389af4..c63726b8e90e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,9 @@ from freezegun import freeze_time import great_expectations as gx +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core import ExpectationConfiguration from great_expectations.core.domain import ( INFERRED_SEMANTIC_TYPE_KEY, @@ -77,7 +80,6 @@ get_filesystem_one_level_directory_glob_path_list, ) from great_expectations.datasource.new_datasource import BaseDatasource, Datasource -from great_expectations.df_to_database_loader import add_dataframe_to_db from great_expectations.render.renderer_configuration import MetaNotesFormat from great_expectations.rule_based_profiler.config import RuleBasedProfilerConfig from great_expectations.rule_based_profiler.config.base import ( @@ -2324,8 +2326,10 @@ def test_db_connection_string(tmp_path_factory, test_backends): basepath = str(tmp_path_factory.mktemp("db_context")) path = os.path.join(basepath, "test.db") # noqa: PTH118 engine = sa.create_engine("sqlite:///" + str(path)) - df1.to_sql(name="table_1", con=engine, index=True) - df2.to_sql(name="table_2", con=engine, index=True, schema="main") + add_dataframe_to_db(df=df1, name="table_1", con=engine, index=True) + add_dataframe_to_db( + df=df2, name="table_2", con=engine, index=True, schema="main" + ) # Return a connection string to this newly-created db return "sqlite:///" + str(path) diff --git a/tests/data_context/test_data_context_test_yaml_config.py b/tests/data_context/test_data_context_test_yaml_config.py index 55f7398edf3c..b4686747202d 100644 --- a/tests/data_context/test_data_context_test_yaml_config.py +++ b/tests/data_context/test_data_context_test_yaml_config.py @@ -15,7 +15,9 @@ ) from great_expectations.data_context.store import CheckpointStore from great_expectations.data_context.util import file_relative_path -from great_expectations.df_to_database_loader import add_dataframe_to_db +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.rule_based_profiler.rule_based_profiler import RuleBasedProfiler from great_expectations.util import get_sqlalchemy_url, load_class from tests.core.usage_statistics.util import ( diff --git a/tests/datasource/fluent/integration/conftest.py b/tests/datasource/fluent/integration/conftest.py index 85ddc0401ed2..49430646497a 100644 --- a/tests/datasource/fluent/integration/conftest.py +++ b/tests/datasource/fluent/integration/conftest.py @@ -8,6 +8,9 @@ import pytest import sqlalchemy +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.data_context import AbstractDataContext from great_expectations.datasource.fluent import ( PandasFilesystemDatasource, @@ -54,7 +57,7 @@ def pandas_sql_data( } ) con = sqlalchemy.create_engine("sqlite://") - df.to_sql("my_table", con=con) + add_dataframe_to_db(df=df, name="my_table", con=con) pandas_ds = context.sources.add_pandas("my_pandas") pandas_ds.read_sql( sql=sqlalchemy.sql.text("SELECT * FROM my_table"), diff --git a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py index 9eba4af1410b..d8dd38d177e1 100644 --- a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py +++ b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py @@ -19,7 +19,9 @@ from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect from great_expectations.self_check.util import build_sa_engine from great_expectations.util import import_library_module -from great_expectations.df_to_database_loader import add_dataframe_to_db +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) try: sqlalchemy = pytest.importorskip("sqlalchemy") diff --git a/tests/execution_engine/test_sqlalchemy_batch_data.py b/tests/execution_engine/test_sqlalchemy_batch_data.py index 68f4c8350d1b..31ea607c9bd0 100644 --- a/tests/execution_engine/test_sqlalchemy_batch_data.py +++ b/tests/execution_engine/test_sqlalchemy_batch_data.py @@ -6,6 +6,10 @@ from great_expectations.execution_engine import SqlAlchemyExecutionEngine from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) + try: sqlalchemy = pytest.importorskip("sqlalchemy") except ImportError: @@ -43,7 +47,7 @@ def test_instantiation_with_table_name(sqlite_view_engine): def test_instantiation_with_query(sqlite_view_engine, test_df): - test_df.to_sql("test_table_0", con=sqlite_view_engine) + add_dataframe_to_db(df=test_df, name="test_table_0", con=sqlite_view_engine) query: str = "SELECT * FROM test_table_0" # If create_temp_table=False, a new temp table should NOT be created diff --git a/tests/execution_engine/test_sqlalchemy_execution_engine.py b/tests/execution_engine/test_sqlalchemy_execution_engine.py index 56411907e024..ba170cfc58c2 100644 --- a/tests/execution_engine/test_sqlalchemy_execution_engine.py +++ b/tests/execution_engine/test_sqlalchemy_execution_engine.py @@ -32,7 +32,9 @@ ) from great_expectations.self_check.util import build_sa_engine from great_expectations.util import get_sqlalchemy_domain_data -from great_expectations.df_to_database_loader import add_dataframe_to_db +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.validator.computed_metric import MetricValue from great_expectations.validator.metric_configuration import MetricConfiguration diff --git a/tests/expectations/metrics/test_core.py b/tests/expectations/metrics/test_core.py index ddf8f3105631..9fc69b48e5d7 100644 --- a/tests/expectations/metrics/test_core.py +++ b/tests/expectations/metrics/test_core.py @@ -40,6 +40,10 @@ from great_expectations.validator.metric_configuration import MetricConfiguration from tests.expectations.test_util import get_table_columns_metric +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) + def test_metric_loads_pd(): assert get_metric_provider("column.max", PandasExecutionEngine()) is not None @@ -1729,7 +1733,7 @@ def test_map_value_set_sa(sa): def test_map_of_type_sa(sa): eng = sa.create_engine("sqlite://") df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) - df.to_sql(name="test", con=eng, index=False) + add_dataframe_to_db(df=df, name="test", con=eng, index=False) batch_data = SqlAlchemyBatchData( execution_engine=eng, table_name="test", source_table_name="test" ) diff --git a/tests/expectations/test_null_filters.py b/tests/expectations/test_null_filters.py index ff3dcd109458..79b073370e9d 100644 --- a/tests/expectations/test_null_filters.py +++ b/tests/expectations/test_null_filters.py @@ -1,3 +1,8 @@ +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) + + def test_spark_null_filters(spark_session): import pandas as pd import pyspark @@ -60,6 +65,6 @@ def test_sa_null_filters(sa): eng = sa.create_engine("sqlite://") # Demonstrate that spark's max aggregate function can tolerate null values df = pd.DataFrame({"a": [1, 2, 3, None, None, 4]}) - df.to_sql(name="test", con=eng, index=False) + add_dataframe_to_db(df=df, name="test", con=eng, index=False) assert eng.execute(sa.text(f"SELECT MAX(a) FROM test;")).fetchone()[0] == 4 diff --git a/tests/profile/test_user_configurable_profiler_v3_batch_request.py b/tests/profile/test_user_configurable_profiler_v3_batch_request.py index d3ecbfc44f76..953f8315b05d 100644 --- a/tests/profile/test_user_configurable_profiler_v3_batch_request.py +++ b/tests/profile/test_user_configurable_profiler_v3_batch_request.py @@ -9,11 +9,13 @@ import pytest import great_expectations as gx +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core.batch import Batch, RuntimeBatchRequest from great_expectations.core.util import get_or_create_spark_application from great_expectations.data_context.types.base import ProgressBarsConfig from great_expectations.data_context.util import file_relative_path -from great_expectations.df_to_database_loader import add_dataframe_to_db from great_expectations.execution_engine import SqlAlchemyExecutionEngine from great_expectations.execution_engine.sqlalchemy_batch_data import ( SqlAlchemyBatchData, diff --git a/tests/rule_based_profiler/data_assistant/test_onboarding_data_assistant_happy_paths.py b/tests/rule_based_profiler/data_assistant/test_onboarding_data_assistant_happy_paths.py index 2f8dd9a4fc7b..caf22c6d8e58 100644 --- a/tests/rule_based_profiler/data_assistant/test_onboarding_data_assistant_happy_paths.py +++ b/tests/rule_based_profiler/data_assistant/test_onboarding_data_assistant_happy_paths.py @@ -8,6 +8,9 @@ import great_expectations as gx from great_expectations import DataContext +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core import ExpectationSuite from great_expectations.core.batch import BatchRequest from great_expectations.core.yaml_handler import YAMLHandler @@ -420,7 +423,8 @@ def test_sql_happy_path_onboarding_data_assistant_mixed_decimal_float_and_boolea df["test_bool"] = df.apply( lambda row: True if row["test_bool"] == "t" else False, axis=1 ) - df.to_sql( + add_dataframe_to_db( + df=df, name=table_name, con=postgresql_engine, schema="public", diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py index 06fc01e915f2..3f9cdea40e94 100644 --- a/tests/test_deprecation.py +++ b/tests/test_deprecation.py @@ -20,7 +20,7 @@ def regex_for_deprecation_comments() -> Pattern: def files_with_deprecation_warnings() -> List[str]: files: List[str] = glob.glob("great_expectations/**/*.py", recursive=True) files_to_exclude = [ - "great_expectations/df_to_database_loader.py", + "great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py", "great_expectations/compatibility/sqlalchemy_and_pandas.py", ] for file_to_exclude in files_to_exclude: diff --git a/tests/test_utils.py b/tests/test_utils.py index b05d712c553c..31e00872ee96 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,6 +13,9 @@ import great_expectations.exceptions as gx_exceptions from great_expectations.alias_types import PathStr +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core.yaml_handler import YAMLHandler from great_expectations.data_context.store import ( CheckpointStore, @@ -691,7 +694,8 @@ def load_data_into_test_database( f"Adding to existing table {table_name} and adding data from {csv_paths}" ) - all_dfs_concatenated.to_sql( + add_dataframe_to_db( + df=all_dfs_concatenated, name=table_name, con=engine, schema=schema_name, From eaac0c0833b55b5433d2d2eb2bc270bbfcb2ac36 Mon Sep 17 00:00:00 2001 From: Nathan Farmer Date: Wed, 5 Apr 2023 22:12:23 -0400 Subject: [PATCH 45/96] [BUGFIX] Render prescriptive `ExpectationConfiguration`s with evaluation parameters inline (#7552) --- .../render/renderer_configuration.py | 66 ++++++++++------ .../test_expectation_suite_crud.py | 75 ++++++++++++++++++- tests/validator/test_validator.py | 74 ++++++++++-------- 3 files changed, 159 insertions(+), 56 deletions(-) diff --git a/great_expectations/render/renderer_configuration.py b/great_expectations/render/renderer_configuration.py index 39c3d975cf7a..b6dd94f98ed3 100644 --- a/great_expectations/render/renderer_configuration.py +++ b/great_expectations/render/renderer_configuration.py @@ -173,12 +173,17 @@ def __init__(self, **values) -> None: values["params"] = _RendererValueBase() super().__init__(**values) - class _RendererParamArgs(TypedDict): + class _RequiredRendererParamArgs(TypedDict): """Used for building up a dictionary that is unpacked into RendererParams upon initialization.""" schema: RendererSchema value: Any + class _RendererParamArgs(_RequiredRendererParamArgs, total=False): + """Used for building up a dictionary that is unpacked into RendererParams upon initialization.""" + + evaluation_parameter: Dict[str, Any] + class _RendererParamBase(_RendererValueBase): """ _RendererParamBase is the base for a param that is added to RendererParams. It contains the validation logic, @@ -187,6 +192,7 @@ class _RendererParamBase(_RendererValueBase): renderer_schema: RendererSchema = Field(alias="schema") value: Any + evaluation_parameter: Optional[Dict[str, Any]] class Config: validate_assignment = True @@ -256,17 +262,16 @@ def _get_renderer_value_base_model_type( def _get_evaluation_parameter_params_from_raw_kwargs( raw_kwargs: Dict[str, Any] ) -> Dict[str, RendererConfiguration._RendererParamArgs]: - evaluation_parameter_count = 0 renderer_params_args = {} - for key, value in raw_kwargs.items(): - evaluation_parameter_name = f"eval_param__{evaluation_parameter_count}" - renderer_params_args[ - evaluation_parameter_name - ] = RendererConfiguration._RendererParamArgs( - schema=RendererSchema(type=RendererValueType.STRING), - value=f'{key}: {value["$PARAMETER"]}', + for kwarg_name, value in raw_kwargs.items(): + renderer_params_args[kwarg_name] = RendererConfiguration._RendererParamArgs( + schema=RendererSchema(type=RendererValueType.OBJECT), + value=None, + evaluation_parameter={ + "schema": RendererSchema(type=RendererValueType.OBJECT), + "value": value, + }, ) - evaluation_parameter_count += 1 return renderer_params_args @root_validator() @@ -463,11 +468,6 @@ def _set_template_str(cls, v: str, values: dict) -> str: ) v = f"{row_condition_str}, then {v}" - if "_raw_kwargs" in values and values["_raw_kwargs"]: - v += " " - for evaluation_parameter_count in range(len(values["_raw_kwargs"])): - v += f" $eval_param__{evaluation_parameter_count}," - v = v[:-1] return v @staticmethod @@ -487,7 +487,7 @@ def _choose_param_type_for_value( pass raise RendererConfigurationError( - f"None of the param_types: {param_types} match the value: {value}" + f"None of the param_types: {[param_type.value for param_type in param_types]} match the value: {value}" ) def add_param( @@ -534,7 +534,9 @@ def add_param( if value is None: value = self.kwargs.get(name) - if isinstance(param_type, list) and value is not None: + if isinstance(value, dict) and "$PARAMETER" in value: + param_type = RendererValueType.OBJECT + elif isinstance(param_type, list) and value is not None: param_type = RendererConfiguration._choose_param_type_for_value( param_types=param_type, value=value ) @@ -547,11 +549,29 @@ def add_param( } else: assert isinstance(param_type, RendererValueType) - renderer_params_args = { - **self.params.dict(exclude_none=False), - name: renderer_param( - schema=RendererSchema(type=param_type), value=value - ), - } + renderer_params_args = self.params.dict(exclude_none=False) + # if we already moved the evaluation parameter raw_kwargs to a param, + # we need to combine the param passed to add_param() with those existing raw_kwargs + if ( + name in renderer_params_args + and renderer_params_args[name]["evaluation_parameter"] + ): + new_args = { + name: renderer_param( + schema=RendererSchema(type=param_type), + value=value, + evaluation_parameter=renderer_params_args[name][ + "evaluation_parameter" + ], + ) + } + else: + new_args = { + name: renderer_param( + schema=RendererSchema(type=param_type), + value=value, + ) + } + renderer_params_args.update(new_args) self.params = cast(RendererParams, renderer_params(**renderer_params_args)) diff --git a/tests/data_context/cloud_data_context/test_expectation_suite_crud.py b/tests/data_context/cloud_data_context/test_expectation_suite_crud.py index 39b7b3849525..3a5608401678 100644 --- a/tests/data_context/cloud_data_context/test_expectation_suite_crud.py +++ b/tests/data_context/cloud_data_context/test_expectation_suite_crud.py @@ -3,7 +3,10 @@ import pytest -from great_expectations.core.expectation_suite import ExpectationSuite +from great_expectations.core.expectation_suite import ( + ExpectationConfiguration, + ExpectationSuite, +) from great_expectations.data_context.cloud_constants import GXCloudRESTResource from great_expectations.data_context.data_context.cloud_data_context import ( CloudDataContext, @@ -14,6 +17,7 @@ from great_expectations.data_context.types.base import DataContextConfig, GXCloudConfig from great_expectations.data_context.types.resource_identifiers import GXCloudIdentifier from great_expectations.exceptions.exceptions import DataContextError, StoreBackendError +from great_expectations.render import RenderedAtomicContent, RenderedAtomicValue from great_expectations.util import get_context from tests.data_context.conftest import MockResponse @@ -607,3 +611,72 @@ def test_add_or_update_expectation_suite_updates_existing_obj( context.add_or_update_expectation_suite(expectation_suite=suite) mock_update.assert_called_once() + + +@pytest.mark.integration +def test_get_expectation_suite_include_rendered_content_prescriptive( + empty_data_context, +): + context = empty_data_context + + expectation_suite_name = "validating_taxi_data" + + expectation_configuration = ExpectationConfiguration( + expectation_type="expect_column_max_to_be_between", + kwargs={ + "column": "passenger_count", + "min_value": {"$PARAMETER": "upstream_column_min"}, + "max_value": {"$PARAMETER": "upstream_column_max"}, + }, + ) + + context.add_expectation_suite( + expectation_suite_name=expectation_suite_name, + expectations=[expectation_configuration], + ) + + expectation_suite_exclude_rendered_content: ExpectationSuite = ( + context.get_expectation_suite( + expectation_suite_name=expectation_suite_name, + ) + ) + assert ( + expectation_suite_exclude_rendered_content.expectations[0].rendered_content + is None + ) + + expected_expectation_configuration_prescriptive_rendered_content = [ + RenderedAtomicContent( + value_type="StringValueType", + value=RenderedAtomicValue( + schema={"type": "com.superconductive.rendered.string"}, + template="$column maximum value must be greater than or equal to $min_value and less than or equal to $max_value.", + params={ + "column": { + "schema": {"type": "string"}, + "value": "passenger_count", + }, + "min_value": { + "schema": {"type": "object"}, + "value": {"$PARAMETER": "upstream_column_min"}, + }, + "max_value": { + "schema": {"type": "object"}, + "value": {"$PARAMETER": "upstream_column_max"}, + }, + }, + ), + name="atomic.prescriptive.summary", + ) + ] + + expectation_suite_include_rendered_content: ExpectationSuite = ( + context.get_expectation_suite( + expectation_suite_name=expectation_suite_name, + include_rendered_content=True, + ) + ) + assert ( + expectation_suite_include_rendered_content.expectations[0].rendered_content + == expected_expectation_configuration_prescriptive_rendered_content + ) diff --git a/tests/validator/test_validator.py b/tests/validator/test_validator.py index 5daeff022f6c..8b8bc8ce5cab 100644 --- a/tests/validator/test_validator.py +++ b/tests/validator/test_validator.py @@ -942,7 +942,7 @@ def test_validator_docstrings(multi_batch_taxi_validator): @pytest.mark.integration -def test_validator_include_rendered_content( +def test_validator_include_rendered_content_diagnostic( yellow_trip_pandas_data_context, ): context = yellow_trip_pandas_data_context @@ -1007,45 +1007,53 @@ def test_validator_include_rendered_content( ) ) - expected_expectation_validation_result_rendered_content = RenderedAtomicContent( - name="atomic.diagnostic.observed_value", - value=RenderedAtomicValue( - schema={"type": "com.superconductive.rendered.string"}, - params={}, - template="6", - ), - value_type="StringValueType", + expected_expectation_validation_result_diagnostic_rendered_content = ( + RenderedAtomicContent( + name="atomic.diagnostic.observed_value", + value=RenderedAtomicValue( + schema={"type": "com.superconductive.rendered.string"}, + params={}, + template="6", + ), + value_type="StringValueType", + ) ) assert ( - expected_expectation_validation_result_rendered_content + expected_expectation_validation_result_diagnostic_rendered_content in validation_result.rendered_content ) - expected_expectation_configuration_rendered_content = RenderedAtomicContent( + expected_expectation_configuration_diagnostic_rendered_content = RenderedAtomicContent( name="atomic.prescriptive.summary", value=RenderedAtomicValue( schema={"type": "com.superconductive.rendered.string"}, params={ "column": {"schema": {"type": "string"}, "value": "passenger_count"}, - "min_value": {"schema": {"type": "number"}, "value": 1}, - "max_value": {"schema": {"type": "number"}, "value": 8}, - "eval_param__0": { - "schema": {"type": "string"}, - "value": "min_value: upstream_column_min", + "min_value": { + "schema": {"type": "number"}, + "value": 1, + "evaluation_parameter": { + "schema": {"type": "object"}, + "value": {"$PARAMETER": "upstream_column_min"}, + }, }, - "eval_param__1": { - "schema": {"type": "string"}, - "value": "max_value: upstream_column_max", + "max_value": { + "schema": {"type": "number"}, + "value": 8, + "evaluation_parameter": { + "schema": {"type": "object"}, + "value": {"$PARAMETER": "upstream_column_max"}, + }, }, }, - template="$column maximum value must be greater than or equal to $min_value and less than or equal to $max_value. $eval_param__0, $eval_param__1", + template="$column maximum value must be greater than or equal to $min_value and less than or equal to $max_value.", ), value_type="StringValueType", ) assert ( - expected_expectation_configuration_rendered_content + expected_expectation_configuration_diagnostic_rendered_content in validation_result.expectation_config.rendered_content ) @@ -1060,22 +1068,24 @@ def test_validator_include_rendered_content( ) ) - expected_expectation_validation_result_rendered_content = RenderedAtomicContent( - name="atomic.diagnostic.observed_value", - value=RenderedAtomicValue( - schema={"type": "com.superconductive.rendered.string"}, - params={}, - template="0", - ), - value_type="StringValueType", + expected_expectation_validation_result_diagnostic_rendered_content = ( + RenderedAtomicContent( + name="atomic.diagnostic.observed_value", + value=RenderedAtomicValue( + schema={"type": "com.superconductive.rendered.string"}, + params={}, + template="0", + ), + value_type="StringValueType", + ) ) assert ( - expected_expectation_validation_result_rendered_content + expected_expectation_validation_result_diagnostic_rendered_content in validation_result.rendered_content ) - expected_expectation_configuration_rendered_content = RenderedAtomicContent( + expected_expectation_configuration_diagnostic_rendered_content = RenderedAtomicContent( name="atomic.prescriptive.summary", value=RenderedAtomicValue( schema={"type": "com.superconductive.rendered.string"}, @@ -1094,7 +1104,7 @@ def test_validator_include_rendered_content( ) assert ( - expected_expectation_configuration_rendered_content + expected_expectation_configuration_diagnostic_rendered_content in validation_result.expectation_config.rendered_content ) From 357d1077a9e9aaa247e1c7437698859c9121062a Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Thu, 6 Apr 2023 07:17:06 -0600 Subject: [PATCH 46/96] [DOCS] Fix integral typo (#7578) --- ...or_for_splitting_and_sampling_a_file_system_or_blob_store.md | 2 +- ..._a_dataconnector_for_splitting_and_sampling_tables_in_sql.md | 2 +- .../sql_components/_table_splitting_methods.mdx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md index b5a3c119bfa0..8cd69b88a347 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_a_file_system_or_blob_store.md @@ -113,7 +113,7 @@ Note: Splitter methods can be specified with or without a preceding underscore. | split_on_year_and_month | `column_name='col'` | Rows where the year and month of a datetime column are equal to the specified value | | split_on_year_and_month_and_day | `column_name='col'` | Rows where the year, month and day of a datetime column are equal to the specified value | | split_on_date_parts | `column_name='col', date_parts=''` | Rows where the date parts of a datetime column are equal to the specified value. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" | -| split_on_divided_integer | `column_name='col', divisor=, batch_identifiers={ 'col': matching_divisor }` | Rows where value of column_name divided (using integral division) by the given divisor are equal to matching_divisor provided for the column_name specified | +| split_on_divided_integer | `column_name='col', divisor=, batch_identifiers={ 'col': matching_divisor }` | Rows where value of column_name divided (using integer division) by the given divisor are equal to matching_divisor provided for the column_name specified | | split_on_mod_integer | `column_name='col', mod=, batch_identifiers={ 'col': matching_mod_value }` | Rows where value of column_name divided (using modular division) by the given mod are equal to matching_mod_value provided for the column_name specified | | split_on_multi_column_values | `column_names='', batch_identifiers={ 'col_0': value_0, 'col_1': value_1, 'col_2': value_2, ... }` | Rows where values of column_names are equal to values corresponding to each column name as specified | | split_on_converted_datetime | `column_name='col', date_format_string=<'%Y-%m-%d'>, batch_identifiers={ 'col': matching_string }` | Rows where value of column_name converted to datetime using the given date_format_string are equal to matching string provided for the column_name specified | diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md index ecec3148d988..bc84610ed034 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/advanced/how_to_configure_a_dataconnector_for_splitting_and_sampling_tables_in_sql.md @@ -99,7 +99,7 @@ Note: Splitter methods can be specified with or without a preceding underscore. | `split_on_year_and_month` | `table_name='table', column_name='col'` | Rows where the year and month of a datetime column are the same | | `split_on_year_and_month_and_day` | `table_name='table', column_name='col'` | Rows where the year, month and day of a datetime column are the same | | `split_on_date_parts` | `table_name='table', column_name='col', date_parts=''` | Rows where the date parts of a datetime column are the same. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" | -| `split_on_divided_integer` | `table_name='table', column_name='col', divisor=` | Rows where value of column_name divided (using integral division) by the given divisor are same | +| `split_on_divided_integer` | `table_name='table', column_name='col', divisor=` | Rows where value of column_name divided (using integer division) by the given divisor are same | | `split_on_mod_integer` | `table_name='table', column_name='col', mod=` | Rows where value of column_name divided (using modular division) by the given mod are same | | `split_on_multi_column_values` | `table_name='table', column_names=''` | Rows where values of column_names are same | | `split_on_converted_datetime` | `table_name='table', column_name='col', date_format_string=<'%Y-%m-%d'>` | Rows where value of column_name converted to datetime using the given date_format_string are same | diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_table_splitting_methods.mdx b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_table_splitting_methods.mdx index 6019c32a6d35..5e1aeb96b2cb 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_table_splitting_methods.mdx +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/datasource_configuration/sql_components/_table_splitting_methods.mdx @@ -15,7 +15,7 @@ | `split_on_year_and_month` | `column_name='col'` | Rows where the year and month of a datetime column are the same | | `split_on_year_and_month_and_day` | `column_name='col'` | Rows where the year, month and day of a datetime column are the same | | `split_on_date_parts` | `column_name='col', date_parts=''` | Rows where the date parts of a datetime column are the same. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" | -| `split_on_divided_integer` | `column_name='col', divisor=` | Rows where value of column_name divided (using integral division) by the given divisor are same | +| `split_on_divided_integer` | `column_name='col', divisor=` | Rows where value of column_name divided (using integer division) by the given divisor are same | | `split_on_mod_integer` | `column_name='col', mod=` | Rows where value of column_name divided (using modular division) by the given mod are same | | `split_on_multi_column_values` | `column_names=''` | Rows where values of column_names are same | | `split_on_converted_datetime` | `column_name='col', date_format_string=<'%Y-%m-%d'>` | Rows where value of column_name converted to datetime using the given date_format_string are same | From 9305e9bbe9f786dbf685a49398c2617fe5f0dd4e Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Thu, 6 Apr 2023 09:34:24 -0400 Subject: [PATCH 47/96] [DOCS] Prepare earlier versions using develop (#7567) --- docs/build_docs | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/docs/build_docs b/docs/build_docs index bbd85461e31f..7ce53c98fab6 100755 --- a/docs/build_docs +++ b/docs/build_docs @@ -30,19 +30,9 @@ for version in $(jq -r '.[]' $VERSIONS_JSON_PATH); do done -# Get latest released version from tag, check out to build API docs. -# Only if not PR deploy preview. -if [ "$PULL_REQUEST" == "false" ] -then - GX_LATEST=$(git tag | grep -E "(^[0-9]{1,}\.)+[0-9]{1,}" | sort -V | tail -1) - echo -e "${ORANGE}Not in a pull request. Using latest released version ${GX_LATEST} at $(git rev-parse HEAD) to build API docs.${NC}" - git checkout "$GX_LATEST" - git pull -else - echo -e "${ORANGE}Building from within a pull request, using the latest commit to build API docs so changes can be viewed in the Netlify deploy preview.${NC}" - git checkout "$CURRENT_COMMIT" - git pull -fi +echo -e "${ORANGE}Prepare prior versions using the current commit (e.g. proposed commit if in a PR or develop if not).${NC}" +git checkout "$CURRENT_COMMIT" +git pull # Update versioned code and docs @@ -61,6 +51,20 @@ cd ../ python prepare_prior_versions.py cd docusaurus +# Get latest released version from tag, check out to build API docs. +# Only if not PR deploy preview. +if [ "$PULL_REQUEST" == "false" ] +then + GX_LATEST=$(git tag | grep -E "(^[0-9]{1,}\.)+[0-9]{1,}" | sort -V | tail -1) + echo -e "${ORANGE}Not in a pull request. Using latest released version ${GX_LATEST} at $(git rev-parse HEAD) to build API docs.${NC}" + git checkout "$GX_LATEST" + git pull +else + echo -e "${ORANGE}Building from within a pull request, using the latest commit to build API docs so changes can be viewed in the Netlify deploy preview.${NC}" + git checkout "$CURRENT_COMMIT" + git pull +fi + # Build current docs echo -e "${ORANGE}Installing Great Expectations library dev dependencies.${NC}" (cd ../../; pip install -c constraints-dev.txt -e ".[test]") @@ -71,6 +75,10 @@ echo -e "${ORANGE}Installing api docs dependencies.${NC}" echo -e "${ORANGE}Building API docs for current version.${NC}" (cd ../../; invoke docs) +echo -e "${ORANGE}Check back out current commit before building the rest of the docs.${NC}" +git checkout "$CURRENT_COMMIT" +git pull + # Move versions.json back from outside of the repo mv $VERSIONS_JSON_PATH versions.json From 2ffa8de3dd337a81a0f8ef632264e87094170511 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Thu, 6 Apr 2023 08:25:59 -0600 Subject: [PATCH 48/96] [MAINTENANCE] Add missing docstrings to fluent `sql_datasource` splitter methods. (#7577) --- .../datasource/fluent/sql_datasource.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index 0ac628cdec4a..328d45847e61 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -470,6 +470,13 @@ def add_splitter_year_and_month_and_day( def add_splitter_datetime_part( self: Self, column_name: str, datetime_parts: List[str] ) -> Self: + """Associates a datetime part splitter with this sql asset. + Args: + column_name: Name of the date column where parts will be parsed out. + datetime_parts: A list of datetime parts to split on, specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" + Returns: + This sql asset so we can use this method fluently. + """ return self._add_splitter( SplitterDatetimePart( method_name="split_on_date_parts", @@ -479,6 +486,12 @@ def add_splitter_datetime_part( ) def add_splitter_column_value(self: Self, column_name: str) -> Self: + """Associates a column value splitter with this sql asset. + Args: + column_name: A column name of the column to split on. + Returns: + This sql asset so we can use this method fluently. + """ return self._add_splitter( SplitterColumnValue( method_name="split_on_column_value", @@ -489,6 +502,13 @@ def add_splitter_column_value(self: Self, column_name: str) -> Self: def add_splitter_divided_integer( self: Self, column_name: str, divisor: int ) -> Self: + """Associates a divided integer splitter with this sql asset. + Args: + column_name: A column name of the column to split on. + divisor: The divisor to use when splitting. + Returns: + This sql asset so we can use this method fluently. + """ return self._add_splitter( SplitterDividedInteger( method_name="split_on_divided_integer", @@ -498,6 +518,13 @@ def add_splitter_divided_integer( ) def add_splitter_mod_integer(self: Self, column_name: str, mod: int) -> Self: + """Associates a mod integer splitter with this sql asset. + Args: + column_name: A column name of the column to split on. + mod: The mod to use when splitting. + Returns: + This sql asset so we can use this method fluently. + """ return self._add_splitter( SplitterModInteger( method_name="split_on_mod_integer", @@ -507,6 +534,12 @@ def add_splitter_mod_integer(self: Self, column_name: str, mod: int) -> Self: ) def add_splitter_multi_column_values(self: Self, column_names: list[str]) -> Self: + """Associates a multi column value splitter with this sql asset. + Args: + column_names: A list of column names to split on. + Returns: + This sql asset so we can use this method fluently. + """ return self._add_splitter( SplitterMultiColumnValue( column_names=column_names, method_name="split_on_multi_column_values" From 1642f0fdf5eb550569ffe55a285e199531e737b7 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Thu, 6 Apr 2023 10:40:36 -0400 Subject: [PATCH 49/96] [DOCS] Use orange in docs logs (#7579) --- docs/build_docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/build_docs b/docs/build_docs index 7ce53c98fab6..a5afe32453a0 100755 --- a/docs/build_docs +++ b/docs/build_docs @@ -3,7 +3,7 @@ # Build API docs then build docusaurus docs. # Currently used in our netlify pipeline. -ORANGE='\033[0;33m' +ORANGE='\033[38;5;208m' NC='\033[0m' # No Color CURRENT_COMMIT=$(git rev-parse HEAD) From 0d4e54e47bdfca4ad2c67350784e4fe7da4c8d25 Mon Sep 17 00:00:00 2001 From: Rob Lim Date: Thu, 6 Apr 2023 08:31:32 -0700 Subject: [PATCH 50/96] [DOCS] Add GX Cloud Onboarding Script (#7517) Co-authored-by: Gabriel --- .../experimental/onboarding_script.py | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 assets/scripts/gx_cloud/experimental/onboarding_script.py diff --git a/assets/scripts/gx_cloud/experimental/onboarding_script.py b/assets/scripts/gx_cloud/experimental/onboarding_script.py new file mode 100644 index 000000000000..fcf024e99503 --- /dev/null +++ b/assets/scripts/gx_cloud/experimental/onboarding_script.py @@ -0,0 +1,182 @@ +import pprint + +import great_expectations as gx +from great_expectations.checkpoint import Checkpoint +from great_expectations.core.batch import RuntimeBatchRequest +from great_expectations.core.expectation_suite import ExpectationSuite +from great_expectations.data_context import CloudDataContext +from great_expectations.datasource import BaseDatasource +from great_expectations.exceptions import StoreBackendError +from great_expectations.validator.validator import Validator + +import pandas as pd + + +# Create a GX Data Context +# Make sure GX_CLOUD_ACCESS_TOKEN and GX_CLOUD_ORGANIZATION_ID +# are set in your environment or config_variables.yml +context: CloudDataContext = gx.get_context( + cloud_mode=True, +) + +# Set variables for creating a Datasource +datasource_name = None +data_connector_name = ( + "default_runtime_data_connector_name" # Optional: Set your own data_connector_name +) +assert datasource_name, "Please set datasource_name." + +# Set variable for creating an Expectation Suite +expectation_suite_name = None +assert expectation_suite_name, "Please set expectation_suite_name." + +# Set variables for connecting a Validator to a Data Asset, along with a Batch of data +data_asset_name = None +assert data_asset_name, "Please set data_asset_name." +path_to_validator_batch = None # e.g. "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" +assert ( + path_to_validator_batch +), "Please set path_to_validator_batch. This can be a local filepath or a remote URL." + +# Set variable for creating a Checkpoint +checkpoint_name = None +assert checkpoint_name, "Please set checkpoint_name." + +# Set variable to get a Batch of data to validate against the new Checkpoint +path_to_batch_to_validate = None # e.g. "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" +assert ( + path_to_batch_to_validate +), "Please set path_to_batch_to_validate. This can be a local filepath or a remote URL." + +# Create Datasource +# For simplicity, this script creates a Datasource with a PandasExecutionEngine and a RuntimeDataConnector +try: + datasource: BaseDatasource = context.get_datasource(datasource_name=datasource_name) +except ValueError: + datasource_yaml = f""" + name: {datasource_name} + class_name: Datasource + execution_engine: + class_name: PandasExecutionEngine + data_connectors: + {data_connector_name}: + class_name: RuntimeDataConnector + batch_identifiers: + - path + """ + # Test your configuration: + datasource: BaseDatasource = context.test_yaml_config(datasource_yaml) + + # Save your Datasource: + datasource: BaseDatasource = context.add_or_update_datasource(datasource=datasource) + +print(f"\n{20*'='}\nDatasource Config\n{20*'='}\n") +pprint.pprint(datasource.config) + +# Create a new Expectation Suite +try: + expectation_suite: ExpectationSuite = context.get_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + expectation_suite_ge_cloud_id = expectation_suite.ge_cloud_id +except StoreBackendError: + expectation_suite: ExpectationSuite = context.add_or_update_expectation_suite( + expectation_suite_name=expectation_suite_name + ) + expectation_suite_ge_cloud_id = expectation_suite.ge_cloud_id + +# Connect a Batch of data to a Validator to add Expectations interactively +batch_df: pd.DataFrame = pd.read_csv(path_to_validator_batch) + +batch_request = RuntimeBatchRequest( + runtime_parameters={"batch_data": batch_df}, + batch_identifiers={"path": path_to_validator_batch}, + datasource_name=datasource_name, + data_connector_name=data_connector_name, + data_asset_name=data_asset_name, +) +validator: Validator = context.get_validator( + expectation_suite_name=expectation_suite_name, batch_request=batch_request +) + +# Add Expectations interactively using tab completion +validator.expect_column_to_exist(column="") + +# Save Expectation Suite +validator.save_expectation_suite(discard_failed_expectations=False) +expectation_suite: ExpectationSuite = context.get_expectation_suite( + expectation_suite_name=expectation_suite_name +) +print(f"\n{20*'='}\nExpectation Suite\n{20*'='}\n") +pprint.pprint(expectation_suite) + +# Create a new Checkpoint +try: + checkpoint: Checkpoint = context.get_checkpoint(checkpoint_name) + checkpoint_id = checkpoint.ge_cloud_id +except StoreBackendError: + checkpoint_config = { + "name": checkpoint_name, + "validations": [ + { + "expectation_suite_name": expectation_suite_name, + "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id, + "batch_request": { + "datasource_name": datasource_name, + "data_connector_name": data_connector_name, + "data_asset_name": data_asset_name, + }, + } + ], + "config_version": 1, + "class_name": "Checkpoint", + } + + context.add_or_update_checkpoint(**checkpoint_config) + checkpoint: Checkpoint = context.get_checkpoint(checkpoint_name) + checkpoint_id = checkpoint.ge_cloud_id + +print(f"\n{20*'='}\nCheckpoint Config\n{20*'='}\n") +pprint.pprint(checkpoint) + +# Get a Checkpoint snippet to use in a CI script +run_checkpoint_snippet = f"""\ +import pprint + +import great_expectations as gx +import pandas as pd + +path_to_batch_to_validate = None +assert path_to_batch_to_validate is not None, "Please set path_to_batch_to_validate. This can be a local filepath or a remote URL." +validation_df = pd.read_csv(path_to_batch_to_validate) + +result = context.run_checkpoint( + ge_cloud_id="{checkpoint_id}", + batch_request={{ + "runtime_parameters": {{ + "batch_data": validation_df + }}, + "batch_identifiers": {{ + "path": path_to_batch_to_validate + }}, + }} +) +ppint.pprint(result) +""" + +print(f"\n{20*'='}\nCheckpoint Snippet\n{20*'='}\n") +print(run_checkpoint_snippet) + +# Run the Checkpoint: +validation_df: pd.DataFrame = pd.read_csv(path_to_batch_to_validate) + +result = context.run_checkpoint( + ge_cloud_id=checkpoint_id, + batch_request={ + "runtime_parameters": {"batch_data": validation_df}, + "batch_identifiers": {"path": path_to_batch_to_validate}, + }, +) + +print(f"\n{20*'='}\nValidation Result\n{20*'='}\n") +pprint.pprint(result) From b0f8a62c068599fede6ae36cb036eebb502f8c99 Mon Sep 17 00:00:00 2001 From: William Shin Date: Thu, 6 Apr 2023 10:01:43 -0700 Subject: [PATCH 51/96] [BUGFIX] Release Pipeline Fix (#7575) Co-authored-by: Chetan Kini --- ci/azure-pipelines.yml | 6 +++--- .../test_include_rendered_content.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/azure-pipelines.yml b/ci/azure-pipelines.yml index ce18a02632e1..b34537ed172f 100644 --- a/ci/azure-pipelines.yml +++ b/ci/azure-pipelines.yml @@ -443,9 +443,9 @@ stages: Python310: python.version: '3.10' constraints: 'ci/constraints-test/py310-min-install.txt' - Pandas20: - python.version: '3.9' - constraints: 'ci/constraints-test/pandas2-min-install.txt' + # Pandas20: + # python.version: '3.9' + # constraints: 'ci/constraints-test/pandas2-min-install.txt' variables: IMAGE_SUFFIX: $[ dependencies.make_suffix.outputs['suffix.IMAGE_SUFFIX'] ] steps: diff --git a/tests/data_context/cloud_data_context/test_include_rendered_content.py b/tests/data_context/cloud_data_context/test_include_rendered_content.py index 46a5741dff63..9a5f80226b98 100644 --- a/tests/data_context/cloud_data_context/test_include_rendered_content.py +++ b/tests/data_context/cloud_data_context/test_include_rendered_content.py @@ -8,12 +8,11 @@ ExpectationSuite, ExpectationValidationResult, ) -from great_expectations.core.batch import RuntimeBatchRequest +from great_expectations.data_context import CloudDataContext from great_expectations.data_context.cloud_constants import GXCloudRESTResource from great_expectations.data_context.types.refs import GXCloudResourceRef from great_expectations.render import RenderedAtomicContent from great_expectations.validator.validator import Validator -from great_expectations.data_context import CloudDataContext @pytest.mark.cloud @@ -34,12 +33,12 @@ def test_cloud_backed_data_context_save_expectation_suite_include_rendered_conte ) with mock.patch( - "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend.list_keys" + "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend.has_key" ), mock.patch( "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend._set", return_value=cloud_ref, ): - expectation_suite: ExpectationSuite = context.add_expectation_suite( + expectation_suite: ExpectationSuite = context.add_or_update_expectation_suite( "test_suite" ) expectation_suite.expectations.append( @@ -114,7 +113,8 @@ def test_cloud_backed_data_context_expectation_validation_result_include_rendere ) with mock.patch( - "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend.list_keys" + "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend.has_key", + return_value=False, ), mock.patch( "great_expectations.data_context.store.gx_cloud_store_backend.GXCloudStoreBackend._set" ): From a6059648515fe64db88b6b857b47f4ac834916ff Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Thu, 6 Apr 2023 12:35:40 -0700 Subject: [PATCH 52/96] [RELEASE] 0.16.6 (#7582) These changes are only focused on the 0.16.6 release; hence, they were deemed by the team as safe to merge without running all checks. --- docs/docusaurus/docs/changelog.md | 34 ++++++++++++++++++++++ docs/docusaurus/docs/components/_data.jsx | 2 +- docs_rtd/changelog.rst | 35 +++++++++++++++++++++++ great_expectations/deployment_version | 2 +- 4 files changed, 71 insertions(+), 2 deletions(-) diff --git a/docs/docusaurus/docs/changelog.md b/docs/docusaurus/docs/changelog.md index 7b4cf129ce6a..9602c6d9dfe7 100644 --- a/docs/docusaurus/docs/changelog.md +++ b/docs/docusaurus/docs/changelog.md @@ -2,6 +2,40 @@ title: Changelog --- +### 0.16.6 +* [FEATURE] Fluent `DataAsset` `batch_metadata` config variables ([#7513](https://github.com/great-expectations/great_expectations/pull/7513)) +* [FEATURE] Add batch metadata to spark add_*_asset methods ([#7534](https://github.com/great-expectations/great_expectations/pull/7534)) +* [BUGFIX] Fluent Datasource load from config fixes for remaining Pandas Datasources ([#7442](https://github.com/great-expectations/great_expectations/pull/7442)) +* [BUGFIX] Address `pandas==2.0.0` test failures ([#7553](https://github.com/great-expectations/great_expectations/pull/7553)) +* [BUGFIX] Render prescriptive `ExpectationConfiguration`s with evaluation parameters inline ([#7552](https://github.com/great-expectations/great_expectations/pull/7552)) +* [BUGFIX] Release Pipeline Fix ([#7575](https://github.com/great-expectations/great_expectations/pull/7575)) +* [DOCS] Update GX version in `_data.jsx` component ([#7549](https://github.com/great-expectations/great_expectations/pull/7549)) +* [DOCS] Adds guides on using Ephemeral Data Contexts and updates Quickstart Next Steps ([#7500](https://github.com/great-expectations/great_expectations/pull/7500)) +* [DOCS] Fixes broken code block and incorrectly numbered steps in "How to organize Batches in a SQL-based Data Asset" ([#7533](https://github.com/great-expectations/great_expectations/pull/7533)) +* [DOCS] Update nav to match gx.io site ([#7557](https://github.com/great-expectations/great_expectations/pull/7557)) +* [DOCS] Corrects step numbers in "How to organize Batches in a file-based Data Asset" ([#7559](https://github.com/great-expectations/great_expectations/pull/7559)) +* [DOCS] Delete SLACK_GUIDELINES.md ([#7566](https://github.com/great-expectations/great_expectations/pull/7566)) +* [DOCS] Update syntax highlighting of code blocks in GX Cloud Getting Started guide ([#7563](https://github.com/great-expectations/great_expectations/pull/7563)) +* [DOCS] Fix code snippets for earlier versions ([#7554](https://github.com/great-expectations/great_expectations/pull/7554)) +* [DOCS] Fix typo in docs ([#7568](https://github.com/great-expectations/great_expectations/pull/7568)) +* [DOCS] Moar typo fix ([#7569](https://github.com/great-expectations/great_expectations/pull/7569)) +* [DOCS] removes the original getting started tutorial pages and redirects to the quickstart guide ([#7548](https://github.com/great-expectations/great_expectations/pull/7548)) +* [DOCS] Fix integral typo ([#7578](https://github.com/great-expectations/great_expectations/pull/7578)) +* [DOCS] Prepare earlier versions using develop ([#7567](https://github.com/great-expectations/great_expectations/pull/7567)) +* [DOCS] Use orange in docs logs ([#7579](https://github.com/great-expectations/great_expectations/pull/7579)) +* [DOCS] Add GX Cloud Onboarding Script ([#7517](https://github.com/great-expectations/great_expectations/pull/7517)) +* [MAINTENANCE] release prep for 0.16.5 ([#7545](https://github.com/great-expectations/great_expectations/pull/7545)) +* [MAINTENANCE] Test Pandas 2.0 prerelease in CI/CD ([#7343](https://github.com/great-expectations/great_expectations/pull/7343)) +* [MAINTENANCE] Add noqa directives for existing sqlalchemy imports ([#7564](https://github.com/great-expectations/great_expectations/pull/7564)) +* [MAINTENANCE] Add ruff rule for sqlalchemy imports ([#7562](https://github.com/great-expectations/great_expectations/pull/7562)) +* [MAINTENANCE] adding a footer to data docs with a link to the cloud page ([#7532](https://github.com/great-expectations/great_expectations/pull/7532)) +* [MAINTENANCE] Harden tests for `CloudDataContext` always `include_rendered_content` ([#7558](https://github.com/great-expectations/great_expectations/pull/7558)) +* [MAINTENANCE] FluentDatasources - Quickstart Snippets converted to Named Snippets ([#7550](https://github.com/great-expectations/great_expectations/pull/7550)) +* [MAINTENANCE] Simplify `GXCloudStoreBackend._has_key` check ([#7561](https://github.com/great-expectations/great_expectations/pull/7561)) +* [MAINTENANCE] Temporarily Pin `pandas<2.0.0` for compatibility ([#7571](https://github.com/great-expectations/great_expectations/pull/7571)) +* [MAINTENANCE] SqlAlchemy 2.0 Compatibility - branched connection + `bind` argument now required ([#7529](https://github.com/great-expectations/great_expectations/pull/7529)) +* [MAINTENANCE] Add missing docstrings to fluent `sql_datasource` splitter methods. ([#7577](https://github.com/great-expectations/great_expectations/pull/7577)) + ### 0.16.5 * [FEATURE] Add batch metadata to sql datasources. ([#7499](https://github.com/great-expectations/great_expectations/pull/7499)) * [BUGFIX] Fix issue running quickstart ([#7539](https://github.com/great-expectations/great_expectations/pull/7539)) diff --git a/docs/docusaurus/docs/components/_data.jsx b/docs/docusaurus/docs/components/_data.jsx index e7cf96ab976d..ae23f999e80d 100644 --- a/docs/docusaurus/docs/components/_data.jsx +++ b/docs/docusaurus/docs/components/_data.jsx @@ -1,5 +1,5 @@ export default { - release_version: 'great_expectations, version 0.16.5', + release_version: 'great_expectations, version 0.16.6', min_python: '3.7', max_python: '3.10' } diff --git a/docs_rtd/changelog.rst b/docs_rtd/changelog.rst index a910219529ee..2a97fd800217 100644 --- a/docs_rtd/changelog.rst +++ b/docs_rtd/changelog.rst @@ -4,6 +4,41 @@ Changelog ######### +0.16.6 +----------------- +* [FEATURE] Fluent `DataAsset` `batch_metadata` config variables ([#7513](https://github.com/great-expectations/great_expectations/pull/7513)) +* [FEATURE] Add batch metadata to spark add_*_asset methods ([#7534](https://github.com/great-expectations/great_expectations/pull/7534)) +* [BUGFIX] Fluent Datasource load from config fixes for remaining Pandas Datasources ([#7442](https://github.com/great-expectations/great_expectations/pull/7442)) +* [BUGFIX] Address `pandas==2.0.0` test failures ([#7553](https://github.com/great-expectations/great_expectations/pull/7553)) +* [BUGFIX] Render prescriptive `ExpectationConfiguration`s with evaluation parameters inline ([#7552](https://github.com/great-expectations/great_expectations/pull/7552)) +* [BUGFIX] Release Pipeline Fix ([#7575](https://github.com/great-expectations/great_expectations/pull/7575)) +* [DOCS] Update GX version in `_data.jsx` component ([#7549](https://github.com/great-expectations/great_expectations/pull/7549)) +* [DOCS] Adds guides on using Ephemeral Data Contexts and updates Quickstart Next Steps ([#7500](https://github.com/great-expectations/great_expectations/pull/7500)) +* [DOCS] Fixes broken code block and incorrectly numbered steps in "How to organize Batches in a SQL-based Data Asset" ([#7533](https://github.com/great-expectations/great_expectations/pull/7533)) +* [DOCS] Update nav to match gx.io site ([#7557](https://github.com/great-expectations/great_expectations/pull/7557)) +* [DOCS] Corrects step numbers in "How to organize Batches in a file-based Data Asset" ([#7559](https://github.com/great-expectations/great_expectations/pull/7559)) +* [DOCS] Delete SLACK_GUIDELINES.md ([#7566](https://github.com/great-expectations/great_expectations/pull/7566)) +* [DOCS] Update syntax highlighting of code blocks in GX Cloud Getting Started guide ([#7563](https://github.com/great-expectations/great_expectations/pull/7563)) +* [DOCS] Fix code snippets for earlier versions ([#7554](https://github.com/great-expectations/great_expectations/pull/7554)) +* [DOCS] Fix typo in docs ([#7568](https://github.com/great-expectations/great_expectations/pull/7568)) +* [DOCS] Moar typo fix ([#7569](https://github.com/great-expectations/great_expectations/pull/7569)) +* [DOCS] removes the original getting started tutorial pages and redirects to the quickstart guide ([#7548](https://github.com/great-expectations/great_expectations/pull/7548)) +* [DOCS] Fix integral typo ([#7578](https://github.com/great-expectations/great_expectations/pull/7578)) +* [DOCS] Prepare earlier versions using develop ([#7567](https://github.com/great-expectations/great_expectations/pull/7567)) +* [DOCS] Use orange in docs logs ([#7579](https://github.com/great-expectations/great_expectations/pull/7579)) +* [DOCS] Add GX Cloud Onboarding Script ([#7517](https://github.com/great-expectations/great_expectations/pull/7517)) +* [MAINTENANCE] release prep for 0.16.5 ([#7545](https://github.com/great-expectations/great_expectations/pull/7545)) +* [MAINTENANCE] Test Pandas 2.0 prerelease in CI/CD ([#7343](https://github.com/great-expectations/great_expectations/pull/7343)) +* [MAINTENANCE] Add noqa directives for existing sqlalchemy imports ([#7564](https://github.com/great-expectations/great_expectations/pull/7564)) +* [MAINTENANCE] Add ruff rule for sqlalchemy imports ([#7562](https://github.com/great-expectations/great_expectations/pull/7562)) +* [MAINTENANCE] adding a footer to data docs with a link to the cloud page ([#7532](https://github.com/great-expectations/great_expectations/pull/7532)) +* [MAINTENANCE] Harden tests for `CloudDataContext` always `include_rendered_content` ([#7558](https://github.com/great-expectations/great_expectations/pull/7558)) +* [MAINTENANCE] FluentDatasources - Quickstart Snippets converted to Named Snippets ([#7550](https://github.com/great-expectations/great_expectations/pull/7550)) +* [MAINTENANCE] Simplify `GXCloudStoreBackend._has_key` check ([#7561](https://github.com/great-expectations/great_expectations/pull/7561)) +* [MAINTENANCE] Temporarily Pin `pandas<2.0.0` for compatibility ([#7571](https://github.com/great-expectations/great_expectations/pull/7571)) +* [MAINTENANCE] SqlAlchemy 2.0 Compatibility - branched connection + `bind` argument now required ([#7529](https://github.com/great-expectations/great_expectations/pull/7529)) +* [MAINTENANCE] Add missing docstrings to fluent `sql_datasource` splitter methods. ([#7577](https://github.com/great-expectations/great_expectations/pull/7577)) + 0.16.5 ----------------- * [FEATURE] Add batch metadata to sql datasources. ([#7499](https://github.com/great-expectations/great_expectations/pull/7499)) diff --git a/great_expectations/deployment_version b/great_expectations/deployment_version index 19270385eaf7..c3f65805f7b7 100644 --- a/great_expectations/deployment_version +++ b/great_expectations/deployment_version @@ -1 +1 @@ -0.16.5 +0.16.6 From 83deebc0e28e2fffef3b53d6510176f8ff5797b6 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Thu, 6 Apr 2023 16:52:55 -0400 Subject: [PATCH 53/96] [MAINTENANCE] Warning non integer slice on row for SQLAlchemy 2.0 Compatibility (#7501) --- pyproject.toml | 4 ---- ...st_sqlalchemy_execution_engine_sampling.py | 11 +++++++--- ...t_sqlalchemy_execution_engine_splitting.py | 21 +++++++++++++------ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dc94bef711a1..1a3dd875dc7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -466,10 +466,6 @@ filterwarnings = [ # SQLAlchemy 2.x support warnings. These warnings should be ignored until sqlalchemy 2.x is fully supported. # To get SQLAlchemy 2.x supported, remove one of these ignores and then fix the resulting errors. 'ignore: The Engine.execute\(\) method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0. All statement execution in SQLAlchemy 2.0 is performed by the Connection.execute\(\) method of Connection, or in the ORM by the Session.execute\(\) method of Session.:DeprecationWarning', - # Example Actual Warning: Found by running pytest tests/test_definitions/test_expectations_v2_api.py (delete with v2 api code if this warning doesn't appear elsewhere). - # Example Actual Warning: Found by running pytest tests/test_definitions/test_expectations_v2_api.py (delete with v2 api code if this warning doesn't appear elsewhere). - # sqlalchemy.exc.RemovedIn20Warning: Using non-integer/slice indices on Row is deprecated and will be removed in version 2.0; please use row._mapping[], or the mappings() accessor on the Result object. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: Using non-integer\/slice indices on Row is deprecated and will be removed in version 2.0:DeprecationWarning', # Example Actual Warning: Found by running setup of test_validate_dataset[sqlite] # sqlalchemy.exc.RemovedIn20Warning: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) 'ignore: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0.:DeprecationWarning', diff --git a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py index d8dd38d177e1..3e5dd81331ef 100644 --- a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py +++ b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py @@ -1,3 +1,4 @@ +from __future__ import annotations import datetime import os from typing import List @@ -279,9 +280,13 @@ def test_sqlite_sample_using_limit(sa): assert num_rows == n # Right rows? - rows: sa.Row = batch_data.execution_engine.engine.execute( - sa.select(sa.text("*")).select_from(batch_data.selectable) - ).fetchall() + rows: list[sa.RowMapping] = ( + batch_data.execution_engine.engine.execute( + sa.select(sa.text("*")).select_from(batch_data.selectable) + ) + .mappings() + .fetchall() + ) row_dates: List[datetime.datetime] = [parse(row["pickup_datetime"]) for row in rows] for row_date in row_dates: diff --git a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py index dc5cde4cd199..27ce15860028 100644 --- a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py +++ b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py @@ -1,3 +1,4 @@ +from __future__ import annotations import datetime import os from typing import List @@ -594,9 +595,13 @@ def test_sqlite_split_on_year( assert num_rows == n # Right rows? - rows: sa.Row = batch_data.execution_engine.engine.execute( - sa.select(sa.text("*")).select_from(batch_data.selectable) - ).fetchall() + rows: list[sa.RowMapping] = ( + batch_data.execution_engine.engine.execute( + sa.select(sa.text("*")).select_from(batch_data.selectable) + ) + .mappings() + .fetchall() + ) row_dates: List[datetime.datetime] = [parse(row["pickup_datetime"]) for row in rows] for row_date in row_dates: @@ -636,9 +641,13 @@ def test_sqlite_split_and_sample_using_limit( assert num_rows == n # Right rows? - rows: sa.Row = batch_data.execution_engine.engine.execute( - sa.select(sa.text("*")).select_from(batch_data.selectable) - ).fetchall() + rows: list[sa.RowMapping] = ( + batch_data.execution_engine.engine.execute( + sa.select(sa.text("*")).select_from(batch_data.selectable) + ) + .mappings() + .fetchall() + ) row_dates: List[datetime.datetime] = [parse(row["pickup_datetime"]) for row in rows] for row_date in row_dates: From 5db16376fccb0db45bcc0365395d1a40d1f00939 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Thu, 6 Apr 2023 16:57:00 -0400 Subject: [PATCH 54/96] [DOCS] Use current minor version number in drop down instead of "Current" (#7581) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/docusaurus/docusaurus.config.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js index 4fb8298bdbb7..baf5620c2317 100644 --- a/docs/docusaurus/docusaurus.config.js +++ b/docs/docusaurus/docusaurus.config.js @@ -259,7 +259,7 @@ module.exports = { lastVersion: 'current', versions: { current: { - label: 'Current', + label: '0.16.x', path: '' } } From 3386038eb38701d7841aabfada99cb65a747b1fe Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 6 Apr 2023 21:17:36 -0400 Subject: [PATCH 55/96] [BUGFIX] Fix Fluent Spark `DataConnectors` on config load (#7560) --- ..._data_on_azure_blob_storage_using_spark.md | 4 +- ...w_to_connect_to_data_on_gcs_using_spark.md | 2 +- ...ow_to_connect_to_data_on_s3_using_spark.md | 2 +- .../public_api_report.py | 2 +- .../datasource/fluent/__init__.py | 1 + .../SparkAzureBlobStorageDatasource.json | 14 +- .../CSVAsset.json | 11 ++ .../schemas/SparkDBFSDatasource/CSVAsset.json | 11 ++ .../schemas/SparkFilesystemDatasource.json | 2 +- .../SparkFilesystemDatasource/CSVAsset.json | 11 ++ .../SparkGoogleCloudStorageDatasource.json | 2 +- .../CSVAsset.json | 11 ++ .../fluent/schemas/SparkS3Datasource.json | 2 +- .../schemas/SparkS3Datasource/CSVAsset.json | 11 ++ .../spark_azure_blob_storage_datasource.py | 135 +++++++++--------- .../spark_azure_blob_storage_datasource.pyi | 58 ++++++++ .../fluent/spark_dbfs_datasource.py | 76 ++++------ .../fluent/spark_dbfs_datasource.pyi | 11 +- .../fluent/spark_file_path_datasource.py | 10 +- .../fluent/spark_filesystem_datasource.py | 77 ++++------ .../fluent/spark_filesystem_datasource.pyi | 47 ++++++ .../spark_google_cloud_storage_datasource.py | 98 ++++++------- .../spark_google_cloud_storage_datasource.pyi | 56 ++++++++ .../datasource/fluent/spark_s3_datasource.py | 90 +++++------- .../datasource/fluent/spark_s3_datasource.pyi | 50 +++++++ pyproject.toml | 2 - tests/datasource/fluent/conftest.py | 15 +- .../datasource/fluent/great_expectations.yml | 49 +++++++ .../integration/integration_test_utils.py | 26 ++-- ...est_spark_azure_blob_storage_datasource.py | 14 +- 30 files changed, 579 insertions(+), 321 deletions(-) create mode 100644 great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi create mode 100644 great_expectations/datasource/fluent/spark_filesystem_datasource.pyi create mode 100644 great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi create mode 100644 great_expectations/datasource/fluent/spark_s3_datasource.pyi diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md index 55872bb807aa..58172815b72e 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_azure_blob_storage_using_spark.md @@ -70,8 +70,8 @@ Once these values have been defined, we will define our Data Asset with the code data_asset = datasource.add_csv_asset( name=asset_name, batching_regex=batching_regex, - container=container, - name_starts_with=name_starts_with, + abs_container=abs_container, + abs_name_starts_with=abs_name_starts_with, ) ``` diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md index 1284eca96d5a..3a4d20fed00c 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_gcs_using_spark.md @@ -68,7 +68,7 @@ data_asset = datasource.add_csv_asset( batching_regex=batching_regex, header=True, infer_schema=True, - prefix=prefix, + gcs_prefix=prefix, ) ``` diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md index abae72664909..84c2e0d44df8 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/cloud/how_to_connect_to_data_on_s3_using_spark.md @@ -77,7 +77,7 @@ data_asset = datasource.add_csv_asset( batching_regex=batching_regex, header=True, infer_schema=True, - prefix=prefix, + s3_prefix=prefix, ) ``` diff --git a/docs/sphinx_api_docs_source/public_api_report.py b/docs/sphinx_api_docs_source/public_api_report.py index 01d375c2d47e..5365d5c91b68 100755 --- a/docs/sphinx_api_docs_source/public_api_report.py +++ b/docs/sphinx_api_docs_source/public_api_report.py @@ -1859,7 +1859,7 @@ def main(): # any methods or classes you are adding to documentation with the @public_api # decorator and any relevant "new" or "deprecated" public api decorators. # If the actual is lower than the threshold, please reduce the threshold. - PUBLIC_API_MISSING_THRESHOLD = 94 # TODO: reduce this number again once this works for the Fluent DS dynamic methods + PUBLIC_API_MISSING_THRESHOLD = 90 # TODO: reduce this number again once this works for the Fluent DS dynamic methods if len(printable_definitions) != PUBLIC_API_MISSING_THRESHOLD: error_msg_prefix = f"There are {len(printable_definitions)} items missing from the public API, we currently allow {PUBLIC_API_MISSING_THRESHOLD}." if len(printable_definitions) > PUBLIC_API_MISSING_THRESHOLD: diff --git a/great_expectations/datasource/fluent/__init__.py b/great_expectations/datasource/fluent/__init__.py index de59469285d5..39abdbe5b429 100644 --- a/great_expectations/datasource/fluent/__init__.py +++ b/great_expectations/datasource/fluent/__init__.py @@ -6,6 +6,7 @@ DataAsset, Datasource, Sorter, + BatchMetadata, ) from great_expectations.datasource.fluent.pandas_datasource import ( PandasDatasource, diff --git a/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json b/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json index 6cc5aac0d09b..8ac66d9c372e 100644 --- a/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json @@ -1,6 +1,6 @@ { "title": "SparkAzureBlobStorageDatasource", - "description": "Base model for most fluent datasource related pydantic models.\n\nAdds yaml dumping and parsing methods.\n\nExtra fields are not allowed.\n\nSerialization methods default to `exclude_unset = True` to prevent serializing\nconfigs full of mostly unset default values.\nAlso prevents passing along unset kwargs to BatchSpec.\nhttps://docs.pydantic.dev/usage/exporting_models/", + "description": "--Public API--", "type": "object", "properties": { "type": { @@ -32,7 +32,17 @@ "azure_options": { "title": "Azure Options", "default": {}, - "type": "object" + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string", + "writeOnly": true, + "format": "password" + }, + {} + ] + } } }, "required": [ diff --git a/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource/CSVAsset.json b/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource/CSVAsset.json index 7a2f10cfca60..39ca990c87a7 100644 --- a/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource/CSVAsset.json +++ b/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource/CSVAsset.json @@ -42,11 +42,22 @@ "title": "Connect Options", "description": "Optional filesystem specific advanced parameters for connecting to data assets", "type": "object" + }, + "header": { + "title": "Header", + "default": false, + "type": "boolean" + }, + "InferSchema": { + "title": "Inferschema", + "default": false, + "type": "boolean" } }, "required": [ "name" ], + "additionalProperties": false, "definitions": { "Sorter": { "title": "Sorter", diff --git a/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource/CSVAsset.json b/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource/CSVAsset.json index 7a2f10cfca60..39ca990c87a7 100644 --- a/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource/CSVAsset.json +++ b/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource/CSVAsset.json @@ -42,11 +42,22 @@ "title": "Connect Options", "description": "Optional filesystem specific advanced parameters for connecting to data assets", "type": "object" + }, + "header": { + "title": "Header", + "default": false, + "type": "boolean" + }, + "InferSchema": { + "title": "Inferschema", + "default": false, + "type": "boolean" } }, "required": [ "name" ], + "additionalProperties": false, "definitions": { "Sorter": { "title": "Sorter", diff --git a/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json b/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json index e355bedbc73a..c1ac1808a24e 100644 --- a/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json @@ -1,6 +1,6 @@ { "title": "SparkFilesystemDatasource", - "description": "Base model for most fluent datasource related pydantic models.\n\nAdds yaml dumping and parsing methods.\n\nExtra fields are not allowed.\n\nSerialization methods default to `exclude_unset = True` to prevent serializing\nconfigs full of mostly unset default values.\nAlso prevents passing along unset kwargs to BatchSpec.\nhttps://docs.pydantic.dev/usage/exporting_models/", + "description": "--Public API--", "type": "object", "properties": { "type": { diff --git a/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource/CSVAsset.json b/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource/CSVAsset.json index 7a2f10cfca60..39ca990c87a7 100644 --- a/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource/CSVAsset.json +++ b/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource/CSVAsset.json @@ -42,11 +42,22 @@ "title": "Connect Options", "description": "Optional filesystem specific advanced parameters for connecting to data assets", "type": "object" + }, + "header": { + "title": "Header", + "default": false, + "type": "boolean" + }, + "InferSchema": { + "title": "Inferschema", + "default": false, + "type": "boolean" } }, "required": [ "name" ], + "additionalProperties": false, "definitions": { "Sorter": { "title": "Sorter", diff --git a/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json b/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json index a44a38b0f15c..2965d75b1c28 100644 --- a/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json @@ -1,6 +1,6 @@ { "title": "SparkGoogleCloudStorageDatasource", - "description": "Base model for most fluent datasource related pydantic models.\n\nAdds yaml dumping and parsing methods.\n\nExtra fields are not allowed.\n\nSerialization methods default to `exclude_unset = True` to prevent serializing\nconfigs full of mostly unset default values.\nAlso prevents passing along unset kwargs to BatchSpec.\nhttps://docs.pydantic.dev/usage/exporting_models/", + "description": "--Public API--", "type": "object", "properties": { "type": { diff --git a/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource/CSVAsset.json b/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource/CSVAsset.json index 7a2f10cfca60..39ca990c87a7 100644 --- a/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource/CSVAsset.json +++ b/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource/CSVAsset.json @@ -42,11 +42,22 @@ "title": "Connect Options", "description": "Optional filesystem specific advanced parameters for connecting to data assets", "type": "object" + }, + "header": { + "title": "Header", + "default": false, + "type": "boolean" + }, + "InferSchema": { + "title": "Inferschema", + "default": false, + "type": "boolean" } }, "required": [ "name" ], + "additionalProperties": false, "definitions": { "Sorter": { "title": "Sorter", diff --git a/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json b/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json index 337ac7bb122a..b0cddc81ac3c 100644 --- a/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json @@ -1,6 +1,6 @@ { "title": "SparkS3Datasource", - "description": "Base model for most fluent datasource related pydantic models.\n\nAdds yaml dumping and parsing methods.\n\nExtra fields are not allowed.\n\nSerialization methods default to `exclude_unset = True` to prevent serializing\nconfigs full of mostly unset default values.\nAlso prevents passing along unset kwargs to BatchSpec.\nhttps://docs.pydantic.dev/usage/exporting_models/", + "description": "--Public API--", "type": "object", "properties": { "type": { diff --git a/great_expectations/datasource/fluent/schemas/SparkS3Datasource/CSVAsset.json b/great_expectations/datasource/fluent/schemas/SparkS3Datasource/CSVAsset.json index 7a2f10cfca60..39ca990c87a7 100644 --- a/great_expectations/datasource/fluent/schemas/SparkS3Datasource/CSVAsset.json +++ b/great_expectations/datasource/fluent/schemas/SparkS3Datasource/CSVAsset.json @@ -42,11 +42,22 @@ "title": "Connect Options", "description": "Optional filesystem specific advanced parameters for connecting to data assets", "type": "object" + }, + "header": { + "title": "Header", + "default": false, + "type": "boolean" + }, + "InferSchema": { + "title": "Inferschema", + "default": false, + "type": "boolean" } }, "required": [ "name" ], + "additionalProperties": false, "definitions": { "Sorter": { "title": "Sorter", diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py index e867a8db4738..dc2b043d7244 100644 --- a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py @@ -2,33 +2,26 @@ import logging import re -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Type, Union import pydantic -from typing_extensions import Literal +from typing_extensions import Final, Literal +from great_expectations.core._docs_decorators import public_api from great_expectations.core.util import AzureUrl from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent.config_str import ( + ConfigStr, # noqa: TCH001 # needed at runtime +) from great_expectations.datasource.fluent.data_asset.data_connector import ( AzureBlobStorageDataConnector, ) from great_expectations.datasource.fluent.interfaces import ( - BatchMetadata, TestConnectionError, ) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) -from great_expectations.datasource.fluent.spark_file_path_datasource import ( - CSVAsset, -) - -if TYPE_CHECKING: - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, - ) - logger = logging.getLogger(__name__) @@ -43,17 +36,30 @@ except ImportError: pass +_MISSING: Final = object() + +if TYPE_CHECKING: + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, + ) + class SparkAzureBlobStorageDatasourceError(SparkDatasourceError): pass +@public_api class SparkAzureBlobStorageDatasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[AzureBlobStorageDataConnector] + ] = AzureBlobStorageDataConnector + # instance attributes type: Literal["spark_abs"] = "spark_abs" # Azure Blob Storage specific attributes - azure_options: Dict[str, Any] = {} + azure_options: Dict[str, Union[ConfigStr, Any]] = {} _account_name: str = pydantic.PrivateAttr(default="") _azure_client: Union[BlobServiceClient, None] = pydantic.PrivateAttr(default=None) @@ -64,8 +70,8 @@ def _get_azure_client(self) -> BlobServiceClient: # Thanks to schema validation, we are guaranteed to have one of `conn_str` or `account_url` to # use in authentication (but not both). If the format or content of the provided keys is invalid, # the assignment of `self._account_name` and `self._azure_client` will fail and an error will be raised. - conn_str: str | None = self.azure_options.get("conn_str") - account_url: str | None = self.azure_options.get("account_url") + conn_str: ConfigStr | str | None = self.azure_options.get("conn_str") + account_url: ConfigStr | str | None = self.azure_options.get("account_url") if not bool(conn_str) ^ bool(account_url): raise SparkAzureBlobStorageDatasourceError( "You must provide one of `conn_str` or `account_url` to the `azure_options` key in your config (but not both)" @@ -75,16 +81,21 @@ def _get_azure_client(self) -> BlobServiceClient: if ABS_IMPORTED: try: if conn_str is not None: - self._account_name = re.search( # type: ignore[union-attr] - r".*?AccountName=(.+?);.*?", conn_str - ).group(1) + self._account_name = re.search( # type: ignore[union-attr] # re.search could return None + r".*?AccountName=(.+?);.*?", str(conn_str) + ).group( + 1 + ) azure_client = BlobServiceClient.from_connection_string( **self.azure_options ) elif account_url is not None: - self._account_name = re.search( # type: ignore[union-attr] - r"(?:https?://)?(.+?).blob.core.windows.net", account_url - ).group(1) + self._account_name = re.search( # type: ignore[union-attr] # re.search could return None + r"(?:https?://)?(.+?).blob.core.windows.net", + str(account_url), + ).group( + 1 + ) azure_client = BlobServiceClient(**self.azure_options) except Exception as e: # Failure to create "azure_client" is most likely due invalid "azure_options" dictionary. @@ -121,60 +132,44 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( + def _build_data_connector( self, - name: str, - batching_regex: Union[re.Pattern, str], - container: str, - header: bool = False, - infer_schema: bool = False, - name_starts_with: str = "", - delimiter: str = "/", - order_by: Optional[SortersDefinition] = None, - batch_metadata: Optional[BatchMetadata] = None, - ) -> CSVAsset: - """Adds a CSV DataAsset to the present "SparkAzureBlobStorageDatasource" object. + data_asset: CSVAsset, + abs_container: str = _MISSING, # type: ignore[assignment] # _MISSING is used as sentinel value + abs_name_starts_with: str = "", + abs_delimiter: str = "/", + **kwargs, + ) -> None: + """Builds and attaches the `AzureBlobStorageDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" + ) + if abs_container is _MISSING: + raise TypeError( + f"'{data_asset.name}' is missing required argument 'abs_container'" + ) - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches csv filenames that is used to label the batches - container: container name for Microsoft Azure Blob Storage - header: boolean (default False) indicating whether or not first line of CSV file is header line - infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically - name_starts_with: Microsoft Azure Blob Storage object name prefix - delimiter: Microsoft Azure Blob Storage object name delimiter - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any - batches created from the asset. - """ - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern - header=header, - inferSchema=infer_schema, - order_by=order_by_sorters, - batch_metadata=batch_metadata or {}, - ) - asset._data_connector = AzureBlobStorageDataConnector.build_data_connector( + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, + data_asset_name=data_asset.name, azure_client=self._get_azure_client(), - batching_regex=asset.batching_regex, + batching_regex=data_asset.batching_regex, account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, - file_path_template_map_fn=AzureUrl.AZURE_BLOB_STORAGE_WASBS_URL_TEMPLATE.format, + container=abs_container, + name_starts_with=abs_name_starts_with, + delimiter=abs_delimiter, + file_path_template_map_fn=AzureUrl.AZURE_BLOB_STORAGE_HTTPS_URL_TEMPLATE.format, ) - asset._test_connection_error_message = ( - AzureBlobStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=asset.batching_regex, + + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, account_name=self._account_name, - container=container, - name_starts_with=name_starts_with, - delimiter=delimiter, + container=abs_container, + name_starts_with=abs_name_starts_with, + delimiter=abs_delimiter, ) ) - return self._add_asset(asset=asset) diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi new file mode 100644 index 000000000000..de0167d5f6ca --- /dev/null +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi @@ -0,0 +1,58 @@ +from __future__ import annotations + +import re +from logging import Logger +from typing import TYPE_CHECKING, Any, ClassVar, Optional, Type + +from typing_extensions import Literal + +from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent.config_str import ( + ConfigStr, # noqa: TCH001 # needed at runtime +) +from great_expectations.datasource.fluent.data_asset.data_connector import ( + S3DataConnector, +) +from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, +) +from great_expectations.datasource.fluent.spark_datasource import ( + SparkDatasourceError, +) + +if TYPE_CHECKING: + from azure.storage.blob import BlobServiceClient + + from great_expectations.datasource.fluent.interfaces import BatchMetadata + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, + ) + +logger: Logger + +class SparkAzureBlobStorageDatasourceError(SparkDatasourceError): ... + +class SparkAzureBlobStorageDatasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[Type[S3DataConnector]] = S3DataConnector + + # instance attributes + type: Literal["spark_abs"] = "spark_abs" + + # Azure Blob Storage specific attributes + azure_options: dict[str, ConfigStr | Any] = {} + # private + _azure_client: BlobServiceClient | None + def add_csv_asset( + self, + name: str, + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: re.Pattern | str = r".*", + abs_container: str = ..., + abs_name_starts_with: str = "", + abs_delimiter: str = "/", + header: bool = ..., + infer_schema: bool = ..., + order_by: Optional[SortersDefinition] = ..., + ) -> CSVAsset: ... diff --git a/great_expectations/datasource/fluent/spark_dbfs_datasource.py b/great_expectations/datasource/fluent/spark_dbfs_datasource.py index 4a754543f5e3..b62653b12ab7 100644 --- a/great_expectations/datasource/fluent/spark_dbfs_datasource.py +++ b/great_expectations/datasource/fluent/spark_dbfs_datasource.py @@ -1,26 +1,22 @@ from __future__ import annotations import logging -import re -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, ClassVar, Type from typing_extensions import Literal from great_expectations.core._docs_decorators import public_api from great_expectations.core.util import DBFSPath -from great_expectations.datasource.fluent import SparkFilesystemDatasource +from great_expectations.datasource.fluent import ( + SparkFilesystemDatasource, +) from great_expectations.datasource.fluent.data_asset.data_connector import ( DBFSDataConnector, ) -from great_expectations.datasource.fluent.spark_file_path_datasource import ( - CSVAsset, -) if TYPE_CHECKING: - from great_expectations.datasource.fluent.interfaces import ( - BatchMetadata, - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, ) logger = logging.getLogger(__name__) @@ -30,57 +26,37 @@ class SparkDBFSDatasource(SparkFilesystemDatasource): """Spark based Datasource for DataBricks File System (DBFS) based data assets.""" + # class attributes + data_connector_type: ClassVar[Type[DBFSDataConnector]] = DBFSDataConnector + # instance attributes # overridden from base `Literal['spark_filesystem']` type: Literal["spark_dbfs"] = "spark_dbfs" # type: ignore[assignment] # base class has different type - @public_api - def add_csv_asset( - self, - name: str, - batching_regex: Optional[Union[re.Pattern, str]] = None, - glob_directive: str = "**/*", - header: bool = False, - infer_schema: bool = False, - order_by: Optional[SortersDefinition] = None, - batch_metadata: Optional[BatchMetadata] = None, - ) -> CSVAsset: - """Adds a CSV DataAsset to the present "SparkDBFSDatasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches csv filenames that is used to label the batches - glob_directive: glob for selecting files in DBFS directory (defaults to `**/*`) or nested directories (e.g. `*/*/*.csv`) - header: boolean (default False) indicating whether or not first line of CSV file is header line - infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any - batches created from the asset. - """ - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern - header=header, - inferSchema=infer_schema, - order_by=order_by_sorters, - batch_metadata=batch_metadata or {}, - ) - asset._data_connector = DBFSDataConnector.build_data_connector( + def _build_data_connector( + self, data_asset: CSVAsset, glob_directive: str = "**/*", **kwargs + ) -> None: + """Builds and attaches the `DBFSDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" + ) + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, - batching_regex=asset.batching_regex, + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, base_directory=self.base_directory, glob_directive=glob_directive, data_context_root_directory=self.data_context_root_directory, file_path_template_map_fn=DBFSPath.convert_to_protocol_version, ) - asset._test_connection_error_message = ( - DBFSDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=asset.batching_regex, + + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, glob_directive=glob_directive, base_directory=self.base_directory, ) ) - return self._add_asset(asset=asset) diff --git a/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi b/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi index 83acba7e502d..af72af1a437f 100644 --- a/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_dbfs_datasource.pyi @@ -1,6 +1,8 @@ +from __future__ import annotations + import re from logging import Logger -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional from typing_extensions import Literal @@ -32,10 +34,11 @@ class SparkDBFSDatasource(SparkFilesystemDatasource): def add_csv_asset( self, name: str, - batching_regex: Optional[Union[re.Pattern, str]] = ..., - glob_directive: str = ..., + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: re.Pattern | str = r".*", + glob_directive: str = "**/*", header: bool = ..., infer_schema: bool = ..., order_by: Optional[SortersDefinition] = ..., - batch_metadata: Optional[BatchMetadata] = ..., ) -> CSVAsset: ... diff --git a/great_expectations/datasource/fluent/spark_file_path_datasource.py b/great_expectations/datasource/fluent/spark_file_path_datasource.py index 8d03ce964bf6..2a3a12c2fd9f 100644 --- a/great_expectations/datasource/fluent/spark_file_path_datasource.py +++ b/great_expectations/datasource/fluent/spark_file_path_datasource.py @@ -3,6 +3,8 @@ import logging from typing import TYPE_CHECKING, ClassVar, Dict, List, Type +import pydantic +from pydantic import Field from typing_extensions import Literal from great_expectations.datasource.fluent import _SparkDatasource @@ -20,12 +22,18 @@ class CSVAsset(_FilePathDataAsset): # Overridden inherited instance fields type: Literal["csv"] = "csv" + header: bool = False + infer_schema: bool = Field(False, alias="InferSchema") + + class Config: + extra = pydantic.Extra.forbid + allow_population_by_field_name = True def _get_reader_method(self) -> str: return self.type def _get_reader_options_include(self) -> set[str] | None: - return {"header", "inferSchema"} + return {"header", "infer_schema"} class _SparkFilePathDatasource(_SparkDatasource): diff --git a/great_expectations/datasource/fluent/spark_filesystem_datasource.py b/great_expectations/datasource/fluent/spark_filesystem_datasource.py index 28aec3f39893..3dbb214af08c 100644 --- a/great_expectations/datasource/fluent/spark_filesystem_datasource.py +++ b/great_expectations/datasource/fluent/spark_filesystem_datasource.py @@ -2,34 +2,35 @@ import logging import pathlib -import re -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, ClassVar, Optional, Type from typing_extensions import Literal +from great_expectations.core._docs_decorators import public_api from great_expectations.datasource.fluent import _SparkFilePathDatasource -from great_expectations.datasource.fluent.constants import MATCH_ALL_PATTERN from great_expectations.datasource.fluent.data_asset.data_connector import ( FilesystemDataConnector, ) from great_expectations.datasource.fluent.interfaces import ( - BatchMetadata, TestConnectionError, ) -from great_expectations.datasource.fluent.spark_file_path_datasource import ( - CSVAsset, -) if TYPE_CHECKING: - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, ) logger = logging.getLogger(__name__) +@public_api class SparkFilesystemDatasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[FilesystemDataConnector] + ] = FilesystemDataConnector + # instance attributes type: Literal["spark_filesystem"] = "spark_filesystem" @@ -54,51 +55,29 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( - self, - name: str, - batching_regex: Union[str, re.Pattern] = MATCH_ALL_PATTERN, - glob_directive: str = "**/*", - header: bool = False, - infer_schema: bool = False, - order_by: Optional[SortersDefinition] = None, - batch_metadata: Optional[BatchMetadata] = None, - ) -> CSVAsset: - """Adds a CSV DataAsset to the present "SparkFilesystemDatasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches csv filenames that is used to label the batches - glob_directive: glob for selecting files in directory (defaults to `**/*`) or nested directories (e.g. `*/*/*.csv`) - header: boolean (default False) indicating whether or not first line of CSV file is header line - infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any - batches created from the asset. - """ - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern - header=header, - inferSchema=infer_schema, - order_by=order_by_sorters, - batch_metadata=batch_metadata or {}, - ) - asset._data_connector = FilesystemDataConnector.build_data_connector( + def _build_data_connector( + self, data_asset: CSVAsset, glob_directive: str = "**/*", **kwargs + ) -> None: + """Builds and attaches the `FilesystemDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" + ) + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, - batching_regex=asset.batching_regex, + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, base_directory=self.base_directory, glob_directive=glob_directive, data_context_root_directory=self.data_context_root_directory, ) - asset._test_connection_error_message = ( - FilesystemDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=asset.batching_regex, + + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, glob_directive=glob_directive, base_directory=self.base_directory, ) ) - return self._add_asset(asset=asset) diff --git a/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi b/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi new file mode 100644 index 000000000000..6c97afb25ada --- /dev/null +++ b/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi @@ -0,0 +1,47 @@ +from __future__ import annotations + +import pathlib +import re +from logging import Logger +from typing import TYPE_CHECKING, ClassVar, Optional, Type + +from typing_extensions import Literal + +from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent.data_asset.data_connector import ( + FilesystemDataConnector, +) +from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, +) + +if TYPE_CHECKING: + from great_expectations.datasource.fluent import BatchMetadata + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, + ) + +logger: Logger + +class SparkFilesystemDatasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[FilesystemDataConnector] + ] = FilesystemDataConnector + + # instance attributes + type: Literal["spark_filesystem"] = "spark_filesystem" + + base_directory: pathlib.Path + data_context_root_directory: Optional[pathlib.Path] = None + def add_csv_asset( + self, + name: str, + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: re.Pattern | str = r".*", + glob_directive: str = "**/*", + header: bool = ..., + infer_schema: bool = ..., + order_by: Optional[SortersDefinition] = ..., + ) -> CSVAsset: ... diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py index 10554f0b18ef..ad5275aa9d92 100644 --- a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py @@ -1,30 +1,28 @@ from __future__ import annotations import logging -import re -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Type, Union import pydantic from typing_extensions import Literal +from great_expectations.core._docs_decorators import public_api from great_expectations.core.util import GCSUrl -from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent import ( + _SparkFilePathDatasource, +) from great_expectations.datasource.fluent.config_str import ( - ConfigStr, # noqa: TCH001 # needed at runtime + ConfigStr, # noqa: TCH001 # needed at runtime # noqa: TCH001 # needed at runtime ) from great_expectations.datasource.fluent.data_asset.data_connector import ( GoogleCloudStorageDataConnector, ) from great_expectations.datasource.fluent.interfaces import ( - BatchMetadata, TestConnectionError, ) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) -from great_expectations.datasource.fluent.spark_file_path_datasource import ( - CSVAsset, -) if TYPE_CHECKING: from google.cloud.storage.client import Client as GoogleCloudStorageClient @@ -32,9 +30,8 @@ Credentials as GoogleServiceAccountCredentials, ) - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, ) @@ -55,7 +52,13 @@ class SparkGoogleCloudStorageDatasourceError(SparkDatasourceError): pass +@public_api class SparkGoogleCloudStorageDatasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[GoogleCloudStorageDataConnector] + ] = GoogleCloudStorageDataConnector + # instance attributes type: Literal["spark_gcs"] = "spark_gcs" @@ -129,59 +132,38 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( + def _build_data_connector( self, - name: str, - batching_regex: Union[re.Pattern, str], - header: bool = False, - infer_schema: bool = False, - prefix: str = "", - delimiter: str = "/", - max_results: int = 1000, - order_by: Optional[SortersDefinition] = None, - batch_metadata: Optional[BatchMetadata] = None, - ) -> CSVAsset: - """Adds a CSV DataAsset to the present "SparkGoogleCloudStorageDatasource" object. - - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches csv filenames that is used to label the batches - header: boolean (default False) indicating whether or not first line of CSV file is header line - infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically - prefix (str): Google Cloud Storage object name prefix - delimiter (str): Google Cloud Storage object name delimiter - max_results (int): Google Cloud Storage max_results (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any - batches created from the asset. - """ - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern - header=header, - inferSchema=infer_schema, - order_by=order_by_sorters, - batch_metadata=batch_metadata or {}, - ) - asset._data_connector = GoogleCloudStorageDataConnector.build_data_connector( + data_asset: CSVAsset, + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, + **kwargs, + ) -> None: + """Builds and attaches the `GoogleCloudStorageDataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" + ) + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, + data_asset_name=data_asset.name, gcs_client=self._get_gcs_client(), - batching_regex=asset.batching_regex, + batching_regex=data_asset.batching_regex, bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, - max_results=max_results, + prefix=gcs_prefix, + delimiter=gcs_delimiter, + max_results=gcs_max_results, file_path_template_map_fn=GCSUrl.OBJECT_URL_TEMPLATE.format, ) - asset._test_connection_error_message = ( - GoogleCloudStorageDataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=asset.batching_regex, + + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, bucket_or_name=self.bucket_or_name, - prefix=prefix, - delimiter=delimiter, + prefix=gcs_prefix, + delimiter=gcs_delimiter, ) ) - return self._add_asset(asset=asset) diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi new file mode 100644 index 000000000000..87b453d064a9 --- /dev/null +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi @@ -0,0 +1,56 @@ +from __future__ import annotations + +import re +from logging import Logger +from typing import TYPE_CHECKING, Any, ClassVar, Optional, Type + +from typing_extensions import Literal + +from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent.config_str import ( + ConfigStr, # noqa: TCH001 # needed at runtime +) +from great_expectations.datasource.fluent.data_asset.data_connector import ( + GoogleCloudStorageDataConnector, +) +from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, +) + +if TYPE_CHECKING: + from google.cloud.storage.client import Client as GoogleCloudStorageClient + + from great_expectations.datasource.fluent import BatchMetadata + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, + ) + +logger: Logger + +class SparkGoogleCloudStorageDatasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[ + Type[GoogleCloudStorageDataConnector] + ] = GoogleCloudStorageDataConnector + + # instance attributes + type: Literal["spark_gcs"] = "spark_gcs" + + # GCS specific attributes + bucket_or_name: str + gcs_options: dict[str, ConfigStr | Any] = {} + + _gcs_client: GoogleCloudStorageClient | None + def add_csv_asset( + self, + name: str, + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: re.Pattern | str = r".*", + gcs_prefix: str = "", + gcs_delimiter: str = "/", + gcs_max_results: int = 1000, + header: bool = ..., + infer_schema: bool = ..., + order_by: Optional[SortersDefinition] = ..., + ) -> CSVAsset: ... diff --git a/great_expectations/datasource/fluent/spark_s3_datasource.py b/great_expectations/datasource/fluent/spark_s3_datasource.py index 0e308d8d2713..86f48c7e59f6 100644 --- a/great_expectations/datasource/fluent/spark_s3_datasource.py +++ b/great_expectations/datasource/fluent/spark_s3_datasource.py @@ -1,38 +1,32 @@ from __future__ import annotations import logging -import re -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Type, Union import pydantic from typing_extensions import Literal +from great_expectations.core._docs_decorators import public_api from great_expectations.core.util import S3Url from great_expectations.datasource.fluent import _SparkFilePathDatasource from great_expectations.datasource.fluent.config_str import ( ConfigStr, # noqa: TCH001 # needed at runtime ) -from great_expectations.datasource.fluent.constants import MATCH_ALL_PATTERN from great_expectations.datasource.fluent.data_asset.data_connector import ( S3DataConnector, ) from great_expectations.datasource.fluent.interfaces import ( - BatchMetadata, TestConnectionError, ) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) -from great_expectations.datasource.fluent.spark_file_path_datasource import ( - CSVAsset, -) if TYPE_CHECKING: from botocore.client import BaseClient - from great_expectations.datasource.fluent.interfaces import ( - Sorter, - SortersDefinition, + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, ) @@ -52,7 +46,11 @@ class SparkS3DatasourceError(SparkDatasourceError): pass +@public_api class SparkS3Datasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[Type[S3DataConnector]] = S3DataConnector + # instance attributes type: Literal["spark_s3"] = "spark_s3" @@ -104,59 +102,39 @@ def test_connection(self, test_assets: bool = True) -> None: for asset in self.assets.values(): asset.test_connection() - def add_csv_asset( + def _build_data_connector( self, - name: str, - batching_regex: Union[str, re.Pattern] = MATCH_ALL_PATTERN, - header: bool = False, - infer_schema: bool = False, - prefix: str = "", - delimiter: str = "/", - max_keys: int = 1000, - order_by: Optional[SortersDefinition] = None, - batch_metadata: Optional[BatchMetadata] = None, - ) -> CSVAsset: - """Adds a CSV DataAsset to the present "SparkS3Datasource" object. + data_asset: CSVAsset, + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, + **kwargs, + ) -> None: + """Builds and attaches the `S3DataConnector` to the asset.""" + if kwargs: + raise TypeError( + f"_build_data_connector() got unexpected keyword arguments {list(kwargs.keys())}" + ) - Args: - name: The name of the CSV asset - batching_regex: regex pattern that matches CSV filenames that is used to label the batches - header: boolean (default False) indicating whether or not first line of CSV file is header line - infer_schema: boolean (default False) instructing Spark to attempt to infer schema of CSV file heuristically - prefix: S3 prefix - delimiter: S3 delimiter - max_keys: S3 max_keys (default is 1000) - order_by: sorting directive via either list[Sorter] or "+/- key" syntax: +/- (a/de)scending; + default - batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any - batches created from the asset. - """ - order_by_sorters: list[Sorter] = self.parse_order_by_sorters(order_by=order_by) - asset = CSVAsset( - name=name, - batching_regex=batching_regex, # type: ignore[arg-type] # pydantic will compile regex str to Pattern - header=header, - inferSchema=infer_schema, - order_by=order_by_sorters, - batch_metadata=batch_metadata or {}, - ) - asset._data_connector = S3DataConnector.build_data_connector( + data_asset._data_connector = self.data_connector_type.build_data_connector( datasource_name=self.name, - data_asset_name=name, + data_asset_name=data_asset.name, s3_client=self._get_s3_client(), - batching_regex=asset.batching_regex, + batching_regex=data_asset.batching_regex, bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, - max_keys=max_keys, + prefix=s3_prefix, + delimiter=s3_delimiter, + max_keys=s3_max_keys, file_path_template_map_fn=S3Url.OBJECT_URL_TEMPLATE.format, ) - asset._test_connection_error_message = ( - S3DataConnector.build_test_connection_error_message( - data_asset_name=name, - batching_regex=asset.batching_regex, + + # build a more specific `_test_connection_error_message` + data_asset._test_connection_error_message = ( + self.data_connector_type.build_test_connection_error_message( + data_asset_name=data_asset.name, + batching_regex=data_asset.batching_regex, bucket=self.bucket, - prefix=prefix, - delimiter=delimiter, + prefix=s3_prefix, + delimiter=s3_delimiter, ) ) - return self._add_asset(asset=asset) diff --git a/great_expectations/datasource/fluent/spark_s3_datasource.pyi b/great_expectations/datasource/fluent/spark_s3_datasource.pyi new file mode 100644 index 000000000000..53d22c44fd95 --- /dev/null +++ b/great_expectations/datasource/fluent/spark_s3_datasource.pyi @@ -0,0 +1,50 @@ +from __future__ import annotations + +import re +from logging import Logger +from typing import TYPE_CHECKING, Any, ClassVar, Optional, Type + +from typing_extensions import Literal + +from great_expectations.datasource.fluent import _SparkFilePathDatasource +from great_expectations.datasource.fluent.config_str import ( + ConfigStr, # noqa: TCH001 # needed at runtime +) +from great_expectations.datasource.fluent.data_asset.data_connector import ( + S3DataConnector, +) +from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, +) + +if TYPE_CHECKING: + from great_expectations.datasource.fluent import BatchMetadata + from great_expectations.datasource.fluent.spark_file_path_datasource import ( + CSVAsset, + ) + +logger: Logger + +class SparkS3Datasource(_SparkFilePathDatasource): + # class attributes + data_connector_type: ClassVar[Type[S3DataConnector]] = S3DataConnector + + # instance attributes + type: Literal["spark_s3"] = "spark_s3" + + # S3 specific attributes + bucket: str + boto3_options: dict[str, ConfigStr | Any] = {} + def add_csv_asset( + self, + name: str, + *, + batch_metadata: Optional[BatchMetadata] = ..., + batching_regex: re.Pattern | str = r".*", + s3_prefix: str = "", + s3_delimiter: str = "/", + s3_max_keys: int = 1000, + header: bool = ..., + infer_schema: bool = ..., + order_by: Optional[SortersDefinition] = ..., + ) -> CSVAsset: ... diff --git a/pyproject.toml b/pyproject.toml index 1a3dd875dc7e..707685999703 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -196,8 +196,6 @@ exclude = [ 'validator/validator\.py', # 54 # tests 'tests/datasource/fluent/tasks\.py', - 'tests/datasource/fluent/integration/integration_test_utils\.py', - 'tests/datasource/fluent/test_spark_datasource\.py', ] [tool.pydantic-mypy] diff --git a/tests/datasource/fluent/conftest.py b/tests/datasource/fluent/conftest.py index f9fd16f2803c..24b4d3b808d1 100644 --- a/tests/datasource/fluent/conftest.py +++ b/tests/datasource/fluent/conftest.py @@ -18,6 +18,8 @@ from great_expectations.datasource.fluent import ( PandasAzureBlobStorageDatasource, PandasGoogleCloudStorageDatasource, + SparkAzureBlobStorageDatasource, + SparkGoogleCloudStorageDatasource, ) from great_expectations.datasource.fluent.interfaces import Datasource from great_expectations.datasource.fluent.sources import _SourceFactories @@ -161,6 +163,12 @@ def gcs_get_client_dummy(monkeypatch: MonkeyPatch): _get_test_client_dummy, raising=True, ) + monkeypatch.setattr( + SparkGoogleCloudStorageDatasource, + "_get_gcs_client", + _get_test_client_dummy, + raising=True, + ) @pytest.fixture @@ -171,6 +179,12 @@ def azure_get_client_dummy(monkeypatch: MonkeyPatch): _get_test_client_dummy, raising=True, ) + monkeypatch.setattr( + SparkAzureBlobStorageDatasource, + "_get_azure_client", + _get_test_client_dummy, + raising=True, + ) @pytest.fixture @@ -184,7 +198,6 @@ def cloud_storage_get_client_doubles( gcs azure """ - # TODO: patch Spark datasources as needed logger.warning( "Patching cloud storage _get_*_client() methods to return client test doubles" ) diff --git a/tests/datasource/fluent/great_expectations.yml b/tests/datasource/fluent/great_expectations.yml index a8b113c6f379..535669153f82 100644 --- a/tests/datasource/fluent/great_expectations.yml +++ b/tests/datasource/fluent/great_expectations.yml @@ -98,3 +98,52 @@ fluent_datasources: delimiter: "," connect_options: abs_container: "this_is_always_required" + my_spark_fs_ds: + type: spark_filesystem + base_directory: "." + assets: + my_csv_asset_w_custom_connect_options: + type: csv + InferSchema: True + connect_options: + glob_directive: "**/*.csv" + my_csv_asset_with_default_connect_options: + type: csv + infer_schema: False + my_spark_s3_ds: + type: spark_s3 + bucket: "test_bucket" + assets: + my_csv_asset_w_custom_connect_options: + type: csv + connect_options: + s3_delimiter: "/" + s3_prefix: "" + s3_max_keys: 99 + my_csv_asset_with_default_connect_options: + type: csv + my_spark_gcs_ds: + type: spark_gcs + bucket_or_name: "test_bucket" + assets: + my_csv_asset_w_custom_connect_options: + type: csv + connect_options: + gcs_delimiter: "/" + gcs_prefix: "" + gcs_max_results: 99 + my_csv_asset_with_default_connect_options: + type: csv + my_spark_abs_ds: + type: spark_abs + assets: + my_csv_asset_w_custom_connect_options: + type: csv + connect_options: + abs_container: "test" + abs_name_starts_with: "" + abs_delimiter: "/" + my_csv_asset_with_default_connect_options: + type: csv + connect_options: + abs_container: "this_is_always_required" diff --git a/tests/datasource/fluent/integration/integration_test_utils.py b/tests/datasource/fluent/integration/integration_test_utils.py index acac6edb701a..2e64b099bcbc 100644 --- a/tests/datasource/fluent/integration/integration_test_utils.py +++ b/tests/datasource/fluent/integration/integration_test_utils.py @@ -52,7 +52,7 @@ def run_checkpoint_and_data_doc( context.add_expectation_suite(expectation_suite_name=suite_name) # noinspection PyTypeChecker validator = context.get_validator( - batch_request=batch_request, + batch_request=batch_request, # type: ignore[arg-type] # expected BatchRequestBase got BatchRequest expectation_suite_name=suite_name, ) validator.expect_table_row_count_to_be_between(0, 10000) @@ -71,7 +71,7 @@ def run_checkpoint_and_data_doc( {"batch_request": batch_request, "expectation_suite_name": suite_name} ], } - metadata = validator.active_batch.metadata + metadata = validator.active_batch.metadata # type: ignore[union-attr] # active_batch could be None if isinstance(datasource, PandasDatasource): checkpoint_name = "single_batch_checkpoint" else: @@ -81,7 +81,7 @@ def run_checkpoint_and_data_doc( checkpoint = SimpleCheckpoint( checkpoint_name, context, - **checkpoint_config, + **checkpoint_config, # type: ignore[arg-type] ) checkpoint_result = checkpoint.run() @@ -179,13 +179,13 @@ def run_data_assistant_and_checkpoint( context, batch_request ) batch_num = len( - data_assistant_result._batch_id_to_batch_identifier_display_name_map + data_assistant_result._batch_id_to_batch_identifier_display_name_map # type: ignore[arg-type] # could be None ) assert batch_num == 1, f"Only expected 1 batch but found {batch_num}" # We assert the data assistant successfully generated expectations. # We don't care about the exact number since that may change as data assistants evolve. - expectation_num = len(data_assistant_result.expectation_configurations) + expectation_num = len(data_assistant_result.expectation_configurations) # type: ignore[arg-type] # could be None assert checkpoint_result.success, "Running expectation suite failed" # Verify that the number of checkpoint validations is the number of expectations generated by the data assistant assert ( @@ -209,11 +209,11 @@ def run_multibatch_data_assistant_and_checkpoint( ) # Assert multiple batches were processed batch_num = len( - data_assistant_result._batch_id_to_batch_identifier_display_name_map + data_assistant_result._batch_id_to_batch_identifier_display_name_map # type: ignore[arg-type] # could be None ) assert batch_num == 12, f"Expected exactly 12 batches but found {batch_num}" - expectation_num = len(data_assistant_result.expectation_configurations) + expectation_num = len(data_assistant_result.expectation_configurations) # type: ignore[arg-type] # could be None """ Exact number of "ExpectationConfiguration" objects, emitted by "DataAssistant" implementation depends on "Batch" schema (including, significantly, column types). Different "ExecutionEngine" backends (even different versions @@ -225,7 +225,7 @@ def run_multibatch_data_assistant_and_checkpoint( while 111 "ExpectationConfiguration" objects for other PySpark versions. Using "SqlAlchemyExecutionEngine" yields 111 "ExpectationConfiguration" objects for all SQLAlchemy environments in test suite. Hence, assertion is ">= 109". """ - assert expectation_num >= 109 + assert expectation_num >= 109, f"{expectation_num} >= 109" assert checkpoint_result.success, "Running expectation suite failed" # Verify that the number of checkpoint validations is the number of expectations generated by the data assistant @@ -260,7 +260,7 @@ def run_batch_head( assert isinstance(fetch_all, bool) execution_engine: ExecutionEngine = batch.data.execution_engine - execution_engine.batch_manager.load_batch_list(batch_list=[batch]) + execution_engine.batch_manager.load_batch_list(batch_list=[batch]) # type: ignore[list-item] # expect core.Batch got interfaces.Batch metrics: Dict[Tuple[str, str, str], MetricValue] = {} @@ -321,11 +321,11 @@ def run_batch_head( assert isinstance(head_data, HeadData) assert len(head_data.data.index) == 5 - assert set(metrics[table_columns_metric.id]) == expected_columns + assert set(metrics[table_columns_metric.id]) == expected_columns # type: ignore[arg-type] else: with pytest.raises(ValidationError) as e: - batch.head(n_rows=n_rows, fetch_all=fetch_all) + batch.head(n_rows=n_rows, fetch_all=fetch_all) # type: ignore[arg-type] n_rows_validation_error = ( "1 validation error for Head\n" "n_rows\n" @@ -347,7 +347,7 @@ def _configure_and_run_data_assistant( ) -> tuple[DataAssistantResult, CheckpointResult]: expectation_suite_name = "my_onboarding_assistant_suite" context.add_expectation_suite(expectation_suite_name=expectation_suite_name) - data_assistant_result = context.assistants.onboarding.run( + data_assistant_result = context.assistants.onboarding.run( # type: ignore[attr-defined] # no attribute .run batch_request=batch_request, numeric_columns_rule={ "estimator": "exact", @@ -375,7 +375,7 @@ def _configure_and_run_data_assistant( checkpoint = SimpleCheckpoint( f"yellow_tripdata_sample_{expectation_suite_name}", context, - **checkpoint_config, + **checkpoint_config, # type: ignore[arg-type] ) checkpoint_result = checkpoint.run() diff --git a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py index 2e8a3c747db9..d4b140d4fba0 100644 --- a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py @@ -109,7 +109,7 @@ def csv_asset( asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) return asset @@ -241,7 +241,7 @@ def test_add_csv_asset_to_datasource( asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", - container="my_container", + abs_container="my_container", batch_metadata=asset_specified_metadata, ) assert asset.name == "csv_asset" @@ -295,7 +295,7 @@ def test_csv_asset_with_batching_regex_unnamed_parameters( asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(\d{4})\.csv", - container="my_container", + abs_container="my_container", ) options = asset.batch_request_options assert options == ( @@ -325,7 +325,7 @@ def test_csv_asset_with_batching_regex_named_parameters( asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) options = asset.batch_request_options assert options == ( @@ -355,7 +355,7 @@ def test_csv_asset_with_some_batching_regex_named_parameters( asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) options = asset.batch_request_options assert options == ( @@ -385,7 +385,7 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(.+)_(.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", ) with pytest.raises(ge_exceptions.InvalidBatchRequestError): # price is an int which will raise an error @@ -421,7 +421,7 @@ def instantiate_azure_client_spy(self) -> None: asset = spark_abs_datasource.add_csv_asset( name="csv_asset", batching_regex=r"(?P.+)_(?P.+)_(?P\d{4})\.csv", - container="my_container", + abs_container="my_container", batch_metadata=asset_specified_metadata, ) From 280242f66235a5448b26de4d09ffa447d5509472 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Thu, 6 Apr 2023 22:19:26 -0400 Subject: [PATCH 56/96] [MAINTENANCE] Warning MetaData.bind argument deprecated for SQLAlchemy 2.0 Compatibility (#7502) --- .../compatibility/sqlalchemy_and_pandas.py | 72 ++++++++++++++++++- .../map_condition_auxilliary_methods.py | 3 +- .../metrics/table_metrics/table_head.py | 12 ++-- pyproject.toml | 3 - tests/expectations/metrics/test_map_metric.py | 6 +- 5 files changed, 86 insertions(+), 10 deletions(-) diff --git a/great_expectations/compatibility/sqlalchemy_and_pandas.py b/great_expectations/compatibility/sqlalchemy_and_pandas.py index 3fa80da0bb5b..014b18720ab6 100644 --- a/great_expectations/compatibility/sqlalchemy_and_pandas.py +++ b/great_expectations/compatibility/sqlalchemy_and_pandas.py @@ -20,7 +20,7 @@ def execute_pandas_reader_fn( ) -> pd.DataFrame | list[pd.DataFrame]: """Suppress warnings while executing the pandas reader functions. - If pandas version is below 2.0 and sqlalchemy installed then suppress + If pandas version is below 2.0 and sqlalchemy installed then we suppress the sqlalchemy 2.0 warning and raise our own warning. pandas does not support sqlalchemy 2.0 until version 2.0 (see https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#other-enhancements) @@ -45,3 +45,73 @@ def execute_pandas_reader_fn( else: reader_fn_result = reader_fn(**reader_options) return reader_fn_result + + +def pandas_read_sql(sql, con, **kwargs) -> pd.DataFrame: + """Suppress deprecation warnings while executing the pandas read_sql function. + + Note this only passes params straight to pandas read_sql method, please + see the pandas documentation + (currently https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html) + for more information on this method. + + If pandas version is below 2.0 and sqlalchemy installed then we suppress + the sqlalchemy 2.0 warning and raise our own warning. pandas does not + support sqlalchemy 2.0 until version 2.0 (see https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#other-enhancements) + + Args: + sql: str or SQLAlchemy Selectable (select or text object) + con: SQLAlchemy connectable, str, or sqlite3 connection + **kwargs: Other keyword arguments, not enumerated here since they differ + between pandas versions. + + Returns: + dataframe + """ + if is_version_less_than(pd.__version__, "2.0.0"): + if sqlalchemy and is_version_greater_or_equal(sqlalchemy.__version__, "2.0.0"): + warn_pandas_less_than_2_0_and_sqlalchemy_greater_than_or_equal_2_0() + with warnings.catch_warnings(): + # Note that RemovedIn20Warning is the warning class that we see from sqlalchemy + # but using the base class here since sqlalchemy is an optional dependency and this + # warning type only exists in sqlalchemy < 2.0. + warnings.filterwarnings(action="ignore", category=DeprecationWarning) + return_value = pd.read_sql(sql=sql, con=con, **kwargs) + else: + return_value = pd.read_sql(sql=sql, con=con, **kwargs) + return return_value + + +def pandas_read_sql_query(sql, con, **kwargs) -> pd.DataFrame: + """Suppress deprecation warnings while executing the pandas read_sql_query function. + + Note this only passes params straight to pandas read_sql_query method, please + see the pandas documentation + (currently https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html) + for more information on this method. + + If pandas version is below 2.0 and sqlalchemy installed then we suppress + the sqlalchemy 2.0 warning and raise our own warning. pandas does not + support sqlalchemy 2.0 until version 2.0 (see https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#other-enhancements) + + Args: + sql: str or SQLAlchemy Selectable (select or text object) + con: SQLAlchemy connectable, str, or sqlite3 connection + **kwargs: Other keyword arguments, not enumerated here since they differ + between pandas versions. + + Returns: + dataframe + """ + if is_version_less_than(pd.__version__, "2.0.0"): + if sqlalchemy and is_version_greater_or_equal(sqlalchemy.__version__, "2.0.0"): + warn_pandas_less_than_2_0_and_sqlalchemy_greater_than_or_equal_2_0() + with warnings.catch_warnings(): + # Note that RemovedIn20Warning is the warning class that we see from sqlalchemy + # but using the base class here since sqlalchemy is an optional dependency and this + # warning type only exists in sqlalchemy < 2.0. + warnings.filterwarnings(action="ignore", category=DeprecationWarning) + return_value = pd.read_sql_query(sql=sql, con=con, **kwargs) + else: + return_value = pd.read_sql_query(sql=sql, con=con, **kwargs) + return return_value diff --git a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py index 11420fd0808f..fe32406bcffe 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py @@ -334,7 +334,8 @@ def _sqlalchemy_map_condition_unexpected_count_value( ) with execution_engine.engine.begin(): - metadata: sa.MetaData = sa.MetaData(execution_engine.engine) + metadata: sa.MetaData = sa.MetaData() + metadata.reflect(bind=execution_engine.engine) temp_table_obj: sa.Table = sa.Table( temp_table_name, metadata, diff --git a/great_expectations/expectations/metrics/table_metrics/table_head.py b/great_expectations/expectations/metrics/table_metrics/table_head.py index 0a2ce1880a7f..f55e4227f6b4 100644 --- a/great_expectations/expectations/metrics/table_metrics/table_head.py +++ b/great_expectations/expectations/metrics/table_metrics/table_head.py @@ -4,6 +4,10 @@ import pandas as pd +from great_expectations.compatibility.sqlalchemy_and_pandas import ( + pandas_read_sql, + pandas_read_sql_query, +) from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( read_sql_table_as_df, ) @@ -78,13 +82,13 @@ def _sqlalchemy( # noqa: C901 - 16 # if a custom query was passed try: if metric_value_kwargs["fetch_all"]: - df = pd.read_sql_query( + df = pandas_read_sql_query( sql=selectable, con=execution_engine.engine, ) else: # passing chunksize causes the Iterator to be returned - df_chunk_iterator = pd.read_sql_query( + df_chunk_iterator = pandas_read_sql_query( sql=selectable, con=execution_engine.engine, chunksize=abs(n_rows), @@ -164,14 +168,14 @@ def _sqlalchemy( # noqa: C901 - 16 # if read_sql_query or read_sql_table failed, we try to use the read_sql convenience method if n_rows <= 0 and not fetch_all: - df_chunk_iterator = pd.read_sql( + df_chunk_iterator = pandas_read_sql( sql=sql, con=execution_engine.engine, chunksize=abs(n_rows) ) df = TableHead._get_head_df_from_df_iterator( df_chunk_iterator=df_chunk_iterator, n_rows=n_rows ) else: - df = pd.read_sql(sql=sql, con=execution_engine.engine) + df = pandas_read_sql(sql=sql, con=execution_engine.engine) return df diff --git a/pyproject.toml b/pyproject.toml index 707685999703..a12b0e00df3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -464,9 +464,6 @@ filterwarnings = [ # SQLAlchemy 2.x support warnings. These warnings should be ignored until sqlalchemy 2.x is fully supported. # To get SQLAlchemy 2.x supported, remove one of these ignores and then fix the resulting errors. 'ignore: The Engine.execute\(\) method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0. All statement execution in SQLAlchemy 2.0 is performed by the Connection.execute\(\) method of Connection, or in the ORM by the Session.execute\(\) method of Session.:DeprecationWarning', - # Example Actual Warning: Found by running setup of test_validate_dataset[sqlite] - # sqlalchemy.exc.RemovedIn20Warning: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - 'ignore: The MetaData.bind argument is deprecated and will be removed in SQLAlchemy 2.0.:DeprecationWarning', # Example Actual Warning: sqlalchemy.exc.RemovedIn20Warning: Deprecated API features detected! These feature(s) are not compatible with SQLAlchemy 2.0. To prevent incompatible upgrades prior to updating applications, ensure requirements files are pinned to "sqlalchemy<2.0". Set environment variable SQLALCHEMY_WARN_20=1 to show all deprecation warnings. Set environment variable SQLALCHEMY_SILENCE_UBER_WARNING=1 to silence this message. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) # Found so far in test_cli_datasource_list 'ignore: Deprecated API features detected!:DeprecationWarning', diff --git a/tests/expectations/metrics/test_map_metric.py b/tests/expectations/metrics/test_map_metric.py index 57c37c8b76b0..ba1257edf1ed 100644 --- a/tests/expectations/metrics/test_map_metric.py +++ b/tests/expectations/metrics/test_map_metric.py @@ -1,6 +1,9 @@ import pandas as pd import pytest +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core import ( ExpectationConfiguration, ExpectationValidationResult, @@ -66,7 +69,8 @@ def sqlite_table_for_unexpected_rows_with_index( # use try-except block to ensure we don't keep modifying the database # adapted from https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html try: - df.to_sql( + add_dataframe_to_db( + df=df, name="animal_names", con=sqlite_engine, index=False, From 583de5a9721f6fbe3633d6601f625be26e5e5a8c Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Fri, 7 Apr 2023 09:05:55 -0500 Subject: [PATCH 57/96] [DOCS] Adds deprecation policy to changelog page (#7585) --- docs/docusaurus/docs/changelog.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/docusaurus/docs/changelog.md b/docs/docusaurus/docs/changelog.md index 9602c6d9dfe7..7b73b2a150f6 100644 --- a/docs/docusaurus/docs/changelog.md +++ b/docs/docusaurus/docs/changelog.md @@ -2,6 +2,14 @@ title: Changelog --- +### Deprecation policy + +- Deprecation warnings and the supporting code are maintained for two minor versions. For example, v0.12 deprecations will only be removed as part of a v0.15 release. + - This means we have three supported minor versions in the release at any time. For example: in v0.15 we support v0.15, v0.14, and v0.13. When v0.16 ships we will support v0.16, v0.15, and v0.14 and will remove support for v0.13. +- Deprecation warnings include (in the warning message) the version that they were introduced. For example: "deprecated as of v0.13" +- Deprecation warnings are accompanied by a moniker (as a code comment) indicating when they were deprecated. For example: `# deprecated-v0.13` +- Changes to methods and parameters due to deprecation are also noted in the relevant docstrings. + ### 0.16.6 * [FEATURE] Fluent `DataAsset` `batch_metadata` config variables ([#7513](https://github.com/great-expectations/great_expectations/pull/7513)) * [FEATURE] Add batch metadata to spark add_*_asset methods ([#7534](https://github.com/great-expectations/great_expectations/pull/7534)) From 4728c1525e856b7a088688b5a392b16dd5cc5f79 Mon Sep 17 00:00:00 2001 From: Tobias Bruckert <62531735+tb102122@users.noreply.github.com> Date: Sat, 8 Apr 2023 00:23:51 +1000 Subject: [PATCH 58/96] [CONTRIB] add check to calculate difference between 2 dates in month (#7576) Co-authored-by: Anthony Burdi --- ...lticolumn_datetime_difference_in_months.py | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py new file mode 100644 index 000000000000..2351d81fa8cc --- /dev/null +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py @@ -0,0 +1,232 @@ +from typing import Optional + +from pandas import to_datetime + +from great_expectations.core.expectation_configuration import ExpectationConfiguration +from great_expectations.exceptions import InvalidExpectationConfigurationError +from great_expectations.execution_engine import ( + PandasExecutionEngine, +) +from great_expectations.expectations.expectation import MulticolumnMapExpectation +from great_expectations.expectations.metrics.map_metric_provider import ( + MulticolumnMapMetricProvider, + multicolumn_condition_partial, +) + + +# This class defines a Metric to support your Expectation. +class ColumnDatetimeDifferenceInMonths(MulticolumnMapMetricProvider): + + condition_metric_name = "multicolumn_values.column_datetime_difference_in_months" + # These point your metric at the provided keys to facilitate calculation + condition_domain_keys = ( + "batch_id", + "table", + "column_list", + "row_condition", + "condition_parser", + "ignore_row_if", + ) + condition_value_keys = ( + "start_datetime", + "end_datetime", + "gap", + ) + + # @multicolumn_condition_partial(engine=SqlAlchemyExecutionEngine) + # def _sqlalchemy(cls, dataframe, start_datetime, end_datetime, gap, **kwargs): + # print(dataframe) + + # def date_diff_in_months(df): + # col_start = column(start_datetime) + # col_end = column(end_datetime) + # col_gap = column(gap) + # if col_start is None or col_end is None or col_gap is None: + # return None + + # # Calculate the difference in months between the start_datetime and end_datetime columns + # diff_months = cast( + # (func.strftime("%Y", col_end) - func.strftime("%Y", col_start)) * 12 + # + (func.strftime("%m", col_end) - func.strftime("%m", col_start)), + # Integer, + # ) + # return col_gap == diff_months + + # return date_diff_in_months(dataframe) + + # @multicolumn_condition_partial(engine=SparkDFExecutionEngine) + # def _spark(cls, dataframe, start_datetime, end_datetime, gap, **kwargs): + # def date_diff_in_months(row): + # col_start = row(start_datetime) + # col_end = row[end_datetime] + # col_gap = row[gap] + + # if col_start is None or col_end is None or col_gap is None: + # return None + + # # Convert start_datetime and end_datetime columns to date format + # col_start = to_date(col_start) + # col_end = to_date(col_end) + + # # Calculate the difference in months between the start_datetime and end_datetime columns + # diff_months = floor(months_between(col_end, col_start)) + + # return col(col_gap) == diff_months + + # return functools.reduce(operator.add, date_diff_in_months(col("*"))) + + @multicolumn_condition_partial(engine=PandasExecutionEngine) + def _pandas(cls, dataframe, start_datetime, end_datetime, gap, **kwargs): + def date_diff_in_months(row): + col_start = to_datetime(row[start_datetime]) + col_end = to_datetime(row[end_datetime]) + col_gap = row[gap] + if col_start is None or col_end is None or col_gap is None: + return None + + diff_months = (col_end.year - col_start.year) * 12 + ( + col_end.month - col_start.month + ) + return col_gap == diff_months + + return dataframe.apply(lambda row: date_diff_in_months(row), axis=1) + + +# This class defines the Expectation itself +class ExpectColumnDatetimeDifferenceInMonths(MulticolumnMapExpectation): + + """Expect the difference of 2 datetime columns is equal to another column in month. + + This means that for each row, we expect end_datetime - start_datetime = gap (in months) + + Args: + + start_datetime (datetime): The first datetime column to compare. + end_datetime (datetime): The second datetime column to compare. + gap (int): The number of months that the difference between start_datetime and end_datetime should be. + + """ + + examples = [ + { + "dataset_name": "test", + "data": { + "start_datetime": [ + "2022-03-22 10:00:00", + "2022-03-22 10:00:00", + "2022-03-22 10:00:00", + ], + "end_datetime": [ + "2022-04-22 11:00:00", + "2022-04-22 11:00:00", + "2022-04-22 11:00:00", + ], + "gap_pass": [1, 1, 1], + "gap_fail": [0, 1, 1], + }, + "tests": [ + { + "title": "passed test cases", + "exact_match_out": False, + "include_in_gallery": True, + "in": { + "column_list": ["start_datetime", "end_datetime", "gap_pass"], + "start_datetime": "start_datetime", + "end_datetime": "end_datetime", + "gap": "gap_pass", + }, + "out": { + "success": True, + }, + }, + { + "title": "failed test cases", + "exact_match_out": False, + "include_in_gallery": True, + "in": { + "column_list": ["start_datetime", "end_datetime", "gap_fail"], + "start_datetime": "start_datetime", + "end_datetime": "end_datetime", + "gap": "gap_fail", + }, + "out": { + "success": False, + }, + }, + ], + "test_backends": [ + { + "backend": "pandas", + "dialects": None, + }, + # { + # "backend": "sqlalchemy", + # "dialects": ["sqlite"], # , "postgresql" + # }, + # { + # "backend": "spark", + # "dialects": None, + # }, + ], + }, + ] + + # This is the id string of the Metric used by this Expectation. + map_metric = "multicolumn_values.column_datetime_difference_in_months" + + # This is a list of parameter names that can affect whether the Expectation evaluates to True or False + success_keys = ("column_list", "start_datetime", "end_datetime", "gap") + # This dictionary contains default values for any parameters that should have default values + default_kwarg_values = { + "result_format": "BASIC", + "include_config": True, + "catch_exceptions": False, + "base": 2, + } + + def validate_configuration( + self, configuration: Optional[ExpectationConfiguration] + ) -> None: + """ + Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that + necessary configuration arguments have been provided for the validation of the expectation. + Args: + configuration (OPTIONAL[ExpectationConfiguration]): \ + An optional Expectation Configuration entry that will be used to configure the expectation + Returns: + None. Raises InvalidExpectationConfigurationError if the config is not validated successfully + """ + + super().validate_configuration(configuration) + configuration = configuration or self.configuration + + start_datetime = configuration.kwargs["start_datetime"] + end_datetime = configuration.kwargs["end_datetime"] + gap = configuration.kwargs["gap"] + column_list = configuration.kwargs["column_list"] + # # Check other things in configuration.kwargs and raise Exceptions if needed + try: + # parameter cannot be less than zero, + assert start_datetime is None or isinstance(start_datetime, str) + assert end_datetime is None or isinstance(end_datetime, str) + assert gap is None or isinstance(gap, str) + assert gap in column_list + assert start_datetime in column_list + assert end_datetime in column_list + + except AssertionError as e: + raise InvalidExpectationConfigurationError(str(e)) + + # This object contains metadata for display in the public Gallery + library_metadata = { + "maturity": "beta", # "experimental", "beta", or "production" + "tags": [ + "multi-column expectation", + "multi-column column datetime difference in months", + ], + "contributors": ["@tb102122"], + } + + +if __name__ == "__main__": + ExpectColumnDatetimeDifferenceInMonths().print_diagnostic_checklist() From 0d0b98ee99e83e27417f2b58186528d6bbf7d6b1 Mon Sep 17 00:00:00 2001 From: Nathan Farmer Date: Fri, 7 Apr 2023 17:10:09 -0400 Subject: [PATCH 59/96] [MAINTENANCE] Capitalize "If" in rendering of conditional Expectations (#7588) --- great_expectations/render/renderer_configuration.py | 4 ++-- tests/validator/test_validator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/great_expectations/render/renderer_configuration.py b/great_expectations/render/renderer_configuration.py index b6dd94f98ed3..14bffd6679d2 100644 --- a/great_expectations/render/renderer_configuration.py +++ b/great_expectations/render/renderer_configuration.py @@ -458,7 +458,7 @@ def _get_row_condition_string(row_condition_str: str) -> str: condition, f"$row_condition__{str(idx)}" ) row_condition_str = row_condition_str.lower() - return f"if {row_condition_str}" + return f"If {row_condition_str}, then " @validator("template_str") def _set_template_str(cls, v: str, values: dict) -> str: @@ -466,7 +466,7 @@ def _set_template_str(cls, v: str, values: dict) -> str: row_condition_str: str = RendererConfiguration._get_row_condition_string( row_condition_str=values["_row_condition"] ) - v = f"{row_condition_str}, then {v}" + v = row_condition_str + v return v diff --git a/tests/validator/test_validator.py b/tests/validator/test_validator.py index 8b8bc8ce5cab..18924313a618 100644 --- a/tests/validator/test_validator.py +++ b/tests/validator/test_validator.py @@ -1098,7 +1098,7 @@ def test_validator_include_rendered_content_diagnostic( "value": "passenger_count>0", }, }, - template="if $row_condition__0, then $column minimum value must be greater than or equal to $min_value and less than or equal to $max_value.", + template="If $row_condition__0, then $column minimum value must be greater than or equal to $min_value and less than or equal to $max_value.", ), value_type="StringValueType", ) From 30bea1baf699a2ce71c6b6350ea0908d09f60053 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Fri, 7 Apr 2023 17:25:36 -0400 Subject: [PATCH 60/96] [DOCS] Use the actual version after release (#7583) --- docs/docusaurus/docusaurus.config.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js index baf5620c2317..0cc1a946d625 100644 --- a/docs/docusaurus/docusaurus.config.js +++ b/docs/docusaurus/docusaurus.config.js @@ -259,7 +259,7 @@ module.exports = { lastVersion: 'current', versions: { current: { - label: '0.16.x', + label: '0.16.6', path: '' } } From 38f1404c05f234b1558d990ed47cd2e4cba341c1 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Fri, 7 Apr 2023 18:26:28 -0400 Subject: [PATCH 61/96] [MAINTENANCE] Remove pip pins in CI and in contributing_setup.md (#7587) --- ci/azure-pipelines-cloud-integration.yml | 6 +++--- ci/azure-pipelines-contrib.yml | 4 ++-- ci/azure-pipelines-dev.yml | 18 +++++++++--------- ci/azure-pipelines-os-integration.yml | 8 ++++---- ci/dev-install-matrix.yml | 4 ++-- ci/user-install-matrix.yml | 2 +- .../docs/contributing/contributing_setup.md | 2 +- .../azure-pipelines-expectation-gallery.yml | 2 +- ...e-pipelines-manual-staging-json-to-prod.yml | 2 +- 9 files changed, 24 insertions(+), 24 deletions(-) diff --git a/ci/azure-pipelines-cloud-integration.yml b/ci/azure-pipelines-cloud-integration.yml index db096e6e27f1..891d1b263b8f 100644 --- a/ci/azure-pipelines-cloud-integration.yml +++ b/ci/azure-pipelines-cloud-integration.yml @@ -22,7 +22,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' # includes explicit install of chardet, which was causing errors in pipeline @@ -84,7 +84,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' # includes explicit install of chardet, which was causing errors in pipeline @@ -126,7 +126,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' # includes explicit install of grpcio-status and chardet, which was causing errors in pipeline diff --git a/ci/azure-pipelines-contrib.yml b/ci/azure-pipelines-contrib.yml index 923a1c6ff276..303a20b04fc9 100644 --- a/ci/azure-pipelines-contrib.yml +++ b/ci/azure-pipelines-contrib.yml @@ -65,7 +65,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - job: deploy_experimental @@ -78,7 +78,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | diff --git a/ci/azure-pipelines-dev.yml b/ci/azure-pipelines-dev.yml index bb0cdb02e268..41446b4988c7 100644 --- a/ci/azure-pipelines-dev.yml +++ b/ci/azure-pipelines-dev.yml @@ -123,7 +123,7 @@ stages: versionSpec: '3.7' displayName: 'Use Python 3.7' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -213,7 +213,7 @@ stages: versionSpec: '3.7' displayName: 'Use Python 3.7' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -276,7 +276,7 @@ stages: versionSpec: '3.7' displayName: 'Use Python 3.7' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -339,7 +339,7 @@ stages: versionSpec: '3.7' displayName: 'Use Python 3.7' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -394,7 +394,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -431,7 +431,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -481,7 +481,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -521,7 +521,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -573,7 +573,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | diff --git a/ci/azure-pipelines-os-integration.yml b/ci/azure-pipelines-os-integration.yml index 8930b298e9e7..b057110b5e30 100644 --- a/ci/azure-pipelines-os-integration.yml +++ b/ci/azure-pipelines-os-integration.yml @@ -141,7 +141,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -182,7 +182,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -213,7 +213,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -241,7 +241,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==20.2.4 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | diff --git a/ci/dev-install-matrix.yml b/ci/dev-install-matrix.yml index f07640438886..8f70fcb875b8 100644 --- a/ci/dev-install-matrix.yml +++ b/ci/dev-install-matrix.yml @@ -13,7 +13,7 @@ jobs: versionSpec: ${{ pythonVersion }} displayName: Use Python ${{ pythonVersion }} - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | @@ -56,7 +56,7 @@ jobs: versionSpec: ${{ pythonVersion }} displayName: Use Python ${{ pythonVersion }} - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | diff --git a/ci/user-install-matrix.yml b/ci/user-install-matrix.yml index 6b2abc39b7a4..123aae690ca5 100644 --- a/ci/user-install-matrix.yml +++ b/ci/user-install-matrix.yml @@ -13,7 +13,7 @@ jobs: displayName: Use Python ${{ pythonVersion }} - script: | - python -m pip install --upgrade pip==21.3.1 + python -m pip install --upgrade pip pip install great_expectations displayName: 'Install Great Expectations' diff --git a/docs/docusaurus/docs/contributing/contributing_setup.md b/docs/docusaurus/docs/contributing/contributing_setup.md index cafbbd4e5c61..fc62aac13fd7 100644 --- a/docs/docusaurus/docs/contributing/contributing_setup.md +++ b/docs/docusaurus/docs/contributing/contributing_setup.md @@ -56,7 +56,7 @@ python3 -m venv gx_dev source gx_dev/bin/activate -pip install --upgrade pip==21.3.1 +pip install --upgrade pip pip install -c constraints-dev.txt -e ".[test]" ``` diff --git a/docs/expectation_gallery/azure-pipelines-expectation-gallery.yml b/docs/expectation_gallery/azure-pipelines-expectation-gallery.yml index 1221ca90247e..93efb878fae3 100644 --- a/docs/expectation_gallery/azure-pipelines-expectation-gallery.yml +++ b/docs/expectation_gallery/azure-pipelines-expectation-gallery.yml @@ -100,7 +100,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==21.3.1 + - bash: python -m pip install --upgrade pip displayName: 'Update pip' - script: | diff --git a/docs/expectation_gallery/azure-pipelines-manual-staging-json-to-prod.yml b/docs/expectation_gallery/azure-pipelines-manual-staging-json-to-prod.yml index e223f558fe7e..889742f6478e 100644 --- a/docs/expectation_gallery/azure-pipelines-manual-staging-json-to-prod.yml +++ b/docs/expectation_gallery/azure-pipelines-manual-staging-json-to-prod.yml @@ -26,7 +26,7 @@ stages: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - - bash: python -m pip install --upgrade pip==21.3.1 awscli + - bash: python -m pip install --upgrade pip awscli displayName: 'Update pip and install awscli' - bash: bash ./copy_s3_staging_to_prod.sh From 21942259987aaa2add0bfefff7b5e235d2298b4b Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Fri, 7 Apr 2023 19:18:56 -0400 Subject: [PATCH 62/96] [MAINTENANCE] Remove ignore of warning deprecated api features detected sqlalchemy 2 (#7584) --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a12b0e00df3e..4b0824e98b8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -464,9 +464,6 @@ filterwarnings = [ # SQLAlchemy 2.x support warnings. These warnings should be ignored until sqlalchemy 2.x is fully supported. # To get SQLAlchemy 2.x supported, remove one of these ignores and then fix the resulting errors. 'ignore: The Engine.execute\(\) method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0. All statement execution in SQLAlchemy 2.0 is performed by the Connection.execute\(\) method of Connection, or in the ORM by the Session.execute\(\) method of Session.:DeprecationWarning', - # Example Actual Warning: sqlalchemy.exc.RemovedIn20Warning: Deprecated API features detected! These feature(s) are not compatible with SQLAlchemy 2.0. To prevent incompatible upgrades prior to updating applications, ensure requirements files are pinned to "sqlalchemy<2.0". Set environment variable SQLALCHEMY_WARN_20=1 to show all deprecation warnings. Set environment variable SQLALCHEMY_SILENCE_UBER_WARNING=1 to silence this message. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) - # Found so far in test_cli_datasource_list - 'ignore: Deprecated API features detected!:DeprecationWarning', # --------------------------------------- TEMPORARY IGNORES -------------------------------------------------------- ] junit_family="xunit2" From c8f45d5a01025f739e667b0968bacdd88d31b2ec Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Sat, 8 Apr 2023 13:30:06 -0400 Subject: [PATCH 63/96] [MAINTENANCE] Fix sqlalchemy 2.0 incompatible warnings (#7589) --- tests/cli/test_checkpoint.py | 9 ++++++--- tests/conftest.py | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/cli/test_checkpoint.py b/tests/cli/test_checkpoint.py index 7e6266f296a2..8b55d4aea13c 100644 --- a/tests/cli/test_checkpoint.py +++ b/tests/cli/test_checkpoint.py @@ -14,6 +14,9 @@ from nbformat import NotebookNode from great_expectations.cli import cli +from great_expectations.compatibility.sqlalchemy_compatibility_wrappers import ( + add_dataframe_to_db, +) from great_expectations.core import ExpectationSuite from great_expectations.core.usage_statistics.anonymizers.types.base import ( GETTING_STARTED_DATASOURCE_NAME, @@ -55,10 +58,10 @@ def titanic_data_context_with_sql_datasource( __file__, os.path.join("..", "test_sets", "Titanic.csv") ) df: pd.DataFrame = pd.read_csv(filepath_or_buffer=csv_path) - df.to_sql(name="titanic", con=conn) + add_dataframe_to_db(df=df, name="titanic", con=conn) df = df.sample(frac=0.5, replace=True, random_state=1) - df.to_sql(name="incomplete", con=conn) - test_df.to_sql(name="wrong", con=conn) + add_dataframe_to_db(df=df, name="incomplete", con=conn) + add_dataframe_to_db(df=test_df, name="wrong", con=conn) except ValueError as ve: logger.warning(f"Unable to store information into database: {str(ve)}") diff --git a/tests/conftest.py b/tests/conftest.py index c63726b8e90e..19cd3d69f70c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1761,7 +1761,8 @@ def empty_sqlite_db(sa): from sqlalchemy import create_engine engine = create_engine("sqlite://") - assert engine.execute(sa.text("select 1")).fetchall()[0] == (1,) + with engine.connect() as connection: + assert connection.execute(sa.text("select 1")).fetchall()[0] == (1,) return engine except ImportError: raise ValueError("sqlite tests require sqlalchemy to be installed") From b23b7ceccd0750187758ceb7dc09129725b6b1f8 Mon Sep 17 00:00:00 2001 From: kenwade4 <95714847+kenwade4@users.noreply.github.com> Date: Mon, 10 Apr 2023 07:29:03 -0500 Subject: [PATCH 64/96] [DOCS] Update some docs_rtd requirements so the venv can be created successfully (#7580) --- docs_rtd/requirements.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs_rtd/requirements.txt b/docs_rtd/requirements.txt index f6736e4cb8f7..f6e30612b0d3 100644 --- a/docs_rtd/requirements.txt +++ b/docs_rtd/requirements.txt @@ -25,9 +25,9 @@ idna==2.10 imagesize==1.2.0 importlib-metadata==1.7.0 ipykernel==5.3.4 -ipython==8.10.0 +ipython==8.12.0 ipython-genutils==0.2.0 -ipywidgets==7.5.1 +ipywidgets==7.6.6 jedi==0.17.2 Jinja2==2.11.3 jsonpatch==1.26 @@ -38,9 +38,9 @@ jupyter-core==4.11.2 lazy-object-proxy==1.4.3 MarkupSafe==1.1.1 mistune==0.8.4 -nbconvert==6.5.1 -nbformat==5.0.7 -notebook==6.4.12 +nbconvert==6.4.4 +nbformat==5.8.0 +notebook==6.5.4 numpy==1.22.0 packaging==20.4 pandas==1.0.5 @@ -50,7 +50,7 @@ pathspec==0.8.0 pexpect==4.8.0 pickleshare==0.7.5 prometheus-client==0.8.0 -prompt-toolkit==3.0.6 +prompt-toolkit==3.0.38 ptyprocess==0.6.0 pycparser==2.20 Pygments==2.7.4 @@ -65,7 +65,7 @@ requests==2.25.1 ruamel.yaml==0.16.10 ruamel.yaml.clib==0.2.0 scipy==1.5.2 -Send2Trash==1.5.0 +Send2Trash==1.8.0 six==1.15.0 smmap==3.0.4 snowballstemmer==2.0.0 @@ -87,7 +87,7 @@ testpath==0.4.4 toml==0.10.1 toolz==0.10.0 tornado==6.1.0 -traitlets==4.3.3 +traitlets==5.9.0 typed-ast==1.4.1 tzlocal==2.1 Unidecode==1.1.1 From 912d74c777d816e44afe828cd71d9c3d1be32013 Mon Sep 17 00:00:00 2001 From: Elena <26873786+elenajdanova@users.noreply.github.com> Date: Mon, 10 Apr 2023 13:20:33 -0400 Subject: [PATCH 65/96] [DOCS] Add Cloud quickstart (#7441) Co-authored-by: Gabriel Co-authored-by: Rob Lim --- .../getting_started_with_gx_cloud.md | 239 ++++++++++++++++-- 1 file changed, 212 insertions(+), 27 deletions(-) diff --git a/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md b/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md index 25ac569dd824..1310222eabe9 100644 --- a/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md +++ b/docs/docusaurus/docs/gx_cloud/tutorials/getting_started/getting_started_with_gx_cloud.md @@ -1,57 +1,242 @@ --- -title: Getting started with Great Expectations Cloud -tag: [tutorial, getting started, cloud, onboarding] -description: This tutorial will help you onboard with GX Cloud and get ready to connect to your data. -keywords: [tutorial, getting started, cloud, onboarding] +title: Quickstart with GX Cloud +tag: [tutorial, getting started, quickstart, cloud] --- +# Quickstart with Great Expectations Cloud import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; +import SetupAndInstallForSqlData from '/docs/components/setup/link_lists/_setup_and_install_for_sql_data.md' +import SetupAndInstallForFilesystemData from '/docs/components/setup/link_lists/_setup_and_install_for_filesystem_data.md' +import SetupAndInstallForHostedData from '/docs/components/setup/link_lists/_setup_and_install_for_hosted_data.md' +import SetupAndInstallForCloudData from '/docs/components/setup/link_lists/_setup_and_install_for_cloud_data.md' +import Prerequisites from '/docs/components/_prerequisites.jsx' -Welcome to Great Expectations Cloud! This tutorial will help you onboard with GX Cloud and get ready to connect to your data. +## Introduction -:::note Prerequisites -- This tutorial assumes you have Great Expectations OSS installed on your machine. If that's not the case please complete [OSS Setup](/docs/guides/setup/installation/local) first. -::: +Few things are as daunting as taking your first steps with a new piece of software. This guide will introduce you to GX Cloud and demonstrate the ease with which you can implement the basic GX workflow. We will walk you through the entire process of connecting to your data, building your first Expectation based off of an initial Batch of that data, validating your data with that Expectation, and finally reviewing the results of your validation. -## Steps +Once you have completed this guide you will have a foundation in the basics of using GX Cloud. In the future you will be able to adapt GX to suit your specific needs by customizing the execution of the individual steps you will learn here. -### Step 1: Login +## Prerequisites -- First of all, you'll need to go to [https://app.greatexpectations.io](https://app.greatexpectations.io) -- You should have received GX Cloud invitation email. Follow the link in the email to set the password. -- Login into the app with your email and password. + -### Step 2: Generate User Token +- Installed Great Expectations OSS on your machine. +- Followed invitation email instructions from the GX team after signing up for Early Access. +- Successfully logged in to GX Cloud at [https://app.greatexpectations.io](https://app.greatexpectations.io). +- A passion for data quality. -- Go to [“Settings” > “Tokens”](https://app.greatexpectations.io/tokens) in the navigation panel. In this tutorial, we’ll create a User Token, but GX Cloud also supports Organization tokens, e.g. for use in shared execution environments. These tokens are see-once and stored as a hash in Great Expectation Cloud's backend database. Once a user copies their API key, the Cloud UI will never show the token value again. + + +## Steps -### Step 3: Set tokens and Create +### 1. Setup -- Open Jupyter Notebook +#### 1.1 Generate User Token + +Go to [“Settings” > “Tokens”](https://app.greatexpectations.io/tokens) in the navigation panel and generate a User Token. Both `admin` and `editor` roles will suffice for this guide. +These tokens are view-once and stored as a hash in Great Expectation Cloud's backend database. Once you copy the API key and close the dialog, the Cloud UI will never show the token value again. + +#### 1.2 Import modules :::tip Any Python Interpreter or script file will work for the remaining steps in the guide, we recommend using a Jupyter Notebook, since they are included in the OSS GX installation and give the best experience of both composing a script file and running code in a live interpreter. ::: -- Set environment variables in the notebook (alternatively, add these as [Data Context config variables(/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials)) +Switch to Jupyter Notebook and import modules we're going to use in this tutorial. -```python +```python title="Jupyter Notebook" +import great_expectations as gx +import pandas as pd import os +``` + +#### 1.3 Create Data Context +Paste this snippet into the next notebook cell to instantiate Cloud . + +:::caution +Please note that GX Cloud API tokens are sensitive information and should not be committed to version control software. Alternatively, add these as [Data Context config variables](https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials/) +::: + +```python title="Jupyter Notebook" +os.environ["GX_CLOUD_ACCESS_TOKEN"] = "" +# your organization_id is indicated on https://app.greatexpectations.io/tokens page os.environ["GX_CLOUD_ORGANIZATION_ID"] = "" -os.environ["GX_CLOUD_ACCESS_TOKEN"] = "" + +context = gx.get_context() ``` -- Set Cloud data context in the notebook +### 2. Create Datasource -```python -import great_expectations as gx +Modify the following snippet code to connect to your . +In case you don't have some data handy to test in this guide, we can use the [NYC taxi data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). This is an open data set which is updated every month. Each record in the data corresponds to one taxi ride. You can find a link to it in the sniipet below. -context = gx.get_context() +:::caution +Please note you should not include sensitive info/credentials directly in the config while connecting to your Datasource, since this would be persisted in plain text in the database and presented in Cloud UI. If credentials/full connection string is required, you should use a [config variables file](https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials/). +::: + +```python title="Jupyter Notebook" +datasource_name = None +assert datasource_name is not None, "Please set datasource_name." +batch_identifier_name = None # batch_identifier_name is intended to help identify batches of data passed in directly through dataframes +assert batch_identifier_name is not None, "Please set batch_identifier_name." +data_connector_name = None +assert data_connector_name is not None, "Please set data_connector_name." + +datasource_yaml = f""" + name: {datasource_name} + class_name: Datasource + execution_engine: + class_name: PandasExecutionEngine + data_connectors: + {data_connector_name}: + class_name: RuntimeDataConnector + batch_identifiers: + - {batch_identifier_name} +""" + +path_to_data = None +# to use sample data uncomment next line +# path_to_data = "https://raw.githubusercontent.com/great-expectations/gx_tutorials/main/data/yellow_tripdata_sample_2019-01.csv" +assert path_to_data is not None, "Please set path_to_data. This can be a local filepath or a remote URL." +df = pd.read_csv(path_to_data) +batch_identifier_value = None +assert batch_identifier_value is not None, "Please set batch_identifier." + +batch_request = { + "runtime_parameters": { + "batch_data": df + }, + "batch_identifiers": { + batch_identifier_name: batch_identifier_value + } +} + +# Test your configuration: +datasource = context.test_yaml_config(datasource_yaml) + +# Save your datasource: +datasource = context.save_datasource(datasource) +``` + +In case you need more details on how to connect to your specific data system, we have step by step how-to guides that cover many common cases. [Start here](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview) + +### 3. Create Expectations + +#### 3.1 Create Expectation Suite + +An is a collection of verifiable assertions about data. Run this snippet to create a new, empty : + +```python title="Jupyter Notebook" +expectation_suite_name = None +assert expectation_suite_name is not None, "Please set expectation_suite_name." + +expectation_suite = context.create_expectation_suite( + expectation_suite_name=expectation_suite_name +) +``` + +#### 3.2 Add Expectation + +Modify and run this snippet to add an to the you just created: + +```python title="Jupyter Notebook" +# Get an existing Expectation Suite +expectation_suite_id = expectation_suite.ge_cloud_id +expectation_suite = context.get_expectation_suite(ge_cloud_id=expectation_suite_id) +column_name = None # set column name you want to test here +assert column_name is not None, "Please set column_name." + +# Look up all expectations types here - https://greatexpectations.io/expectations/ +expectation_configuration = gx.core.ExpectationConfiguration(**{ + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": column_name, + }, + "meta":{} +}) + +expectation_suite.add_expectation( + expectation_configuration=expectation_configuration +) +print(expectation_suite) + +# Save the Expectation Suite +context.save_expectation_suite(expectation_suite) ``` -## Next Steps +With the Expectation defined above, we are stating that we _expect_ the column of your choice to always be populated. That is: none of the column's values should be null. + + +### 4. Validate data + +#### 4.1 Create Checkpoint + +Now that we have connected to data and defined an , it is time to validate whether our data meets the Expectation. To do this, we define a , which will allow us to repeat the in the future. + +```python title="Jupyter Notebook" +checkpoint_name = None # name your checkpoint here +assert checkpoint_name is not None, "Please set checkpoint_name." +data_asset_name = None # name your table here +assert data_asset_name is not None, "Please set data_asset_name." + +checkpoint_config = { + "name": checkpoint_name, + "validations": [{ + "expectation_suite_name": expectation_suite_name, + "expectation_suite_ge_cloud_id": expectation_suite.ge_cloud_id, + "batch_request": { + "datasource_name": datasource_name, + "data_connector_name": data_connector_name, + "data_asset_name": data_asset_name, + }, + }], + "config_version": 1, + "class_name": "Checkpoint" +} + +context.add_or_update_checkpoint(**checkpoint_config) +checkpoint = context.get_checkpoint(checkpoint_name) +print(checkpoint) +``` + +#### 4.2 Run Checkpoint + +Once we have created the , we will run it and get back the results from our . + +```python title="Jupyter Notebook" +result = context.run_checkpoint(ge_cloud_id=checkpoint.ge_cloud_id, batch_request=batch_request) +print(result) +``` + +#### 4.3 Review your results + +After you run the , you should see a `validation_result_url` in the result, that takes you directly to GX Cloud, so you can see your and in the GX Cloud UI. + +Alternatively, you can visit the [Checkpoints page](https://app.greatexpectations.io/checkpoints) and filter by the Checkpoint, Expectation Suite, or Data Asset you want to see the results for. + + +## Next Steps + +Now that you've seen how to implement the GX workflow, it is time to customize the workflow to suit your specific use cases! To help with this we have prepared more detailed guides tailored to specific environments and resources. + +To get all the snippets above in one script, visit [GX OSS repository](https://github.com/great-expectations/great_expectations/blob/develop/assets/scripts/gx_cloud/experimental/onboarding_script.py) + +To invite additional team members to the app visit [“Settings” > “Users”](https://app.greatexpectations.io/users). + +For more details on installing GX for use with local filesystems, please see: + + + +For guides on installing GX for use with cloud storage systems, please reference: + + + +For information on installing GX for use with SQL databases, see: + + -Follow in-app snippets to create a , define an , configure and run a and view . +And for instructions on installing GX for use with hosted data systems, read: -You can also go to [“Settings” > “Users”](https://app.greatexpectations.io/users) in the navigation panel to invite your team members to the app. + From 7791f6d5fb0ceaaf8f3954a154824183b1405abb Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Mon, 10 Apr 2023 12:36:13 -0500 Subject: [PATCH 66/96] [DOCS] Updates how the GX Cloud Beta is referenced in the Quickstart guide. (#7594) Co-authored-by: Erin <118856238+Erin-GX@users.noreply.github.com> --- .../docs/tutorials/quickstart/quickstart.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/docusaurus/docs/tutorials/quickstart/quickstart.md b/docs/docusaurus/docs/tutorials/quickstart/quickstart.md index c81c8a406640..a88d5c98e8ee 100644 --- a/docs/docusaurus/docs/tutorials/quickstart/quickstart.md +++ b/docs/docusaurus/docs/tutorials/quickstart/quickstart.md @@ -14,6 +14,14 @@ Few things are as daunting as taking your first steps with a new piece of softwa Once you have completed this guide you will have a foundation in the basics of using GX. In the future you will be able to adapt GX to suit your specific needs by customizing the execution of the individual steps you will learn here. +:::info Great Expectations Cloud +This guide is intended to introduce you to the open source Python and command line use of Great Expectations. GX also offers an online interface, currently in Beta. The GX Cloud interface significantly simplifies collaboration between data teams and domain experts. + +If you are interested in GX Cloud, you should join the GX Cloud Beta. During this program limited seats are available, but signing up will keep you informed of the product's process. + +**[Sign up for the GX Cloud Beta!](https://greatexpectations.io/cloud)** +::: + ## Prerequisites @@ -117,14 +125,6 @@ Great Expectations provides a friendly, human-readable way to view the results o ``` -### 5. (Optional) Great Expectations Cloud - -By completing the Quickstart guide, you now have the opportunity to join the Cloud Early Access program and explore how Great Expectations Cloud visualizes and creates shareable links for anyone on your team. The GX Cloud interface significantly simplifies collaboration between data teams and domain experts. - -To access GX Cloud, you need to join our Cloud Early Access program. During this program, limited seats are available, but joining the queue will keep you informed of the product's progress. - -**[Sign up for the Cloud Early Access program!](https://greatexpectations.io/cloud)** - ## Next Steps Now that you've seen how easy it is to implement the GX workflow, it is time to customize that workflow to suit your specific use cases! To help with this we have prepared some more detailed guides on setting up and installing GX and getting an initial Data Context that are tailored to specific environments and resources. From eeaf6ee5a6460d8613997b43ff43c8ea61928677 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Mon, 10 Apr 2023 11:29:25 -0700 Subject: [PATCH 67/96] [MAINTENANCE] Increase minimum scipy version package to 1.6.0 to take advantage of available capabilities. (#7591) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 032c902e2cf9..6a56dab8e730 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ python-dateutil>=2.8.1 pytz>=2021.3 requests>=2.20 ruamel.yaml>=0.16,<0.17.18 # See https://github.com/great-expectations/great_expectations/issues/4152 -scipy>=0.19.0 +scipy>=1.6.0 tqdm>=4.59.0 typing-extensions>=3.10.0.0 # Leverage type annotations from recent Python releases tzlocal>=1.2 From ad8b7617b88135ecb440f34e5f276e427ef89770 Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Mon, 10 Apr 2023 16:14:36 -0500 Subject: [PATCH 68/96] [DOCS] Corrects typo in code block within in-memory Pandas guide (#7600) --- .../in_memory/how_to_connect_to_in_memory_data_using_pandas.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas.md b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas.md index f8487d7d417d..5c213d00f504 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/fluent/in_memory/how_to_connect_to_in_memory_data_using_pandas.md @@ -68,7 +68,7 @@ name = "taxi_dataframe" Now that we have the `name` and `dataframe` for our Data Asset, we can create the Data Asset with the code: ```python title="Python code" -data_asset = datasource.add_dataframe(name=name, dataframe=dataframe) +data_asset = datasource.add_dataframe_asset(name=name, dataframe=dataframe) ``` ## Next steps From c52758426ff98bb9cca45fa2d850b6327632c133 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Mon, 10 Apr 2023 18:37:25 -0400 Subject: [PATCH 69/96] [MAINTENANCE] Remove s3fs dependency and upper bound for boto3 (#7598) --- constraints-dev.txt | 8 - .../usage_statistics/package_dependencies.py | 2 - .../batch_kwargs_generator/__init__.py | 1 - ...s3_subdir_reader_batch_kwargs_generator.py | 332 ------------------ reqs/requirements-dev-lite.txt | 3 +- .../test_s3_subdir_reader_generator.py | 140 -------- 6 files changed, 1 insertion(+), 485 deletions(-) delete mode 100644 great_expectations/datasource/batch_kwargs_generator/s3_subdir_reader_batch_kwargs_generator.py delete mode 100644 tests/datasource/batch_kwarg_generator/test_s3_subdir_reader_generator.py diff --git a/constraints-dev.txt b/constraints-dev.txt index 3f9e433b264e..ddf875711896 100644 --- a/constraints-dev.txt +++ b/constraints-dev.txt @@ -6,14 +6,6 @@ # To install dev dependencies using the new pip resolver (recommended) please use the following syntax: # `python -m pip install -r requirements-dev.txt -c constraints-dev.txt` -boto3==1.17.106 # from botocore==1.20.106 dependency -# NOTE - 20210114 -# aiobotocore is a dependency of the s3fs package -# The latest version of aiobotocore (v1.4.0) is only compatible with botocore up to 1.20.106 -# botocore==1.20.106 is compatible with boto3==1.17.106 -botocore==1.20.106 # From aiobotocore v1.4.0 dependencies https://pypi.org/project/aiobotocore/ -# END NOTE - # Several capitalone_dataprofiler_expectations that use tensorflow started # dying with the following error from site-packates/google/protobuf/descriptor.py # TypeError: Descriptors cannot not be created directly. diff --git a/great_expectations/core/usage_statistics/package_dependencies.py b/great_expectations/core/usage_statistics/package_dependencies.py index 23fb10911654..d879fd464246 100644 --- a/great_expectations/core/usage_statistics/package_dependencies.py +++ b/great_expectations/core/usage_statistics/package_dependencies.py @@ -105,7 +105,6 @@ class GXDependencies: "pytest-random-order", "pytest-timeout", "requirements-parser", - "s3fs", "snapshottest", "snowflake-connector-python", "snowflake-sqlalchemy", @@ -146,7 +145,6 @@ class GXDependencies: "pytest-icdiff", "pytest-timeout", "requirements-parser", - "s3fs", "snapshottest", # "sqlalchemy", # Not excluded from tracking "trino", diff --git a/great_expectations/datasource/batch_kwargs_generator/__init__.py b/great_expectations/datasource/batch_kwargs_generator/__init__.py index 5127300e9a0c..7b202f8ed8a6 100644 --- a/great_expectations/datasource/batch_kwargs_generator/__init__.py +++ b/great_expectations/datasource/batch_kwargs_generator/__init__.py @@ -3,6 +3,5 @@ from .manual_batch_kwargs_generator import ManualBatchKwargsGenerator from .query_batch_kwargs_generator import QueryBatchKwargsGenerator from .s3_batch_kwargs_generator import S3GlobReaderBatchKwargsGenerator -from .s3_subdir_reader_batch_kwargs_generator import S3SubdirReaderBatchKwargsGenerator from .subdir_reader_batch_kwargs_generator import SubdirReaderBatchKwargsGenerator from .table_batch_kwargs_generator import TableBatchKwargsGenerator diff --git a/great_expectations/datasource/batch_kwargs_generator/s3_subdir_reader_batch_kwargs_generator.py b/great_expectations/datasource/batch_kwargs_generator/s3_subdir_reader_batch_kwargs_generator.py deleted file mode 100644 index 7139f206436f..000000000000 --- a/great_expectations/datasource/batch_kwargs_generator/s3_subdir_reader_batch_kwargs_generator.py +++ /dev/null @@ -1,332 +0,0 @@ -import logging -import os -from pathlib import Path -from typing import Dict, Iterable -from urllib.parse import urlparse, urlunparse - -try: - import s3fs -except ImportError: - s3fs = None - -from great_expectations.datasource.batch_kwargs_generator.batch_kwargs_generator import ( - BatchKwargsGenerator, -) -from great_expectations.datasource.types import PathBatchKwargs, S3BatchKwargs -from great_expectations.exceptions import BatchKwargsError - -logger = logging.getLogger(__name__) - -KNOWN_EXTENSIONS = [ - ".csv", - ".tsv", - ".parquet", - ".pqt", - ".parq", - ".xls", - ".xlsx", - ".json", - ".csv.gz", - ".tsv.gz", - ".feather", - ".pkl", -] - - -class S3SubdirReaderBatchKwargsGenerator(BatchKwargsGenerator): - """The SubdirReaderBatchKwargsGenerator inspects a filesystem and produces path-based batch_kwargs. - - SubdirReaderBatchKwargsGenerator recognizes data assets using two criteria: - - for files directly in 'base_directory' with recognized extensions (.csv, .tsv, .parquet, .xls, .xlsx, .json - .csv.gz, tsv.gz, .feather, .pkl), it uses the name of the file without the extension - - for other files or directories in 'base_directory', is uses the file or directory name - - SubdirReaderBatchKwargsGenerator sees all files inside a directory of base_directory as batches of one datasource. - - SubdirReaderBatchKwargsGenerator can also include configured reader_options which will be added to batch_kwargs generated - by this generator. - """ - - _default_reader_options: Dict = {} - recognized_batch_parameters = {"data_asset_name", "partition_id"} - - def __init__( - self, - name="default", - datasource=None, - bucket=None, - boto3_options=None, - base_directory="/data", - reader_options=None, - known_extensions=None, - reader_method=None, - ) -> None: - super().__init__(name, datasource=datasource) - - if not s3fs: - raise ImportError("ModuleNotFoundError: No module named 's3fs'") - - if reader_options is None: - reader_options = self._default_reader_options - - if known_extensions is None: - known_extensions = KNOWN_EXTENSIONS - - self._known_extensions = known_extensions - self._reader_options = reader_options - self._reader_method = reader_method - self._base_directory = base_directory - if boto3_options is None: - boto3_options = {} - # s3fs can read credentials from ~/.aws/credentials, same as boto3 - client_kwargs = {} - if boto3_options.get("endpoint_url"): - client_kwargs["endpoint_url"] = boto3_options.get("endpoint_url") - self.fs = s3fs.S3FileSystem(anon=False, client_kwargs=client_kwargs) - - @property - def reader_options(self): - return self._reader_options - - @property - def known_extensions(self): - return self._known_extensions - - @property - def reader_method(self): - return self._reader_method - - @property - def base_directory(self): - # A base path for S3 - s3a://bucket/prefix/files - return self._base_directory - - def get_available_data_asset_names(self): - if not self.fs.isdir(self.base_directory): - return {"names": [], "is_complete_list": True} - known_assets = self._get_valid_file_options(base_directory=self.base_directory) - return {"names": known_assets, "is_complete_list": True} - - def get_available_partition_ids(self, data_asset_name=None): - # If the asset names a single known *file*, return ONLY that - for extension in self.known_extensions: - if self.fs.isfile( - os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name + extension - ) - ): - return [data_asset_name] - if self.fs.isfile( - os.path.join(self.base_directory, data_asset_name) # noqa: PTH118 - ): - return [data_asset_name] - - # Otherwise, subdir files are partition ids - return [ - path - for (path, type) in self._get_valid_file_options( - base_directory=os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name - ) - ) - ] - - def _build_batch_kwargs(self, batch_parameters): - """ - - Args: - batch_parameters: - - Returns: - batch_kwargs - - """ - try: - data_asset_name = batch_parameters.pop("data_asset_name") - except KeyError: - raise BatchKwargsError( - "Unable to build BatchKwargs: no name provided in batch_parameters.", - batch_kwargs=batch_parameters, - ) - - if "partition_id" in batch_parameters: - partition_id = batch_parameters.pop("partition_id") - # Find the path - path = None - for extension in self.known_extensions: - if self.fs.isfile( - os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name, partition_id + extension - ) - ): - path = os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name, partition_id + extension - ) - - if path is None: - logger.warning( - "Unable to find path with the provided partition; searching for asset-name partitions." - ) - # Fall through to this case in the event that there is not a subdir available, or if partition_id was - # not provided - if self.fs.isfile( - os.path.join(self.base_directory, data_asset_name) # noqa: PTH118 - ): - path = os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name - ) - - for extension in self.known_extensions: - if self.fs.isfile( - os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name + extension - ) - ): - path = os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name + extension - ) - - if path is None: - raise BatchKwargsError( - f"Unable to build batch kwargs from for asset '{data_asset_name}'", - batch_parameters, - ) - return self._build_batch_kwargs_from_path(path, **batch_parameters) - - else: - return self.yield_batch_kwargs( - data_asset_name=data_asset_name, **batch_parameters - ) - - def _get_valid_file_options(self, base_directory=None): - valid_options = [] - if base_directory is None: - base_directory = self.base_directory - file_options = self.fs.listdir(base_directory) - for file_option in file_options: - file_option = file_option["Key"] - file_option = file_option.split("/")[ - -1 - ] # fs.listdir with return full path unlike os.listdir - for extension in self.known_extensions: - if ( - file_option.endswith(extension) - and not file_option.startswith(".") - and (file_option[: -len(extension)], "file") not in valid_options - ): - valid_options.append((file_option[: -len(extension)], "file")) - elif self.fs.isdir( - os.path.join(self.base_directory, file_option) # noqa: PTH118 - ): - # Make sure there's at least one valid file inside the subdir - subdir_options = self._get_valid_file_options( - base_directory=os.path.join( # noqa: PTH118 - base_directory, file_option - ) - ) - if ( - len(subdir_options) > 0 - and (file_option, "directory") not in valid_options - ): - valid_options.append((file_option, "directory")) - return valid_options - - def _get_iterator(self, data_asset_name, reader_options=None, limit=None): - logger.debug( - f"Beginning SubdirReaderBatchKwargsGenerator _get_iterator for data_asset_name: {data_asset_name}" - ) - # If the data asset is a file, then return the path. - # Otherwise, use files in a subdir as batches - if self.fs.isdir( - os.path.join(self.base_directory, data_asset_name) # noqa: PTH118 - ): - subdir_options = os.listdir( - os.path.join(self.base_directory, data_asset_name) # noqa: PTH118 - ) - batches = [] - for file_option in subdir_options: - for extension in self.known_extensions: - if file_option.endswith(extension) and not file_option.startswith( - "." - ): - batches.append( - os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name, file_option - ) - ) - - return self._build_batch_kwargs_path_iter( - batches, reader_options=reader_options, limit=limit - ) - else: - for extension in self.known_extensions: - path = os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name + extension - ) - path = self._window_to_s3_path(path) - if self.fs.isfile(path): - return iter( - [ - self._build_batch_kwargs_from_path( - path, reader_options=reader_options, limit=limit - ) - ] - ) - # If we haven't returned yet, raise - raise BatchKwargsError( - "No valid files found when searching {:s} using configured known_extensions: " - "{:s} ".format( - os.path.join(self.base_directory, data_asset_name), # noqa: PTH118 - ", ".join(map(str, self.known_extensions)), - ), - batch_kwargs=PathBatchKwargs( - path=os.path.join( # noqa: PTH118 - self.base_directory, data_asset_name - ) - ), - ) - - def _build_batch_kwargs_path_iter( - self, path_list, reader_options=None, limit=None - ) -> Iterable[S3BatchKwargs]: - for path in path_list: - yield self._build_batch_kwargs_from_path( - path, reader_options=reader_options, limit=limit - ) - - def _build_batch_kwargs_from_path( - self, path, reader_method=None, reader_options=None, limit=None - ) -> S3BatchKwargs: - batch_kwargs = self._datasource.process_batch_parameters( - reader_method=reader_method or self.reader_method, - reader_options=reader_options or self.reader_options, - limit=limit, - ) - if "s3a://" not in path: - path = f"s3a://{path}" - batch_kwargs["s3"] = path - batch_kwargs["datasource"] = self._datasource.name - - return S3BatchKwargs(batch_kwargs) - - def _window_to_s3_path(self, path: str): - """ - To handle window "\" path. "s3://bucket\\prefix" => "s3://bucket/prefix" - >>> path = os.path.join("s3://bucket", "prefix") - >>> window_to_s3_path(path) - >>> - """ - - s3_url = urlparse(path) - s3_path = Path(s3_url.path) - s3_new_url = urlunparse( - ( - s3_url.scheme, - s3_url.netloc, - s3_path.as_posix(), - s3_url.params, - s3_url.query, - s3_url.fragment, - ) - ) - return s3_new_url diff --git a/reqs/requirements-dev-lite.txt b/reqs/requirements-dev-lite.txt index 6f5ddf06a672..2d50b30d6b74 100644 --- a/reqs/requirements-dev-lite.txt +++ b/reqs/requirements-dev-lite.txt @@ -1,4 +1,4 @@ -boto3==1.17.106 # This should match the version in constraints-dev.txt +boto3>=1.17.106 flask>=1.0.0 # for s3 test only (with moto) freezegun>=0.3.15 ipykernel<=6.17.1 # Newest version (6.19.0 released on 12/7/2022) causes "WARNING traitlets:client.py:1181 No handler found for comm target 'comm'" by "https://github.com/jupyter/nbclient/blob/main/nbclient/client.py" (version 0.7.2) to be emitted. @@ -15,6 +15,5 @@ pytest-order>=0.9.5 pytest-random-order>=1.0.4 pytest-timeout>=2.1.0 requirements-parser>=0.2.0 -s3fs>=0.5.1 snapshottest==0.6.0 # GX Cloud atomic renderer tests sqlalchemy>=1.4.0,<2.0.0 diff --git a/tests/datasource/batch_kwarg_generator/test_s3_subdir_reader_generator.py b/tests/datasource/batch_kwarg_generator/test_s3_subdir_reader_generator.py deleted file mode 100644 index 770acd84918c..000000000000 --- a/tests/datasource/batch_kwarg_generator/test_s3_subdir_reader_generator.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import time - -import pandas as pd -import pytest -import requests -from botocore.session import Session - -from great_expectations.datasource.batch_kwargs_generator import ( - S3SubdirReaderBatchKwargsGenerator, -) - -port = 5555 -url_host = os.getenv("GE_TEST_LOCALHOST_URL", "127.0.0.1") -endpoint_uri = f"http://{url_host}:{port}/" -os.environ["AWS_ACCESS_KEY_ID"] = "dummy_key" -os.environ["AWS_SECRET_ACCESS_KEY"] = "dummy_secret" - - -@pytest.fixture(scope="module") -def s3_base(): - # writable local S3 system - import shlex - import subprocess - - proc = subprocess.Popen(shlex.split(f"moto_server s3 -p {port}")) - - timeout = 5 - while timeout > 0: - try: - r = requests.get(endpoint_uri) - if r.ok: - break - except: # noqa: E722 - pass - timeout -= 0.1 - time.sleep(0.1) - yield - proc.terminate() - proc.wait() - - -@pytest.fixture(scope="module") -def mock_s3_bucket(s3_base): - bucket = "test_bucket" - session = Session() - client = session.create_client("s3", endpoint_url=endpoint_uri) - client.create_bucket(Bucket=bucket, ACL="public-read") - - df = pd.DataFrame({"c1": [1, 2, 3], "c2": ["a", "b", "c"]}) - keys = [ - "data/for/you.csv", - "data/for/me.csv", - ] - for key in keys: - # noinspection PyTypeChecker - client.put_object( - Bucket=bucket, Body=df.to_csv(index=None).encode("utf-8"), Key=key - ) - yield bucket - - -@pytest.fixture -def s3_subdir_generator(mock_s3_bucket, basic_sparkdf_datasource): - # We configure a generator that will fetch from (mocked) my_bucket - # and will use glob patterns to match returned assets into batches of the same asset - try: - generator = S3SubdirReaderBatchKwargsGenerator( - "my_generator", - datasource=basic_sparkdf_datasource, - boto3_options={"endpoint_url": endpoint_uri}, - base_directory="test_bucket/data/for", - reader_options={"sep": ","}, - ) - yield generator - except ImportError as e: - pytest.skip(str(e)) - - -@pytest.fixture -def s3_subdir_generator_with_partition(mock_s3_bucket, basic_sparkdf_datasource): - # We configure a generator that will fetch from (mocked) my_bucket - # and will use glob patterns to match returned assets into batches of the same asset - try: - generator = S3SubdirReaderBatchKwargsGenerator( - "my_generator", - datasource=basic_sparkdf_datasource, - boto3_options={"endpoint_url": endpoint_uri}, - base_directory="test_bucket/data/", - reader_options={"sep": ","}, - ) - yield generator - except ImportError as e: - pytest.skip(str(e)) - - -@pytest.mark.skip( - reason='This test is currently failing, because "moto_server s3 --host 127.0.0.1 --port 5555" is unresponsive.' -) -def test_s3_subdir_generator_basic_operation(s3_subdir_generator): - # S3 Generator sees *only* configured assets - assets = s3_subdir_generator.get_available_data_asset_names() - print(assets) - assert set(assets["names"]) == { - ("you", "file"), - ("me", "file"), - } - - -@pytest.mark.skip( - reason='This test is currently failing, because "moto_server s3 --host 127.0.0.1 --port 5555" is unresponsive.' -) -def test_s3_subdir_generator_reader_options_configuration(s3_subdir_generator): - batch_kwargs_list = [ - kwargs - for kwargs in s3_subdir_generator.get_iterator(data_asset_name="you", limit=10) - ] - print(batch_kwargs_list) - assert batch_kwargs_list[0]["reader_options"] == {"sep": ","} - - -@pytest.mark.skip( - reason='This test is currently failing, because "moto_server s3 --host 127.0.0.1 --port 5555" is unresponsive.' -) -def test_s3_subdir_generator_build_batch_kwargs_no_partition_id(s3_subdir_generator): - batch_kwargs = s3_subdir_generator.build_batch_kwargs("you") - assert batch_kwargs["s3"] in [ - "s3a://test_bucket/data/for/you.csv", - ] - - -@pytest.mark.skip( - reason='This test is currently failing, because "moto_server s3 --host 127.0.0.1 --port 5555" is unresponsive.' -) -def test_s3_subdir_generator_build_batch_kwargs_partition_id( - s3_subdir_generator_with_partition, basic_sparkdf_datasource -): - - batch_kwargs = s3_subdir_generator_with_partition.build_batch_kwargs("for", "you") - assert batch_kwargs["s3"] == "s3a://test_bucket/data/for/you.csv" From 2bd78f4c3ad57fa80904b24a6c7ee78d30cf8702 Mon Sep 17 00:00:00 2001 From: William Shin Date: Mon, 10 Apr 2023 15:53:56 -0700 Subject: [PATCH 70/96] [MAINTENANCE] Move Fluent Datasources Sorters into `TYPE_CHECKING` block (#7602) --- .../fluent/spark_azure_blob_storage_datasource.pyi | 8 ++++---- .../datasource/fluent/spark_filesystem_datasource.pyi | 6 +++--- .../fluent/spark_google_cloud_storage_datasource.pyi | 6 +++--- .../datasource/fluent/spark_s3_datasource.pyi | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi index de0167d5f6ca..71b79f49a99a 100644 --- a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.pyi @@ -13,9 +13,6 @@ from great_expectations.datasource.fluent.config_str import ( from great_expectations.datasource.fluent.data_asset.data_connector import ( S3DataConnector, ) -from great_expectations.datasource.fluent.interfaces import ( - SortersDefinition, -) from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) @@ -23,7 +20,10 @@ from great_expectations.datasource.fluent.spark_datasource import ( if TYPE_CHECKING: from azure.storage.blob import BlobServiceClient - from great_expectations.datasource.fluent.interfaces import BatchMetadata + from great_expectations.datasource.fluent.interfaces import ( + BatchMetadata, + SortersDefinition, + ) from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) diff --git a/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi b/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi index 6c97afb25ada..42c8a27dac63 100644 --- a/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_filesystem_datasource.pyi @@ -11,12 +11,12 @@ from great_expectations.datasource.fluent import _SparkFilePathDatasource from great_expectations.datasource.fluent.data_asset.data_connector import ( FilesystemDataConnector, ) -from great_expectations.datasource.fluent.interfaces import ( - SortersDefinition, -) if TYPE_CHECKING: from great_expectations.datasource.fluent import BatchMetadata + from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, + ) from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi index 87b453d064a9..32c8c12ef2d9 100644 --- a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.pyi @@ -13,14 +13,14 @@ from great_expectations.datasource.fluent.config_str import ( from great_expectations.datasource.fluent.data_asset.data_connector import ( GoogleCloudStorageDataConnector, ) -from great_expectations.datasource.fluent.interfaces import ( - SortersDefinition, -) if TYPE_CHECKING: from google.cloud.storage.client import Client as GoogleCloudStorageClient from great_expectations.datasource.fluent import BatchMetadata + from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, + ) from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) diff --git a/great_expectations/datasource/fluent/spark_s3_datasource.pyi b/great_expectations/datasource/fluent/spark_s3_datasource.pyi index 53d22c44fd95..5e53bc800bf5 100644 --- a/great_expectations/datasource/fluent/spark_s3_datasource.pyi +++ b/great_expectations/datasource/fluent/spark_s3_datasource.pyi @@ -13,12 +13,12 @@ from great_expectations.datasource.fluent.config_str import ( from great_expectations.datasource.fluent.data_asset.data_connector import ( S3DataConnector, ) -from great_expectations.datasource.fluent.interfaces import ( - SortersDefinition, -) if TYPE_CHECKING: from great_expectations.datasource.fluent import BatchMetadata + from great_expectations.datasource.fluent.interfaces import ( + SortersDefinition, + ) from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) From d2013e23fc9fb32eef0b35d116211d71cf02a3f2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Apr 2023 15:58:19 +0000 Subject: [PATCH 71/96] [MAINTENANCE] Bump terser from 5.10.0 to 5.16.8 in /docs/docusaurus (#7486) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Anthony Burdi --- docs/docusaurus/yarn.lock | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/docs/docusaurus/yarn.lock b/docs/docusaurus/yarn.lock index 6c137340644e..7a6c149f7769 100644 --- a/docs/docusaurus/yarn.lock +++ b/docs/docusaurus/yarn.lock @@ -8497,11 +8497,6 @@ source-map@^0.6.0, source-map@^0.6.1, source-map@~0.6.0: resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.6.1.tgz#74722af32e9614e9c287a8d0bbde48b5e2f1a263" integrity sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g== -source-map@~0.7.2: - version "0.7.3" - resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.7.3.tgz#5302f8169031735226544092e64981f751750383" - integrity sha512-CkCj6giN3S+n9qrYiBTX5gystlENnRW5jZeNLHpe6aue+SrHcG5VYwujhW9s4dY31mEGsxBDrHR6oI69fTXsaQ== - space-separated-tokens@^1.0.0: version "1.1.5" resolved "https://registry.yarnpkg.com/space-separated-tokens/-/space-separated-tokens-1.1.5.tgz#85f32c3d10d9682007e917414ddc5c26d1aa6899" @@ -8827,19 +8822,10 @@ terser-webpack-plugin@^5.3.3: serialize-javascript "^6.0.0" terser "^5.14.1" -terser@^5.10.0, terser@^5.7.2: - version "5.10.0" - resolved "https://registry.yarnpkg.com/terser/-/terser-5.10.0.tgz#b86390809c0389105eb0a0b62397563096ddafcc" - integrity sha512-AMmF99DMfEDiRJfxfY5jj5wNH/bYO09cniSqhfoyxc8sFoYIgkJy86G04UoZU5VjlpnplVu0K6Tx6E9b5+DlHA== - dependencies: - commander "^2.20.0" - source-map "~0.7.2" - source-map-support "~0.5.20" - -terser@^5.14.1: - version "5.15.1" - resolved "https://registry.yarnpkg.com/terser/-/terser-5.15.1.tgz#8561af6e0fd6d839669c73b92bdd5777d870ed6c" - integrity sha512-K1faMUvpm/FBxjBXud0LWVAGxmvoPbZbfTCYbSgaaYQaIXI3/TdI7a7ZGA73Zrou6Q8Zmz3oeUTsp/dj+ag2Xw== +terser@^5.10.0, terser@^5.14.1, terser@^5.7.2: + version "5.16.8" + resolved "https://registry.yarnpkg.com/terser/-/terser-5.16.8.tgz#ccde583dabe71df3f4ed02b65eb6532e0fae15d5" + integrity sha512-QI5g1E/ef7d+PsDifb+a6nnVgC4F22Bg6T0xrBrz6iloVB4PUkkunp6V8nzoOOZJIzjWVdAGqCdlKlhLq/TbIA== dependencies: "@jridgewell/source-map" "^0.3.2" acorn "^8.5.0" From b174dd3373223a72d19b39e88c53b7eb22457796 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Apr 2023 16:11:23 +0000 Subject: [PATCH 72/96] [MAINTENANCE] Bump cookiecutter from 1.7.3 to 2.1.1 in /contrib/cli (#7510) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Anthony Burdi --- contrib/cli/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/cli/requirements.txt b/contrib/cli/requirements.txt index 2507c93945f5..69b792817f1a 100644 --- a/contrib/cli/requirements.txt +++ b/contrib/cli/requirements.txt @@ -1,6 +1,6 @@ black[jupyter]==22.3.0 # Linting / code style Click>=7.1.2 # CLI tooling -cookiecutter==1.7.3 # Project templating +cookiecutter==2.1.1 # Project templating mypy==1.1.1 # Type checker pydantic>=1.0,<2.0 # Needed for mypy plugin pytest>=5.3.5 # Test framework From 6cdf61e998f985a8c619b22bdb56fe80f585c231 Mon Sep 17 00:00:00 2001 From: Rob Gray <104205257+kwcanuck@users.noreply.github.com> Date: Tue, 11 Apr 2023 13:13:06 -0400 Subject: [PATCH 73/96] [DOCS] Updates to Contributing through GitHub (#7601) --- .../docs/contributing/contributing_github.md | 64 ++++--------------- 1 file changed, 13 insertions(+), 51 deletions(-) diff --git a/docs/docusaurus/docs/contributing/contributing_github.md b/docs/docusaurus/docs/contributing/contributing_github.md index 28769c0b1364..53b0a1fd7bac 100644 --- a/docs/docusaurus/docs/contributing/contributing_github.md +++ b/docs/docusaurus/docs/contributing/contributing_github.md @@ -1,65 +1,27 @@ --- -title: Contributing through GitHub +title: Contributing with GitHub --- -## Making changes directly through GitHub +To avoid forking the repository, Great Expectations recommends using the GitHub Markdown editor to edit documentation. -If you want to change documentation, but not code, we suggest using the GitHub markdown editor, which means you don’t have to fork the repo at all. Here’s how you do this: +1. Open a browser and go to the [GitHub Great Expectations docs repository](https://github.com/great-expectations/great_expectations/tree/develop/docs). -### Start editing +2. Go to the topic file you want to edit. The topic URL contains the path to each topic file. For example, the path for this topic is . The URL indicates the topic is located in the `contributing` folder, and it's named `contributing_github`. -#### 1A. Edit from https://docs.greatexpectations.io/docs/ +3. Click the file and then click **Edit this file**. -* Go to the [Great Expectations docs](https://docs.greatexpectations.io/docs/). +4. Add your edits. See the Great Expectations [Style Guide](./style_guides/docs_style.md) for formatting recommendations. -* On each page, you’ll see an `Edit` button in the lower left. Click this to go to the source file in the Great Expectations GitHub repo. +5. Optional. Click the **Preview** tab to preview your changes. -#### 1B. Edit from GitHub +6. When you’ve completed your edits, scroll down to the **Propose changes** section and add a meaningful commit message and an explanation of what you changed and why. -* If you’re already on GitHub, the docs are located in `great_expectations > docs`. You can directly navigate to the respective page you want to edit (but getting there from https://docs.greatexpectations.io/docs/ is a little easier). +7. Select **Create a new branch for this commit and start a pull request**. Accept the default name for the branch, or enter a new one. -* In the top right of the grey header bar of the actual file, click the pencil icon to get into edit mode on GitHub. +8. Click **Propose changes**. -#### 2. Make edits - -* Make your edits and use the Preview tab to preview changes. - -* Please pay close attention to the [Style Guide](./style_guides/docs_style.md). - -### Submit a pull request - -#### 3. Submit your edits as a PR - -* When you’re done, add a meaningful commit message at the bottom. Use a short title and a meaningful explanation of what you changed and why. - -* Click the `Propose File Change` button at the bottom of the page. - -* Click the `Create Pull Request` button. - -* Optionally: Add comment to explain your change, if it’s not already in the commit message. - -* Click the next `Create Pull Request` button to create the actual PR. - -#### 4. Sign the CLA - -* If this is your first contribution to Great Expectations, You will see a comment from the “CLA Bot” that asks you to complete the Contributor Licence Agreement form. - -* Please complete the form and comment on the PR to say that you’ve signed the form. - -#### 5. Verify continuous integration checks - -* Wait for the other Continuous Integration (CI) checks to go green and watch out for a comment from the automated linter that checks for syntax and formatting issues. - -* Fix any issues that are flagged. (For documentation changes, it’s unlikely that you’ll have any issues.) - -#### 6. Wait for a core team member to approve and merge your PR - -* Once all checks pass, a Great Expectations team member will approve your PR and merge it. - -* GitHub will notify you of comments or a successful merge according to your notification settings. - -* If there are any issues, please address them promptly. - -Congratulations! You’ve just contributed to Great Expectations! + If this is your first Great Expectations documentation contribution, you'll be prompted to complete the Contributor License Agreement (CLA). Complete the CLA and add `@cla-bot check` as a comment to the pull request (PR) to indicate that you’ve completed it. +9. Wait for the Continuous Integration (CI) checks to complete and then correct any syntax or formatting issues. + A Great Expectations team member reviews, approves, and merges your PR. Depending on your GitHub notification settings, you'll be notified when there are comments or when your changes are successfully merged. From afc8db5ca4a186570ca638cda928c36a4e873cf0 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Tue, 11 Apr 2023 14:08:38 -0400 Subject: [PATCH 74/96] [MAINTENANCE] Polish and ratchet requirements pins and upper bounds (#7604) --- tests/test_packaging.py | 172 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 167 insertions(+), 5 deletions(-) diff --git a/tests/test_packaging.py b/tests/test_packaging.py index 0a4958eca1f7..083f05f8d22b 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -1,7 +1,10 @@ +from __future__ import annotations + import os.path import pathlib from typing import List +import pytest import requirements as rp @@ -15,21 +18,57 @@ def collect_requirements_files() -> List[pathlib.Path]: return list(project_root.glob(pattern)) + list(reqs_dir.glob(pattern)) -def test_requirements_files(): - """requirements.txt should be a subset of requirements-dev.txt""" +def parse_requirements_files_to_strings(files: list[pathlib.Path]) -> dict: + """Parse requirements files to dict. + + dict of the form {"filename": {"package_name_with_specs_as_str"}} + + Args: + files: Paths to files that should be parsed + """ req_set_dict = {} - req_files = collect_requirements_files() - for req_file in req_files: + for req_file in files: abs_path = req_file.absolute().as_posix() key = abs_path.rsplit(os.path.sep, 1)[-1] with open(req_file) as f: req_set_dict[key] = { - f'{line.name}{"".join(line.specs[0])}' + f'{line.name}{",".join(["".join(spec) for spec in line.specs])}' for line in rp.parse(f) if line.specs } + return req_set_dict + + +def parse_requirements_files_to_specs(files: list[pathlib.Path]) -> dict: + """Parse requirements files to dict. + + dict of the form {"filename": {"package_name_with_specs_as_str"}} + + Args: + files: Paths to files that should be parsed + """ + + req_set_dict = {} + for req_file in files: + abs_path = req_file.absolute().as_posix() + key = abs_path.rsplit(os.path.sep, 1)[-1] + with open(req_file) as f: + req_set_dict[key] = { + line.name: line.specs for line in rp.parse(f) if line.specs + } + + return req_set_dict + + +@pytest.mark.integration +def test_requirements_files(): + """requirements.txt should be a subset of requirements-dev.txt""" + + req_files = collect_requirements_files() + req_set_dict = parse_requirements_files_to_strings(files=req_files) + assert req_set_dict["requirements.txt"] <= req_set_dict["requirements-dev.txt"] assert ( @@ -109,3 +148,126 @@ def test_requirements_files(): | req_set_dict["requirements-dev-trino.txt"] | req_set_dict["requirements-dev-vertica.txt"] ) <= {"numpy>=1.21.0", "scipy>=1.7.0"} + + +@pytest.mark.integration +def test_polish_and_ratchet_pins_and_upper_bounds(): + """What does this test do and why? + + We would like to reduce the number of pins and upper bounds on dependencies + so that we can stay up to date with our dependencies where possible. + """ + req_files = collect_requirements_files() + req_set_dict = parse_requirements_files_to_specs(files=req_files) + + packages_with_pins_or_upper_bounds = set() + + for req_file, package_specs in req_set_dict.items(): + for package, specs in package_specs.items(): + for spec in specs: + if spec[0] in ("<", "<=", "=="): + packages_with_pins_or_upper_bounds.add( + ( + req_file, + package, + tuple(sorted(specs, key=lambda s: (s[0], s[1]))), + ) + ) + + sorted_packages_with_pins_or_upper_bounds = sorted( + list(packages_with_pins_or_upper_bounds), key=lambda p: (p[0], p[1]) + ) + + # Polish and ratchet this number down as low as possible + assert len(sorted_packages_with_pins_or_upper_bounds) == 76 + + assert sorted_packages_with_pins_or_upper_bounds == [ + ("requirements-dev-api-docs-test.txt", "docstring-parser", (("==", "0.15"),)), + ("requirements-dev-contrib.txt", "adr-tools-python", (("==", "1.0.3"),)), + ("requirements-dev-contrib.txt", "black", (("==", "22.3.0"),)), + ("requirements-dev-contrib.txt", "mypy", (("==", "1.1.1"),)), + ("requirements-dev-contrib.txt", "ruff", (("==", "0.0.255"),)), + ("requirements-dev-dremio.txt", "sqlalchemy-dremio", (("==", "1.2.1"),)), + ("requirements-dev-excel.txt", "xlrd", (("<", "2.0.0"), (">=", "1.1.0"))), + ("requirements-dev-lite.txt", "ipykernel", (("<=", "6.17.1"),)), + ("requirements-dev-lite.txt", "moto", (("<", "3.0.0"), (">=", "2.0.0"))), + ("requirements-dev-lite.txt", "snapshottest", (("==", "0.6.0"),)), + ("requirements-dev-lite.txt", "sqlalchemy", (("<", "2.0.0"), (">=", "1.4.0"))), + ("requirements-dev-mysql.txt", "PyMySQL", (("<", "0.10"), (">=", "0.9.3"))), + ("requirements-dev-pagerduty.txt", "pypd", (("==", "1.1.0"),)), + ( + "requirements-dev-sqlalchemy.txt", + "PyMySQL", + (("<", "0.10"), (">=", "0.9.3")), + ), + ("requirements-dev-sqlalchemy.txt", "ipykernel", (("<=", "6.17.1"),)), + ("requirements-dev-sqlalchemy.txt", "moto", (("<", "3.0.0"), (">=", "2.0.0"))), + ("requirements-dev-sqlalchemy.txt", "snapshottest", (("==", "0.6.0"),)), + ( + "requirements-dev-sqlalchemy.txt", + "sqlalchemy", + (("<", "2.0.0"), (">=", "1.4.0")), + ), + ("requirements-dev-sqlalchemy.txt", "sqlalchemy-dremio", (("==", "1.2.1"),)), + ( + "requirements-dev-sqlalchemy.txt", + "teradatasqlalchemy", + (("==", "17.0.0.1"),), + ), + ("requirements-dev-teradata.txt", "teradatasqlalchemy", (("==", "17.0.0.1"),)), + ("requirements-dev-test.txt", "adr-tools-python", (("==", "1.0.3"),)), + ("requirements-dev-test.txt", "black", (("==", "22.3.0"),)), + ("requirements-dev-test.txt", "docstring-parser", (("==", "0.15"),)), + ("requirements-dev-test.txt", "ipykernel", (("<=", "6.17.1"),)), + ("requirements-dev-test.txt", "moto", (("<", "3.0.0"), (">=", "2.0.0"))), + ("requirements-dev-test.txt", "mypy", (("==", "1.1.1"),)), + ("requirements-dev-test.txt", "ruff", (("==", "0.0.255"),)), + ("requirements-dev-test.txt", "snapshottest", (("==", "0.6.0"),)), + ("requirements-dev-test.txt", "sqlalchemy", (("<", "2.0.0"), (">=", "1.4.0"))), + ("requirements-dev.txt", "PyMySQL", (("<", "0.10"), (">=", "0.9.3"))), + ("requirements-dev.txt", "adr-tools-python", (("==", "1.0.3"),)), + ("requirements-dev.txt", "altair", (("<", "4.2.1"), (">=", "4.0.0"))), + ("requirements-dev.txt", "black", (("==", "22.3.0"),)), + ("requirements-dev.txt", "docstring-parser", (("==", "0.15"),)), + ("requirements-dev.txt", "ipykernel", (("<=", "6.17.1"),)), + ("requirements-dev.txt", "makefun", (("<", "2"), (">=", "1.7.0"))), + ("requirements-dev.txt", "marshmallow", (("<", "4.0.0"), (">=", "3.7.1"))), + ("requirements-dev.txt", "moto", (("<", "3.0.0"), (">=", "2.0.0"))), + ("requirements-dev.txt", "mypy", (("==", "1.1.1"),)), + ("requirements-dev.txt", "pandas", (("<", "2.0.0"), (">=", "1.3.0"))), + ("requirements-dev.txt", "pydantic", (("<", "2.0"), (">=", "1.9.2"))), + ("requirements-dev.txt", "pypd", (("==", "1.1.0"),)), + ("requirements-dev.txt", "ruamel.yaml", (("<", "0.17.18"), (">=", "0.16"))), + ("requirements-dev.txt", "ruff", (("==", "0.0.255"),)), + ("requirements-dev.txt", "snapshottest", (("==", "0.6.0"),)), + ("requirements-dev.txt", "sqlalchemy", (("<", "2.0.0"), (">=", "1.4.0"))), + ("requirements-dev.txt", "sqlalchemy-dremio", (("==", "1.2.1"),)), + ("requirements-dev.txt", "teradatasqlalchemy", (("==", "17.0.0.1"),)), + ("requirements-dev.txt", "urllib3", (("<", "1.27"), (">=", "1.25.4"))), + ("requirements-dev.txt", "xlrd", (("<", "2.0.0"), (">=", "1.1.0"))), + ("requirements-types.txt", "PyMySQL", (("<", "0.10"), (">=", "0.9.3"))), + ("requirements-types.txt", "adr-tools-python", (("==", "1.0.3"),)), + ("requirements-types.txt", "altair", (("<", "4.2.1"), (">=", "4.0.0"))), + ("requirements-types.txt", "black", (("==", "22.3.0"),)), + ("requirements-types.txt", "ipykernel", (("<=", "6.17.1"),)), + ("requirements-types.txt", "makefun", (("<", "2"), (">=", "1.7.0"))), + ("requirements-types.txt", "marshmallow", (("<", "4.0.0"), (">=", "3.7.1"))), + ("requirements-types.txt", "moto", (("<", "3.0.0"), (">=", "2.0.0"))), + ("requirements-types.txt", "mypy", (("==", "1.1.1"),)), + ("requirements-types.txt", "pandas", (("<", "2.0.0"), (">=", "1.3.0"))), + ("requirements-types.txt", "pydantic", (("<", "2.0"), (">=", "1.9.2"))), + ("requirements-types.txt", "ruamel.yaml", (("<", "0.17.18"), (">=", "0.16"))), + ("requirements-types.txt", "ruff", (("==", "0.0.255"),)), + ("requirements-types.txt", "snapshottest", (("==", "0.6.0"),)), + ("requirements-types.txt", "sqlalchemy", (("<", "2.0.0"), (">=", "1.4.0"))), + ("requirements-types.txt", "sqlalchemy-dremio", (("==", "1.2.1"),)), + ("requirements-types.txt", "teradatasqlalchemy", (("==", "17.0.0.1"),)), + ("requirements-types.txt", "urllib3", (("<", "1.27"), (">=", "1.25.4"))), + ("requirements.txt", "altair", (("<", "4.2.1"), (">=", "4.0.0"))), + ("requirements.txt", "makefun", (("<", "2"), (">=", "1.7.0"))), + ("requirements.txt", "marshmallow", (("<", "4.0.0"), (">=", "3.7.1"))), + ("requirements.txt", "pandas", (("<", "2.0.0"), (">=", "1.3.0"))), + ("requirements.txt", "pydantic", (("<", "2.0"), (">=", "1.9.2"))), + ("requirements.txt", "ruamel.yaml", (("<", "0.17.18"), (">=", "0.16"))), + ("requirements.txt", "urllib3", (("<", "1.27"), (">=", "1.25.4"))), + ] From 2f469ee31e7dd08e71ae128a03f4c357bbfb3867 Mon Sep 17 00:00:00 2001 From: Asaf Lachisch <79054184+asafla@users.noreply.github.com> Date: Tue, 11 Apr 2023 21:31:09 +0300 Subject: [PATCH 75/96] [CONTRIB] Expect Column Values to be Valid UUID - Added SqlAlchemyExecutionEngine support (#7592) Co-authored-by: Anthony Burdi --- .../expect_column_values_to_be_valid_uuid.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_uuid.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_uuid.py index 6c683ff53365..7995636d2629 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_uuid.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_uuid.py @@ -8,7 +8,10 @@ from uuid import UUID from great_expectations.core.expectation_configuration import ExpectationConfiguration -from great_expectations.execution_engine import PandasExecutionEngine +from great_expectations.execution_engine import ( + PandasExecutionEngine, + SqlAlchemyExecutionEngine, +) from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ( ColumnMapMetricProvider, @@ -37,9 +40,19 @@ def _pandas(cls, column, **kwargs): return column.apply(lambda x: is_valid_uuid(x)) # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine - # @column_condition_partial(engine=SqlAlchemyExecutionEngine) - # def _sqlalchemy(cls, column, _dialect, **kwargs): - # raise NotImplementedError + @column_condition_partial(engine=SqlAlchemyExecutionEngine) + def _sqlalchemy(cls, column, _dialect, **kwargs): + """ + Please note that there is a stricter version to verify GUID, as can be seen in the following link: + https://www.techtarget.com/searchwindowsserver/definition/GUID-global-unique-identifier#:~:text=RFC%204122%20specification.-,How%20does%20GUID%20work%3F,-GUIDs%20are%20constructed + However, since the UUID package doesn't seem to enforce it, the chosen regex was the less stricter. + For future purposes, the stricter pattern can be found here as well, commented out. + """ + # regex_pattern = '^(urn:uuid:)?\{?[A-Fa-f0-9]{8}-?[A-Fa-f0-9]{4}-?[1-5][A-Fa-f0-9]{3}-?[89ABab][A-Fa-f0-9]{3}-?[A-Fa-f0-9]{12}\}?$' + regex_pattern = ( + "^(urn:uuid:)?\\{?[0-9a-fA-F]{8}(-?[0-9a-fA-F]{4}){3}-?[0-9a-fA-F]{12}\\}?$" + ) + return column.regexp_match(regex_pattern) # This method defines the business logic for evaluating your metric when using a SparkDFExecutionEngine # @column_condition_partial(engine=SparkDFExecutionEngine) @@ -65,9 +78,9 @@ class ExpectColumnValuesToBeValidUUID(ColumnMapExpectation): # hyphens may or may not be present "e8a4926e5f7643079e8acdbd49a4e15b", # curly braces may or may not be present - "{00010203-0405-0607-0809-0a0b0c0d0e0f}", + "{00010203-0405-1607-8809-0a0b0c0d0e0f}", # leading identifier "urn:uuid:" is allowed - "urn:uuid:12345678-1234-5678-1234-567812345678", + "urn:uuid:12345678-1234-5678-9234-567812345678", ], "malformed_uuids": [ # has non-hexidecimal value @@ -147,6 +160,7 @@ def validate_configuration( "tags": ["typed-entities"], # Tags for this Expectation in the Gallery "contributors": [ # Github handles for all contributors to this Expectation. "@joshua-stauffer", # Don't forget to add your github handle here! + "@asafla", ], } From bea9d5144e1e318e0c391b73174944b79e4c2ac5 Mon Sep 17 00:00:00 2001 From: David Talbot <17692467+dctalbot@users.noreply.github.com> Date: Tue, 11 Apr 2023 14:47:43 -0400 Subject: [PATCH 76/96] [MAINTENANCE] small documentation updates (#7606) Co-authored-by: Rob Gray <104205257+kwcanuck@users.noreply.github.com> --- docs/docusaurus/docs/contributing/contributing_github.md | 2 +- docs/docusaurus/docs/contributing/contributing_overview.md | 6 ++---- docs/docusaurus/docs/contributing/contributing_setup.md | 4 ++-- docs_rtd/contributing/make_changes_through_github.rst | 2 +- docs_rtd/contributing/setting_up_your_dev_environment.rst | 2 +- .../how_to_add_and_test_a_new_sqlalchemydataset_class.rst | 2 +- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/docusaurus/docs/contributing/contributing_github.md b/docs/docusaurus/docs/contributing/contributing_github.md index 53b0a1fd7bac..0325d2f9195e 100644 --- a/docs/docusaurus/docs/contributing/contributing_github.md +++ b/docs/docusaurus/docs/contributing/contributing_github.md @@ -24,4 +24,4 @@ To avoid forking the repository, Great Expectations recommends using the GitHub 9. Wait for the Continuous Integration (CI) checks to complete and then correct any syntax or formatting issues. - A Great Expectations team member reviews, approves, and merges your PR. Depending on your GitHub notification settings, you'll be notified when there are comments or when your changes are successfully merged. + A Great Expectations team member reviews, approves, and merges your PR. Depending on your GitHub notification settings, you'll be notified when there are comments or when your changes are successfully merged. \ No newline at end of file diff --git a/docs/docusaurus/docs/contributing/contributing_overview.md b/docs/docusaurus/docs/contributing/contributing_overview.md index 0295096e6f97..ab37280788a6 100644 --- a/docs/docusaurus/docs/contributing/contributing_overview.md +++ b/docs/docusaurus/docs/contributing/contributing_overview.md @@ -9,7 +9,7 @@ Our goal is to make your experience as great as possible. Please follow these st #### 1. Join the community on Slack -* Go to [greatexpectations.io/slack](https://greatexpectations.io/slack) and introduce yourself in the `#contributors-contributing` channel. +* Go to [greatexpectations.io/slack](https://greatexpectations.io/slack) and introduce yourself in the [#contributing](https://greatexpectationstalk.slack.com/archives/CV828B2UX) channel. #### 2. Contribute small changes directly through GitHub @@ -18,15 +18,13 @@ Our goal is to make your experience as great as possible. Please follow these st #### 3. Set up your development environment to contribute large changes * Follow these instructions to [set up your dev environment](./contributing_setup.md). -Alternatively, for small changes that don’t need to be tested locally (e.g. documentation changes), you can [make changes directly through GitHub](https://docs.greatexpectations.io/docs/contributing/contributing_github). - #### 4. Identify the type of contribution that you want to make * [Issues in GitHub](https://github.com/great-expectations/great_expectations/issues) are a great place to start. Check out the help wanted and good first issue labels. Comment to let everyone know you’re working on it. * If there’s no issue for what you want to work on, please create one. Add a comment to let everyone know that you’re working on it. We prefer small, incremental commits, because it makes the thought process behind changes easier to review. -* Our [Levels of maturity grid](./contributing_maturity.md) provides guidelines for how the maintainers of Great Expectations evaluate levels of maturity of a feature. +* Our [Levels of Maturity grid](./contributing_maturity.md) provides guidelines for how the maintainers of Great Expectations evaluate levels of maturity of a feature. #### 5. Start developing * Make sure to reference the style guides for [code](./style_guides/code_style.md) and diff --git a/docs/docusaurus/docs/contributing/contributing_setup.md b/docs/docusaurus/docs/contributing/contributing_setup.md index fc62aac13fd7..2d8317fdf1b1 100644 --- a/docs/docusaurus/docs/contributing/contributing_setup.md +++ b/docs/docusaurus/docs/contributing/contributing_setup.md @@ -49,7 +49,7 @@ In order to contribute to Great Expectations, you will need the following: ### Install Python dependencies ### (Easy version of steps 5-7 below for Mac/Linux users) -Create a virtual environment in your locally cloned repo, use the same version of `pip` that we use in our CI/CD pipelines (for Python 3.7 - 3.10), and install the fewest dependencies needed for a dev environment (to minimize potential setup headaches). +Create a virtual environment in your local repository using Python versions 3.7 to 3.10, activate the environment, and then install the necessary dependencies. ``` python3 -m venv gx_dev @@ -105,7 +105,7 @@ Later on, try setting up the full dev environment (as mentioned in step 6) when ### 5. Create a new virtual environment -* Make a new virtual environment (e.g. using virtualenv or conda), name it “great_expectations_dev” or similar. +* Make a new virtual environment and name it “great_expectations_dev” or similar. * Ex virtualenv: `python3 -m venv /great_expectations_dev` and then `/great_expectations_dev/bin/activate` diff --git a/docs_rtd/contributing/make_changes_through_github.rst b/docs_rtd/contributing/make_changes_through_github.rst index 1be26317facd..b870a4ce8cff 100644 --- a/docs_rtd/contributing/make_changes_through_github.rst +++ b/docs_rtd/contributing/make_changes_through_github.rst @@ -39,7 +39,7 @@ Submit a pull request **4. Sign the CLA** - * If this is your first contribution to Great Expectations, You will see a comment from the "CLA Bot" that asks you to complete the Contributor Licence Agreement form. + * If this is your first contribution to Great Expectations, you will see a comment from the "CLA Bot" that asks you to complete the Contributor Licence Agreement form. * Please complete the form and comment on the PR to say that you’ve signed the form. **5. Verify continuous integration checks** diff --git a/docs_rtd/contributing/setting_up_your_dev_environment.rst b/docs_rtd/contributing/setting_up_your_dev_environment.rst index bbc35b60a746..0289c4ff7f0c 100644 --- a/docs_rtd/contributing/setting_up_your_dev_environment.rst +++ b/docs_rtd/contributing/setting_up_your_dev_environment.rst @@ -54,7 +54,7 @@ Install python dependencies **5. Create a new virtual environment** - * Make a new virtual environment (e.g. using virtualenv or conda), name it "great_expectations_dev" or similar. + * Make a new virtual environment and name it "great_expectations_dev" or similar. * Ex virtualenv: ``python3 -m venv /great_expectations_dev`` and then ``source /great_expectations_dev/bin/activate`` * Ex conda: ``conda create --name great_expectations_dev python=3.7`` and then ``conda activate great_expectations_dev`` (we support multiple python versions, you may select something other than 3.7). * This is not required, but highly recommended. diff --git a/docs_rtd/guides/how_to_guides/miscellaneous/how_to_add_and_test_a_new_sqlalchemydataset_class.rst b/docs_rtd/guides/how_to_guides/miscellaneous/how_to_add_and_test_a_new_sqlalchemydataset_class.rst index c889a2f098e7..18160f3a95d1 100644 --- a/docs_rtd/guides/how_to_guides/miscellaneous/how_to_add_and_test_a_new_sqlalchemydataset_class.rst +++ b/docs_rtd/guides/how_to_guides/miscellaneous/how_to_add_and_test_a_new_sqlalchemydataset_class.rst @@ -9,7 +9,7 @@ This guide will help you extend the Great Expectations execution layer to work o This guide is a working checklist. We improve it each time we add support for a new SQL dialect, but it still has rough edges. And adding support for a new database will probably never be a cookie-cutter operation. - If you're interested in extending Great Expectations in this way, please reach out on `Slack `__ in the ``#contributors-contributing`` channel, and we'll work with you to get there. + If you're interested in extending Great Expectations in this way, submit a request on the `Slack `__ ``#contributing`` channel, and a member of our team will work with you to create a solution. Steps ----- From 0d0eac4aa94527ff2205506489ed90c01c8a7f88 Mon Sep 17 00:00:00 2001 From: Manas Bhardwaj Date: Wed, 12 Apr 2023 02:19:41 +0530 Subject: [PATCH 77/96] [FEATURE] Added AssumeRole Feature (#7547) Co-authored-by: Manas Bhardwaj - Personal --- .../data_context/store/tuple_store_backend.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/great_expectations/data_context/store/tuple_store_backend.py b/great_expectations/data_context/store/tuple_store_backend.py index f3715830459e..d592bb7d293c 100644 --- a/great_expectations/data_context/store/tuple_store_backend.py +++ b/great_expectations/data_context/store/tuple_store_backend.py @@ -699,6 +699,28 @@ def _has_key(self, key): all_keys = self.list_keys() return key in all_keys + def _assume_role_and_get_secret_credentials(self): + import boto3 + + role_session_name = "GXAssumeRoleSession" + client = boto3.client("sts", self._boto3_options.get("region_name")) + role_arn = self._boto3_options.pop("assume_role_arn") + assume_role_duration = self._boto3_options.pop("assume_role_duration") + response = client.assume_role( + RoleArn=role_arn, + RoleSessionName=role_session_name, + DurationSeconds=assume_role_duration, + ) + self._boto3_options["aws_access_key_id"] = response["Credentials"][ + "AccessKeyId" + ] + self._boto3_options["aws_secret_access_key"] = response["Credentials"][ + "SecretAccessKey" + ] + self._boto3_options["aws_session_token"] = response["Credentials"][ + "SessionToken" + ] + @property def boto3_options(self): from botocore.client import Config @@ -707,6 +729,8 @@ def boto3_options(self): if self._boto3_options.get("signature_version"): signature_version = self._boto3_options.pop("signature_version") result["config"] = Config(signature_version=signature_version) + if self._boto3_options.get("assume_role_arn"): + self._assume_role_and_get_secret_credentials() result.update(self._boto3_options) return result From da817659fa83b565f70304543625d2c905d87ded Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 11 Apr 2023 14:57:57 -0700 Subject: [PATCH 78/96] [BUGFIX] `dataset_name` made optional parameter for Expectations (#7603) --- great_expectations/checkpoint/checkpoint.py | 2 +- great_expectations/self_check/util.py | 42 ++++++++- .../test_expectations_v3_api.py | 29 +++++- tests/test_the_utils_in_self_check_utils.py | 91 +++++++++++++++++-- 4 files changed, 148 insertions(+), 16 deletions(-) diff --git a/great_expectations/checkpoint/checkpoint.py b/great_expectations/checkpoint/checkpoint.py index cc568c7f41c4..03b295907e6b 100644 --- a/great_expectations/checkpoint/checkpoint.py +++ b/great_expectations/checkpoint/checkpoint.py @@ -135,7 +135,7 @@ def __init__( version="0.13.33", message="Used in cloud deployments.", ) - def run( + def run( # noqa: C901 - complexity 19 self, template_name: Optional[str] = None, run_name_template: Optional[str] = None, diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index 402c7f39f7b6..2600bf92bd1c 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -93,6 +93,9 @@ expectationConfigurationSchema = ExpectationConfigurationSchema() expectationSuiteSchema = ExpectationSuiteSchema() +# mysql and mssql allow table names to be a maximum of 128 characters +# for postgres it is 63. +MAX_TABLE_NAME_LENGTH: int = 63 logger = logging.getLogger(__name__) @@ -2757,7 +2760,7 @@ def generate_test_table_name( def generate_dataset_name_from_expectation_name( dataset: dict, expectation_type: str, index: int, sub_index: int | None = None ) -> str: - """Method to generate datset_name for tests. Will either use the name defined in the test + """Method to generate dataset_name for tests. Will either use the name defined in the test configuration ("dataset_name"), or generate one using the Expectation name and index. In cases where the dataset is a list, then an additional index will be used. @@ -2779,6 +2782,43 @@ def generate_dataset_name_from_expectation_name( dataset_name = dataset.get( "dataset_name", f"{expectation_type}_dataset_{index}_{sub_index}" ) + + dataset_name = _check_if_valid_dataset_name(dataset_name) + return dataset_name + + +def _check_if_valid_dataset_name(dataset_name: str) -> str: + """Check that dataset_name (ie. table name) is valid before adding data to table. + + A valid dataset_name must: + + 1. Contain only alphanumeric characters and `_` + 2. Not be longer than 63 characters (which is the limit for postgres) + 3. Begin with letter + + Args: + dataset_name (str): dataset_name passed in by user or generated by generate_dataset_name_from_expectation_name() + + Returns: dataset_name + + """ + if not re.match(r"^[A-Za-z0-9_]+$", dataset_name): + raise ExecutionEngineError( + f"dataset_name: {dataset_name} is not valid, because it contains non-alphanumeric and _ characters." + f"Please check your configuration." + ) + + if len(dataset_name) >= MAX_TABLE_NAME_LENGTH: + # starting from the end, so that we always get the index and sub_index + new_dataset_name = dataset_name[-MAX_TABLE_NAME_LENGTH:] + logger.info( + f"dataset_name: '{dataset_name}' was truncated to '{new_dataset_name}' to keep within length limits." + ) + dataset_name = new_dataset_name + + while not re.match(r"^[A-Za-z]+$", dataset_name[0]): + dataset_name = dataset_name[1:] + return dataset_name diff --git a/tests/test_definitions/test_expectations_v3_api.py b/tests/test_definitions/test_expectations_v3_api.py index a84d4dae97ef..99b0c9db6c66 100644 --- a/tests/test_definitions/test_expectations_v3_api.py +++ b/tests/test_definitions/test_expectations_v3_api.py @@ -16,6 +16,7 @@ BigQueryDialect, candidate_test_is_on_temporary_notimplemented_list_v3_api, evaluate_json_test_v3_api, + generate_dataset_name_from_expectation_name, generate_sqlite_db_path, get_test_validator_with_data, mssqlDialect, @@ -51,8 +52,8 @@ def pytest_generate_tests(metafunc): # noqa C901 - 35 pk_column: bool = False file = open(filename) test_configuration = json.load(file) - - for test_config in test_configuration["datasets"]: + expectation_type = filename.split(".json")[0].split("/")[-1] + for index, test_config in enumerate(test_configuration["datasets"], 1): datasets = [] # optional only_for and suppress_test flag at the datasets-level that can prevent data being # added to incompatible backends. Currently only used by expect_column_values_to_be_unique @@ -76,12 +77,24 @@ def pytest_generate_tests(metafunc): # noqa C901 - 35 skip_expectation = False if isinstance(test_config["data"], list): sqlite_db_path = generate_sqlite_db_path() + sub_index: int = ( + 1 # additional index needed when dataset is a list + ) for dataset in test_config["data"]: + dataset_name = ( + generate_dataset_name_from_expectation_name( + dataset=dataset, + expectation_type=expectation_type, + index=index, + sub_index=sub_index, + ) + ) + datasets.append( get_test_validator_with_data( execution_engine=backend, data=dataset["data"], - table_name=dataset["dataset_name"], + table_name=dataset_name, schemas=dataset.get("schemas"), sqlite_db_path=sqlite_db_path, context=cast( @@ -105,10 +118,16 @@ def pytest_generate_tests(metafunc): # noqa C901 - 35 if "schemas" in test_config else None ) + dataset = test_config["data"] + dataset_name = generate_dataset_name_from_expectation_name( + dataset=dataset, + expectation_type=expectation_type, + index=index, + ) validator_with_data = get_test_validator_with_data( execution_engine=backend, - data=test_config["data"], - table_name=test_config["dataset_name"], + data=dataset, + table_name=dataset_name, schemas=schemas, context=cast( DataContext, build_in_memory_runtime_context() diff --git a/tests/test_the_utils_in_self_check_utils.py b/tests/test_the_utils_in_self_check_utils.py index f2af4acf0c49..55a98e143c6b 100644 --- a/tests/test_the_utils_in_self_check_utils.py +++ b/tests/test_the_utils_in_self_check_utils.py @@ -1,14 +1,18 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise + import pytest +from great_expectations.exceptions import ExecutionEngineError from great_expectations.self_check.util import ( + _check_if_valid_dataset_name, generate_dataset_name_from_expectation_name, ) @pytest.mark.parametrize( - "dataset,expectation_name,index,sub_index,expected_output", + "dataset,expectation_name,index,sub_index,expected_output,expectation", [ pytest.param( {"dataset_name": "i_am_a_dataset"}, @@ -16,6 +20,7 @@ 1, None, "i_am_a_dataset", + does_not_raise(), id="defined_in_dataset_dict", ), pytest.param( @@ -24,6 +29,7 @@ 1, None, "expect_things_dataset_1", + does_not_raise(), id="expectation_name_and_index", ), pytest.param( @@ -32,8 +38,27 @@ 1, 2, "expect_things_dataset_1_2", + does_not_raise(), id="expectation_name_and_sub_index", ), + pytest.param( + {}, + "expect_many_many_many_many_many_many_many_many_many_many_things", + 1, + None, + "y_many_many_many_many_many_many_many_many_many_things_dataset_1", + does_not_raise(), + id="expection_name_truncated_and_underscore_removed", + ), + pytest.param( + {}, + "i*am*not*valid", + 1, + None, + "", + pytest.raises(ExecutionEngineError), + id="expection_name_truncated_and_underscore_removed", + ), ], ) def test_generate_table_name_with_expectation( @@ -42,6 +67,7 @@ def test_generate_table_name_with_expectation( expected_output: str, index: int, sub_index: int | None, + expectation, ): """Test for helper method that automatically generates table name for tests Args: @@ -51,12 +77,59 @@ def test_generate_table_name_with_expectation( index (int): index of current dataset sub_index (int): optional sub_index if there dataset is part of a list. """ - assert ( - generate_dataset_name_from_expectation_name( - dataset=dataset, - expectation_type=expectation_name, - index=index, - sub_index=sub_index, + with expectation: + assert ( + generate_dataset_name_from_expectation_name( + dataset=dataset, + expectation_type=expectation_name, + index=index, + sub_index=sub_index, + ) + == expected_output + ) + + +@pytest.mark.parametrize( + "dataset_name,expected_output,expectation", + [ + pytest.param( + "i_am_a_dataset", + "i_am_a_dataset", + does_not_raise(), + id="defined_in_dataset_dict", + ), + pytest.param( + "expect_many_many_many_many_many_many_many_many_many_many_many_many_many_many_things", + "y_many_many_many_many_many_many_many_many_many_many_many_things", + does_not_raise(), + id="expection_name_truncated_and_underscore_removed", + ), + pytest.param( + "i*am*not*valid", + "", + pytest.raises(ExecutionEngineError), + id="expection_name_truncated_and_underscore_removed", + ), + pytest.param( + "_________i_have_too_many_underscores", + "i_have_too_many_underscores", + does_not_raise(), + id="beginning_underscores_removed", + ), + ], +) +def test_check_if_valid_dataset_name( + dataset_name: str, expected_output: str, expectation +): + """Test for helper method that ensures table names are valid for tests + Args: + dataset_name(str): candidate dataset_name. + expected_output (str): what should the final table name be? + """ + with expectation: + assert ( + _check_if_valid_dataset_name( + dataset_name=dataset_name, + ) + == expected_output ) - == expected_output - ) From 9507a83cc6a0a067c17285e55f73c0f7c6097288 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 11 Apr 2023 17:47:59 -0700 Subject: [PATCH 79/96] [MAINTENANCE] SqlAlchemy 2 Compatibility - `engine.execute()` (#7469) Co-authored-by: Anthony Burdi Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- great_expectations/core/util.py | 28 +- .../store/database_store_backend.py | 25 +- .../data_context/store/query_store.py | 15 +- .../sqlalchemy_execution_engine.py | 21 +- .../map_condition_auxilliary_methods.py | 9 +- .../expectations/metrics/util.py | 394 +++++++++--------- great_expectations/optional_imports.py | 19 + pyproject.toml | 4 - tests/conftest.py | 37 +- .../test_data_context_test_yaml_config.py | 34 +- tests/data_context/test_data_context_v013.py | 4 +- .../test_sqlalchemy_execution_engine.py | 20 +- tests/expectations/test_null_filters.py | 6 +- tests/test_utils.py | 46 +- 14 files changed, 367 insertions(+), 295 deletions(-) diff --git a/great_expectations/core/util.py b/great_expectations/core/util.py index 985862423812..cdf136a4233d 100644 --- a/great_expectations/core/util.py +++ b/great_expectations/core/util.py @@ -72,7 +72,12 @@ MultiPolygon = None LineString = None -from great_expectations.optional_imports import SQLALCHEMY_NOT_IMPORTED, sqlalchemy +from great_expectations.optional_imports import ( + SQLALCHEMY_NOT_IMPORTED, + sqlalchemy, + sqlalchemy_Connection, + sqlalchemy_TextClause, +) try: LegacyRow = sqlalchemy.engine.row.LegacyRow @@ -82,13 +87,6 @@ ): # We need to catch an AttributeError since sqlalchemy>=2 does not have LegacyRow LegacyRow = SQLALCHEMY_NOT_IMPORTED -# This is a separate try/except than the LegacyRow one since TextClause exists in sqlalchemy 2. This means LegacyRow -# may be not importable while TextClause is. -try: - TextClause = sqlalchemy.sql.elements.TextClause -except ImportError: - TextClause = SQLALCHEMY_NOT_IMPORTED - SCHEMAS = { "api_np": { "NegativeInfinity": -np.inf, @@ -414,7 +412,7 @@ def convert_to_json_serializable( # noqa: C901 - complexity 32 return dict(data) # sqlalchemy text for SqlAlchemy 2 compatibility - if TextClause and isinstance(data, TextClause): + if sqlalchemy_TextClause and isinstance(data, sqlalchemy_TextClause): return str(data) if isinstance(data, decimal.Decimal): @@ -427,6 +425,10 @@ def convert_to_json_serializable( # noqa: C901 - complexity 32 if StructType is not None and isinstance(data, StructType): return dict(data.jsonValue()) + if sqlalchemy_Connection and isinstance(data, sqlalchemy_Connection): + # Connection is a module, which is non-serializable. Return module name instead. + return "sqlalchemy.engine.base.Connection" + raise TypeError( f"{str(data)} is of type {type(data).__name__} which cannot be serialized." ) @@ -533,8 +535,12 @@ def ensure_json_serializable(data): # noqa: C901 - complexity 21 if isinstance(data, RunIdentifier): return - if sqlalchemy is not None and isinstance(data, sqlalchemy.sql.elements.TextClause): - return str(data) + if sqlalchemy_TextClause and isinstance(data, sqlalchemy_TextClause): + # TextClause is handled manually by convert_to_json_serializable() + return + if sqlalchemy_Connection and isinstance(data, sqlalchemy_Connection): + # Connection module is handled manually by convert_to_json_serializable() + return else: raise InvalidExpectationConfigurationError( diff --git a/great_expectations/data_context/store/database_store_backend.py b/great_expectations/data_context/store/database_store_backend.py index 40f4309cdd40..c8321383ec23 100644 --- a/great_expectations/data_context/store/database_store_backend.py +++ b/great_expectations/data_context/store/database_store_backend.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import uuid from pathlib import Path @@ -5,7 +7,10 @@ import great_expectations.exceptions as gx_exceptions from great_expectations.data_context.store.store_backend import StoreBackend -from great_expectations.optional_imports import SQLALCHEMY_NOT_IMPORTED +from great_expectations.optional_imports import ( + SQLALCHEMY_NOT_IMPORTED, + sqlalchemy_Row, +) from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.util import ( filter_properties_dict, @@ -170,7 +175,7 @@ def store_backend_id(self) -> str: self._store_backend_id = f"{self.STORE_BACKEND_ID_PREFIX}{store_id}" return self._store_backend_id.replace(self.STORE_BACKEND_ID_PREFIX, "") - def _build_engine(self, credentials, **kwargs) -> "sa.engine.Engine": + def _build_engine(self, credentials, **kwargs) -> "sa.engine.Engine": # noqa: UP037 """ Using a set of given credentials, constructs an Execution Engine , connecting to a database using a URL or a private key path. @@ -198,7 +203,7 @@ def _build_engine(self, credentials, **kwargs) -> "sa.engine.Engine": @staticmethod def _get_sqlalchemy_key_pair_auth_url( drivername: str, credentials: dict - ) -> Tuple["URL", Dict]: + ) -> Tuple["URL", Dict]: # noqa: UP037 """ Utilizing a private key path and a passphrase in a given credentials dictionary, attempts to encode the provided values into a private key. If passphrase is incorrect, this will fail and an exception is raised. @@ -260,7 +265,9 @@ def _get(self, key): ) ) try: - return self.engine.execute(sel).fetchone()[0] + with self.engine.begin() as connection: + row = connection.execute(sel).fetchone()[0] + return row except (IndexError, SQLAlchemyError) as e: logger.debug(f"Error fetching value: {str(e)}") raise gx_exceptions.StoreError(f"Unable to fetch value for key: {str(key)}") @@ -327,7 +334,8 @@ def _has_key(self, key): ) ) try: - return self.engine.execute(sel).fetchone()[0] == 1 + with self.engine.begin() as connection: + return connection.execute(sel).fetchone()[0] == 1 except (IndexError, SQLAlchemyError) as e: logger.debug(f"Error checking for value: {str(e)}") return False @@ -347,7 +355,9 @@ def list_keys(self, prefix=()): ) ) ) - return [tuple(row) for row in self.engine.execute(sel).fetchall()] + with self.engine.begin() as connection: + row_list: list[sqlalchemy_Row] = connection.execute(sel).fetchall() + return [tuple(row) for row in row_list] def remove_key(self, key): delete_statement = self._table.delete().where( @@ -359,7 +369,8 @@ def remove_key(self, key): ) ) try: - return self.engine.execute(delete_statement) + with self.engine.begin() as connection: + return connection.execute(delete_statement) except SQLAlchemyError as e: raise gx_exceptions.StoreBackendError( f"Unable to delete key: got sqlalchemy error {str(e)}" diff --git a/great_expectations/data_context/store/query_store.py b/great_expectations/data_context/store/query_store.py index d9e25d177b3c..6c9a22a06503 100644 --- a/great_expectations/data_context/store/query_store.py +++ b/great_expectations/data_context/store/query_store.py @@ -123,13 +123,14 @@ def get_query_result(self, key, query_parameters=None): assert query, "Query must be specified to use SqlAlchemyQueryStore" query = Template(query).safe_substitute(query_parameters) - res = self.engine.execute(sa.text(query)).fetchall() - # NOTE: 20200617 - JPC: this approach is probably overly opinionated, but we can - # adjust based on specific user requests - res = [val for row in res for val in row] - if return_type == "scalar": - [res] = res - return res + with self.engine.begin() as connection: + res = connection.execute(sa.text(query)).fetchall() + # NOTE: 20200617 - JPC: this approach is probably overly opinionated, but we can + # adjust based on specific user requests + res = [val for row in res for val in row] + if return_type == "scalar": + [res] = res + return res @property def config(self) -> dict: diff --git a/great_expectations/execution_engine/sqlalchemy_execution_engine.py b/great_expectations/execution_engine/sqlalchemy_execution_engine.py index 60cd5be41e22..7494988b1ddf 100644 --- a/great_expectations/execution_engine/sqlalchemy_execution_engine.py +++ b/great_expectations/execution_engine/sqlalchemy_execution_engine.py @@ -44,7 +44,10 @@ from great_expectations.execution_engine.split_and_sample.sqlalchemy_data_splitter import ( SqlAlchemyDataSplitter, ) -from great_expectations.optional_imports import sqlalchemy_version_check +from great_expectations.optional_imports import ( + sqlalchemy_Engine, + sqlalchemy_version_check, +) from great_expectations.validator.computed_metric import MetricValue # noqa: TCH001 del get_versions # isort:skip @@ -98,7 +101,7 @@ sa = None try: - from sqlalchemy.engine import Dialect, Engine, Row # noqa: TID251 + from sqlalchemy.engine import Dialect, Row # noqa: TID251 from sqlalchemy.exc import OperationalError # noqa: TID251 from sqlalchemy.sql import Selectable # noqa: TID251 from sqlalchemy.sql.elements import ( # noqa: TID251 @@ -109,7 +112,6 @@ ) from sqlalchemy.sql.selectable import Select, TextualSelect # noqa: TID251 except ImportError: - Engine = None BooleanClauseList = None DefaultDialect = None Dialect = None @@ -448,7 +450,7 @@ def _on_connect(dbapi_con, connection_record): self._engine_backup = self.engine # sqlite/mssql temp tables only persist within a connection so override the engine # but only do this if self.engine is an Engine and isn't a Connection - if isinstance(self.engine, Engine): + if sqlalchemy_Engine and isinstance(self.engine, sqlalchemy_Engine): self.engine = self.engine.connect() # Send a connect event to provide dialect type @@ -1058,6 +1060,9 @@ def resolve_metric_bundle( ) logger.debug(f"Attempting query {str(sa_query_object)}") + + if sqlalchemy_Engine and isinstance(self.engine, sqlalchemy_Engine): + self.engine = self.engine.connect() res = self.engine.execute(sa_query_object).fetchall() logger.debug( @@ -1144,7 +1149,13 @@ def execute_split_query(self, split_query: Selectable) -> List[Row]: pattern = re.compile(r"(CAST\(EXTRACT\(.*?\))( AS STRING\))", re.IGNORECASE) split_query = re.sub(pattern, r"\1 AS VARCHAR)", split_query) - return self.engine.execute(split_query).fetchall() + if sqlalchemy_Engine and isinstance(self.engine, sqlalchemy_Engine): + connection = self.engine.connect() + else: + connection = self.engine + + query_result: List[Row] = connection.execute(split_query).fetchall() + return query_result def get_data_for_batch_identifiers( self, selectable: Selectable, splitter_method_name: str, splitter_kwargs: dict diff --git a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py index fe32406bcffe..601343c0b43b 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py @@ -37,6 +37,7 @@ verify_column_names_exist, ) from great_expectations.optional_imports import sqlalchemy as sa +from great_expectations.optional_imports import sqlalchemy_Engine from great_expectations.util import ( generate_temporary_table_name, get_sqlalchemy_selectable, @@ -361,8 +362,12 @@ def _sqlalchemy_map_condition_unexpected_count_value( .select_from(count_selectable) .alias("UnexpectedCountSubquery") ) - - unexpected_count: Union[float, int] = execution_engine.engine.execute( + if sqlalchemy_Engine and isinstance(execution_engine.engine, sqlalchemy_Engine): + connection = execution_engine.engine.connect() + else: + # execution_engine.engine is already a Connection. Use it directly + connection = execution_engine.engine + unexpected_count: Union[float, int] = connection.execute( sa.select( unexpected_count_query.c[ f"{SummarizationMetricNameSuffixes.UNEXPECTED_COUNT.value}" diff --git a/great_expectations/expectations/metrics/util.py b/great_expectations/expectations/metrics/util.py index 2d28460655b2..8c614afb7549 100644 --- a/great_expectations/expectations/metrics/util.py +++ b/great_expectations/expectations/metrics/util.py @@ -20,6 +20,7 @@ ) from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect from great_expectations.execution_engine.util import check_sql_engine_dialect +from great_expectations.optional_imports import sqlalchemy_Engine from great_expectations.util import get_sqlalchemy_inspector try: @@ -405,214 +406,227 @@ def column_reflection_fallback( selectable: Select, dialect: Dialect, sqlalchemy_engine: Engine ) -> List[Dict[str, str]]: """If we can't reflect the table, use a query to at least get column names.""" - col_info_dict_list: List[Dict[str, str]] - # noinspection PyUnresolvedReferences - if dialect.name.lower() == "mssql": - # Get column names and types from the database - # Reference: https://dataedo.com/kb/query/sql-server/list-table-columns-in-database - tables_table_clause: TableClause = sa.table( - "tables", - sa.column("object_id"), - sa.column("schema_id"), - sa.column("name"), - schema="sys", - ).alias("sys_tables_table_clause") - tables_table_query: Select = ( - sa.select( - tables_table_clause.c.object_id.label("object_id"), - sa.func.schema_name(tables_table_clause.c.schema_id).label( - "schema_name" - ), - tables_table_clause.c.name.label("table_name"), - ) - .select_from(tables_table_clause) - .alias("sys_tables_table_subquery") - ) - columns_table_clause: TableClause = sa.table( - "columns", - sa.column("object_id"), - sa.column("user_type_id"), - sa.column("column_id"), - sa.column("name"), - sa.column("max_length"), - sa.column("precision"), - schema="sys", - ).alias("sys_columns_table_clause") - columns_table_query: Select = ( - sa.select( - columns_table_clause.c.object_id.label("object_id"), - columns_table_clause.c.user_type_id.label("user_type_id"), - columns_table_clause.c.column_id.label("column_id"), - columns_table_clause.c.name.label("column_name"), - columns_table_clause.c.max_length.label("column_max_length"), - columns_table_clause.c.precision.label("column_precision"), + + if isinstance(sqlalchemy_engine.engine, sqlalchemy_Engine): + connection = sqlalchemy_engine.engine.connect() + else: + connection = sqlalchemy_engine.engine + + # with sqlalchemy_engine.begin() as connection: + with connection: + + col_info_dict_list: List[Dict[str, str]] + # noinspection PyUnresolvedReferences + if dialect.name.lower() == "mssql": + # Get column names and types from the database + # Reference: https://dataedo.com/kb/query/sql-server/list-table-columns-in-database + tables_table_clause: TableClause = sa.table( + "tables", + sa.column("object_id"), + sa.column("schema_id"), + sa.column("name"), + schema="sys", + ).alias("sys_tables_table_clause") + tables_table_query: Select = ( + sa.select( + tables_table_clause.c.object_id.label("object_id"), + sa.func.schema_name(tables_table_clause.c.schema_id).label( + "schema_name" + ), + tables_table_clause.c.name.label("table_name"), + ) + .select_from(tables_table_clause) + .alias("sys_tables_table_subquery") ) - .select_from(columns_table_clause) - .alias("sys_columns_table_subquery") - ) - types_table_clause: TableClause = sa.table( - "types", - sa.column("user_type_id"), - sa.column("name"), - schema="sys", - ).alias("sys_types_table_clause") - types_table_query: Select = ( - sa.select( - types_table_clause.c.user_type_id.label("user_type_id"), - types_table_clause.c.name.label("column_data_type"), + columns_table_clause: TableClause = sa.table( + "columns", + sa.column("object_id"), + sa.column("user_type_id"), + sa.column("column_id"), + sa.column("name"), + sa.column("max_length"), + sa.column("precision"), + schema="sys", + ).alias("sys_columns_table_clause") + columns_table_query: Select = ( + sa.select( + columns_table_clause.c.object_id.label("object_id"), + columns_table_clause.c.user_type_id.label("user_type_id"), + columns_table_clause.c.column_id.label("column_id"), + columns_table_clause.c.name.label("column_name"), + columns_table_clause.c.max_length.label("column_max_length"), + columns_table_clause.c.precision.label("column_precision"), + ) + .select_from(columns_table_clause) + .alias("sys_columns_table_subquery") ) - .select_from(types_table_clause) - .alias("sys_types_table_subquery") - ) - inner_join_conditions: BinaryExpression = sa.and_( - *(tables_table_query.c.object_id == columns_table_query.c.object_id,) - ) - outer_join_conditions: BinaryExpression = sa.and_( - *( - columns_table_query.columns.user_type_id - == types_table_query.columns.user_type_id, + types_table_clause: TableClause = sa.table( + "types", + sa.column("user_type_id"), + sa.column("name"), + schema="sys", + ).alias("sys_types_table_clause") + types_table_query: Select = ( + sa.select( + types_table_clause.c.user_type_id.label("user_type_id"), + types_table_clause.c.name.label("column_data_type"), + ) + .select_from(types_table_clause) + .alias("sys_types_table_subquery") ) - ) - col_info_query = ( - sa.select( - tables_table_query.c.schema_name, - tables_table_query.c.table_name, - columns_table_query.c.column_id, - columns_table_query.c.column_name, - types_table_query.c.column_data_type, - columns_table_query.c.column_max_length, - columns_table_query.c.column_precision, + inner_join_conditions: BinaryExpression = sa.and_( + *(tables_table_query.c.object_id == columns_table_query.c.object_id,) ) - .select_from( - tables_table_query.join( - right=columns_table_query, - onclause=inner_join_conditions, - isouter=False, - ).join( - right=types_table_query, - onclause=outer_join_conditions, - isouter=True, + outer_join_conditions: BinaryExpression = sa.and_( + *( + columns_table_query.columns.user_type_id + == types_table_query.columns.user_type_id, ) ) - .where(tables_table_query.c.table_name == selectable.name) - .order_by( - tables_table_query.c.schema_name.asc(), - tables_table_query.c.table_name.asc(), - columns_table_query.c.column_id.asc(), + col_info_query = ( + sa.select( + tables_table_query.c.schema_name, + tables_table_query.c.table_name, + columns_table_query.c.column_id, + columns_table_query.c.column_name, + types_table_query.c.column_data_type, + columns_table_query.c.column_max_length, + columns_table_query.c.column_precision, + ) + .select_from( + tables_table_query.join( + right=columns_table_query, + onclause=inner_join_conditions, + isouter=False, + ).join( + right=types_table_query, + onclause=outer_join_conditions, + isouter=True, + ) + ) + .where(tables_table_query.c.table_name == selectable.name) + .order_by( + tables_table_query.c.schema_name.asc(), + tables_table_query.c.table_name.asc(), + columns_table_query.c.column_id.asc(), + ) ) - ) - col_info_tuples_list: List[tuple] = sqlalchemy_engine.execute( - col_info_query - ).fetchall() - # type_module = _get_dialect_type_module(dialect=dialect) - col_info_dict_list = [ - { - "name": column_name, - # "type": getattr(type_module, column_data_type.upper())(), - "type": column_data_type.upper(), - } - for schema_name, table_name, column_id, column_name, column_data_type, column_max_length, column_precision in col_info_tuples_list - ] - elif dialect.name.lower() == "trino": - try: - table_name = selectable.name - except AttributeError: - table_name = selectable - if str(table_name).lower().startswith("select"): - rx = re.compile(r"^.* from ([\S]+)", re.I) - match = rx.match(str(table_name).replace("\n", "")) - if match: - table_name = match.group(1) - schema_name = sqlalchemy_engine.dialect.default_schema_name - - tables_table: sa.Table = sa.Table( - "tables", - sa.MetaData(), - schema="information_schema", - ) - tables_table_query = ( - sa.select( - sa.column("table_schema").label("schema_name"), - sa.column("table_name").label("table_name"), + col_info_tuples_list: List[tuple] = connection.execute( + col_info_query + ).fetchall() + # type_module = _get_dialect_type_module(dialect=dialect) + col_info_dict_list = [ + { + "name": column_name, + # "type": getattr(type_module, column_data_type.upper())(), + "type": column_data_type.upper(), + } + for schema_name, table_name, column_id, column_name, column_data_type, column_max_length, column_precision in col_info_tuples_list + ] + elif dialect.name.lower() == "trino": + try: + table_name = selectable.name + except AttributeError: + table_name = selectable + if str(table_name).lower().startswith("select"): + rx = re.compile(r"^.* from ([\S]+)", re.I) + match = rx.match(str(table_name).replace("\n", "")) + if match: + table_name = match.group(1) + schema_name = sqlalchemy_engine.dialect.default_schema_name + + tables_table: sa.Table = sa.Table( + "tables", + sa.MetaData(), + schema="information_schema", ) - .select_from(tables_table) - .alias("information_schema_tables_table") - ) - columns_table: sa.Table = sa.Table( - "columns", - sa.MetaData(), - schema="information_schema", - ) - columns_table_query = ( - sa.select( - sa.column("column_name").label("column_name"), - sa.column("table_name").label("table_name"), - sa.column("table_schema").label("schema_name"), - sa.column("data_type").label("column_data_type"), + tables_table_query = ( + sa.select( + sa.column("table_schema").label("schema_name"), + sa.column("table_name").label("table_name"), + ) + .select_from(tables_table) + .alias("information_schema_tables_table") ) - .select_from(columns_table) - .alias("information_schema_columns_table") - ) - conditions = sa.and_( - *( - tables_table_query.c.table_name == columns_table_query.c.table_name, - tables_table_query.c.schema_name == columns_table_query.c.schema_name, + columns_table: sa.Table = sa.Table( + "columns", + sa.MetaData(), + schema="information_schema", ) - ) - col_info_query = ( - sa.select( - tables_table_query.c.schema_name, - tables_table_query.c.table_name, - columns_table_query.c.column_name, - columns_table_query.c.column_data_type, + columns_table_query = ( + sa.select( + sa.column("column_name").label("column_name"), + sa.column("table_name").label("table_name"), + sa.column("table_schema").label("schema_name"), + sa.column("data_type").label("column_data_type"), + ) + .select_from(columns_table) + .alias("information_schema_columns_table") ) - .select_from( - tables_table_query.join( - right=columns_table_query, onclause=conditions, isouter=False + conditions = sa.and_( + *( + tables_table_query.c.table_name == columns_table_query.c.table_name, + tables_table_query.c.schema_name + == columns_table_query.c.schema_name, ) ) - .where( - sa.and_( - *( - tables_table_query.c.table_name == table_name, - tables_table_query.c.schema_name == schema_name, + col_info_query = ( + sa.select( + tables_table_query.c.schema_name, + tables_table_query.c.table_name, + columns_table_query.c.column_name, + columns_table_query.c.column_data_type, + ) + .select_from( + tables_table_query.join( + right=columns_table_query, onclause=conditions, isouter=False ) ) + .where( + sa.and_( + *( + tables_table_query.c.table_name == table_name, + tables_table_query.c.schema_name == schema_name, + ) + ) + ) + .order_by( + tables_table_query.c.schema_name.asc(), + tables_table_query.c.table_name.asc(), + columns_table_query.c.column_name.asc(), + ) + .alias("column_info") ) - .order_by( - tables_table_query.c.schema_name.asc(), - tables_table_query.c.table_name.asc(), - columns_table_query.c.column_name.asc(), - ) - .alias("column_info") - ) - col_info_tuples_list = sqlalchemy_engine.execute(col_info_query).fetchall() - # type_module = _get_dialect_type_module(dialect=dialect) - col_info_dict_list = [ - { - "name": column_name, - "type": column_data_type.upper(), - } - for schema_name, table_name, column_name, column_data_type in col_info_tuples_list - ] - else: - # if a custom query was passed - if isinstance(selectable, TextClause): - query: TextClause = selectable + col_info_tuples_list = connection.execute(col_info_query).fetchall() + # type_module = _get_dialect_type_module(dialect=dialect) + col_info_dict_list = [ + { + "name": column_name, + "type": column_data_type.upper(), + } + for schema_name, table_name, column_name, column_data_type in col_info_tuples_list + ] else: - # noinspection PyUnresolvedReferences - if dialect.name.lower() == GXSqlDialect.REDSHIFT: - # Redshift needs temp tables to be declared as text - query = ( - sa.select(sa.text("*")).select_from(sa.text(selectable)).limit(1) - ) + # if a custom query was passed + if isinstance(selectable, TextClause): + query: TextClause = selectable else: - query = sa.select(sa.text("*")).select_from(selectable).limit(1) - result_object = sqlalchemy_engine.execute(query) - # noinspection PyProtectedMember - col_names: List[str] = result_object._metadata.keys - col_info_dict_list = [{"name": col_name} for col_name in col_names] - return col_info_dict_list + # noinspection PyUnresolvedReferences + if dialect.name.lower() == GXSqlDialect.REDSHIFT: + # Redshift needs temp tables to be declared as text + query = ( + sa.select(sa.text("*")) + .select_from(sa.text(selectable)) + .limit(1) + ) + else: + query = sa.select(sa.text("*")).select_from(selectable).limit(1) + + result_object = connection.execute(query) + # noinspection PyProtectedMember + col_names: List[str] = result_object._metadata.keys + col_info_dict_list = [{"name": col_name} for col_name in col_names] + return col_info_dict_list @overload diff --git a/great_expectations/optional_imports.py b/great_expectations/optional_imports.py index e6755144ca93..0bca9f436acb 100644 --- a/great_expectations/optional_imports.py +++ b/great_expectations/optional_imports.py @@ -102,6 +102,25 @@ def is_version_less_than( except ImportError: sqlalchemy = SQLALCHEMY_NOT_IMPORTED +try: + sqlalchemy_Connection = sqlalchemy.engine.Connection +except (ImportError, AttributeError): + sqlalchemy_Connection = SQLALCHEMY_NOT_IMPORTED + +try: + sqlalchemy_Engine = sqlalchemy.engine.Engine +except (ImportError, AttributeError): + sqlalchemy_Engine = SQLALCHEMY_NOT_IMPORTED + +try: + sqlalchemy_Row = sqlalchemy.engine.Row +except (ImportError, AttributeError): + sqlalchemy_Row = SQLALCHEMY_NOT_IMPORTED + +try: + sqlalchemy_TextClause = sqlalchemy.sql.elements.TextClause +except (ImportError, AttributeError): + sqlalchemy_TextClause = SQLALCHEMY_NOT_IMPORTED SPARK_NOT_IMPORTED = NotImported( "pyspark is not installed, please 'pip install pyspark'" diff --git a/pyproject.toml b/pyproject.toml index 4b0824e98b8a..5fc10cfac6d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -460,10 +460,6 @@ filterwarnings = [ # Example Actual Warning: (tests/datasource/fluent/integration/conftest.py) # sqlalchemy.exc.SADeprecationWarning: Table.tometadata() is renamed to Table.to_metadata() (deprecated since: 1.4) 'ignore: Table.tometadata\(\) is renamed to Table.to_metadata\(\):DeprecationWarning', - - # SQLAlchemy 2.x support warnings. These warnings should be ignored until sqlalchemy 2.x is fully supported. - # To get SQLAlchemy 2.x supported, remove one of these ignores and then fix the resulting errors. - 'ignore: The Engine.execute\(\) method is considered legacy as of the 1.x series of SQLAlchemy and will be removed in 2.0. All statement execution in SQLAlchemy 2.0 is performed by the Connection.execute\(\) method of Connection, or in the ORM by the Session.execute\(\) method of Session.:DeprecationWarning', # --------------------------------------- TEMPORARY IGNORES -------------------------------------------------------- ] junit_family="xunit2" diff --git a/tests/conftest.py b/tests/conftest.py index 19cd3d69f70c..827ec83b44a8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1705,10 +1705,11 @@ def titanic_sqlite_db(sa): titanic_db_path = file_relative_path(__file__, "./test_sets/titanic.db") engine = create_engine(f"sqlite:///{titanic_db_path}") - assert engine.execute(sa.text("select count(*) from titanic")).fetchall()[ - 0 - ] == (1313,) - return engine + with engine.begin() as connection: + assert connection.execute( + sa.text("select count(*) from titanic") + ).fetchall()[0] == (1313,) + return engine except ImportError: raise ValueError("sqlite tests require sqlalchemy to be installed") @@ -1721,9 +1722,10 @@ def titanic_sqlite_db_connection_string(sa): titanic_db_path = file_relative_path(__file__, "./test_sets/titanic.db") engine = create_engine(f"sqlite:////{titanic_db_path}") - assert engine.execute(sa.text("select count(*) from titanic")).fetchall()[ - 0 - ] == (1313,) + with engine.begin() as connection: + assert connection.execute( + sa.text("select count(*) from titanic") + ).fetchall()[0] == (1313,) return f"sqlite:///{titanic_db_path}" except ImportError: raise ValueError("sqlite tests require sqlalchemy to be installed") @@ -1761,7 +1763,7 @@ def empty_sqlite_db(sa): from sqlalchemy import create_engine engine = create_engine("sqlite://") - with engine.connect() as connection: + with engine.begin() as connection: assert connection.execute(sa.text("select 1")).fetchall()[0] == (1,) return engine except ImportError: @@ -2282,18 +2284,17 @@ def sqlite_view_engine(test_backends): con=sqlite_engine, index=True, ) - with sqlite_engine.connect() as connection: - with connection.begin(): - connection.execute( - sa.text( - "CREATE TEMP VIEW test_temp_view AS SELECT * FROM test_table where a < 4;" - ) + with sqlite_engine.begin() as connection: + connection.execute( + sa.text( + "CREATE TEMP VIEW test_temp_view AS SELECT * FROM test_table where a < 4;" ) - connection.execute( - sa.text( - "CREATE VIEW test_view AS SELECT * FROM test_table where a > 4;" - ) + ) + connection.execute( + sa.text( + "CREATE VIEW test_view AS SELECT * FROM test_table where a > 4;" ) + ) return sqlite_engine except ImportError: sa = None diff --git a/tests/data_context/test_data_context_test_yaml_config.py b/tests/data_context/test_data_context_test_yaml_config.py index b4686747202d..05d295cc529c 100644 --- a/tests/data_context/test_data_context_test_yaml_config.py +++ b/tests/data_context/test_data_context_test_yaml_config.py @@ -43,31 +43,35 @@ def test_connectable_postgresql_db(sa, test_backends, test_df): database="test_ci", ) engine = sa.create_engine(url) - - schema_check_results = engine.execute( - sa.text( - "SELECT schema_name FROM information_schema.schemata WHERE schema_name = 'connection_test';" - ) - ).fetchall() + with engine.begin() as connection: + schema_check_results = connection.execute( + sa.text( + "SELECT schema_name FROM information_schema.schemata WHERE schema_name = 'connection_test';" + ) + ).fetchall() if len(schema_check_results) == 0: with engine.begin() as connection: connection.execute(sa.text("CREATE SCHEMA connection_test;")) - table_check_results = engine.execute( - sa.text( - """ + table_check_results = connection.execute( + sa.text( + """ SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_schema = 'connection_test' AND table_name = 'test_df' ); """ - ) - ).fetchall() - if table_check_results != [(True,)]: - add_dataframe_to_db( - df=test_df, name="test_df", con=engine, index=True, schema="connection_test" - ) + ) + ).fetchall() + if table_check_results != [(True,)]: + add_dataframe_to_db( + df=test_df, + name="test_df", + con=engine, + index=True, + schema="connection_test", + ) # Return a connection string to this newly-created db return engine diff --git a/tests/data_context/test_data_context_v013.py b/tests/data_context/test_data_context_v013.py index 43f7d106619b..e1388e04c666 100644 --- a/tests/data_context/test_data_context_v013.py +++ b/tests/data_context/test_data_context_v013.py @@ -592,7 +592,7 @@ def test_get_batch_with_query_in_runtime_parameters_using_runtime_data_connector assert sa_engine.execute(sa.text(selectable_count_sql_str)).scalar() == 123 assert batch.batch_markers.get("ge_load_time") is not None # since create_temp_table defaults to True, there should be 1 temp table - assert len(get_sqlite_temp_table_names(batch.data.execution_engine.engine)) == 1 + assert len(get_sqlite_temp_table_names(batch.data.execution_engine)) == 1 # if create_temp_table in batch_spec_passthrough is set to False, no new temp tables should be created batch = context.get_batch( @@ -610,7 +610,7 @@ def test_get_batch_with_query_in_runtime_parameters_using_runtime_data_connector batch_spec_passthrough={"create_temp_table": False}, ), ) - assert len(get_sqlite_temp_table_names(batch.data.execution_engine.engine)) == 1 + assert len(get_sqlite_temp_table_names(batch.data.execution_engine)) == 1 def test_get_validator_with_query_in_runtime_parameters_using_runtime_data_connector( diff --git a/tests/execution_engine/test_sqlalchemy_execution_engine.py b/tests/execution_engine/test_sqlalchemy_execution_engine.py index ba170cfc58c2..c1f666b1e9b6 100644 --- a/tests/execution_engine/test_sqlalchemy_execution_engine.py +++ b/tests/execution_engine/test_sqlalchemy_execution_engine.py @@ -941,21 +941,21 @@ def test_get_batch_data_and_markers_using_query(sqlite_view_engine, test_df): def test_sa_batch_unexpected_condition_temp_table(caplog, sa): - def validate_tmp_tables(): + def validate_tmp_tables(execution_engine): temp_tables = [ name - for name in get_sqlite_temp_table_names(engine.engine) + for name in get_sqlite_temp_table_names(execution_engine) if name.startswith("ge_temp_") ] tables = [ name - for name in get_sqlite_table_names(engine.engine) + for name in get_sqlite_table_names(execution_engine) if name.startswith("ge_temp_") ] assert len(temp_tables) == 0 assert len(tables) == 0 - engine = build_sa_engine( + execution_engine = build_sa_engine( pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), sa ) @@ -964,10 +964,10 @@ def validate_tmp_tables(): table_columns_metric: MetricConfiguration results: Dict[Tuple[str, str, str], MetricValue] - table_columns_metric, results = get_table_columns_metric(engine=engine) + table_columns_metric, results = get_table_columns_metric(engine=execution_engine) metrics.update(results) - validate_tmp_tables() + validate_tmp_tables(execution_engine=execution_engine) condition_metric = MetricConfiguration( metric_name=f"column_values.unique.{MetricPartialFunctionTypeSuffixes.CONDITION.value}", @@ -977,12 +977,12 @@ def validate_tmp_tables(): condition_metric.metric_dependencies = { "table.columns": table_columns_metric, } - results = engine.resolve_metrics( + results = execution_engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) - validate_tmp_tables() + validate_tmp_tables(execution_engine=execution_engine) desired_metric = MetricConfiguration( metric_name=f"column_values.unique.{SummarizationMetricNameSuffixes.UNEXPECTED_COUNT.value}", @@ -993,8 +993,8 @@ def validate_tmp_tables(): "unexpected_condition": condition_metric, } # noinspection PyUnusedLocal - results = engine.resolve_metrics( + results = execution_engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) - validate_tmp_tables() + validate_tmp_tables(execution_engine=execution_engine) diff --git a/tests/expectations/test_null_filters.py b/tests/expectations/test_null_filters.py index 79b073370e9d..734c1b81d449 100644 --- a/tests/expectations/test_null_filters.py +++ b/tests/expectations/test_null_filters.py @@ -66,5 +66,7 @@ def test_sa_null_filters(sa): # Demonstrate that spark's max aggregate function can tolerate null values df = pd.DataFrame({"a": [1, 2, 3, None, None, 4]}) add_dataframe_to_db(df=df, name="test", con=eng, index=False) - - assert eng.execute(sa.text(f"SELECT MAX(a) FROM test;")).fetchone()[0] == 4 + with eng.begin() as connection: + assert ( + connection.execute(sa.text(f"SELECT MAX(a) FROM test;")).fetchone()[0] == 4 + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 31e00872ee96..8323890918a8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -33,6 +33,7 @@ logger = logging.getLogger(__name__) +from great_expectations.optional_imports import sqlalchemy_Connection try: import sqlalchemy as sa @@ -153,33 +154,34 @@ def validate_uuid4(uuid_string: str) -> bool: return val.hex == uuid_string.replace("-", "") -def get_sqlite_temp_table_names(engine): - result = engine.execute( - sa.text( - """ -SELECT - name -FROM - sqlite_temp_master -""" - ) - ) +def get_sqlite_temp_table_names(execution_engine): + + statement = sa.text("SELECT name FROM sqlite_temp_master") + + if isinstance(execution_engine.engine, sqlalchemy_Connection): + connection = execution_engine.engine + result = connection.execute(statement) + else: + with execution_engine.engine.connect() as connection: + result = connection.execute(statement) + rows = result.fetchall() return {row[0] for row in rows} -def get_sqlite_table_names(engine): - result = engine.execute( - sa.text( - """ -SELECT - name -FROM - sqlite_master -""" - ) - ) +def get_sqlite_table_names(execution_engine): + + statement = sa.text("SELECT name FROM sqlite_master") + + if isinstance(execution_engine.engine, sqlalchemy_Connection): + connection = execution_engine.engine + result = connection.execute(statement) + else: + with execution_engine.engine.connect() as connection: + result = connection.execute(statement) + rows = result.fetchall() + return {row[0] for row in rows} From c3187e47ce10d3a78bdf1e2d54afb4fee90b203c Mon Sep 17 00:00:00 2001 From: kenwade4 <95714847+kenwade4@users.noreply.github.com> Date: Wed, 12 Apr 2023 08:43:18 -0500 Subject: [PATCH 80/96] [BUGFIX] Misc gallery bugfixes (#7611) --- .../expect_column_discrete_entropy_to_be_between.py | 2 +- ...pect_column_distribution_to_match_benfords_law.py | 2 +- .../expect_column_kurtosis_to_be_between.py | 2 +- .../expectations/expect_column_skew_to_be_between.py | 4 +--- ...xpect_column_values_to_be_normally_distributed.py | 2 +- ...ct_column_wasserstein_distance_to_be_less_than.py | 2 +- ...pect_multicolumn_datetime_difference_in_months.py | 5 ++--- .../expect_queried_column_list_to_be_unique.py | 1 - ...ect_queried_column_to_be_unique_with_condition.py | 1 - ...olumn_to_have_n_distinct_values_with_condition.py | 1 - ...eried_column_value_frequency_to_meet_threshold.py | 1 - ..._column_values_to_exist_in_second_table_column.py | 3 --- ...expect_queried_custom_query_to_return_num_rows.py | 1 - ..._queried_slowly_changing_table_to_have_no_gaps.py | 1 - .../expect_queried_table_row_count_to_be.py | 1 - .../expect_table_checksum_to_equal_other_table.py | 3 --- .../expectations/__init__.py | 7 +++++++ .../expect_column_values_to_be_valid_india_zip.py | 0 .../expectations/__init__.py | 9 +++++++++ ...ct_batch_row_count_to_match_prophet_date_model.py | 7 ++++--- .../expect_column_max_to_match_prophet_date_model.py | 8 ++++++++ ...column_pair_values_to_match_prophet_date_model.py | 7 ++++--- .../core/usage_statistics/package_dependencies.py | 2 ++ great_expectations/core/util.py | 3 +++ great_expectations/expectations/expectation.py | 12 +++++++++++- reqs/requirements-dev-all-contrib-expectations.txt | 2 ++ .../expect_column_max_to_be_between_custom.py | 1 - ...xpect_column_values_to_be_in_solfege_scale_set.py | 1 - .../expect_column_values_to_only_contain_vowels.py | 1 - ...eried_column_value_frequency_to_meet_threshold.py | 1 - .../expect_queried_table_row_count_to_be.py | 1 - .../expect_table_columns_to_be_unique.py | 2 -- .../expect_table_row_count_to_equal_other_table.json | 1 + 33 files changed, 59 insertions(+), 38 deletions(-) rename contrib/great_expectations_zipcode_expectations/{ => great_expectations_zipcode_expectations/expectations}/expect_column_values_to_be_valid_india_zip.py (100%) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py index 91b5b7185ca1..3281ec7b7096 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py @@ -19,7 +19,7 @@ ColumnExpectation, render_evaluation_parameter_string, ) -from great_expectations.expectations.metrics.column_aggregate_metric import ( +from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, ) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py index 3d851a4ef6b6..fee4cacd8950 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py @@ -7,7 +7,7 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.expectation import ColumnExpectation -from great_expectations.expectations.metrics.column_aggregate_metric import ( +from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, ) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py index 780db599dc39..95f1559e1e8d 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py @@ -10,7 +10,7 @@ ) from great_expectations.expectations.expectation import ColumnExpectation from great_expectations.expectations.metrics import column_aggregate_partial -from great_expectations.expectations.metrics.column_aggregate_metric import ( +from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, ) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py index 861887ce63a4..32c86f335011 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py @@ -16,10 +16,8 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.expectation import ColumnExpectation -from great_expectations.expectations.metrics.column_aggregate_metric import ( - ColumnAggregateMetricProvider, -) from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( + ColumnAggregateMetricProvider, column_aggregate_partial, column_aggregate_value, ) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py index abd53a54d77a..67bb21963ec8 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py @@ -5,7 +5,7 @@ from great_expectations.core import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.expectation import ColumnExpectation -from great_expectations.expectations.metrics.column_aggregate_metric import ( +from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, ) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py index c6b88a85ffb7..7db52ad6f2b2 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py @@ -5,7 +5,7 @@ from great_expectations.core import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.expectation import ColumnExpectation -from great_expectations.expectations.metrics.column_aggregate_metric import ( +from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, ) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py index 2351d81fa8cc..f46337385365 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_datetime_difference_in_months.py @@ -93,7 +93,7 @@ def date_diff_in_months(row): # This class defines the Expectation itself -class ExpectColumnDatetimeDifferenceInMonths(MulticolumnMapExpectation): +class ExpectMulticolumnDatetimeDifferenceInMonths(MulticolumnMapExpectation): """Expect the difference of 2 datetime columns is equal to another column in month. @@ -109,7 +109,6 @@ class ExpectColumnDatetimeDifferenceInMonths(MulticolumnMapExpectation): examples = [ { - "dataset_name": "test", "data": { "start_datetime": [ "2022-03-22 10:00:00", @@ -229,4 +228,4 @@ def validate_configuration( if __name__ == "__main__": - ExpectColumnDatetimeDifferenceInMonths().print_diagnostic_checklist() + ExpectMulticolumnDatetimeDifferenceInMonths().print_diagnostic_checklist() diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_list_to_be_unique.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_list_to_be_unique.py index c89b0488ad89..762c0918cf82 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_list_to_be_unique.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_list_to_be_unique.py @@ -87,7 +87,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "unique_num": [1, 2, 3, 4, 5, 6], "unique_str": ["a", "b", "c", "d", "e", "f"], diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_be_unique_with_condition.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_be_unique_with_condition.py index a8149463bd32..ffa8fcdc8c70 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_be_unique_with_condition.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_be_unique_with_condition.py @@ -89,7 +89,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "uuid": [1, 2, 2, 3, 4, 4], "is_open": [True, False, True, True, True, True], diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_have_n_distinct_values_with_condition.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_have_n_distinct_values_with_condition.py index 6b6590fecbc8..6238dc26392f 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_have_n_distinct_values_with_condition.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_to_have_n_distinct_values_with_condition.py @@ -108,7 +108,6 @@ def validate_template_dict(self, configuration): { "data": [ { - "dataset_name": "test", "data": { "uuid": [1, 2, 2, 3, 4, 4], "is_open": [True, False, True, True, True, True], diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_value_frequency_to_meet_threshold.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_value_frequency_to_meet_threshold.py index 7c44af6949e6..a62cfd29a533 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_value_frequency_to_meet_threshold.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_value_frequency_to_meet_threshold.py @@ -109,7 +109,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "col1": [1, 2, 2, 3, 4], "col2": ["a", "a", "b", "b", "a"], diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_values_to_exist_in_second_table_column.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_values_to_exist_in_second_table_column.py index d203cffe265b..4d95b07774f3 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_values_to_exist_in_second_table_column.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_column_values_to_exist_in_second_table_column.py @@ -76,19 +76,16 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "msid": ["aaa", "bbb"], }, }, { - "dataset_name": "test_2", "data": { "msid": ["aaa", "aaa"], }, }, { - "dataset_name": "test_3", "data": { "msid": [ "aaa", diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_custom_query_to_return_num_rows.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_custom_query_to_return_num_rows.py index 547e1fb1d856..730ea33a13f9 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_custom_query_to_return_num_rows.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_custom_query_to_return_num_rows.py @@ -63,7 +63,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": {"col1": [1, 2, 3, 4, 5, 5], "col2": [10, 3, 4, 4, 5, 5]}, } ], diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_slowly_changing_table_to_have_no_gaps.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_slowly_changing_table_to_have_no_gaps.py index d64695ca16a2..68cf506331bc 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_slowly_changing_table_to_have_no_gaps.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_slowly_changing_table_to_have_no_gaps.py @@ -104,7 +104,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "msid": [ "aaa", diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_table_row_count_to_be.py b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_table_row_count_to_be.py index 872d121b51b2..d82a92e821a4 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_queried_table_row_count_to_be.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_queried_table_row_count_to_be.py @@ -80,7 +80,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "col1": [1, 2, 2, 3, 4], "col2": ["a", "a", "b", "b", "a"], diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py b/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py index 29f932124d56..7d6d2965b582 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py @@ -303,7 +303,6 @@ class ExpectTableChecksumToEqualOtherTable(TableExpectation): { "data": [ { - "dataset_name": "table_data_1", "data": { "columnone": [3, 5, 7], "columntwo": [True, False, True], @@ -312,7 +311,6 @@ class ExpectTableChecksumToEqualOtherTable(TableExpectation): }, }, { - "dataset_name": "table_data_2", "data": { "columnone": [3, 5, 7], "columntwo": [True, False, True], @@ -321,7 +319,6 @@ class ExpectTableChecksumToEqualOtherTable(TableExpectation): }, }, { - "dataset_name": "table_data_3", "data": { "columnone": [3, 5, 7, 8], "columntwo": [True, False, True, False], diff --git a/contrib/great_expectations_zipcode_expectations/great_expectations_zipcode_expectations/expectations/__init__.py b/contrib/great_expectations_zipcode_expectations/great_expectations_zipcode_expectations/expectations/__init__.py index 000b98f8f2c2..2dc977dc176f 100644 --- a/contrib/great_expectations_zipcode_expectations/great_expectations_zipcode_expectations/expectations/__init__.py +++ b/contrib/great_expectations_zipcode_expectations/great_expectations_zipcode_expectations/expectations/__init__.py @@ -157,3 +157,10 @@ ) from .expect_column_values_to_be_valid_zip5 import ExpectColumnValuesToBeValidZip5 from .expect_column_values_to_be_valid_zip9 import ExpectColumnValuesToBeValidZip9 + +# Uncomment this when https://pypi.org/project/indiapins works +# - Currently fails on `import indiapins` +# - FileNotFoundError: [Errno 2] No such file or directory: '/Users/me/great_expectations/venv/lib/python3.8/site-packages/indiapins/pins.json.bz2' +# from .expect_column_values_to_be_valid_india_zip import ( +# ExpectColumnValuesToBeValidIndiaZip, +# ) diff --git a/contrib/great_expectations_zipcode_expectations/expect_column_values_to_be_valid_india_zip.py b/contrib/great_expectations_zipcode_expectations/great_expectations_zipcode_expectations/expectations/expect_column_values_to_be_valid_india_zip.py similarity index 100% rename from contrib/great_expectations_zipcode_expectations/expect_column_values_to_be_valid_india_zip.py rename to contrib/great_expectations_zipcode_expectations/great_expectations_zipcode_expectations/expectations/expect_column_values_to_be_valid_india_zip.py diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/__init__.py b/contrib/time_series_expectations/time_series_expectations/expectations/__init__.py index 62b8037908a8..642ebf7c92e8 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/__init__.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/__init__.py @@ -1 +1,10 @@ # Make sure to include any Expectations your want exported below! +from .expect_batch_row_count_to_match_prophet_date_model import ( + ExpectBatchRowCountToMatchProphetDateModel, +) +from .expect_column_max_to_match_prophet_date_model import ( + ExpectColumnMaxToMatchProphetDateModel, +) +from .expect_column_pair_values_to_match_prophet_date_model import ( + ExpectColumnPairValuesToMatchProphetDateModel, +) diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py b/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py index c8f31d6e4e1f..34942fb881e8 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py @@ -170,10 +170,11 @@ def _validate( } library_metadata = { - "tags": [], # Tags for this Expectation in the Gallery - "contributors": [ # Github handles for all contributors to this Expectation. - "@your_name_here", # Don't forget to add your github handle here! + "tags": [], + "contributors": [ + "@abegong", ], + "requirements": ["prophet"], } diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_max_to_match_prophet_date_model.py b/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_max_to_match_prophet_date_model.py index ca27e2277c1d..330c003f741e 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_max_to_match_prophet_date_model.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_max_to_match_prophet_date_model.py @@ -93,6 +93,14 @@ class ExpectColumnMaxToMatchProphetDateModel(ColumnAggregateTimeSeriesExpectatio metric_dependency = "column.max" + library_metadata = { + "tags": [], + "contributors": [ + "@abegong", + ], + "requirements": ["prophet"], + } + if __name__ == "__main__": ExpectColumnMaxToMatchProphetDateModel().print_diagnostic_checklist() diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py b/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py index 7f4b8937c46b..6103a59a51ac 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py @@ -235,10 +235,11 @@ def validate_configuration( # raise InvalidExpectationConfigurationError(str(e)) library_metadata = { - "tags": [], # Tags for this Expectation in the Gallery - "contributors": [ # Github handles for all contributors to this Expectation. - "@your_name_here", # Don't forget to add your github handle here! + "tags": [], + "contributors": [ + "@abegong", ], + "requirements": ["prophet"], } diff --git a/great_expectations/core/usage_statistics/package_dependencies.py b/great_expectations/core/usage_statistics/package_dependencies.py index d879fd464246..143c85aeb34a 100644 --- a/great_expectations/core/usage_statistics/package_dependencies.py +++ b/great_expectations/core/usage_statistics/package_dependencies.py @@ -173,6 +173,7 @@ class GXDependencies: "global-land-mask", "gtin", "holidays", + # "indiapins", # Currently a broken package "ipwhois", "isbnlib", "langid", @@ -180,6 +181,7 @@ class GXDependencies: "phonenumbers", "price_parser", "primefac", + "prophet", "pwnedpasswords", "py-moneyed", "pydnsbl", diff --git a/great_expectations/core/util.py b/great_expectations/core/util.py index cdf136a4233d..a2f37e58f4c8 100644 --- a/great_expectations/core/util.py +++ b/great_expectations/core/util.py @@ -310,6 +310,9 @@ def convert_to_json_serializable( # noqa: C901 - complexity 32 # No problem to encode json return data + if isinstance(data, range): + return list(data) + if isinstance(data, dict): new_dict = {} for key in data: diff --git a/great_expectations/expectations/expectation.py b/great_expectations/expectations/expectation.py index 64ec13fb5cda..702c5d5f828a 100644 --- a/great_expectations/expectations/expectation.py +++ b/great_expectations/expectations/expectation.py @@ -117,6 +117,7 @@ ) from great_expectations.self_check.util import ( evaluate_json_test_v3_api, + generate_dataset_name_from_expectation_name, generate_expectation_tests, ) from great_expectations.util import camel_to_snake, is_parseable_date @@ -1552,7 +1553,7 @@ def _get_examples( all_examples: List[dict] = self.examples or self._get_examples_from_json() included_examples = [] - for example in all_examples: + for i, example in enumerate(all_examples, 1): included_test_cases = [] # As of commit 7766bb5caa4e0 on 1/28/22, only_for does not need to be applied to individual tests @@ -1597,6 +1598,15 @@ def _get_examples( copied_example["test_backends"] = [ TestBackend(**tb) for tb in copied_example["test_backends"] ] + + if "dataset_name" not in copied_example: + dataset_name = generate_dataset_name_from_expectation_name( + dataset=copied_example, + expectation_type=self.expectation_type, + index=i, + ) + copied_example["dataset_name"] = dataset_name + included_examples.append(ExpectationTestDataCases(**copied_example)) return included_examples diff --git a/reqs/requirements-dev-all-contrib-expectations.txt b/reqs/requirements-dev-all-contrib-expectations.txt index 770faf90f36e..28ec1fc8853a 100644 --- a/reqs/requirements-dev-all-contrib-expectations.txt +++ b/reqs/requirements-dev-all-contrib-expectations.txt @@ -16,6 +16,7 @@ geopy global-land-mask gtin holidays +# indiapins # This package does not work (FileNotFoundError: '...lib/python3.8/site-packages/indiapins/pins.json.bz2') ipwhois isbnlib langid>=1.1.6 @@ -23,6 +24,7 @@ pgeocode phonenumbers price_parser primefac +prophet pwnedpasswords py-moneyed pydnsbl diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py index c2f6c455a562..8d4d9bf68db5 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py @@ -99,7 +99,6 @@ class ExpectColumnMaxToBeBetweenCustom(ColumnExpectation): # examples = [ { - "dataset_name": "expect_column_max_to_be_between_custom_1", "data": {"x": [1, 2, 3, 4, 5], "y": [0, -1, -2, 4, None]}, "tests": [ { diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_be_in_solfege_scale_set.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_be_in_solfege_scale_set.py index 1e0e42b3e8e0..c800e7fdd8e9 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_be_in_solfege_scale_set.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_be_in_solfege_scale_set.py @@ -42,7 +42,6 @@ class ExpectColumnValuesToBeInSolfegeScaleSet(SetBasedColumnMapExpectation): # examples = [ { - "dataset_name": "expect_column_values_to_be_in_solfege_scale_set_1", "data": { "lowercase_solfege_scale": [ "do", diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py index 42605d2120f8..df12b5472404 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_only_contain_vowels.py @@ -19,7 +19,6 @@ class ExpectColumnValuesToOnlyContainVowels(RegexBasedColumnMapExpectation): # examples = [ { - "dataset_name": "expect_column_values_to_only_contain_vowels_1", "data": { "only_vowels": ["a", "e", "I", "O", "U", "y", ""], "mixed": ["A", "b", "c", "D", "E", "F", "g"], diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_column_value_frequency_to_meet_threshold.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_column_value_frequency_to_meet_threshold.py index 2b39fc636acc..b1646fc07f21 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_column_value_frequency_to_meet_threshold.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_column_value_frequency_to_meet_threshold.py @@ -123,7 +123,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "col1": [1, 2, 2, 3, 4], "col2": ["a", "a", "b", "b", "a"], diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_table_row_count_to_be.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_table_row_count_to_be.py index 62d9e8330f96..2ae9e87ab15f 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_table_row_count_to_be.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_queried_table_row_count_to_be.py @@ -92,7 +92,6 @@ def _validate( { "data": [ { - "dataset_name": "test", "data": { "col1": [1, 2, 2, 3, 4], "col2": ["a", "a", "b", "b", "a"], diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py index 3626470fc1ec..a7acbd9e84ca 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py @@ -97,7 +97,6 @@ class ExpectTableColumnsToBeUnique(TableExpectation): # examples = [ { - "dataset_name": "expect_table_columns_to_be_unique_1", "data": { "col1": [1, 2, 3, 4, 5], "col2": [2, 3, 4, 5, 6], @@ -114,7 +113,6 @@ class ExpectTableColumnsToBeUnique(TableExpectation): ], }, { - "dataset_name": "expect_table_columns_to_be_unique_2", "data": { "col1": [1, 2, 3, 4, 5], "col2": [1, 2, 3, 4, 5], diff --git a/tests/test_definitions/multi_table_expectations/expect_table_row_count_to_equal_other_table.json b/tests/test_definitions/multi_table_expectations/expect_table_row_count_to_equal_other_table.json index cfa20d84c936..f334f34ac0db 100644 --- a/tests/test_definitions/multi_table_expectations/expect_table_row_count_to_equal_other_table.json +++ b/tests/test_definitions/multi_table_expectations/expect_table_row_count_to_equal_other_table.json @@ -2,6 +2,7 @@ "expectation_type": "expect_table_row_count_to_equal_other_table", "datasets": [ { + "dataset_name": "expect_table_row_count_to_equal_other_table_data_0", "data": [ { "dataset_name": "expect_table_row_count_to_equal_other_table_data_1", From a4ff2a4120fc4c27cd3bf2ef68778f525cd26c48 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Wed, 12 Apr 2023 10:34:21 -0400 Subject: [PATCH 81/96] [DOCS] Correct expectation documentation for expect_column_max_to_be_between (#7597) Co-authored-by: Anthony Burdi --- .../expectations/core/expect_column_max_to_be_between.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/great_expectations/expectations/core/expect_column_max_to_be_between.py b/great_expectations/expectations/core/expect_column_max_to_be_between.py index 161eee8a7a27..96d3f8a38d42 100644 --- a/great_expectations/expectations/core/expect_column_max_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_max_to_be_between.py @@ -59,13 +59,13 @@ class ExpectColumnMaxToBeBetween(ColumnExpectation): column (str): \ The column name min_value (comparable type or None): \ - The minimum number of unique values allowed. + The minimum value of the acceptable range for the column maximum. max_value (comparable type or None): \ - The maximum number of unique values allowed. + The maximum value of the acceptable range for the column maximum. strict_min (boolean): \ - If True, the minimal column minimum must be strictly larger than min_value, default=False + If True, the lower bound of the column maximum acceptable range must be strictly larger than min_value, default=False strict_max (boolean): \ - If True, the maximal column minimum must be strictly smaller than max_value, default=False + If True, the upper bound of the column maximum acceptable range must be strictly smaller than max_value, default=False Keyword Args: parse_strings_as_datetimes (Boolean or None): \ From fdc8353117dc383070cbd505c79269b5bac559f0 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 12 Apr 2023 09:23:16 -0600 Subject: [PATCH 82/96] [MAINTENANCE] Deprecate ColumnExpectation in favor of ColumnAggregateExpectation (#7609) --- ...t_column_discrete_entropy_to_be_between.py | 4 +-- ...column_distinct_values_to_be_continuous.py | 4 +-- ...lumn_distribution_to_match_benfords_law.py | 4 +-- .../expect_column_kurtosis_to_be_between.py | 4 +-- .../expect_column_skew_to_be_between.py | 4 +-- .../expectations/expect_column_sum_to_be.py | 6 ++-- .../expect_column_to_have_no_days_missing.py | 4 +-- ...umn_values_to_be_in_set_spark_optimized.py | 6 ++-- ...olumn_values_to_be_normally_distributed.py | 4 +-- ...values_to_be_string_integers_increasing.py | 4 +-- ...mn_wasserstein_distance_to_be_less_than.py | 4 +-- ...to_be_close_to_equivalent_week_day_mean.py | 4 +-- ...compared_to_avg_equivalent_days_of_week.py | 4 +-- ...t_lon_pairwise_distance_to_be_less_than.py | 6 ++-- ...erage_to_be_within_range_of_given_point.py | 8 ++--- ...n_minimum_bounding_radius_to_be_between.py | 6 ++-- ...t_column_values_geometry_not_to_overlap.py | 4 +-- ...xpect_column_values_geometry_to_overlap.py | 4 +-- contrib/time_series_expectations/README.md | 2 +- .../time_series_expectations/docs/README.md | 2 +- .../docs/working-notes.md | 2 +- ...olumn_aggregate_time_series_expectation.py | 4 +-- ...izing_framework_to_a_custom_expectation.md | 2 +- ...te_custom_column_aggregate_expectations.md | 8 ++--- .../column_aggregate_expectation_template.py | 8 ++--- ...ect_column_distinct_values_to_be_in_set.py | 4 +-- ...t_column_distinct_values_to_contain_set.py | 4 +-- ...ect_column_distinct_values_to_equal_set.py | 4 +-- ...ct_column_kl_divergence_to_be_less_than.py | 4 +-- .../core/expect_column_max_to_be_between.py | 4 +-- .../core/expect_column_mean_to_be_between.py | 4 +-- .../expect_column_median_to_be_between.py | 4 +-- .../core/expect_column_min_to_be_between.py | 4 +-- ...t_column_most_common_value_to_be_in_set.py | 4 +-- ...oportion_of_unique_values_to_be_between.py | 4 +-- ...ct_column_quantile_values_to_be_between.py | 4 +-- .../core/expect_column_stdev_to_be_between.py | 4 +-- .../core/expect_column_sum_to_be_between.py | 4 +-- ...column_unique_value_count_to_be_between.py | 4 +-- .../expectations/expectation.py | 33 +++++++++++++++++-- .../expect_column_max_to_be_between_custom.py | 4 +-- .../column_aggregate_expectation_template.py | 8 ++--- .../examples/table_expectation_template.py | 2 +- ...xample_RBP_Instantiation_and_running.ipynb | 4 +-- 44 files changed, 124 insertions(+), 95 deletions(-) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py index 3281ec7b7096..6d2c4122785f 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_discrete_entropy_to_be_between.py @@ -16,7 +16,7 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( @@ -128,7 +128,7 @@ def _get_evaluation_dependencies( return dependencies -class ExpectColumnDiscreteEntropyToBeBetween(ColumnExpectation): +class ExpectColumnDiscreteEntropyToBeBetween(ColumnAggregateExpectation): """Expect the column discrete entropy to be between a minimum value and a maximum value. The Shannon entropy of a discrete probability distribution is given by - \\sum_{i=1}^{n} P(x_i) * \\log(P(x_i)) diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distinct_values_to_be_continuous.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distinct_values_to_be_continuous.py index 6e5d81c5a43a..06398bcd41a6 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distinct_values_to_be_continuous.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distinct_values_to_be_continuous.py @@ -4,7 +4,7 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, InvalidExpectationConfigurationError, ) from great_expectations.expectations.util import ( @@ -19,7 +19,7 @@ ) -class ExpectColumnDistinctValuesToBeContinuous(ColumnExpectation): +class ExpectColumnDistinctValuesToBeContinuous(ColumnAggregateExpectation): examples = [ { diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py index fee4cacd8950..4bcb362c29cc 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_distribution_to_match_benfords_law.py @@ -6,7 +6,7 @@ from great_expectations.execution_engine.sqlalchemy_execution_engine import ( SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -201,7 +201,7 @@ def _get_evaluation_dependencies( return dependencies -class ExpectColumnDistributionToMatchBenfordsLaw(ColumnExpectation): +class ExpectColumnDistributionToMatchBenfordsLaw(ColumnAggregateExpectation): """Expect column distribution to match Benford's Law. Tests whether data matches Benford's Law Fraud Detection Algorithm. Uses a Chi-Square Goodness of Fit test with an 80@ p-value. diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py index 95f1559e1e8d..c4b6b32813be 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py @@ -8,7 +8,7 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import column_aggregate_partial from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, @@ -100,7 +100,7 @@ def _spark(cls, column, **kwargs): # return dependencies -class ExpectColumnKurtosisToBeBetween(ColumnExpectation): +class ExpectColumnKurtosisToBeBetween(ColumnAggregateExpectation): """Expect column Kurtosis to be between. Test values are drawn from various distributions (uniform, normal, gamma, student-t).""" # These examples will be shown in the public gallery, and also executed as unit tests for your Expectation diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py index 32c86f335011..eb1067fb3c0c 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py @@ -15,7 +15,7 @@ from great_expectations.execution_engine.sqlalchemy_execution_engine import ( SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_partial, @@ -185,7 +185,7 @@ def _get_query_result(func, selectable, sqlalchemy_engine): # return dependencies -class ExpectColumnSkewToBeBetween(ColumnExpectation): +class ExpectColumnSkewToBeBetween(ColumnAggregateExpectation): """Expect column skew to be between. Currently tests against Gamma and Beta distributions.""" # These examples will be shown in the public gallery, and also executed as unit tests for your Expectation diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_sum_to_be.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_sum_to_be.py index 4effb731ba23..52f2fef90efb 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_sum_to_be.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_sum_to_be.py @@ -1,5 +1,5 @@ """ -This is a template for creating custom ColumnExpectations. +This is a template for creating custom ColumnAggregateExpectations. For detailed instructions on how to use it, please see: https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations """ @@ -8,11 +8,11 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation # This class defines the Expectation itself -class ExpectColumnSumToBe(ColumnExpectation): +class ExpectColumnSumToBe(ColumnAggregateExpectation): """Expect the sum of a column to be exactly a value.""" # These examples will be shown in the public gallery. diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_to_have_no_days_missing.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_to_have_no_days_missing.py index 9e3e52e3f808..677c930c26c8 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_to_have_no_days_missing.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_to_have_no_days_missing.py @@ -6,7 +6,7 @@ ExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ColumnAggregateMetricProvider from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.optional_imports import sqlalchemy as sa @@ -52,7 +52,7 @@ def _sqlalchemy( return all_unique_dates -class ExpectColumnToHaveNoDaysMissing(ColumnExpectation): +class ExpectColumnToHaveNoDaysMissing(ColumnAggregateExpectation): """Expect No missing days in date column.""" from datetime import datetime, timedelta diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py index dc8a1fd23335..a3bc3df43fac 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py @@ -4,14 +4,14 @@ from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.exceptions import InvalidExpectationConfigurationError from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ColumnAggregateMetricProvider from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.metric_provider import metric_value # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most ColumnAggregateExpectations, the main business logic for calculation will live in this class. class ColumnValuesInSetSparkOptimized(ColumnAggregateMetricProvider): metric_name = "column_values.in_set.spark_optimized" @@ -58,7 +58,7 @@ def _spark( # This class defines the Expectation itself -class ExpectColumnValuesToBeInSetSparkOptimized(ColumnExpectation): +class ExpectColumnValuesToBeInSetSparkOptimized(ColumnAggregateExpectation): """Expect each column value to be in a given set; optimized using **join** for spark backends. Args: diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py index 67bb21963ec8..721ab5926226 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_normally_distributed.py @@ -4,7 +4,7 @@ from great_expectations.core import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -115,7 +115,7 @@ def _pandas(cls, column, **kwargs): # return dependencies -class ExpectColumnValuesToBeNormallyDistributed(ColumnExpectation): +class ExpectColumnValuesToBeNormallyDistributed(ColumnAggregateExpectation): """Expect column values to be normally distributed. NaN values are omitted.""" # These examples will be shown in the public gallery, and also executed as unit tests for your Expectation diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py index ff703e1eb573..419e635f2e5c 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py @@ -18,7 +18,7 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnMapMetricProvider, column_function_partial, @@ -140,7 +140,7 @@ def _get_evaluation_dependencies( return dependencies -class ExpectColumnValuesToBeStringIntegersIncreasing(ColumnExpectation): +class ExpectColumnValuesToBeStringIntegersIncreasing(ColumnAggregateExpectation): """Expect a column to contain string-typed integers to be increasing. expect_column_values_to_be_string_integers_increasing is a \ diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py index 7db52ad6f2b2..31b6c675ceff 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_wasserstein_distance_to_be_less_than.py @@ -4,7 +4,7 @@ from great_expectations.core import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -129,7 +129,7 @@ def _pandas(cls, column, raw_values=None, partition=None, **kwargs): # return dependencies -class ExpectColumnWassersteinDistanceToBeLessThan(ColumnExpectation): +class ExpectColumnWassersteinDistanceToBeLessThan(ColumnAggregateExpectation): """Expect that the Wasserstein Distance of the specified column with respect to an optional partition object to be lower than the provided value. See Also: diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_day_count_to_be_close_to_equivalent_week_day_mean.py b/contrib/experimental/great_expectations_experimental/expectations/expect_day_count_to_be_close_to_equivalent_week_day_mean.py index 86e0c2367ea3..fb9694a15ed0 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_day_count_to_be_close_to_equivalent_week_day_mean.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_day_count_to_be_close_to_equivalent_week_day_mean.py @@ -7,7 +7,7 @@ ExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ColumnAggregateMetricProvider from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.optional_imports import sqlalchemy as sa @@ -82,7 +82,7 @@ def _sqlalchemy( return results -class ExpectDayCountToBeCloseToEquivalentWeekDayMean(ColumnExpectation): +class ExpectDayCountToBeCloseToEquivalentWeekDayMean(ColumnAggregateExpectation): """Expect No missing days in date column""" # Default values diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py b/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py index 5505c64eaa9e..4d3e6c8006b7 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py @@ -7,7 +7,7 @@ ExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ColumnAggregateMetricProvider from great_expectations.expectations.metrics.import_manager import sa from great_expectations.expectations.metrics.metric_provider import metric_value @@ -86,7 +86,7 @@ def _sqlalchemy( return results -class ExpectYesterdayCountComparedToAvgEquivalentDaysOfWeek(ColumnExpectation): +class ExpectYesterdayCountComparedToAvgEquivalentDaysOfWeek(ColumnAggregateExpectation): """Expect No missing days in date column""" # Default values diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_lat_lon_pairwise_distance_to_be_less_than.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_lat_lon_pairwise_distance_to_be_less_than.py index 0208112db11b..f3103c2ec2ec 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_lat_lon_pairwise_distance_to_be_less_than.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_lat_lon_pairwise_distance_to_be_less_than.py @@ -7,7 +7,7 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -15,7 +15,7 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most ColumnAggregateExpectations, the main business logic for calculation will live in this class. class ColumnAverageLatLonPairwiseDistance(ColumnAggregateMetricProvider): metric_name = "column.average_lat_lon_pairwise_distance" @@ -46,7 +46,7 @@ def haversine_adapted(point_1, point_2): # This class defines the Expectation itself -class ExpectColumnAverageLatLonPairwiseDistanceToBeLessThan(ColumnExpectation): +class ExpectColumnAverageLatLonPairwiseDistanceToBeLessThan(ColumnAggregateExpectation): """Expect the average pairwise haversine distance between lat/lon points in a column is less than some value in km. This expectation will compute the pairwise haversine distance between each (latitude, longitude) pair and test that the average is less than some value in km. diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_to_be_within_range_of_given_point.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_to_be_within_range_of_given_point.py index 57a4f31ac163..35fd39905b22 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_to_be_within_range_of_given_point.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_average_to_be_within_range_of_given_point.py @@ -1,5 +1,5 @@ """ -This is a template for creating custom ColumnExpectations. +This is a template for creating custom ColumnAggregateExpectations. For detailed instructions on how to use it, please see: https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations """ @@ -12,7 +12,7 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.expectations.metrics import ( @@ -30,7 +30,7 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most ColumnAggregateExpectations, the main business logic for calculation will live in this class. class ColumnCoordinatesDistance(ColumnAggregateMetricProvider): # This is the id string that will be used to reference your Metric. @@ -71,7 +71,7 @@ def fcc_projection(loc1, loc2): # This class defines the Expectation itself -class ExpectColumnAverageToBeWithinRangeOfGivenPoint(ColumnExpectation): +class ExpectColumnAverageToBeWithinRangeOfGivenPoint(ColumnAggregateExpectation): """Expect the average of a column of degree-decimal, lat/lon coordinates to be in range of a given point.""" # These examples will be shown in the public gallery. diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_minimum_bounding_radius_to_be_between.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_minimum_bounding_radius_to_be_between.py index 57f2156b171b..9b0c3be19bd0 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_minimum_bounding_radius_to_be_between.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_minimum_bounding_radius_to_be_between.py @@ -5,7 +5,7 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -13,7 +13,7 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most ColumnAggregateExpectations, the main business logic for calculation will live in this class. class ColumnAggregateGeometryBoundingRadius(ColumnAggregateMetricProvider): # This is the id string that will be used to reference your Metric. @@ -57,7 +57,7 @@ def _pandas(cls, column, **kwargs): # This class defines the Expectation itself -class ExpectColumnMinimumBoundingRadiusToBeBetween(ColumnExpectation): +class ExpectColumnMinimumBoundingRadiusToBeBetween(ColumnAggregateExpectation): """ Expect that column values as geometry points to be contained within a bounding circle with a given radius (or diameter). diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_not_to_overlap.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_not_to_overlap.py index 456061455217..5c11aa716dc4 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_not_to_overlap.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_not_to_overlap.py @@ -6,7 +6,7 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -45,7 +45,7 @@ def _pandas(cls, column, **kwargs): # This class defines the Expectation itself -class ExpectColumnValuesGeometryNotToOverlap(ColumnExpectation): +class ExpectColumnValuesGeometryNotToOverlap(ColumnAggregateExpectation): """Expect geometries in this column Not to overlap with each other. If any two geometries do overlap, expectation will return False. For more information look here \ diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_to_overlap.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_to_overlap.py index 210d763c58dc..57eaa0a3bda4 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_to_overlap.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_geometry_to_overlap.py @@ -6,7 +6,7 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnAggregateMetricProvider, column_aggregate_value, @@ -45,7 +45,7 @@ def _pandas(cls, column, **kwargs): # This class defines the Expectation itself -class ExpectColumnValuesGeometryToOverlap(ColumnExpectation): +class ExpectColumnValuesGeometryToOverlap(ColumnAggregateExpectation): """Expect geometries in this column to overlap with each other. If any two geometries do overlap, expectation will return True. For more information look here \ diff --git a/contrib/time_series_expectations/README.md b/contrib/time_series_expectations/README.md index 33a5c6cc1bed..110b11b3bbbb 100644 --- a/contrib/time_series_expectations/README.md +++ b/contrib/time_series_expectations/README.md @@ -81,7 +81,7 @@ As all of those use cases are realized, we imagine the full class hierarchy for ... for other types of models - *ColumnExpectation* (ABC) + *ColumnAggregateExpectation* (ABC) ColumnAggregateTimeSeriesExpectation (ABC, :white_check_mark:) expect_column_max_to_match_prophet_date_model (:white_check_mark:) expect_column_{property}_to_match_{model}_model diff --git a/contrib/time_series_expectations/docs/README.md b/contrib/time_series_expectations/docs/README.md index 32a9f4e7e1d8..e4e900f52fa3 100644 --- a/contrib/time_series_expectations/docs/README.md +++ b/contrib/time_series_expectations/docs/README.md @@ -127,7 +127,7 @@ The full class hiereachy is: ExpectBatchAggregateStatisticToMatchArimaModel (ABC) expect_batch_volume_to_match_arima_model - *ColumnExpectation* (ABC) + *ColumnAggregateExpectation* (ABC) ColumnAggregateTimeSeriesExpectation (ABC) expect_column_{property}_to_match_{model}_model diff --git a/contrib/time_series_expectations/docs/working-notes.md b/contrib/time_series_expectations/docs/working-notes.md index b6aa9af891b5..d02e0ac8b784 100644 --- a/contrib/time_series_expectations/docs/working-notes.md +++ b/contrib/time_series_expectations/docs/working-notes.md @@ -127,7 +127,7 @@ The full class hiereachy is: ExpectBatchAggregateStatisticToMatchArimaModel (ABC) expect_batch_volume_to_match_arima_model - *ColumnExpectation* (ABC) + *ColumnAggregateExpectation* (ABC) ColumnAggregateTimeSeriesExpectation (ABC) expect_column_{property}_to_match_{model}_model diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/column_aggregate_time_series_expectation.py b/contrib/time_series_expectations/time_series_expectations/expectations/column_aggregate_time_series_expectation.py index 579a44440b24..e2accf2882a3 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/column_aggregate_time_series_expectation.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/column_aggregate_time_series_expectation.py @@ -7,13 +7,13 @@ from great_expectations.execution_engine import ( ExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from time_series_expectations.expectations.prophet_model_deserializer import ( ProphetModelDeserializer, ) -class ColumnAggregateTimeSeriesExpectation(ColumnExpectation, ABC): +class ColumnAggregateTimeSeriesExpectation(ColumnAggregateExpectation, ABC): """This Expectation abstract base class checks to see if an aggregate statistic calculated from a column matches the predictions of a prophet model for a given date. To complete this Expectation, you must implement a metric_dependency. If you're referring to a metric that already exists, this can be as simple as: diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_add_support_for_the_auto_initializing_framework_to_a_custom_expectation.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_add_support_for_the_auto_initializing_framework_to_a_custom_expectation.md index 6e904b3b2d80..8eaf5f41bdef 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_add_support_for_the_auto_initializing_framework_to_a_custom_expectation.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_add_support_for_the_auto_initializing_framework_to_a_custom_expectation.md @@ -40,7 +40,7 @@ Key-value pairs defined in the `variables` portion of a Rule Based Profiler Conf The `DomainBuilder` configuration requries a `class_name` and `module_name`. In this example, we will be using the `ColumnDomainBuilder` which outputs the column of interest (for example: `trip_distance` in the NYC taxi data) which is then accessed by the `ExpectationConfigurationBuilder` using the variable `$domain.domain_kwargs.column`. - **`class_name`:** is the name of the DomainBuilder class that is to be used. Additional Domain Builders are: - - `ColumnDomainBuilder`: This `DomainBuilder` outputs column Domains, which are required by `ColumnExpectations` like (`expect_column_median_to_be_between`). + - `ColumnDomainBuilder`: This `DomainBuilder` outputs column Domains, which are required by `ColumnAggregateExpectations` like (`expect_column_median_to_be_between`). - `MultiColumnDomainBuilder`: This DomainBuilder outputs `multicolumn` Domains by taking in a column list in the `include_column_names` parameter. - `ColumnPairDomainBuilder`: This DomainBuilder outputs columnpair domains by taking in a column pair list in the include_column_names parameter. - `TableDomainBuilder`: This `DomainBuilder` outputs table `Domains`, which is required by `Expectations` that act on tables, like (`expect_table_row_count_to_equal`, or `expect_table_columns_to_match_set`). diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations.md index 96537d515b99..2314ea17a335 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations.md @@ -4,10 +4,10 @@ title: How to create a Custom Column Aggregate Expectation import Prerequisites from '../creating_custom_expectations/components/prerequisites.jsx' import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -**`ColumnExpectations`** are one of the most common types of . +**`ColumnAggregateExpectations`** are one of the most common types of . They are evaluated for a single column, and produce an aggregate , such as a mean, standard deviation, number of unique values, column type, etc. If that Metric meets the conditions you set, the Expectation considers that data valid. -This guide will walk you through the process of creating your own custom `ColumnExpectation`. +This guide will walk you through the process of creating your own custom `ColumnAggregateExpectation`. @@ -19,7 +19,7 @@ This guide will walk you through the process of creating your own custom `Column ### 1. Choose a name for your Expectation -First, decide on a name for your own Expectation. By convention, `ColumnExpectations` always start with `expect_column_`. +First, decide on a name for your own Expectation. By convention, `ColumnAggregateExpectations` always start with `expect_column_`. For more on Expectation naming conventions, see the [Expectations section](../../../contributing/style_guides/code_style.md#expectations) of the Code Style Guide. Your Expectation will have two versions of the same name: a `CamelCaseName` and a `snake_case_name`. For example, this tutorial will use: @@ -31,7 +31,7 @@ Your Expectation will have two versions of the same name: a `CamelCaseName` and By convention, each Expectation is kept in its own python file, named with the snake_case version of the Expectation's name. -You can find the template file for a custom [ColumnExpectation here](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/column_aggregate_expectation_template.py). +You can find the template file for a custom [ColumnAggregateExpectation here](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/column_aggregate_expectation_template.py). Download the file, place it in the appropriate directory, and rename it to the appropriate name. ```bash diff --git a/examples/expectations/column_aggregate_expectation_template.py b/examples/expectations/column_aggregate_expectation_template.py index 0417ea09dce1..c9ff854ce96a 100644 --- a/examples/expectations/column_aggregate_expectation_template.py +++ b/examples/expectations/column_aggregate_expectation_template.py @@ -1,5 +1,5 @@ """ -This is a template for creating custom ColumnExpectations. +This is a template for creating custom ColumnAggregateExpectations. For detailed instructions on how to use it, please see: https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations """ @@ -14,7 +14,7 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnAggregateMetricProvider, column_aggregate_partial, @@ -23,7 +23,7 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most ColumnAggregateExpectations, the main business logic for calculation will live in this class. class ColumnAggregateMatchesSomeCriteria(ColumnAggregateMetricProvider): # This is the id string that will be used to reference your Metric. @@ -46,7 +46,7 @@ def _pandas(cls, column, **kwargs): # This class defines the Expectation itself -class ExpectColumnAggregateToMatchSomeCriteria(ColumnExpectation): +class ExpectColumnAggregateToMatchSomeCriteria(ColumnAggregateExpectation): """TODO: add a docstring here""" # These examples will be shown in the public gallery. diff --git a/great_expectations/expectations/core/expect_column_distinct_values_to_be_in_set.py b/great_expectations/expectations/core/expect_column_distinct_values_to_be_in_set.py index 6c90ae75676a..0522208836d0 100644 --- a/great_expectations/expectations/core/expect_column_distinct_values_to_be_in_set.py +++ b/great_expectations/expectations/core/expect_column_distinct_values_to_be_in_set.py @@ -10,7 +10,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, InvalidExpectationConfigurationError, render_evaluation_parameter_string, ) @@ -35,7 +35,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnDistinctValuesToBeInSet(ColumnExpectation): +class ExpectColumnDistinctValuesToBeInSet(ColumnAggregateExpectation): """Expect the set of distinct column values to be contained by a given set. expect_column_distinct_values_to_be_in_set is a \ diff --git a/great_expectations/expectations/core/expect_column_distinct_values_to_contain_set.py b/great_expectations/expectations/core/expect_column_distinct_values_to_contain_set.py index d45bc3dd2c99..ccecb1afea4b 100644 --- a/great_expectations/expectations/core/expect_column_distinct_values_to_contain_set.py +++ b/great_expectations/expectations/core/expect_column_distinct_values_to_contain_set.py @@ -8,7 +8,7 @@ ) from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, InvalidExpectationConfigurationError, render_evaluation_parameter_string, ) @@ -29,7 +29,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnDistinctValuesToContainSet(ColumnExpectation): +class ExpectColumnDistinctValuesToContainSet(ColumnAggregateExpectation): """Expect the set of distinct column values to contain a given set. expect_column_distinct_values_to_contain_set is a \ diff --git a/great_expectations/expectations/core/expect_column_distinct_values_to_equal_set.py b/great_expectations/expectations/core/expect_column_distinct_values_to_equal_set.py index 399a39cd1af1..23686cfe346d 100644 --- a/great_expectations/expectations/core/expect_column_distinct_values_to_equal_set.py +++ b/great_expectations/expectations/core/expect_column_distinct_values_to_equal_set.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, InvalidExpectationConfigurationError, render_evaluation_parameter_string, ) @@ -27,7 +27,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnDistinctValuesToEqualSet(ColumnExpectation): +class ExpectColumnDistinctValuesToEqualSet(ColumnAggregateExpectation): """Expect the set of distinct column values to equal a given set. expect_column_distinct_values_to_equal_set is a \ diff --git a/great_expectations/expectations/core/expect_column_kl_divergence_to_be_less_than.py b/great_expectations/expectations/core/expect_column_kl_divergence_to_be_less_than.py index 3b9a7fab31b5..c0f3662a4409 100644 --- a/great_expectations/expectations/core/expect_column_kl_divergence_to_be_less_than.py +++ b/great_expectations/expectations/core/expect_column_kl_divergence_to_be_less_than.py @@ -17,7 +17,7 @@ is_valid_partition_object, ) from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -58,7 +58,7 @@ logging.captureWarnings(True) -class ExpectColumnKlDivergenceToBeLessThan(ColumnExpectation): +class ExpectColumnKlDivergenceToBeLessThan(ColumnAggregateExpectation): """Expect the Kulback-Leibler (KL) divergence (relative entropy) of the specified column with respect to the partition object to be lower than the provided threshold. KL divergence compares two distributions. The higher the divergence value (relative entropy), the larger \ diff --git a/great_expectations/expectations/core/expect_column_max_to_be_between.py b/great_expectations/expectations/core/expect_column_max_to_be_between.py index 96d3f8a38d42..c56b0d52dcb8 100644 --- a/great_expectations/expectations/core/expect_column_max_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_max_to_be_between.py @@ -42,14 +42,14 @@ pass -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.render.renderer.renderer import renderer if TYPE_CHECKING: from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnMaxToBeBetween(ColumnExpectation): +class ExpectColumnMaxToBeBetween(ColumnAggregateExpectation): """Expect the column maximum to be between a minimum value and a maximum value. expect_column_max_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_column_mean_to_be_between.py b/great_expectations/expectations/core/expect_column_mean_to_be_between.py index 2d8e9625e5f1..6d409c5f28b7 100644 --- a/great_expectations/expectations/core/expect_column_mean_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_mean_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -42,7 +42,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnMeanToBeBetween(ColumnExpectation): +class ExpectColumnMeanToBeBetween(ColumnAggregateExpectation): """Expect the column mean to be between a minimum value and a maximum value (inclusive). expect_column_mean_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_column_median_to_be_between.py b/great_expectations/expectations/core/expect_column_median_to_be_between.py index 177bbf6f1feb..2636fd4b7d74 100644 --- a/great_expectations/expectations/core/expect_column_median_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_median_to_be_between.py @@ -6,7 +6,7 @@ ) from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -37,7 +37,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnMedianToBeBetween(ColumnExpectation): +class ExpectColumnMedianToBeBetween(ColumnAggregateExpectation): """Expect the column median to be between a minimum value and a maximum value. expect_column_median_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_column_min_to_be_between.py b/great_expectations/expectations/core/expect_column_min_to_be_between.py index 37a45e1e2070..b6afcc641abf 100644 --- a/great_expectations/expectations/core/expect_column_min_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_min_to_be_between.py @@ -6,7 +6,7 @@ ) from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -41,7 +41,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnMinToBeBetween(ColumnExpectation): +class ExpectColumnMinToBeBetween(ColumnAggregateExpectation): """Expect the column minimum to be between a minimum value and a maximum value. expect_column_min_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_column_most_common_value_to_be_in_set.py b/great_expectations/expectations/core/expect_column_most_common_value_to_be_in_set.py index 5abfdda90a89..fd43451509c6 100644 --- a/great_expectations/expectations/core/expect_column_most_common_value_to_be_in_set.py +++ b/great_expectations/expectations/core/expect_column_most_common_value_to_be_in_set.py @@ -6,7 +6,7 @@ ) from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, InvalidExpectationConfigurationError, render_evaluation_parameter_string, ) @@ -25,7 +25,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnMostCommonValueToBeInSet(ColumnExpectation): +class ExpectColumnMostCommonValueToBeInSet(ColumnAggregateExpectation): """Expect the most common value to be within the designated value set. expect_column_most_common_value_to_be_in_set is a \ diff --git a/great_expectations/expectations/core/expect_column_proportion_of_unique_values_to_be_between.py b/great_expectations/expectations/core/expect_column_proportion_of_unique_values_to_be_between.py index 6344b8f1c66f..9cdb670cc6dd 100644 --- a/great_expectations/expectations/core/expect_column_proportion_of_unique_values_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_proportion_of_unique_values_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -42,7 +42,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnProportionOfUniqueValuesToBeBetween(ColumnExpectation): +class ExpectColumnProportionOfUniqueValuesToBeBetween(ColumnAggregateExpectation): """Expect the proportion of unique values to be between a minimum value and a maximum value. For example, in a column containing [1, 2, 2, 3, 3, 3, 4, 4, 4, 4], there are 4 unique values and 10 total \ diff --git a/great_expectations/expectations/core/expect_column_quantile_values_to_be_between.py b/great_expectations/expectations/core/expect_column_quantile_values_to_be_between.py index 1cc7df89d894..cb7d48db2ae3 100644 --- a/great_expectations/expectations/core/expect_column_quantile_values_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_quantile_values_to_be_between.py @@ -11,7 +11,7 @@ from great_expectations.exceptions import InvalidExpectationConfigurationError from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -57,7 +57,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnQuantileValuesToBeBetween(ColumnExpectation): +class ExpectColumnQuantileValuesToBeBetween(ColumnAggregateExpectation): # noinspection PyUnresolvedReferences """Expect the specific provided column quantiles to be between a minimum value and a maximum value. diff --git a/great_expectations/expectations/core/expect_column_stdev_to_be_between.py b/great_expectations/expectations/core/expect_column_stdev_to_be_between.py index aa7995c0e8a5..57e6f541ae56 100644 --- a/great_expectations/expectations/core/expect_column_stdev_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_stdev_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -38,7 +38,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnStdevToBeBetween(ColumnExpectation): +class ExpectColumnStdevToBeBetween(ColumnAggregateExpectation): """Expect the column standard deviation to be between a minimum value and a maximum value. Uses sample standard deviation (normalized by N-1). diff --git a/great_expectations/expectations/core/expect_column_sum_to_be_between.py b/great_expectations/expectations/core/expect_column_sum_to_be_between.py index c6ec0c070642..89ef30457f03 100644 --- a/great_expectations/expectations/core/expect_column_sum_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_sum_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -38,7 +38,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnSumToBeBetween(ColumnExpectation): +class ExpectColumnSumToBeBetween(ColumnAggregateExpectation): """Expect the column to sum to be between a minimum value and a maximum value. expect_column_sum_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py b/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py index 2fbee5045b98..ca1058e5e787 100644 --- a/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py +++ b/great_expectations/expectations/core/expect_column_unique_value_count_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -43,7 +43,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation): +class ExpectColumnUniqueValueCountToBeBetween(ColumnAggregateExpectation): """Expect the number of unique values to be between a minimum value and a maximum value. expect_column_unique_value_count_to_be_between is a \ diff --git a/great_expectations/expectations/expectation.py b/great_expectations/expectations/expectation.py index 702c5d5f828a..d72a7f7f7799 100644 --- a/great_expectations/expectations/expectation.py +++ b/great_expectations/expectations/expectation.py @@ -2542,8 +2542,8 @@ def validate_configuration( @public_api -class ColumnExpectation(TableExpectation, ABC): - """Base class for column-type Expectations. +class ColumnAggregateExpectation(TableExpectation, ABC): + """Base class for column aggregate Expectations. These types of Expectation produce an aggregate metric for a column, such as the mean, standard deviation, number of unique values, column type, etc. @@ -2583,6 +2583,35 @@ def validate_configuration( raise InvalidExpectationConfigurationError(str(e)) +@public_api +class ColumnExpectation(ColumnAggregateExpectation, ABC): + """Base class for column aggregate Expectations. + + These types of Expectation produce an aggregate metric for a column, such as the mean, standard deviation, + number of unique values, column type, etc. + + WARNING: This class will be deprecated in favor of ColumnAggregateExpectation, and removed in a future release. + If you're using this class, please update your code to use ColumnAggregateExpectation instead. + There is no change in functionality between the two classes; just a name change for clarity. + + --Documentation-- + - https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations/ + + Args: + domain_keys (tuple): A tuple of the keys used to determine the domain of the + expectation. + success_keys (tuple): A tuple of the keys used to determine the success of + the expectation. + default_kwarg_values (optional[dict]): Optional. A dictionary that will be used to fill unspecified + kwargs from the Expectation Configuration. + + - A "column" key is required for column expectations. + + Raises: + InvalidExpectationConfigurationError: If no `column` is specified + """ + + @public_api class ColumnMapExpectation(TableExpectation, ABC): """Base class for ColumnMapExpectations. diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py index 8d4d9bf68db5..a586682ce5bb 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py @@ -13,7 +13,7 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.expectation import ( - ColumnExpectation, + ColumnAggregateExpectation, ExpectationValidationResult, render_evaluation_parameter_string, ) @@ -89,7 +89,7 @@ def _spark(cls, column, _table, _column_name, **kwargs): # # -class ExpectColumnMaxToBeBetweenCustom(ColumnExpectation): +class ExpectColumnMaxToBeBetweenCustom(ColumnAggregateExpectation): # # """Expect column max to be between a given range.""" diff --git a/tests/integration/docusaurus/expectations/examples/column_aggregate_expectation_template.py b/tests/integration/docusaurus/expectations/examples/column_aggregate_expectation_template.py index 02b08220518b..03455e3ba3b6 100644 --- a/tests/integration/docusaurus/expectations/examples/column_aggregate_expectation_template.py +++ b/tests/integration/docusaurus/expectations/examples/column_aggregate_expectation_template.py @@ -1,5 +1,5 @@ """ -This is a template for creating custom ColumnExpectations. +This is a template for creating custom ColumnAggregateExpectations. For detailed instructions on how to use it, please see: https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations """ @@ -14,7 +14,7 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import ColumnExpectation +from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ( ColumnAggregateMetricProvider, column_aggregate_partial, @@ -23,7 +23,7 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most ColumnAggregateExpectations, the main business logic for calculation will live in this class. # class ColumnAggregateMatchesSomeCriteria(ColumnAggregateMetricProvider): # @@ -52,7 +52,7 @@ def _pandas(cls, column, **kwargs): # This class defines the Expectation itself # -class ExpectColumnAggregateToMatchSomeCriteria(ColumnExpectation): +class ExpectColumnAggregateToMatchSomeCriteria(ColumnAggregateExpectation): # # """TODO: add a docstring here""" diff --git a/tests/integration/docusaurus/expectations/examples/table_expectation_template.py b/tests/integration/docusaurus/expectations/examples/table_expectation_template.py index 02810a4bb4da..8f79f38a7c81 100644 --- a/tests/integration/docusaurus/expectations/examples/table_expectation_template.py +++ b/tests/integration/docusaurus/expectations/examples/table_expectation_template.py @@ -25,7 +25,7 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. +# For most TableExpectations, the main business logic for calculation will live in this class. # class TableMeetsSomeCriteria(TableMetricProvider): # diff --git a/tests/test_fixtures/rule_based_profiler/example_notebooks/BasicExample_RBP_Instantiation_and_running.ipynb b/tests/test_fixtures/rule_based_profiler/example_notebooks/BasicExample_RBP_Instantiation_and_running.ipynb index 60d1e1c9adc8..cf8ca26b916f 100644 --- a/tests/test_fixtures/rule_based_profiler/example_notebooks/BasicExample_RBP_Instantiation_and_running.ipynb +++ b/tests/test_fixtures/rule_based_profiler/example_notebooks/BasicExample_RBP_Instantiation_and_running.ipynb @@ -761,7 +761,7 @@ "- `expect_column_min_to_be_between`\n", "- `expect_column_max_to_be_between`\n", "\n", - "The Expectations are both `ColumnExpectations`, so the `column` parameter will be accessed from the Domain kwargs using `$domain.domain_kwargs.column`. \n", + "The Expectations are both `ColumnAggregateExpectations`, so the `column` parameter will be accessed from the Domain kwargs using `$domain.domain_kwargs.column`. \n", "\n", "The Expectations also take in a `min_value` and `max_value` parameter, which our `NumericMetricRangeMultiBatchParameterBuilders` are estimating. For `expect_column_min_to_be_between`, these estimated values are accessible using\n", "\n", @@ -888,7 +888,7 @@ "metadata": {}, "source": [ "#### `ColumnDomainBuilder`\n", - "This `DomainBuilder` outputs column Domains, which are required by `ColumnExpectations` like (`expect_column_median_to_be_between`). There are a few ways that the `ColumnDomainBuilder` can be used. \n", + "This `DomainBuilder` outputs column Domains, which are required by `ColumnAggregateExpectations` like (`expect_column_median_to_be_between`). There are a few ways that the `ColumnDomainBuilder` can be used. \n", "\n", "1. In the simplest usecase, the `ColumnDomainBuilder` can output all columns in the dataset as a Domain, or include/exclude columns if you already know which ones you would like. Column suffixes (like `_amount`) can be used to select columns of interest, as we saw in our examples above.\n", "3. The `ColumnDomainBuilder` also allows you to choose columns based on their semantic types (such as numeric, or text).\n", From 9ecea4adf438c983a0eb656221e1c6b8e87ff8d1 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 12 Apr 2023 11:15:30 -0600 Subject: [PATCH 83/96] [MAINTENANCE] Deprecate TableExpectation in favor of BatchExpectation (#7610) Co-authored-by: Don Heppner --- ...rofile_numeric_columns_diff_expectation.py | 4 +- ...ect_table_checksum_to_equal_other_table.py | 6 +- .../expect_table_binary_label_model_bias.py | 4 +- ..._table_linear_feature_importances_to_be.py | 4 +- contrib/time_series_expectations/README.md | 2 +- .../time_series_expectations/docs/README.md | 2 +- .../docs/working-notes.md | 4 +- ...h_row_count_to_match_prophet_date_model.py | 6 +- .../contributing/style_guides/code_style.md | 2 +- ...ow_to_create_custom_batch_expectations.md} | 80 +++++++++---------- .../creating_custom_expectations/overview.md | 2 +- .../docs/guides/expectations/index.md | 2 +- docs/docusaurus/sidebars.js | 2 +- ...plate.py => batch_expectation_template.py} | 14 ++-- ...pped_ks_test_p_value_to_be_greater_than.py | 4 +- ...isquare_test_p_value_to_be_greater_than.py | 4 +- ..._pair_cramers_phi_value_to_be_less_than.py | 4 +- ...tion_ks_test_p_value_to_be_greater_than.py | 4 +- .../core/expect_column_to_exist.py | 4 +- ...expect_column_values_to_be_in_type_list.py | 4 +- .../expect_column_values_to_be_of_type.py | 4 +- ...expect_table_column_count_to_be_between.py | 4 +- .../expect_table_column_count_to_equal.py | 4 +- ...ect_table_columns_to_match_ordered_list.py | 4 +- .../core/expect_table_columns_to_match_set.py | 4 +- .../expect_table_row_count_to_be_between.py | 4 +- .../core/expect_table_row_count_to_equal.py | 4 +- ...ct_table_row_count_to_equal_other_table.py | 4 +- .../expectations/expectation.py | 48 ++++++++--- tests/data_context/test_data_context.py | 4 +- ...y => expect_batch_columns_to_be_unique.py} | 42 +++++----- ...plate.py => batch_expectation_template.py} | 34 ++++---- 32 files changed, 173 insertions(+), 145 deletions(-) rename docs/docusaurus/docs/guides/expectations/creating_custom_expectations/{how_to_create_custom_table_expectations.md => how_to_create_custom_batch_expectations.md} (84%) rename examples/expectations/{table_expectation_template.py => batch_expectation_template.py} (91%) rename tests/integration/docusaurus/expectations/creating_custom_expectations/{expect_table_columns_to_be_unique.py => expect_batch_columns_to_be_unique.py} (85%) rename tests/integration/docusaurus/expectations/examples/{table_expectation_template.py => batch_expectation_template.py} (85%) diff --git a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/expectations/profile_numeric_columns_diff_expectation.py b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/expectations/profile_numeric_columns_diff_expectation.py index 4a6aacb85bce..04997f04bdd3 100644 --- a/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/expectations/profile_numeric_columns_diff_expectation.py +++ b/contrib/capitalone_dataprofiler_expectations/capitalone_dataprofiler_expectations/expectations/profile_numeric_columns_diff_expectation.py @@ -2,13 +2,13 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine.execution_engine import ExecutionEngine -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.expectations.registry import get_metric_kwargs from great_expectations.validator.metric_configuration import MetricConfiguration from great_expectations.validator.validator import ValidationDependencies -class ProfileNumericColumnsDiffExpectation(TableExpectation): +class ProfileNumericColumnsDiffExpectation(BatchExpectation): profile_metric = None @classmethod diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py b/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py index 7d6d2965b582..3cb999f18130 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_table_checksum_to_equal_other_table.py @@ -26,7 +26,7 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.expectations.metrics.metric_provider import metric_value @@ -264,8 +264,8 @@ def _get_evaluation_dependencies( # This class defines the Expectation itself # The main business logic for calculation lives here. -class ExpectTableChecksumToEqualOtherTable(TableExpectation): - """Expect the checksum table to equal the checksum of another table. +class ExpectTableChecksumToEqualOtherTable(BatchExpectation): + """Expect the checksum for one batch table to equal the checksum of another table. expect_table_checksum_to_equal_other_table is a \ [Table Expectation](https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations). diff --git a/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_binary_label_model_bias.py b/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_binary_label_model_bias.py index 3b18995569a7..6606d61dc6b7 100644 --- a/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_binary_label_model_bias.py +++ b/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_binary_label_model_bias.py @@ -9,7 +9,7 @@ from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.exceptions import InvalidExpectationConfigurationError from great_expectations.execution_engine import PandasExecutionEngine -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.expectations.metrics.metric_provider import ( MetricConfiguration, metric_value, @@ -94,7 +94,7 @@ def _get_evaluation_dependencies( # This class defines the Expectation itself # The main business logic for calculation lives here. -class ExpectTableBinaryLabelModelBias(TableExpectation): +class ExpectTableBinaryLabelModelBias(BatchExpectation): """Expect fairness in a model by calculating disparities among features, score (binary or continuous), and a label (binary) in a table using Aequitas. Using Aeqitas we evaluate predicted and true values to evaluate certain metrics \ diff --git a/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_linear_feature_importances_to_be.py b/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_linear_feature_importances_to_be.py index 64a1e8e71cd3..3c7b42dfe28c 100644 --- a/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_linear_feature_importances_to_be.py +++ b/contrib/great_expectations_ethical_ai_expectations/great_expectations_ethical_ai_expectations/expectations/expect_table_linear_feature_importances_to_be.py @@ -11,7 +11,7 @@ from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.exceptions import InvalidExpectationConfigurationError from great_expectations.execution_engine import PandasExecutionEngine -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.expectations.metrics.metric_provider import ( MetricConfiguration, metric_value, @@ -73,7 +73,7 @@ def _get_evaluation_dependencies( # This class defines the Expectation itself # The main business logic for calculation lives here. -class ExpectTableLinearFeatureImportancesToBe(TableExpectation): +class ExpectTableLinearFeatureImportancesToBe(BatchExpectation): """Expect Feature Importances of specified columns in table for Linear Regression to meet threshold.""" # These examples will be shown in the public gallery, and also executed as unit tests for your Expectation diff --git a/contrib/time_series_expectations/README.md b/contrib/time_series_expectations/README.md index 110b11b3bbbb..8d0776b4709f 100644 --- a/contrib/time_series_expectations/README.md +++ b/contrib/time_series_expectations/README.md @@ -64,7 +64,7 @@ See the script that creates examples (`assets/generate_test_time_series_data.py` As all of those use cases are realized, we imagine the full class hierarchy for time series Expectations to evolve into this: - *TableExpectation* (ABC) + *BatchExpectation* (ABC) *BatchAggregateStatisticExpectation* (ABC) BatchAggregateStatisticTimeSeriesExpectation (ABC) ExpectBatchAggregateStatisticToMatchProphetDateModel (ABC) diff --git a/contrib/time_series_expectations/docs/README.md b/contrib/time_series_expectations/docs/README.md index e4e900f52fa3..39539cf216ba 100644 --- a/contrib/time_series_expectations/docs/README.md +++ b/contrib/time_series_expectations/docs/README.md @@ -111,7 +111,7 @@ The most important ABCs are [BatchAggregateStatisticTimeSeriesExpectation](link) The full class hiereachy is: - *TableExpectation* (ABC) + *BatchExpectation* (ABC) BatchAggregateStatisticExpectation (ABC) ExpectBatchAggregateStatisticToBeBetween (ABC) expect_batch_update_time_to_be_between diff --git a/contrib/time_series_expectations/docs/working-notes.md b/contrib/time_series_expectations/docs/working-notes.md index d02e0ac8b784..9411a4e61305 100644 --- a/contrib/time_series_expectations/docs/working-notes.md +++ b/contrib/time_series_expectations/docs/working-notes.md @@ -109,9 +109,9 @@ You can learn more about the Expectations in the Expectation gallery, [here](lin The most important ABCs are [BatchAggregateStatisticTimeSeriesExpectation](link), [ColumnAggregateTimeSeriesExpectation](link), and [ColumnPairTimeSeriesExpectation](link). They allow time series models to be applied to data in a variety of shapes and formats. Please see the class docstrings for more detailed explanation. -The full class hiereachy is: +The full class hierarchy is: - *TableExpectation* (ABC) + *BatchExpectation* (ABC) BatchAggregateStatisticExpectation (ABC) ExpectBatchAggregateStatisticToBeBetween (ABC) expect_batch_update_time_to_be_between diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py b/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py index 34942fb881e8..ba776f2b5907 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/expect_batch_row_count_to_match_prophet_date_model.py @@ -7,16 +7,16 @@ from great_expectations.execution_engine import ( ExecutionEngine, ) -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from time_series_expectations.expectations.prophet_model_deserializer import ( ProphetModelDeserializer, ) -class ExpectBatchRowCountToMatchProphetDateModel(TableExpectation): +class ExpectBatchRowCountToMatchProphetDateModel(BatchExpectation): """This Expectation checks to see if the number of rows in a Batch matches the predictions of a prophet model for a given date. - expect_batch_row_count_to_match_prophet_date_model is a [TableExpectation](https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations) + expect_batch_row_count_to_match_prophet_date_model is a [BatchExpectation](https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations) Args: date (str): diff --git a/docs/docusaurus/docs/contributing/style_guides/code_style.md b/docs/docusaurus/docs/contributing/style_guides/code_style.md index c3229f977bcf..a0a98e861f7b 100644 --- a/docs/docusaurus/docs/contributing/style_guides/code_style.md +++ b/docs/docusaurus/docs/contributing/style_guides/code_style.md @@ -84,7 +84,7 @@ Or [run `mypy`](https://mypy.readthedocs.io/en/stable/running_mypy.html) directl | Base class | prefix | |------------------------------|---------------------------------| | `Expectation` | `expect_...` | -| `TableExpectation` | `expect_table_...` | +| `BatchExpectation` | `expect_table_...` | | `ColumnMapExpectation` | `expect_column_values_...` | | `ColumnAggregateExpectation` | `expect_column_...` | | `ColumnPairMapExpectation` | `expect_column_pair_values...` | diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations.md similarity index 84% rename from docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations.md rename to docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations.md index 7c48a8fd7fb8..fd534e9dbe8f 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations.md @@ -1,13 +1,13 @@ --- -title: How to create a Custom Table Expectation +title: How to create a Custom Batch Expectation --- import Prerequisites from '../creating_custom_expectations/components/prerequisites.jsx' import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; -**`TableExpectations`** are one of the most common types of . -They are evaluated for an entire table, and answer a semantic question about the table itself. For example, `expect_table_column_count_to_equal` and `expect_table_row_count_to_equal` answer how many columns and rows are in your table. +**`BatchExpectations`** are one of the most common types of . +They are evaluated for an entire Batch, and answer a semantic question about the Batch itself. For example, `expect_table_column_count_to_equal` and `expect_table_row_count_to_equal` answer how many columns and rows are in your Batch. -This guide will walk you through the process of creating your own custom `TableExpectation`. +This guide will walk you through the process of creating your own custom `BatchExpectation`. @@ -19,23 +19,23 @@ This guide will walk you through the process of creating your own custom `TableE ### 1. Choose a name for your Expectation -First, decide on a name for your own Expectation. By convention, `TableExpectations` always start with `expect_table_`. +First, decide on a name for your own Expectation. By convention, `BatchExpectations` always start with `expect_table_`. For more on Expectation naming conventions, see the [Expectations section](../../../contributing/style_guides/code_style.md#expectations) of the Code Style Guide. Your Expectation will have two versions of the same name: a `CamelCaseName` and a `snake_case_name`. For example, this tutorial will use: -- `ExpectTableColumnsToBeUnique` -- `expect_table_columns_to_be_unique` +- `ExpectBatchColumnsToBeUnique` +- `expect_batch_columns_to_be_unique` ### 2. Copy and rename the template file By convention, each Expectation is kept in its own python file, named with the snake_case version of the Expectation's name. -You can find the template file for a custom [TableExpectation here](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/table_expectation_template.py). +You can find the template file for a custom [BatchExpectation here](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/batch_expectation_template.py). Download the file, place it in the appropriate directory, and rename it to the appropriate name. ```bash -cp table_expectation_template.py /SOME_DIRECTORY/expect_table_columns_to_be_unique.py +cp batch_expectation_template.py /SOME_DIRECTORY/expect_batch_columns_to_be_unique.py ```
    @@ -64,7 +64,7 @@ cp table_expectation_template.py /SOME_DIRECTORY/expect_table_columns_to_be_uniq Once you've copied and renamed the template file, you can execute it as follows. ```bash -python expect_table_columns_to_be_unique.py +python expect_batch_columns_to_be_unique.py ``` The template file is set up so that this will run the Expectation's `print_diagnostic_checklist()` method. This will run a diagnostic script on your new Expectation, and return a checklist of steps to get it to full production readiness. @@ -89,36 +89,36 @@ By convention, your class is defined Let's start by updating your Expectation's name and docstring. Replace the Expectation class name -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py ExpectTableToMeetSomeCriteria class_def" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py ExpectBatchToMeetSomeCriteria class_def" ``` with your real Expectation class name, in upper camel case: -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py ExpectTableColumnsToBeUnique class_def" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py ExpectBatchColumnsToBeUnique class_def" ``` You can also go ahead and write a new one-line docstring, replacing -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py docstring" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py docstring" ``` with something like: -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py docstring" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py docstring" ``` You'll also need to change the class name at the bottom of the file, by replacing this line: -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py diagnostics" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py diagnostics" ``` with this one: -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py diagnostics" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py diagnostics" ``` Later, you can go back and write a more thorough docstring. At this point you can re-run your diagnostic checklist. You should see something like this: ``` -$ python expect_table_columns_to_be_unique.py +$ python expect_batch_columns_to_be_unique.py -Completeness checklist for ExpectTableColumnsToBeUnique: +Completeness checklist for ExpectBatchColumnsToBeUnique: ✔ Has a valid library_metadata object ✔ Has a docstring, including a one-line short description Has at least one positive and negative example case, and all test cases pass @@ -138,16 +138,16 @@ Next, we're going to search for `examples = []` in your file, and replace it wit Your examples will look something like this: -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py examples" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py examples" ``` Here's a quick overview of how to create test cases to populate `examples`. The overall structure is a list of dictionaries. Each dictionary has two keys: -* `data`: defines the input data of the example as a table/data frame. In these examples the table has three columns (`col1`, `col2` and `col3`). These columns have 5 rows. (Note: if you define multiple columns, make sure that they have the same number of rows.) +* `data`: defines the input data of the example as a Batch. In these examples the Batch has three columns (`col1`, `col2` and `col3`). These columns have 5 rows. (Note: if you define multiple columns, make sure that they have the same number of rows.) * `tests`: a list of test cases to validate against the data frame defined in the corresponding `data`. * `title` should be a descriptive name for the test case. Make sure to have no spaces. * `include_in_gallery`: This must be set to `True` if you want this test case to be visible in the Gallery as an example. - * `in` contains exactly the parameters that you want to pass in to the Expectation. `"in": {"strict": True}` in the example above is equivalent to `expect_table_columns_to_be_unique(strict=True)` + * `in` contains exactly the parameters that you want to pass in to the Expectation. `"in": {"strict": True}` in the example above is equivalent to `expect_batch_columns_to_be_unique(strict=True)` * `out` is based on the Validation Result returned when executing the Expectation. * `exact_match_out`: if you set `exact_match_out=False`, then you don’t need to include all the elements of the Validation Result object - only the ones that are important to test. @@ -160,9 +160,9 @@ If you run your Expectation file again, you won't see any new checkmarks, as the However, you should see that the tests you've written are now being caught and reported in your checklist: ``` -$ python expect_table_columns_to_be_unique.py +$ python expect_batch_columns_to_be_unique.py -Completeness checklist for ExpectTableColumnsToBeUnique: +Completeness checklist for ExpectBatchColumnsToBeUnique: ✔ Has a valid library_metadata object ✔ Has a docstring, including a one-line short description ... @@ -187,9 +187,9 @@ By the time your Expectation is complete, your Metric will have functions for al Metrics answer questions about your data posed by your Expectation,
    and allow your Expectation to judge whether your data meets ***your*** expectations. ::: -Your Metric function will have the `@metric_value` decorator, with the appropriate `engine`. Metric functions can be as complex as you like, but they're often very short. For example, here's the definition for a Metric function to find the unique columns of a table with the PandasExecutionEngine. +Your Metric function will have the `@metric_value` decorator, with the appropriate `engine`. Metric functions can be as complex as you like, but they're often very short. For example, here's the definition for a Metric function to find the unique columns of a Batch with the PandasExecutionEngine. -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py pandas" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py pandas" ``` :::note @@ -220,22 +220,22 @@ The remainder of the Metric Identifier simply describes what the Metric computes You'll need to substitute this metric into two places in the code. First, in the Metric class, replace -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py metric_name" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py metric_name" ``` with -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py metric_name" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py metric_name" ``` Second, in the Expectation class, replace -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py metric_dependencies" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py metric_dependencies" ``` with -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py metric_dependencies" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py metric_dependencies" ``` It's essential to make sure to use matching Metric Identifier strings across your Metric class and Expectation class. This is how the Expectation knows which Metric to use for its internal logic. @@ -244,12 +244,12 @@ Finally, rename the Metric class name itself, using the camel case version of th For example, replace: -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py TableMeetsSomeCriteria class_def" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py BatchMeetsSomeCriteria class_def" ``` with -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py TableColumnsUnique class_def" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py BatchColumnsUnique class_def" ``` ### 7. Validate @@ -258,7 +258,7 @@ In this step, we simply need to validate that the results of our Metrics meet ou The validate method is implemented as `_validate(...)`: -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py validate" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py validate" ``` This method takes a dictionary named `metrics`, which contains all Metrics requested by your Metric dependencies, @@ -267,14 +267,14 @@ and performs a simple validation against your success keys (i.e. important thres To do so, we'll be accessing our success keys, as well as the result of our previously-calculated Metrics. For example, here is the definition of a `_validate(...)` method to validate the results of our `table.columns.unique` Metric against our success keys: -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py validate" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py validate" ``` Running your diagnostic checklist at this point should return something like this: ``` -$ python expect_table_columns_to_be_unique.py +$ python expect_batch_columns_to_be_unique.py -Completeness checklist for ExpectTableColumnsToBeUnique: +Completeness checklist for ExpectBatchColumnsToBeUnique: ✔ Has a valid library_metadata object ✔ Has a docstring, including a one-line short description ✔ Has at least one positive and negative example case, and all test cases pass @@ -301,9 +301,9 @@ If desired, you can automate this to happen at commit time. See our [guidance on Once this is done, running your diagnostic checklist should now reflect your Custom Expectation as meeting our linting requirements: ``` -$ python expect_table_columns_to_be_unique.py +$ python expect_batch_columns_to_be_unique.py -Completeness checklist for ExpectTableColumnsToBeUnique: +Completeness checklist for ExpectBatchColumnsToBeUnique: ✔ Has a valid library_metadata object ✔ Has a docstring, including a one-line short description ✔ Has at least one positive and negative example case, and all test cases pass @@ -324,12 +324,12 @@ This guide will leave you with a Custom Expectation sufficient for [contribution If you plan to contribute your Expectation to the public open source project, you should update the `library_metadata` object before submitting your [Pull Request](https://github.com/great-expectations/great_expectations/pulls). For example: -```python name="tests/integration/docusaurus/expectations/examples/table_expectation_template.py library_metadata" +```python name="tests/integration/docusaurus/expectations/examples/batch_expectation_template.py library_metadata" ``` would become -```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py library_metadata" +```python name="tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py library_metadata" ``` This is particularly important because ***we*** want to make sure that ***you*** get credit for all your hard work! @@ -338,5 +338,5 @@ This is particularly important because ***we*** want to make sure that ***you*** For more information on our code standards and contribution, see our guide on [Levels of Maturity](../../../contributing/contributing_maturity.md#contributing-expectations) for Expectations. To view the full script used in this page, see it on GitHub: -- [expect_table_columns_to_be_unique.py](https://github.com/great-expectations/great_expectations/blob/develop/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py) +- [expect_batch_columns_to_be_unique.py](https://github.com/great-expectations/great_expectations/blob/develop/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py) ::: diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md index de83c972f3cc..34bec85972c5 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md @@ -56,7 +56,7 @@ The code to achieve the first four steps looks somewhat different depending on t |-----------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Column Map Expectation](./how_to_create_custom_column_map_expectations.md) | [column_map_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/column_map_expectation_template.py) | | [Column Aggregate Expectation](./how_to_create_custom_column_aggregate_expectations.md) | [column_aggregate_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/column_aggregate_expectation_template.py) | -| [Table Expectation](./how_to_create_custom_table_expectations.md) | [table_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/table_expectation_template.py) | +| [Batch Expectation](./how_to_create_custom_batch_expectations.md) | [table_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/batch_expectation_template.py) | | [Regex-Based Column Map Expectation](./how_to_create_custom_regex_based_column_map_expectations.md) | [regex-based map column_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/regex_based_column_map_expectation_template.py) | | [Set-Based Column Map Expectation](./how_to_create_custom_set_based_column_map_expectations.md) | [set-based map_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/set_based_column_map_expectation_template.py) | diff --git a/docs/docusaurus/docs/guides/expectations/index.md b/docs/docusaurus/docs/guides/expectations/index.md index eb3d55010cfa..c7cfbc6fd386 100644 --- a/docs/docusaurus/docs/guides/expectations/index.md +++ b/docs/docusaurus/docs/guides/expectations/index.md @@ -22,7 +22,7 @@ title: "Create Expectations: Index" - [Overview](../../guides/expectations/creating_custom_expectations/overview.md) - [How to create a Custom Column Aggregate Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations.md) - [How to create a Custom Column Map Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_column_map_expectations.md) -- [How to create a Custom Table Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations.md) +- [How to create a Custom Batch Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations.md) - [How to create a Custom Column Pair Map Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_column_pair_map_expectations.md) - [How to create a Custom Multicolumn Map Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_multicolumn_map_expectations.md) - [How to create a Custom Regex-Based Column Map Expectation](../../guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations.md) diff --git a/docs/docusaurus/sidebars.js b/docs/docusaurus/sidebars.js index 814c05cf5af9..fa0958bf9da3 100644 --- a/docs/docusaurus/sidebars.js +++ b/docs/docusaurus/sidebars.js @@ -342,7 +342,7 @@ module.exports = { 'guides/expectations/creating_custom_expectations/overview', 'guides/expectations/creating_custom_expectations/how_to_create_custom_column_aggregate_expectations', 'guides/expectations/creating_custom_expectations/how_to_create_custom_column_map_expectations', - 'guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations', + 'guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations', 'guides/expectations/creating_custom_expectations/how_to_create_custom_column_pair_map_expectations', 'guides/expectations/creating_custom_expectations/how_to_create_custom_multicolumn_map_expectations', 'guides/expectations/creating_custom_expectations/how_to_create_custom_regex_based_column_map_expectations', diff --git a/examples/expectations/table_expectation_template.py b/examples/expectations/batch_expectation_template.py similarity index 91% rename from examples/expectations/table_expectation_template.py rename to examples/expectations/batch_expectation_template.py index adb042ffe0e5..0d69f5d570cd 100644 --- a/examples/expectations/table_expectation_template.py +++ b/examples/expectations/batch_expectation_template.py @@ -1,7 +1,7 @@ """ -This is a template for creating custom TableExpectations. +This is a template for creating custom BatchExpectations. For detailed instructions on how to use it, please see: - https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations + https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations """ from typing import Dict, Optional @@ -14,7 +14,7 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.expectations.metrics.metric_provider import ( MetricConfiguration, metric_value, @@ -25,8 +25,8 @@ # This class defines a Metric to support your Expectation. -# For most ColumnExpectations, the main business logic for calculation will live in this class. -class TableMeetsSomeCriteria(TableMetricProvider): +# For most BatchExpectations, the main business logic for calculation will live in this class. +class BatchMeetsSomeCriteria(TableMetricProvider): # This is the id string that will be used to reference your Metric. metric_name = "METRIC NAME GOES HERE" @@ -82,7 +82,7 @@ def _get_evaluation_dependencies( # This class defines the Expectation itself # The main business logic for calculation lives here. -class ExpectTableToMeetSomeCriteria(TableExpectation): +class ExpectBatchToMeetSomeCriteria(BatchExpectation): """TODO: add a docstring here""" # These examples will be shown in the public gallery. @@ -146,4 +146,4 @@ def _validate( if __name__ == "__main__": - ExpectTableToMeetSomeCriteria().print_diagnostic_checklist() + ExpectBatchToMeetSomeCriteria().print_diagnostic_checklist() diff --git a/great_expectations/expectations/core/expect_column_bootstrapped_ks_test_p_value_to_be_greater_than.py b/great_expectations/expectations/core/expect_column_bootstrapped_ks_test_p_value_to_be_greater_than.py index 37897f939e56..15ac4d932f6b 100644 --- a/great_expectations/expectations/core/expect_column_bootstrapped_ks_test_p_value_to_be_greater_than.py +++ b/great_expectations/expectations/core/expect_column_bootstrapped_ks_test_p_value_to_be_greater_than.py @@ -5,14 +5,14 @@ ExpectationValidationResult, # noqa: TCH001 ) from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyDiagnosticRendererType, LegacyRendererType from great_expectations.render.renderer.renderer import renderer -class ExpectColumnBootstrappedKsTestPValueToBeGreaterThan(TableExpectation): +class ExpectColumnBootstrappedKsTestPValueToBeGreaterThan(BatchExpectation): # This expectation is a stub - it needs migration to the modular expectation API # This dictionary contains metadata for display in the public gallery diff --git a/great_expectations/expectations/core/expect_column_chisquare_test_p_value_to_be_greater_than.py b/great_expectations/expectations/core/expect_column_chisquare_test_p_value_to_be_greater_than.py index e1a95fff85e9..b5d69241d1a3 100644 --- a/great_expectations/expectations/core/expect_column_chisquare_test_p_value_to_be_greater_than.py +++ b/great_expectations/expectations/core/expect_column_chisquare_test_p_value_to_be_greater_than.py @@ -5,14 +5,14 @@ ExpectationValidationResult, # noqa: TCH001 ) from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyDiagnosticRendererType, LegacyRendererType from great_expectations.render.renderer.renderer import renderer -class ExpectColumnChiSquareTestPValueToBeGreaterThan(TableExpectation): +class ExpectColumnChiSquareTestPValueToBeGreaterThan(BatchExpectation): # This expectation is a stub - it needs migration to the modular expectation API # This dictionary contains metadata for display in the public gallery diff --git a/great_expectations/expectations/core/expect_column_pair_cramers_phi_value_to_be_less_than.py b/great_expectations/expectations/core/expect_column_pair_cramers_phi_value_to_be_less_than.py index cd0ab86f1080..a933c48ed260 100644 --- a/great_expectations/expectations/core/expect_column_pair_cramers_phi_value_to_be_less_than.py +++ b/great_expectations/expectations/core/expect_column_pair_cramers_phi_value_to_be_less_than.py @@ -5,7 +5,7 @@ ExpectationValidationResult, # noqa: TCH001 ) from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -25,7 +25,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnPairCramersPhiValueToBeLessThan(TableExpectation): +class ExpectColumnPairCramersPhiValueToBeLessThan(BatchExpectation): # This dictionary contains metadata for display in the public gallery library_metadata = { diff --git a/great_expectations/expectations/core/expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than.py b/great_expectations/expectations/core/expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than.py index a54fe8e6946a..cae33dfa220b 100644 --- a/great_expectations/expectations/core/expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than.py +++ b/great_expectations/expectations/core/expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than.py @@ -5,7 +5,7 @@ ExpectationValidationResult, # noqa: TCH001 ) from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyDiagnosticRendererType, LegacyRendererType @@ -13,7 +13,7 @@ class ExpectColumnParameterizedDistributionKsTestPValueToBeGreaterThan( - TableExpectation + BatchExpectation ): # This expectation is a stub - it needs migration to the modular expectation API diff --git a/great_expectations/expectations/core/expect_column_to_exist.py b/great_expectations/expectations/core/expect_column_to_exist.py index 6c8da0ce4611..74b97753d1c4 100644 --- a/great_expectations/expectations/core/expect_column_to_exist.py +++ b/great_expectations/expectations/core/expect_column_to_exist.py @@ -7,8 +7,8 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( + BatchExpectation, InvalidExpectationConfigurationError, - TableExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -23,7 +23,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectColumnToExist(TableExpectation): +class ExpectColumnToExist(BatchExpectation): """Expect the specified column to exist. expect_column_to_exist is a \ diff --git a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py index 836aa2640af3..0b148adbe5cb 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py @@ -465,10 +465,10 @@ def get_validation_dependencies( runtime_configuration: Optional[dict] = None, **kwargs, ) -> ValidationDependencies: - # This calls TableExpectation.get_validation_dependencies to set baseline validation_dependencies for the aggregate version + # This calls BatchExpectation.get_validation_dependencies to set baseline validation_dependencies for the aggregate version # of the expectation. # We need to keep this as super(ColumnMapExpectation, self), which calls - # TableExpectation.get_validation_dependencies instead of ColumnMapExpectation.get_validation_dependencies. + # BatchExpectation.get_validation_dependencies instead of ColumnMapExpectation.get_validation_dependencies. # This is because the map version of this expectation is only supported for Pandas, so we want the aggregate # version for the other backends. validation_dependencies: ValidationDependencies = super( diff --git a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py index 5f360c1fea3f..aca1fe3e0a41 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py @@ -445,10 +445,10 @@ def get_validation_dependencies( runtime_configuration: Optional[dict] = None, **kwargs, ) -> ValidationDependencies: - # This calls TableExpectation.get_validation_dependencies to set baseline validation_dependencies for the aggregate version + # This calls BatchExpectation.get_validation_dependencies to set baseline validation_dependencies for the aggregate version # of the expectation. # We need to keep this as super(ColumnMapExpectation, self), which calls - # TableExpectation.get_validation_dependencies instead of ColumnMapExpectation.get_validation_dependencies. + # BatchExpectation.get_validation_dependencies instead of ColumnMapExpectation.get_validation_dependencies. # This is because the map version of this expectation is only supported for Pandas, so we want the aggregate # version for the other backends. validation_dependencies: ValidationDependencies = super( diff --git a/great_expectations/expectations/core/expect_table_column_count_to_be_between.py b/great_expectations/expectations/core/expect_table_column_count_to_be_between.py index 7164681b7819..940542fb3b92 100644 --- a/great_expectations/expectations/core/expect_table_column_count_to_be_between.py +++ b/great_expectations/expectations/core/expect_table_column_count_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -25,7 +25,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectTableColumnCountToBeBetween(TableExpectation): +class ExpectTableColumnCountToBeBetween(BatchExpectation): """Expect the number of columns to be between two values. expect_table_column_count_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_table_column_count_to_equal.py b/great_expectations/expectations/core/expect_table_column_count_to_equal.py index 616f7783dcd6..8f6d6c3919f1 100644 --- a/great_expectations/expectations/core/expect_table_column_count_to_equal.py +++ b/great_expectations/expectations/core/expect_table_column_count_to_equal.py @@ -7,8 +7,8 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( + BatchExpectation, InvalidExpectationConfigurationError, - TableExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -20,7 +20,7 @@ from great_expectations.render.util import substitute_none_for_missing -class ExpectTableColumnCountToEqual(TableExpectation): +class ExpectTableColumnCountToEqual(BatchExpectation): """Expect the number of columns to equal a value. expect_table_column_count_to_equal is a \ diff --git a/great_expectations/expectations/core/expect_table_columns_to_match_ordered_list.py b/great_expectations/expectations/core/expect_table_columns_to_match_ordered_list.py index ca8b3a6815ae..a4f0991aadca 100644 --- a/great_expectations/expectations/core/expect_table_columns_to_match_ordered_list.py +++ b/great_expectations/expectations/core/expect_table_columns_to_match_ordered_list.py @@ -8,8 +8,8 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( + BatchExpectation, InvalidExpectationConfigurationError, - TableExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -21,7 +21,7 @@ from great_expectations.render.util import substitute_none_for_missing -class ExpectTableColumnsToMatchOrderedList(TableExpectation): +class ExpectTableColumnsToMatchOrderedList(BatchExpectation): """Expect the columns to exactly match a specified list. expect_table_columns_to_match_ordered_list is a \ diff --git a/great_expectations/expectations/core/expect_table_columns_to_match_set.py b/great_expectations/expectations/core/expect_table_columns_to_match_set.py index ccf20febca8c..9a43abf9ecdf 100644 --- a/great_expectations/expectations/core/expect_table_columns_to_match_set.py +++ b/great_expectations/expectations/core/expect_table_columns_to_match_set.py @@ -8,7 +8,7 @@ from great_expectations.exceptions import InvalidExpectationConfigurationError from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -35,7 +35,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectTableColumnsToMatchSet(TableExpectation): +class ExpectTableColumnsToMatchSet(BatchExpectation): """Expect the columns to match an unordered set. expect_table_columns_to_match_set is a \ diff --git a/great_expectations/expectations/core/expect_table_row_count_to_be_between.py b/great_expectations/expectations/core/expect_table_row_count_to_be_between.py index 1d42a3b4ff4d..6900e55aa7ab 100644 --- a/great_expectations/expectations/core/expect_table_row_count_to_be_between.py +++ b/great_expectations/expectations/core/expect_table_row_count_to_be_between.py @@ -7,7 +7,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -37,7 +37,7 @@ from great_expectations.render.renderer_configuration import AddParamArgs -class ExpectTableRowCountToBeBetween(TableExpectation): +class ExpectTableRowCountToBeBetween(BatchExpectation): """Expect the number of rows to be between two values. expect_table_row_count_to_be_between is a \ diff --git a/great_expectations/expectations/core/expect_table_row_count_to_equal.py b/great_expectations/expectations/core/expect_table_row_count_to_equal.py index 9f149928ee72..c5e43dc3191a 100644 --- a/great_expectations/expectations/core/expect_table_row_count_to_equal.py +++ b/great_expectations/expectations/core/expect_table_row_count_to_equal.py @@ -7,8 +7,8 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( + BatchExpectation, InvalidExpectationConfigurationError, - TableExpectation, render_evaluation_parameter_string, ) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent @@ -20,7 +20,7 @@ from great_expectations.render.util import substitute_none_for_missing -class ExpectTableRowCountToEqual(TableExpectation): +class ExpectTableRowCountToEqual(BatchExpectation): """Expect the number of rows to equal a value. expect_table_row_count_to_equal is a \ diff --git a/great_expectations/expectations/core/expect_table_row_count_to_equal_other_table.py b/great_expectations/expectations/core/expect_table_row_count_to_equal_other_table.py index 998bbe770284..f4872e3e0e7b 100644 --- a/great_expectations/expectations/core/expect_table_row_count_to_equal_other_table.py +++ b/great_expectations/expectations/core/expect_table_row_count_to_equal_other_table.py @@ -8,7 +8,7 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 from great_expectations.expectations.expectation import ( - TableExpectation, + BatchExpectation, render_evaluation_parameter_string, ) from great_expectations.render import ( @@ -30,7 +30,7 @@ ) -class ExpectTableRowCountToEqualOtherTable(TableExpectation): +class ExpectTableRowCountToEqualOtherTable(BatchExpectation): """Expect the number of rows to equal the number in another table. expect_table_row_count_to_equal_other_table is a \ diff --git a/great_expectations/expectations/expectation.py b/great_expectations/expectations/expectation.py index d72a7f7f7799..d8ec6ddf5db3 100644 --- a/great_expectations/expectations/expectation.py +++ b/great_expectations/expectations/expectation.py @@ -304,7 +304,7 @@ class Expectation(metaclass=MetaExpectation): 2. `success_keys`: a tuple of the *keys* used to determine the success of the expectation. - In some cases, subclasses of Expectation (such as TableExpectation) can + In some cases, subclasses of Expectation (such as BatchExpectation) can inherit these properties from their parent class. They *may* optionally override `runtime_keys` and `default_kwarg_values`, and @@ -2241,18 +2241,18 @@ def _get_maturity_checklist( @public_api -class TableExpectation(Expectation, ABC): - """Base class for TableExpectations. +class BatchExpectation(Expectation, ABC): + """Base class for BatchExpectations. - TableExpectations answer a semantic question about the table itself. + BatchExpectations answer a semantic question about a Batch of data. For example, `expect_table_column_count_to_equal` and `expect_table_row_count_to_equal` answer how many columns and rows are in your table. - TableExpectations must implement a `_validate(...)` method containing logic + BatchExpectations must implement a `_validate(...)` method containing logic for determining whether the Expectation is successfully validated. - TableExpectations may optionally provide implementations of `validate_configuration`, + BatchExpectations may optionally provide implementations of `validate_configuration`, which should raise an error if the configuration will not be usable for the Expectation. Raises: @@ -2433,7 +2433,33 @@ def _validate_metric_value_between( # noqa: C901 - 21 @public_api -class QueryExpectation(TableExpectation, ABC): +class TableExpectation(BatchExpectation, ABC): + """Base class for TableExpectations. + + WARNING: TableExpectation will be deprecated in a future release. Please use BatchExpectation instead. + + TableExpectations answer a semantic question about the table itself. + + For example, `expect_table_column_count_to_equal` and `expect_table_row_count_to_equal` answer + how many columns and rows are in your table. + + TableExpectations must implement a `_validate(...)` method containing logic + for determining whether the Expectation is successfully validated. + + TableExpectations may optionally provide implementations of `validate_configuration`, + which should raise an error if the configuration will not be usable for the Expectation. + + Raises: + InvalidExpectationConfigurationError: The configuration does not contain the values required by the Expectation. + + Args: + domain_keys (tuple): A tuple of the keys used to determine the domain of the + expectation. + """ + + +@public_api +class QueryExpectation(BatchExpectation, ABC): """Base class for QueryExpectations. QueryExpectations facilitate the execution of SQL or Spark-SQL queries as the core logic for an Expectation. @@ -2542,7 +2568,7 @@ def validate_configuration( @public_api -class ColumnAggregateExpectation(TableExpectation, ABC): +class ColumnAggregateExpectation(BatchExpectation, ABC): """Base class for column aggregate Expectations. These types of Expectation produce an aggregate metric for a column, such as the mean, standard deviation, @@ -2613,7 +2639,7 @@ class ColumnExpectation(ColumnAggregateExpectation, ABC): @public_api -class ColumnMapExpectation(TableExpectation, ABC): +class ColumnMapExpectation(BatchExpectation, ABC): """Base class for ColumnMapExpectations. ColumnMapExpectations are evaluated for a column and ask a yes/no question about every row in the column. @@ -2898,7 +2924,7 @@ def _validate( @public_api -class ColumnPairMapExpectation(TableExpectation, ABC): +class ColumnPairMapExpectation(BatchExpectation, ABC): """Base class for ColumnPairMapExpectations. ColumnPairMapExpectations are evaluated for a pair of columns and ask a yes/no question about the row-wise @@ -3164,7 +3190,7 @@ def _validate( @public_api -class MulticolumnMapExpectation(TableExpectation, ABC): +class MulticolumnMapExpectation(BatchExpectation, ABC): """Base class for MulticolumnMapExpectations. MulticolumnMapExpectations are evaluated for a set of columns and ask a yes/no question about the diff --git a/tests/data_context/test_data_context.py b/tests/data_context/test_data_context.py index 1d711bb50bb3..cadf248bd1bf 100644 --- a/tests/data_context/test_data_context.py +++ b/tests/data_context/test_data_context.py @@ -41,7 +41,7 @@ SimpleSqlalchemyDatasource, ) from great_expectations.datasource.types.batch_kwargs import PathBatchKwargs -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.render import ( AtomicPrescriptiveRendererType, AtomicRendererType, @@ -2867,7 +2867,7 @@ def test_check_for_usage_stats_sync_short_circuits_due_to_disabled_usage_stats( assert res is False -class ExpectSkyToBeColor(TableExpectation): +class ExpectSkyToBeColor(BatchExpectation): metric_dependencies = ("table.color",) success_keys = ("color",) args_keys = ("color",) diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py similarity index 85% rename from tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py rename to tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py index a7acbd9e84ca..d712a4d27c52 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_table_columns_to_be_unique.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_batch_columns_to_be_unique.py @@ -1,5 +1,5 @@ """ -This is a template for creating custom TableExpectations. +This is a template for creating custom BatchExpectations. For detailed instructions on how to use it, please see: https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations """ @@ -10,7 +10,7 @@ from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.exceptions import InvalidExpectationConfigurationError from great_expectations.execution_engine import ExecutionEngine, PandasExecutionEngine -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.expectations.metrics.table_metric_provider import ( TableMetricProvider, @@ -19,17 +19,17 @@ # This class defines a Metric to support your Expectation. -# -class TableColumnsUnique(TableMetricProvider): +# +class BatchColumnsUnique(TableMetricProvider): # # This is the id string that will be used to reference your Metric. - # + # metric_name = "table.columns.unique" # # This method implements the core logic for the PandasExecutionEngine - # + # @metric_value(engine=PandasExecutionEngine) def _pandas( cls, @@ -85,18 +85,19 @@ def _get_evaluation_dependencies( } -# -class ExpectTableColumnsToBeUnique(TableExpectation): +# +class ExpectBatchColumnsToBeUnique(BatchExpectation): # - # - """Expect table to contain columns with unique contents.""" + # + """Expect batch to contain columns with unique contents.""" # # These examples will be shown in the public gallery. # They will also be executed as unit tests for your Expectation. - # + # examples = [ { + "dataset_name": "expect_batch_columns_to_be_unique_1", "data": { "col1": [1, 2, 3, 4, 5], "col2": [2, 3, 4, 5, 6], @@ -113,6 +114,7 @@ class ExpectTableColumnsToBeUnique(TableExpectation): ], }, { + "dataset_name": "expect_batch_columns_to_be_unique_2", "data": { "col1": [1, 2, 3, 4, 5], "col2": [1, 2, 3, 4, 5], @@ -138,7 +140,7 @@ class ExpectTableColumnsToBeUnique(TableExpectation): ] # # This is a tuple consisting of all Metrics necessary to evaluate the Expectation. - # + # metric_dependencies = ("table.columns.unique", "table.columns") # @@ -176,7 +178,7 @@ def validate_configuration( raise InvalidExpectationConfigurationError(str(e)) # This method performs a validation of your metrics against your success keys, returning a dict indicating the success or failure of the Expectation. - # + # def _validate( self, configuration: ExpectationConfiguration, @@ -185,15 +187,15 @@ def _validate( execution_engine: ExecutionEngine = None, ): unique_columns = metrics.get("table.columns.unique") - table_columns = metrics.get("table.columns") + batch_columns = metrics.get("table.columns") strict = configuration.kwargs.get("strict") - duplicate_columns = unique_columns.symmetric_difference(table_columns) + duplicate_columns = unique_columns.symmetric_difference(batch_columns) if strict is True: success = len(duplicate_columns) == 0 else: - success = len(duplicate_columns) < len(table_columns) + success = len(duplicate_columns) < len(batch_columns) return { "success": success, @@ -202,7 +204,7 @@ def _validate( # # This dictionary contains metadata for display in the public gallery - # + # library_metadata = { "tags": ["uniqueness"], "contributors": ["@joegargery"], @@ -211,13 +213,13 @@ def _validate( if __name__ == "__main__": - # - ExpectTableColumnsToBeUnique().print_diagnostic_checklist() + # + ExpectBatchColumnsToBeUnique().print_diagnostic_checklist() # # Note to users: code below this line is only for integration testing -- ignore! -diagnostics = ExpectTableColumnsToBeUnique().run_diagnostics() +diagnostics = ExpectBatchColumnsToBeUnique().run_diagnostics() for check in diagnostics["tests"]: assert check["test_passed"] is True diff --git a/tests/integration/docusaurus/expectations/examples/table_expectation_template.py b/tests/integration/docusaurus/expectations/examples/batch_expectation_template.py similarity index 85% rename from tests/integration/docusaurus/expectations/examples/table_expectation_template.py rename to tests/integration/docusaurus/expectations/examples/batch_expectation_template.py index 8f79f38a7c81..d90f607e2f14 100644 --- a/tests/integration/docusaurus/expectations/examples/table_expectation_template.py +++ b/tests/integration/docusaurus/expectations/examples/batch_expectation_template.py @@ -1,7 +1,7 @@ """ -This is a template for creating custom TableExpectations. +This is a template for creating custom BatchExpectations. For detailed instructions on how to use it, please see: - https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_table_expectations + https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/how_to_create_custom_batch_expectations """ from typing import Dict, Optional @@ -14,7 +14,7 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.expectation import TableExpectation +from great_expectations.expectations.expectation import BatchExpectation from great_expectations.expectations.metrics.metric_provider import ( MetricConfiguration, metric_value, @@ -25,18 +25,18 @@ # This class defines a Metric to support your Expectation. -# For most TableExpectations, the main business logic for calculation will live in this class. -# -class TableMeetsSomeCriteria(TableMetricProvider): +# For most BatchExpectations, the main business logic for calculation will live in this class. +# +class BatchMeetsSomeCriteria(TableMetricProvider): # # This is the id string that will be used to reference your Metric. - # + # metric_name = "METRIC NAME GOES HERE" # # This method implements the core logic for the PandasExecutionEngine - # + # @metric_value(engine=PandasExecutionEngine) def _pandas( cls, @@ -89,21 +89,21 @@ def _get_evaluation_dependencies( # This class defines the Expectation itself # The main business logic for calculation lives here. -# -class ExpectTableToMeetSomeCriteria(TableExpectation): +# +class ExpectBatchToMeetSomeCriteria(BatchExpectation): # - # + # """TODO: add a docstring here""" # # These examples will be shown in the public gallery. # They will also be executed as unit tests for your Expectation. - # + # examples = [] # # This is a tuple consisting of all Metrics necessary to evaluate the Expectation. - # + # metric_dependencies = ("METRIC NAME GOES HERE",) # @@ -142,7 +142,7 @@ def validate_configuration( # raise InvalidExpectationConfigurationError(str(e)) # This method performs a validation of your metrics against your success keys, returning a dict indicating the success or failure of the Expectation. - # + # def _validate( self, configuration: ExpectationConfiguration, @@ -154,7 +154,7 @@ def _validate( raise NotImplementedError # This object contains metadata for display in the public Gallery - # + # library_metadata = { "tags": [], # Tags for this Expectation in the Gallery "contributors": [ # Github handles for all contributors to this Expectation. @@ -167,6 +167,6 @@ def _validate( if __name__ == "__main__": - # - ExpectTableToMeetSomeCriteria().print_diagnostic_checklist() + # + ExpectBatchToMeetSomeCriteria().print_diagnostic_checklist() # From 3f66ac3b4c8fd2f3641f8cbd6b05c90c1d3c85b1 Mon Sep 17 00:00:00 2001 From: Anthony Burdi Date: Wed, 12 Apr 2023 14:14:34 -0400 Subject: [PATCH 84/96] [MAINTENANCE] Explicitly test relevant modules in Sqlalchemy compatibility pipeline (#7613) --- ci/azure-pipelines-sqlalchemy-compatibility.yml | 7 ++----- pyproject.toml | 3 ++- tests/data_context/store/test_database_store_backend.py | 2 ++ .../test_sqlalchemy_execution_engine_sampling.py | 2 ++ .../test_sqlalchemy_execution_engine_splitting.py | 2 ++ tests/execution_engine/test_sqlalchemy_batch_data.py | 2 ++ tests/execution_engine/test_sqlalchemy_execution_engine.py | 2 ++ tests/test_definitions/test_expectations_v3_api.py | 2 ++ 8 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ci/azure-pipelines-sqlalchemy-compatibility.yml b/ci/azure-pipelines-sqlalchemy-compatibility.yml index b05c8d8cae9c..0543250759e0 100644 --- a/ci/azure-pipelines-sqlalchemy-compatibility.yml +++ b/ci/azure-pipelines-sqlalchemy-compatibility.yml @@ -112,9 +112,6 @@ stages: # sqlalchemy minor versions that we support. # (versions as semver major.minor.patch) matrix: - # Uncomment if we need 1.3.x verification - # sqlalchemy_1_3_x: - # sqlalchemy_base_version: '1.3.0' sqlalchemy_1_4_x: sqlalchemy_base_version: '1.4.0' # Uncomment when we are compatible with 2.0.x. @@ -136,11 +133,11 @@ stages: pip install --constraint constraints-dev-temp.txt ".[dev]" pytest-azurepipelines displayName: 'Install dependencies using SQLAlchemy base version $(sqlalchemy_base_version)' - # TODO: Currently the below test only runs expectations tests for postgresql. We should figure out what the + # TODO: Currently the below test only runs tests with postgresql. We should figure out what the # TODO: best way to test for sqlalchemy version compatibility and implement that here. - script: | # Run pytest - pytest --postgresql tests/test_definitions/test_expectations_v3_api.py + pytest --postgresql -m sqlalchemy_version_compatibility pytest --postgresql -m unit displayName: 'pytest' diff --git a/pyproject.toml b/pyproject.toml index 5fc10cfac6d5..ef0f5083a23d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -471,7 +471,8 @@ markers = [ "integration: mark test as an integration test.", "slow: mark tests taking longer than 1 second.", "unit: mark a test as a unit test.", - "v2_api: mark test as specific to the v2 api (e.g. pre Data Connectors)", + "v2_api: mark test as specific to the v2 api (e.g. pre Data Connectors).", + "sqlalchemy_version_compatibility: mark test as required for sqlalchemy version compatibility.", ] testpaths = "tests" # use `pytest-mock` drop-in replacement for `unittest.mock` diff --git a/tests/data_context/store/test_database_store_backend.py b/tests/data_context/store/test_database_store_backend.py index e57cdbcbed28..f64ba215a2f4 100644 --- a/tests/data_context/store/test_database_store_backend.py +++ b/tests/data_context/store/test_database_store_backend.py @@ -8,6 +8,8 @@ from great_expectations.data_context.util import instantiate_class_from_config from great_expectations.exceptions import StoreBackendError +pytestmark = pytest.mark.sqlalchemy_version_compatibility + @pytest.mark.integration def test_database_store_backend_schema_spec(caplog, sa, test_backends): diff --git a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py index 3e5dd81331ef..6a34f889a594 100644 --- a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py +++ b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_sampling.py @@ -29,6 +29,8 @@ except ImportError: sqlalchemy = None +pytestmark = pytest.mark.sqlalchemy_version_compatibility + @pytest.mark.parametrize( "underscore_prefix", diff --git a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py index 27ce15860028..4f21691dfce1 100644 --- a/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py +++ b/tests/execution_engine/split_and_sample/test_sqlalchemy_execution_engine_splitting.py @@ -56,6 +56,8 @@ ) ] +pytestmark = pytest.mark.sqlalchemy_version_compatibility + @mock.patch( "great_expectations.execution_engine.split_and_sample.sqlalchemy_data_splitter.SqlAlchemyDataSplitter.split_on_date_parts" diff --git a/tests/execution_engine/test_sqlalchemy_batch_data.py b/tests/execution_engine/test_sqlalchemy_batch_data.py index 31ea607c9bd0..7d996861b9a1 100644 --- a/tests/execution_engine/test_sqlalchemy_batch_data.py +++ b/tests/execution_engine/test_sqlalchemy_batch_data.py @@ -22,6 +22,8 @@ from tests.sqlalchemy_test_doubles import MockSaEngine, Dialect +pytestmark = pytest.mark.sqlalchemy_version_compatibility + def test_instantiation_with_table_name(sqlite_view_engine): execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine( diff --git a/tests/execution_engine/test_sqlalchemy_execution_engine.py b/tests/execution_engine/test_sqlalchemy_execution_engine.py index c1f666b1e9b6..d5822ce46e7f 100644 --- a/tests/execution_engine/test_sqlalchemy_execution_engine.py +++ b/tests/execution_engine/test_sqlalchemy_execution_engine.py @@ -47,6 +47,8 @@ except ImportError: sqlalchemy = None +pytestmark = pytest.mark.sqlalchemy_version_compatibility + def test_instantiation_via_connection_string(sa, test_db_connection_string): my_execution_engine = SqlAlchemyExecutionEngine( diff --git a/tests/test_definitions/test_expectations_v3_api.py b/tests/test_definitions/test_expectations_v3_api.py index 99b0c9db6c66..d8d269617f00 100644 --- a/tests/test_definitions/test_expectations_v3_api.py +++ b/tests/test_definitions/test_expectations_v3_api.py @@ -29,6 +29,8 @@ from great_expectations.util import build_in_memory_runtime_context from tests.conftest import build_test_backends_list_v3_api +pytestmark = pytest.mark.sqlalchemy_version_compatibility + def pytest_generate_tests(metafunc): # noqa C901 - 35 # Load all the JSON files in the directory From 6bcb1a43c20e74a436a0ebd48cf0f6204866ec7e Mon Sep 17 00:00:00 2001 From: Nathan Farmer Date: Wed, 12 Apr 2023 17:10:09 -0400 Subject: [PATCH 85/96] [DOCS] Add scripts under test for "How to create and edit Expectations with instant feedback from a sample Batch of data" (#7615) --- .../datasource/fluent/file_path_data_asset.py | 13 ++++ .../datasource/fluent/pandas_datasource.py | 28 +++++--- .../fluent/schemas/PandasDatasource.json | 2 +- .../datasource/fluent/spark_datasource.py | 19 +++--- .../datasource/fluent/sql_datasource.py | 1 + ...ions_with_instant_feedback_block_config.py | 68 +++++++++++++++++++ ...pectations_with_instant_feedback_fluent.py | 47 +++++++++++++ tests/integration/test_script_runner.py | 10 +++ 8 files changed, 165 insertions(+), 23 deletions(-) create mode 100644 tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_block_config.py create mode 100644 tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py diff --git a/great_expectations/datasource/fluent/file_path_data_asset.py b/great_expectations/datasource/fluent/file_path_data_asset.py index 4950fe079962..20bde9196b0b 100644 --- a/great_expectations/datasource/fluent/file_path_data_asset.py +++ b/great_expectations/datasource/fluent/file_path_data_asset.py @@ -19,6 +19,7 @@ import pydantic import great_expectations.exceptions as gx_exceptions +from great_expectations.core._docs_decorators import public_api from great_expectations.datasource.fluent.constants import MATCH_ALL_PATTERN from great_expectations.datasource.fluent.data_asset.data_connector import ( FILE_PATH_BATCH_SPEC_KEY, @@ -130,9 +131,21 @@ def batch_request_options( """ return tuple(self._all_group_names) + (FILE_PATH_BATCH_SPEC_KEY,) + @public_api def build_batch_request( self, options: Optional[BatchRequestOptions] = None ) -> BatchRequest: + """A batch request that can be used to obtain batches for this DataAsset. + + Args: + options: A dict that can be used to limit the number of batches returned from the asset. + The dict structure depends on the asset type. The available keys for dict can be obtained by + calling batch_request_options. + + Returns: + A BatchRequest object that can be used to obtain a batch list from a Datasource by calling the + get_batch_list_from_batch_request method. + """ if options: for option, value in options.items(): if ( diff --git a/great_expectations/datasource/fluent/pandas_datasource.py b/great_expectations/datasource/fluent/pandas_datasource.py index 750e7c9a2cb2..1ff66b7e658f 100644 --- a/great_expectations/datasource/fluent/pandas_datasource.py +++ b/great_expectations/datasource/fluent/pandas_datasource.py @@ -28,6 +28,7 @@ from typing_extensions import Literal import great_expectations.exceptions as gx_exceptions +from great_expectations.core._docs_decorators import public_api from great_expectations.core.batch_spec import PandasBatchSpec, RuntimeDataBatchSpec from great_expectations.datasource.fluent.constants import ( _DATA_CONNECTOR_NAME, @@ -63,7 +64,6 @@ from great_expectations.datasource.fluent.interfaces import ( BatchMetadata, - BatchRequestOptions, ) from great_expectations.execution_engine import PandasExecutionEngine from great_expectations.validator.validator import Validator @@ -168,17 +168,14 @@ def get_batch_list_from_batch_request( ) return batch_list - def build_batch_request( - self, options: Optional[BatchRequestOptions] = None - ) -> BatchRequest: - if options: - actual_keys = set(options.keys()) - raise gx_exceptions.InvalidBatchRequestError( - "Data Assets associated with PandasDatasource can only contain a single batch,\n" - "therefore BatchRequest options cannot be supplied. BatchRequest options with keys:\n" - f"{actual_keys}\nwere passed.\n" - ) + @public_api + def build_batch_request(self) -> BatchRequest: # type: ignore[override] + """A batch request that can be used to obtain batches for this DataAsset. + Returns: + A BatchRequest object that can be used to obtain a batch list from a Datasource by calling the + get_batch_list_from_batch_request method. + """ return BatchRequest( datasource_name=self.datasource.name, data_asset_name=self.name, @@ -495,7 +492,16 @@ def json( _DYNAMIC_ASSET_TYPES = list(_PANDAS_ASSET_MODELS.values()) +@public_api class PandasDatasource(_PandasDatasource): + """Adds a single-batch pandas datasource to the data context. + + Args: + name: The name of this datasource. + assets: An optional dictionary whose keys are Pandas DataAsset names and whose values + are Pandas DataAsset objects. + """ + # class attributes asset_types: ClassVar[Sequence[Type[DataAsset]]] = _DYNAMIC_ASSET_TYPES + [ DataFrameAsset diff --git a/great_expectations/datasource/fluent/schemas/PandasDatasource.json b/great_expectations/datasource/fluent/schemas/PandasDatasource.json index cdd824087282..7e6b89e128a7 100644 --- a/great_expectations/datasource/fluent/schemas/PandasDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasDatasource.json @@ -1,6 +1,6 @@ { "title": "PandasDatasource", - "description": "Base model for most fluent datasource related pydantic models.\n\nAdds yaml dumping and parsing methods.\n\nExtra fields are not allowed.\n\nSerialization methods default to `exclude_unset = True` to prevent serializing\nconfigs full of mostly unset default values.\nAlso prevents passing along unset kwargs to BatchSpec.\nhttps://docs.pydantic.dev/usage/exporting_models/", + "description": "--Public API--Adds a single-batch pandas datasource to the data context.\n\nArgs:\n name: The name of this datasource.\n assets: An optional dictionary whose keys are Pandas DataAsset names and whose values\n are Pandas DataAsset objects.", "type": "object", "properties": { "type": { diff --git a/great_expectations/datasource/fluent/spark_datasource.py b/great_expectations/datasource/fluent/spark_datasource.py index c314d1cafc20..93e0072f6da7 100644 --- a/great_expectations/datasource/fluent/spark_datasource.py +++ b/great_expectations/datasource/fluent/spark_datasource.py @@ -18,6 +18,7 @@ from typing_extensions import Literal import great_expectations.exceptions as gx_exceptions +from great_expectations.core._docs_decorators import public_api from great_expectations.core.batch_spec import RuntimeDataBatchSpec from great_expectations.datasource.fluent.constants import ( _DATA_CONNECTOR_NAME, @@ -25,7 +26,6 @@ from great_expectations.datasource.fluent.interfaces import ( Batch, BatchRequest, - BatchRequestOptions, DataAsset, Datasource, ) @@ -109,17 +109,14 @@ def _get_reader_options_include(self) -> set[str] | None: """Spark DataFrameAsset does not implement "_get_reader_options_include()" method, because DataFrame is already available.""" ) - def build_batch_request( - self, options: Optional[BatchRequestOptions] = None - ) -> BatchRequest: - if options: - actual_keys = set(options.keys()) - raise gx_exceptions.InvalidBatchRequestError( - "Data Assets associated with SparkDatasource can only contain a single batch,\n" - "therefore BatchRequest options cannot be supplied. BatchRequest options with keys:\n" - f"{actual_keys}\nwere passed.\n" - ) + @public_api + def build_batch_request(self) -> BatchRequest: # type: ignore[override] + """A batch request that can be used to obtain batches for this DataAsset. + Returns: + A BatchRequest object that can be used to obtain a batch list from a Datasource by calling the + get_batch_list_from_batch_request method. + """ return BatchRequest( datasource_name=self.datasource.name, data_asset_name=self.name, diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index 328d45847e61..2278e6e4364e 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -667,6 +667,7 @@ def get_batch_list_from_batch_request( self.sort_batches(batch_list) return batch_list + @public_api def build_batch_request( self, options: Optional[BatchRequestOptions] = None ) -> BatchRequest: diff --git a/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_block_config.py b/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_block_config.py new file mode 100644 index 000000000000..4201b46e6c6f --- /dev/null +++ b/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_block_config.py @@ -0,0 +1,68 @@ +# +import great_expectations as gx +from great_expectations.core.batch import BatchRequest + +# + +from great_expectations.core.yaml_handler import YAMLHandler + +yaml = YAMLHandler() + +# +context = gx.get_context() +# + +datasource_yaml = r""" +name: my_datasource +class_name: Datasource +module_name: great_expectations.datasource +execution_engine: + module_name: great_expectations.execution_engine + class_name: PandasExecutionEngine +data_connectors: + my_configured_asset_data_connector: + class_name: ConfiguredAssetFilesystemDataConnector + base_directory: ./data + assets: + my_data_asset: + pattern: yellow_tripdata_sample_2019-01.csv +""" +context.test_yaml_config(datasource_yaml) +context.add_datasource(**yaml.load(datasource_yaml)) + +# +batch_request = BatchRequest( + datasource_name="my_datasource", + data_connector_name="my_configured_asset_data_connector", + data_asset_name="my_data_asset", +) +# + +# +validator = context.get_validator( + batch_request=batch_request, + expectation_suite_name="my_expectation_suite", +) +validator.head() +# + +# this snippet is only for users who are not using a jupyter notebook +# +print(validator.head()) +# + +# +validator.expect_column_values_to_not_be_null(column="vendor_id") +# + +# this snippet is only for users who are not using a jupyter notebook +# +expectation_validation_result = validator.expect_column_values_to_not_be_null( + column="vendor_id" +) +print(expectation_validation_result) +# + +# +validator.save_expectation_suite(discard_failed_expectations=False) +# diff --git a/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py b/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py new file mode 100644 index 000000000000..75c6e2663abb --- /dev/null +++ b/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py @@ -0,0 +1,47 @@ +# +import great_expectations as gx + +# + +# +context = gx.get_context() +# + +context.sources.add_pandas(name="my_datasource",).add_csv_asset( + name="my_data_asset", + filepath_or_buffer="./data/yellow_tripdata_sample_2019-01.csv", +) + +# +data_asset = context.get_datasource("my_datasource").get_asset("my_data_asset") +batch_request = data_asset.build_batch_request() +# + +# +validator = context.get_validator( + batch_request=batch_request, + expectation_suite_name="my_expectation_suite", +) +validator.head() +# + +# this snippet is only for users who are not using a jupyter notebook +# +print(validator.head()) +# + +# +validator.expect_column_values_to_not_be_null(column="vendor_id") +# + +# this snippet is only for users who are not using a jupyter notebook +# +expectation_validation_result = validator.expect_column_values_to_not_be_null( + column="vendor_id" +) +print(expectation_validation_result) +# + +# +validator.save_expectation_suite(discard_failed_expectations=False) +# diff --git a/tests/integration/test_script_runner.py b/tests/integration/test_script_runner.py index bd76240eebb4..36a6dcee64cb 100644 --- a/tests/integration/test_script_runner.py +++ b/tests/integration/test_script_runner.py @@ -231,6 +231,11 @@ name="how_to_configure_result_format_parameter", user_flow_script="tests/integration/docusaurus/reference/core_concepts/result_format.py", ), + IntegrationTestFixture( + name="how_to_create_and_edit_expectations_with_instant_feedback_block_config", + user_flow_script="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_block_config.py", + data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files", + ), # Fluent Datasources IntegrationTestFixture( name="how_to_connect_to_one_or_more_files_using_pandas", @@ -238,6 +243,11 @@ data_context_dir="tests/integration/fixtures/no_datasources/great_expectations", data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files", ), + IntegrationTestFixture( + name="how_to_create_and_edit_expectations_with_instant_feedback_fluent", + user_flow_script="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py", + data_dir="tests/test_sets/taxi_yellow_tripdata_samples/first_3_files", + ), ] quickstart = [ From 4aec193a55b1044eda50037eceb29aa09f8b1333 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Wed, 12 Apr 2023 15:26:26 -0700 Subject: [PATCH 86/96] [MAINTENANCE] Fluent Datasources: Eliminate redundant Datasource name and DataAsset name from dictionary and JSON configuration (#7573) --- .../public_api_report.py | 2 +- .../data_context/abstract_data_context.py | 12 +- .../data_context/file_data_context.py | 4 +- .../data_context/serializable_data_context.py | 9 +- .../data_context/data_context_variables.py | 6 +- great_expectations/data_context/types/base.py | 1 + .../datasource/fluent/config.py | 205 ++++++---- .../datasource/fluent/constants.py | 3 + .../datasource/fluent/interfaces.py | 63 ++- .../pandas_azure_blob_storage_datasource.py | 2 +- .../datasource/fluent/pandas_datasource.py | 34 +- .../datasource/fluent/pandas_datasource.pyi | 8 +- .../fluent/pandas_file_path_datasource.py | 3 +- .../fluent/pandas_file_path_datasource.pyi | 4 +- .../fluent/pandas_filesystem_datasource.py | 2 +- .../pandas_google_cloud_storage_datasource.py | 2 +- .../datasource/fluent/pandas_s3_datasource.py | 2 +- .../datasource/fluent/schemas/Datasource.json | 6 +- .../PandasAzureBlobStorageDatasource.json | 6 +- .../fluent/schemas/PandasDBFSDatasource.json | 6 +- .../fluent/schemas/PandasDatasource.json | 6 +- .../schemas/PandasFilesystemDatasource.json | 6 +- .../PandasGoogleCloudStorageDatasource.json | 6 +- .../fluent/schemas/PandasS3Datasource.json | 6 +- .../fluent/schemas/PostgresDatasource.json | 6 +- .../fluent/schemas/SQLDatasource.json | 6 +- .../SparkAzureBlobStorageDatasource.json | 6 +- .../fluent/schemas/SparkDBFSDatasource.json | 6 +- .../fluent/schemas/SparkDatasource.json | 6 +- .../schemas/SparkFilesystemDatasource.json | 6 +- .../SparkGoogleCloudStorageDatasource.json | 6 +- .../fluent/schemas/SparkS3Datasource.json | 6 +- .../fluent/schemas/SqliteDatasource.json | 6 +- .../spark_azure_blob_storage_datasource.py | 2 +- .../datasource/fluent/spark_datasource.py | 3 +- .../fluent/spark_file_path_datasource.py | 4 +- .../fluent/spark_filesystem_datasource.py | 2 +- .../spark_google_cloud_storage_datasource.py | 2 +- .../datasource/fluent/spark_s3_datasource.py | 2 +- .../datasource/fluent/sql_datasource.py | 4 +- tests/datasource/fluent/conftest.py | 4 +- .../datasource/fluent/integration/conftest.py | 4 +- tests/datasource/fluent/test_config.py | 366 +++++++++--------- .../datasource/fluent/test_metadatasource.py | 79 ++-- ...st_pandas_azure_blob_storage_datasource.py | 4 +- .../fluent/test_pandas_datasource.py | 24 +- .../fluent/test_pandas_dbfs_datasource.py | 8 +- .../test_pandas_filesystem_datasource.py | 4 +- ..._pandas_google_cloud_storage_datasource.py | 4 +- .../fluent/test_pandas_s3_datasource.py | 4 +- .../fluent/test_postgres_datasource.py | 32 +- ...est_spark_azure_blob_storage_datasource.py | 4 +- .../fluent/test_spark_datasource.py | 5 +- .../fluent/test_spark_dbfs_datasource.py | 4 +- .../test_spark_filesystem_datasource.py | 4 +- ...t_spark_google_cloud_storage_datasource.py | 4 +- .../fluent/test_spark_s3_datasource.py | 4 +- .../datasource/fluent/test_viral_snippets.py | 13 +- 58 files changed, 619 insertions(+), 429 deletions(-) diff --git a/docs/sphinx_api_docs_source/public_api_report.py b/docs/sphinx_api_docs_source/public_api_report.py index 5365d5c91b68..9d67ab4c6596 100755 --- a/docs/sphinx_api_docs_source/public_api_report.py +++ b/docs/sphinx_api_docs_source/public_api_report.py @@ -1859,7 +1859,7 @@ def main(): # any methods or classes you are adding to documentation with the @public_api # decorator and any relevant "new" or "deprecated" public api decorators. # If the actual is lower than the threshold, please reduce the threshold. - PUBLIC_API_MISSING_THRESHOLD = 90 # TODO: reduce this number again once this works for the Fluent DS dynamic methods + PUBLIC_API_MISSING_THRESHOLD = 91 # TODO: reduce this number again once this works for the Fluent DS dynamic methods if len(printable_definitions) != PUBLIC_API_MISSING_THRESHOLD: error_msg_prefix = f"There are {len(printable_definitions)} items missing from the public API, we currently allow {PUBLIC_API_MISSING_THRESHOLD}." if len(printable_definitions) > PUBLIC_API_MISSING_THRESHOLD: diff --git a/great_expectations/data_context/data_context/abstract_data_context.py b/great_expectations/data_context/data_context/abstract_data_context.py index 0b0b633301ef..3d01e75d7d19 100644 --- a/great_expectations/data_context/data_context/abstract_data_context.py +++ b/great_expectations/data_context/data_context/abstract_data_context.py @@ -5430,18 +5430,19 @@ def _load_fluent_config(self, config_provider: _ConfigurationProvider) -> GxConf logger.info( f"{self.__class__.__name__} has not implemented `_load_fluent_config()` returning empty `GxConfig`" ) - return GxConfig(fluent_datasources={}) + return GxConfig(fluent_datasources=[]) def _attach_fluent_config_datasources_and_build_data_connectors( self, config: GxConfig ): """Called at end of __init__""" - for ds_name, datasource in config.datasources.items(): + for datasource in config.datasources: + ds_name = datasource.name logger.info(f"Loaded '{ds_name}' from fluent config") # if Datasource required a data_connector we need to build the data_connector for each asset if datasource.data_connector_type: - for data_asset in datasource.assets.values(): + for data_asset in datasource.assets: connect_options = getattr(data_asset, "connect_options", {}) datasource._build_data_connector(data_asset, **connect_options) @@ -5454,8 +5455,9 @@ def _synchronize_fluent_datasources(self) -> Dict[str, FluentDatasource]: """ fluent_datasources = self.fluent_datasources if fluent_datasources: - self.fluent_config.fluent_datasources.update(fluent_datasources) - return self.fluent_config.fluent_datasources + self.fluent_config.update_datasources(datasources=fluent_datasources) + + return self.fluent_config.get_datasources_as_dict() @staticmethod def _resolve_id_and_ge_cloud_id( diff --git a/great_expectations/data_context/data_context/file_data_context.py b/great_expectations/data_context/data_context/file_data_context.py index d8ab55ede9c1..dcd6aced47c7 100644 --- a/great_expectations/data_context/data_context/file_data_context.py +++ b/great_expectations/data_context/data_context/file_data_context.py @@ -169,9 +169,9 @@ def _load_fluent_config(self, config_provider: _ConfigurationProvider) -> GxConf gx_config = GxConfig.parse_yaml(path_to_fluent_yaml, _allow_empty=True) # attach the config_provider for each loaded datasource - for datasource in gx_config.datasources.values(): + for datasource in gx_config.datasources: datasource._config_provider = config_provider return gx_config logger.info(f"no fluent config at {path_to_fluent_yaml.absolute()}") - return GxConfig(fluent_datasources={}) + return GxConfig(fluent_datasources=[]) diff --git a/great_expectations/data_context/data_context/serializable_data_context.py b/great_expectations/data_context/data_context/serializable_data_context.py index 681394f6401c..72e1dbd309bc 100644 --- a/great_expectations/data_context/data_context/serializable_data_context.py +++ b/great_expectations/data_context/data_context/serializable_data_context.py @@ -96,13 +96,20 @@ def _save_project_config(self) -> None: fluent_datasources = self._synchronize_fluent_datasources() if fluent_datasources: - self.fluent_config.datasources.update(fluent_datasources) + self.fluent_config.update_datasources( + datasources=fluent_datasources + ) logger.info( f"Saving {len(self.fluent_config.datasources)} Fluent Datasources to {config_filepath}" ) fluent_json_dict: dict[ str, JSONValues ] = self.fluent_config._json_dict() + fluent_json_dict = ( + self.fluent_config._exclude_name_fields_from_fluent_datasources( + config=fluent_json_dict + ) + ) self.config._commented_map.update(fluent_json_dict) self.config.to_yaml(outfile) diff --git a/great_expectations/data_context/data_context_variables.py b/great_expectations/data_context/data_context_variables.py index b5349da4347b..b1aa4c34632d 100644 --- a/great_expectations/data_context/data_context_variables.py +++ b/great_expectations/data_context/data_context_variables.py @@ -383,7 +383,7 @@ def _fluent_objects_stash( for fluent_datasource_name in config_fluent_datasources_stash.keys(): self.data_context.datasources.pop(fluent_datasource_name) # this would be `deep_copy'ed in `instantiate_class_from_config` too - self.data_context.fluent_config.fluent_datasources = {} + self.data_context.fluent_config.fluent_datasources = [] yield except Exception: raise @@ -393,8 +393,8 @@ def _fluent_objects_stash( f"Replacing {len(config_fluent_datasources_stash)} stashed `FluentDatasource`s" ) self.data_context.datasources.update(config_fluent_datasources_stash) - self.data_context.fluent_config.fluent_datasources = ( - config_fluent_datasources_stash + self.data_context.fluent_config.fluent_datasources = list( + config_fluent_datasources_stash.values() ) diff --git a/great_expectations/data_context/types/base.py b/great_expectations/data_context/types/base.py index 61d0c47faf9b..b0fd97d739d7 100644 --- a/great_expectations/data_context/types/base.py +++ b/great_expectations/data_context/types/base.py @@ -1629,6 +1629,7 @@ class DataContextConfigSchema(Schema): allow_none=True, ) fluent_datasources = fields.Dict( + keys=fields.Str(), required=False, allow_none=True, load_only=True, diff --git a/great_expectations/datasource/fluent/config.py b/great_expectations/datasource/fluent/config.py index 966718c70e1a..d72a5a6a4c74 100644 --- a/great_expectations/datasource/fluent/config.py +++ b/great_expectations/datasource/fluent/config.py @@ -19,10 +19,15 @@ overload, ) -from pydantic import Extra, Field, ValidationError, validator +from pydantic import Extra, Field, validator from ruamel.yaml import YAML from typing_extensions import Final +from great_expectations.datasource.fluent.constants import ( + _DATA_ASSET_NAME_KEY, + _DATASOURCE_NAME_KEY, + _FLUENT_DATASOURCES_KEY, +) from great_expectations.datasource.fluent.fluent_base_model import FluentBaseModel from great_expectations.datasource.fluent.interfaces import ( Datasource, # noqa: TCH001 @@ -55,7 +60,7 @@ _MISSING_FLUENT_DATASOURCES_ERRORS: Final[List[PydanticErrorDict]] = [ { - "loc": ("fluent_datasources",), + "loc": (_FLUENT_DATASOURCES_KEY,), "msg": "field required", "type": "value_error.missing", } @@ -65,85 +70,123 @@ class GxConfig(FluentBaseModel): """Represents the full fluent configuration file.""" - fluent_datasources: Dict[str, Datasource] = Field( + fluent_datasources: List[Datasource] = Field( ..., description=_FLUENT_STYLE_DESCRIPTION ) _EXCLUDE_FROM_DATASOURCE_SERIALIZATION: ClassVar[Set[str]] = { - "name", # The "name" field is set in validation upon deserialization from configuration key; hence, it should not be serialized. + _DATASOURCE_NAME_KEY, # The "name" field is set in validation upon deserialization from configuration key; hence, it should not be serialized. } _EXCLUDE_FROM_DATA_ASSET_SERIALIZATION: ClassVar[Set[str]] = { - "name", # The "name" field is set in validation upon deserialization from configuration key; hence, it should not be serialized. + _DATA_ASSET_NAME_KEY, # The "name" field is set in validation upon deserialization from configuration key; hence, it should not be serialized. } + class Config: + extra = Extra.ignore # ignore any old style config keys + @property - def datasources(self) -> Dict[str, Datasource]: + def datasources(self) -> List[Datasource]: + """Returns available Fluent Datasources as list.""" return self.fluent_datasources - class Config: - extra = Extra.ignore # ignore any old style config keys + def get_datasources_as_dict(self) -> Dict[str, Datasource]: + """Returns available Datasource objects as dictionary, with corresponding name as key. + + Returns: + Dictionary of "Datasource" objects with "name" attribute serving as key. + """ + datasource: Datasource + datasources_as_dict: Dict[str, Datasource] = { + datasource.name: datasource for datasource in self.fluent_datasources + } + + return datasources_as_dict + + def get_datasource_names(self) -> Set[str]: + """Returns the set of available Datasource names. + + Returns: + Set of available Datasource names. + """ + datasource: Datasource + return {datasource.name for datasource in self.datasources} + + def get_datasource(self, datasource_name: str) -> Datasource: + """Returns the Datasource referred to by datasource_name + + Args: + datasource_name: name of Datasource sought. + + Returns: + Datasource -- if named "Datasource" objects exists; otherwise, exception is raised. + """ + try: + datasource: Datasource + return list( + filter( + lambda datasource: datasource.name == datasource_name, + self.datasources, + ) + )[0] + except IndexError as exc: + raise LookupError( + f"'{datasource_name}' not found. Available datasources are {self.get_datasource_names()}" + ) from exc + + def update_datasources(self, datasources: Dict[str, Datasource]) -> None: + """ + Updates internal list of datasources using supplied datasources dictionary. + + Args: + datasources: Dictionary of datasources to use to update internal datasources. + """ + datasources_as_dict: Dict[str, Datasource] = self.get_datasources_as_dict() + datasources_as_dict.update(datasources) + self.fluent_datasources = list(datasources_as_dict.values()) # noinspection PyNestedDecorators - @validator("fluent_datasources", pre=True) + @validator(_FLUENT_DATASOURCES_KEY, pre=True) @classmethod - def _load_datasource_subtype(cls, v: Dict[str, dict]): + def _load_datasource_subtype(cls, v: List[dict]): logger.info(f"Loading 'datasources' ->\n{pf(v, depth=2)}") - loaded_datasources: Dict[str, Datasource] = {} + loaded_datasources: List[Datasource] = [] - for ds_key, config in v.items(): + for config in v: ds_type_name: str = config.get("type", "") + ds_name: str = config[_DATASOURCE_NAME_KEY] if not ds_type_name: # TODO: (kilo59 122222) ideally this would be raised by `Datasource` validation # https://github.com/pydantic/pydantic/issues/734 - raise ValueError(f"'{ds_key}' is missing a 'type' entry") + raise ValueError(f"'{ds_name}' is missing a 'type' entry") try: ds_type: Type[Datasource] = _SourceFactories.type_lookup[ds_type_name] - logger.debug(f"Instantiating '{ds_key}' as {ds_type}") + logger.debug(f"Instantiating '{ds_name}' as {ds_type}") except KeyError as type_lookup_err: raise ValueError( - f"'{ds_key}' has unsupported 'type' - {type_lookup_err}" + f"'{ds_name}' has unsupported 'type' - {type_lookup_err}" ) from type_lookup_err - if "name" in config: - ds_name: str = config["name"] - if ds_name != ds_key: - raise ValueError( - f'Datasource key "{ds_key}" is different from name "{ds_name}" in its configuration.' - ) - else: - config["name"] = ds_key - if "assets" not in config: - config["assets"] = {} - - for asset_key, asset_config in config["assets"].items(): - if "name" in asset_config: - asset_name: str = asset_config["name"] - if asset_name != asset_key: - raise ValueError( - f'DataAsset key "{asset_key}" is different from name "{asset_name}" in its configuration.' - ) - else: - asset_config["name"] = asset_key + config["assets"] = [] datasource = ds_type(**config) # the ephemeral asset should never be serialized - if DEFAULT_PANDAS_DATA_ASSET_NAME in datasource.assets: - datasource.assets.pop(DEFAULT_PANDAS_DATA_ASSET_NAME) + if DEFAULT_PANDAS_DATA_ASSET_NAME in datasource.get_assets_as_dict(): + datasource.delete_asset(asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME) # if the default pandas datasource has no assets, it should not be serialized if ( datasource.name != DEFAULT_PANDAS_DATASOURCE_NAME or len(datasource.assets) > 0 ): - loaded_datasources[datasource.name] = datasource + loaded_datasources.append(datasource) # TODO: move this to a different 'validator' method # attach the datasource to the nested assets, avoiding recursion errors - for asset in datasource.assets.values(): + for asset in datasource.assets: asset._datasource = datasource logger.info(f"Loaded 'datasources' ->\n{repr(loaded_datasources)}") @@ -159,32 +202,22 @@ def parse_yaml( ) -> GxConfig: """ Overriding base method to allow an empty/missing `fluent_datasources` field. + In addition, converts datasource and assets configuration sections from dictionary style to list style. Other validation errors will still result in an error. TODO (kilo59) 122822: remove this as soon as it's no longer needed. Such as when we use a new `config_version` instead of `fluent_datasources` key. """ - if _allow_empty: - try: - super().parse_yaml(f) - except ValidationError as validation_err: - errors_list: List[PydanticErrorDict] = validation_err.errors() - logger.info( - f"{cls.__name__}.parse_yaml() failed with errors - {errors_list}" - ) - if errors_list == _MISSING_FLUENT_DATASOURCES_ERRORS: - logger.info( - f"{cls.__name__}.parse_yaml() returning empty `fluent_datasources`" - ) - return cls(fluent_datasources={}) - else: - logger.warning( - "`_allow_empty` does not prevent unrelated validation errors" - ) - raise - - # noinspection PyTypeChecker - return super().parse_yaml(f) + loaded = yaml.load(f) + logger.debug(f"loaded from yaml ->\n{pf(loaded, depth=3)}\n") + loaded = _convert_fluent_datasources_loaded_from_yaml_to_internal_object_representation( + config=loaded, _allow_empty=_allow_empty + ) + if _FLUENT_DATASOURCES_KEY not in loaded: + return cls(fluent_datasources=[]) + + config = cls(**loaded) + return config @overload def yaml( @@ -265,30 +298,36 @@ def yaml( def _exclude_name_fields_from_fluent_datasources( self, config: Dict[str, Any] ) -> Dict[str, Any]: - if "fluent_datasources" in config: - fluent_datasources: dict = config["fluent_datasources"] + if _FLUENT_DATASOURCES_KEY in config: + fluent_datasources_config_as_dict = {} + + fluent_datasources: List[dict] = config[_FLUENT_DATASOURCES_KEY] datasource_name: str datasource_config: dict - for datasource_name, datasource_config in fluent_datasources.items(): + for datasource_config in fluent_datasources: + datasource_name = datasource_config[_DATASOURCE_NAME_KEY] datasource_config = _exclude_fields_from_serialization( source_dict=datasource_config, exclusions=self._EXCLUDE_FROM_DATASOURCE_SERIALIZATION, ) if "assets" in datasource_config: - data_assets: dict = datasource_config["assets"] - data_asset_name: str + data_assets: List[dict] = datasource_config["assets"] data_asset_config: dict - data_assets = { - data_asset_name: _exclude_fields_from_serialization( + data_assets_config_as_dict = { + data_asset_config[ + _DATA_ASSET_NAME_KEY + ]: _exclude_fields_from_serialization( source_dict=data_asset_config, exclusions=self._EXCLUDE_FROM_DATA_ASSET_SERIALIZATION, ) - for data_asset_name, data_asset_config in data_assets.items() + for data_asset_config in data_assets } - datasource_config["assets"] = data_assets + datasource_config["assets"] = data_assets_config_as_dict - fluent_datasources[datasource_name] = datasource_config + fluent_datasources_config_as_dict[datasource_name] = datasource_config + + config[_FLUENT_DATASOURCES_KEY] = fluent_datasources_config_as_dict return config @@ -304,3 +343,29 @@ def _exclude_fields_from_serialization( source_dict.items(), ) ) + + +def _convert_fluent_datasources_loaded_from_yaml_to_internal_object_representation( + config: Dict[str, Any], _allow_empty: bool = False +) -> Dict[str, Any]: + if _FLUENT_DATASOURCES_KEY in config: + fluent_datasources: dict = config[_FLUENT_DATASOURCES_KEY] + + datasource_name: str + datasource_config: dict + for datasource_name, datasource_config in fluent_datasources.items(): + datasource_config[_DATASOURCE_NAME_KEY] = datasource_name + if "assets" in datasource_config: + data_assets: dict = datasource_config["assets"] + data_asset_name: str + data_asset_config: dict + for data_asset_name, data_asset_config in data_assets.items(): + data_asset_config[_DATA_ASSET_NAME_KEY] = data_asset_name + + datasource_config["assets"] = list(data_assets.values()) + + fluent_datasources[datasource_name] = datasource_config + + config[_FLUENT_DATASOURCES_KEY] = list(fluent_datasources.values()) + + return config diff --git a/great_expectations/datasource/fluent/constants.py b/great_expectations/datasource/fluent/constants.py index d8e4b2fe3273..5814e26d1885 100644 --- a/great_expectations/datasource/fluent/constants.py +++ b/great_expectations/datasource/fluent/constants.py @@ -11,7 +11,10 @@ "type", } +_FLUENT_DATASOURCES_KEY: Final[str] = "fluent_datasources" +_DATASOURCE_NAME_KEY: Final[str] = "name" _ASSETS_KEY: Final[str] = "assets" +_DATA_ASSET_NAME_KEY: Final[str] = "name" _DATA_CONNECTOR_NAME: Final[str] = "fluent" diff --git a/great_expectations/datasource/fluent/interfaces.py b/great_expectations/datasource/fluent/interfaces.py index 070d71741f07..6acee7c11caf 100644 --- a/great_expectations/datasource/fluent/interfaces.py +++ b/great_expectations/datasource/fluent/interfaces.py @@ -15,6 +15,7 @@ Generic, List, MutableMapping, + MutableSequence, Optional, Sequence, Set, @@ -402,7 +403,7 @@ class Datasource( type: str name: str id: Optional[uuid.UUID] = Field(default=None, description="Datasource id") - assets: MutableMapping[str, _DataAssetT] = {} + assets: MutableSequence[_DataAssetT] = [] # private attrs _data_context: GXDataContext = pydantic.PrivateAttr() @@ -470,17 +471,59 @@ def get_batch_list_from_batch_request( data_asset = self.get_asset(batch_request.data_asset_name) return data_asset.get_batch_list_from_batch_request(batch_request) + def get_assets_as_dict(self) -> MutableMapping[str, _DataAssetT]: + """Returns available DataAsset objects as dictionary, with corresponding name as key. + + Returns: + Dictionary of "_DataAssetT" objects with "name" attribute serving as key. + """ + asset: _DataAssetT + assets_as_dict: MutableMapping[str, _DataAssetT] = { + asset.name: asset for asset in self.assets + } + + return assets_as_dict + + def get_asset_names(self) -> Set[str]: + """Returns the set of available DataAsset names + + Returns: + Set of available DataAsset names. + """ + asset: _DataAssetT + return {asset.name for asset in self.assets} + def get_asset(self, asset_name: str) -> _DataAssetT: - """Returns the DataAsset referred to by name""" + """Returns the DataAsset referred to by asset_name + + Args: + asset_name: name of DataAsset sought. + + Returns: + _DataAssetT -- if named "DataAsset" object exists; otherwise, exception is raised. + """ # This default implementation will be used if protocol is inherited try: - self.assets[asset_name]._datasource = self - return self.assets[asset_name] - except KeyError as exc: + asset: _DataAssetT + found_asset: _DataAssetT = list( + filter(lambda asset: asset.name == asset_name, self.assets) + )[0] + found_asset._datasource = self + return found_asset + except IndexError as exc: raise LookupError( - f"'{asset_name}' not found. Available assets are {list(self.assets.keys())}" + f'"{asset_name}" not found. Available assets are {", ".join(self.get_asset_names())})' ) from exc + def delete_asset(self, asset_name: str) -> None: + """Removes the DataAsset referred to by asset_name from internal list of available DataAsset objects. + + Args: + asset_name: name of DataAsset to be deleted. + """ + asset: _DataAssetT + self.assets = list(filter(lambda asset: asset.name != asset_name, self.assets)) + def _add_asset( self, asset: _DataAssetT, connect_options: dict | None = None ) -> _DataAssetT: @@ -499,7 +542,13 @@ def _add_asset( asset.test_connection() - self.assets[asset.name] = asset + asset_names: Set[str] = self.get_asset_names() + if asset.name in asset_names: + raise ValueError( + f'"{asset.name}" already exists (all existing assets are {", ".join(asset_names)})' + ) + + self.assets.append(asset) return asset diff --git a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py index f5453c2c307b..6510c3d0bd0b 100644 --- a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py @@ -117,7 +117,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/pandas_datasource.py b/great_expectations/datasource/fluent/pandas_datasource.py index 1ff66b7e658f..79b19a9fb8fc 100644 --- a/great_expectations/datasource/fluent/pandas_datasource.py +++ b/great_expectations/datasource/fluent/pandas_datasource.py @@ -10,11 +10,10 @@ Any, Callable, ClassVar, - Dict, Generic, List, Mapping, - MutableMapping, + MutableSequence, Optional, Sequence, Set, @@ -400,10 +399,7 @@ class _PandasDatasource(Datasource, Generic[_DataAssetT]): asset_types: ClassVar[Sequence[Type[DataAsset]]] = [] # instance attributes - assets: MutableMapping[ - str, - _DataAssetT, - ] = {} + assets: MutableSequence[_DataAssetT] = [] # Abstract Methods @property @@ -462,7 +458,7 @@ def json( ) if "assets" in self.__fields_set__: exclude_assets = {} - for asset_name, asset in self.assets.items(): + for asset in self.assets: # don't check fields that should always be set check_fields: set[str] = asset.__fields_set__.copy().difference( _FIELDS_ALWAYS_SET @@ -471,7 +467,7 @@ def json( if isinstance( getattr(asset, field), tuple(_EXCLUDE_TYPES_FROM_JSON) ): - exclude_assets[asset_name] = {field: True} + exclude_assets[asset.name] = {field: True} if exclude_assets: exclude_fields["assets"] = exclude_assets @@ -488,6 +484,26 @@ def json( **dumps_kwargs, ) + def _add_asset( + self, asset: _DataAssetT, connect_options: dict | None = None + ) -> _DataAssetT: + """Adds an asset to this "_PandasDatasource" object. + + The reserved asset name "DEFAULT_PANDAS_DATA_ASSET_NAME" undergoes replacement (rather than signaling error). + + Args: + asset: The DataAsset to be added to this datasource. + """ + asset_name: str = asset.name + + asset_names: Set[str] = self.get_asset_names() + + if asset_name == DEFAULT_PANDAS_DATA_ASSET_NAME: + if asset_name in asset_names: + self.delete_asset(asset_name=asset_name) + + return super()._add_asset(asset=asset, connect_options=connect_options) + _DYNAMIC_ASSET_TYPES = list(_PANDAS_ASSET_MODELS.values()) @@ -509,7 +525,7 @@ class PandasDatasource(_PandasDatasource): # instance attributes type: Literal["pandas"] = "pandas" - assets: Dict[str, _PandasDataAsset] = {} + assets: List[_PandasDataAsset] = [] def test_connection(self, test_assets: bool = True) -> None: ... diff --git a/great_expectations/datasource/fluent/pandas_datasource.pyi b/great_expectations/datasource/fluent/pandas_datasource.pyi index 023fbeadb1dd..490fe50909ac 100644 --- a/great_expectations/datasource/fluent/pandas_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_datasource.pyi @@ -8,11 +8,11 @@ from typing import ( Any, Callable, ClassVar, - Dict, Hashable, Iterable, + List, Mapping, - MutableMapping, + MutableSequence, Optional, Sequence, Set, @@ -117,7 +117,7 @@ _PandasDataAssetT = TypeVar("_PandasDataAssetT", bound=_PandasDataAsset) class _PandasDatasource(Datasource): asset_types: ClassVar[Sequence[Type[DataAsset]]] - assets: MutableMapping[str, _PandasDataAssetT] # type: ignore[valid-type] + assets: MutableSequence[_PandasDataAssetT] # type: ignore[valid-type] @property def execution_engine_type(self) -> Type[PandasExecutionEngine]: ... def test_connection(self, test_assets: bool = ...) -> None: ... @@ -141,7 +141,7 @@ _DYNAMIC_ASSET_TYPES: list[Type[_PandasDataAsset]] class PandasDatasource(_PandasDatasource): asset_types: ClassVar[Sequence[Type[DataAsset]]] type: Literal["pandas"] - assets: Dict[str, _PandasDataAsset] + assets: List[_PandasDataAsset] def test_connection(self, test_assets: bool = ...) -> None: ... def add_dataframe_asset( self, diff --git a/great_expectations/datasource/fluent/pandas_file_path_datasource.py b/great_expectations/datasource/fluent/pandas_file_path_datasource.py index f69901b82b54..9c6e4b83763c 100644 --- a/great_expectations/datasource/fluent/pandas_file_path_datasource.py +++ b/great_expectations/datasource/fluent/pandas_file_path_datasource.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, ClassVar, - Dict, List, Type, ) @@ -79,4 +78,4 @@ class _PandasFilePathDatasource(_PandasDatasource): ) # instance attributes - assets: Dict[str, _FilePathDataAsset] = {} + assets: List[_FilePathDataAsset] = [] diff --git a/great_expectations/datasource/fluent/pandas_file_path_datasource.pyi b/great_expectations/datasource/fluent/pandas_file_path_datasource.pyi index 59689d512751..9e0b953688be 100644 --- a/great_expectations/datasource/fluent/pandas_file_path_datasource.pyi +++ b/great_expectations/datasource/fluent/pandas_file_path_datasource.pyi @@ -1,5 +1,5 @@ from logging import Logger -from typing import ClassVar, Dict, List, Type +from typing import ClassVar, List, Type from great_expectations.datasource.fluent.file_path_data_asset import _FilePathDataAsset from great_expectations.datasource.fluent.interfaces import DataAsset as DataAsset @@ -23,4 +23,4 @@ class XMLAsset(_FilePathDataAsset): ... class _PandasFilePathDatasource(_PandasDatasource): asset_types: ClassVar[List[Type[DataAsset]]] - assets: Dict[str, _FilePathDataAsset] + assets: List[_FilePathDataAsset] diff --git a/great_expectations/datasource/fluent/pandas_filesystem_datasource.py b/great_expectations/datasource/fluent/pandas_filesystem_datasource.py index 6f3a968899c8..fdc2ef40ce0c 100644 --- a/great_expectations/datasource/fluent/pandas_filesystem_datasource.py +++ b/great_expectations/datasource/fluent/pandas_filesystem_datasource.py @@ -54,7 +54,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py index 7e4a05fb1ce5..dad0076d0efc 100644 --- a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py @@ -122,7 +122,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/pandas_s3_datasource.py b/great_expectations/datasource/fluent/pandas_s3_datasource.py index 83a7aef07aaf..f60eb58e8dbc 100644 --- a/great_expectations/datasource/fluent/pandas_s3_datasource.py +++ b/great_expectations/datasource/fluent/pandas_s3_datasource.py @@ -95,7 +95,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/schemas/Datasource.json b/great_expectations/datasource/fluent/schemas/Datasource.json index fd322a9c4a34..38aa14e82ecf 100644 --- a/great_expectations/datasource/fluent/schemas/Datasource.json +++ b/great_expectations/datasource/fluent/schemas/Datasource.json @@ -19,9 +19,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/DataAsset" } } diff --git a/great_expectations/datasource/fluent/schemas/PandasAzureBlobStorageDatasource.json b/great_expectations/datasource/fluent/schemas/PandasAzureBlobStorageDatasource.json index 573e206d7482..14ee344f16d2 100644 --- a/great_expectations/datasource/fluent/schemas/PandasAzureBlobStorageDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasAzureBlobStorageDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/PandasDBFSDatasource.json b/great_expectations/datasource/fluent/schemas/PandasDBFSDatasource.json index 52002adc1301..3bdc5f40f9a6 100644 --- a/great_expectations/datasource/fluent/schemas/PandasDBFSDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasDBFSDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/PandasDatasource.json b/great_expectations/datasource/fluent/schemas/PandasDatasource.json index 7e6b89e128a7..e5210f04a3a0 100644 --- a/great_expectations/datasource/fluent/schemas/PandasDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_PandasDataAsset" } } diff --git a/great_expectations/datasource/fluent/schemas/PandasFilesystemDatasource.json b/great_expectations/datasource/fluent/schemas/PandasFilesystemDatasource.json index a509892950f1..53d3dd862258 100644 --- a/great_expectations/datasource/fluent/schemas/PandasFilesystemDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasFilesystemDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/PandasGoogleCloudStorageDatasource.json b/great_expectations/datasource/fluent/schemas/PandasGoogleCloudStorageDatasource.json index a5094fb697f4..c7d2c2c92eb9 100644 --- a/great_expectations/datasource/fluent/schemas/PandasGoogleCloudStorageDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasGoogleCloudStorageDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/PandasS3Datasource.json b/great_expectations/datasource/fluent/schemas/PandasS3Datasource.json index 8f1c4eaff33b..d554830a66dc 100644 --- a/great_expectations/datasource/fluent/schemas/PandasS3Datasource.json +++ b/great_expectations/datasource/fluent/schemas/PandasS3Datasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/PostgresDatasource.json b/great_expectations/datasource/fluent/schemas/PostgresDatasource.json index f23d9f77a98b..582202bf8d98 100644 --- a/great_expectations/datasource/fluent/schemas/PostgresDatasource.json +++ b/great_expectations/datasource/fluent/schemas/PostgresDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "anyOf": [ { "$ref": "#/definitions/TableAsset" diff --git a/great_expectations/datasource/fluent/schemas/SQLDatasource.json b/great_expectations/datasource/fluent/schemas/SQLDatasource.json index 78e8e5911350..5890d8df3c59 100644 --- a/great_expectations/datasource/fluent/schemas/SQLDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SQLDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "anyOf": [ { "$ref": "#/definitions/TableAsset" diff --git a/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json b/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json index 8ac66d9c372e..3231907f17d8 100644 --- a/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkAzureBlobStorageDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource.json b/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource.json index a4fec896d44c..aa15bfb979aa 100644 --- a/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkDBFSDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/SparkDatasource.json b/great_expectations/datasource/fluent/schemas/SparkDatasource.json index 619cc16fbd9b..fc9ccbf50982 100644 --- a/great_expectations/datasource/fluent/schemas/SparkDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/DataFrameAsset" } } diff --git a/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json b/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json index c1ac1808a24e..5c34b64c50cb 100644 --- a/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkFilesystemDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json b/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json index 2965d75b1c28..6a40149d0bf2 100644 --- a/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkGoogleCloudStorageDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json b/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json index b0cddc81ac3c..0a0aef7e8246 100644 --- a/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json +++ b/great_expectations/datasource/fluent/schemas/SparkS3Datasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "$ref": "#/definitions/_FilePathDataAsset" } }, diff --git a/great_expectations/datasource/fluent/schemas/SqliteDatasource.json b/great_expectations/datasource/fluent/schemas/SqliteDatasource.json index 047e3ba560c3..7ac4e6525262 100644 --- a/great_expectations/datasource/fluent/schemas/SqliteDatasource.json +++ b/great_expectations/datasource/fluent/schemas/SqliteDatasource.json @@ -23,9 +23,9 @@ }, "assets": { "title": "Assets", - "default": {}, - "type": "object", - "additionalProperties": { + "default": [], + "type": "array", + "items": { "anyOf": [ { "$ref": "#/definitions/TableAsset" diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py index dc2b043d7244..f869a7361235 100644 --- a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py @@ -129,7 +129,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/spark_datasource.py b/great_expectations/datasource/fluent/spark_datasource.py index 93e0072f6da7..908c7a9f3be9 100644 --- a/great_expectations/datasource/fluent/spark_datasource.py +++ b/great_expectations/datasource/fluent/spark_datasource.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, ClassVar, - Dict, Generic, List, Optional, @@ -203,7 +202,7 @@ class SparkDatasource(_SparkDatasource): # instance attributes type: Literal["spark"] = "spark" - assets: Dict[str, DataFrameAsset] = {} # type: ignore[assignment] + assets: List[DataFrameAsset] = [] # type: ignore[assignment] def test_connection(self, test_assets: bool = True) -> None: ... diff --git a/great_expectations/datasource/fluent/spark_file_path_datasource.py b/great_expectations/datasource/fluent/spark_file_path_datasource.py index 2a3a12c2fd9f..278909584944 100644 --- a/great_expectations/datasource/fluent/spark_file_path_datasource.py +++ b/great_expectations/datasource/fluent/spark_file_path_datasource.py @@ -1,7 +1,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, ClassVar, Dict, List, Type +from typing import TYPE_CHECKING, ClassVar, List, Type import pydantic from pydantic import Field @@ -41,4 +41,4 @@ class _SparkFilePathDatasource(_SparkDatasource): asset_types: ClassVar[List[Type[DataAsset]]] = [CSVAsset] # instance attributes - assets: Dict[str, _FilePathDataAsset] = {} # type: ignore[assignment] + assets: List[_FilePathDataAsset] = [] # type: ignore[assignment] diff --git a/great_expectations/datasource/fluent/spark_filesystem_datasource.py b/great_expectations/datasource/fluent/spark_filesystem_datasource.py index 3dbb214af08c..7ffcc2f8bb6f 100644 --- a/great_expectations/datasource/fluent/spark_filesystem_datasource.py +++ b/great_expectations/datasource/fluent/spark_filesystem_datasource.py @@ -52,7 +52,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py index ad5275aa9d92..4e3cd24ec7a2 100644 --- a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py @@ -129,7 +129,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/spark_s3_datasource.py b/great_expectations/datasource/fluent/spark_s3_datasource.py index 86f48c7e59f6..dcd88f4d58f4 100644 --- a/great_expectations/datasource/fluent/spark_s3_datasource.py +++ b/great_expectations/datasource/fluent/spark_s3_datasource.py @@ -99,7 +99,7 @@ def test_connection(self, test_assets: bool = True) -> None: ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset.test_connection() def _build_data_connector( diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index 2278e6e4364e..0b9a0ba300ee 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -878,7 +878,7 @@ class SQLDatasource(Datasource): ) # We need to explicitly add each asset type to the Union due to how # deserialization is implemented in our pydantic base model. - assets: Dict[str, Union[TableAsset, QueryAsset]] = {} + assets: List[Union[TableAsset, QueryAsset]] = [] # private attrs _cached_connection_string: Union[str, ConfigStr] = pydantic.PrivateAttr("") @@ -933,7 +933,7 @@ def test_connection(self, test_assets: bool = True) -> None: f"{str(e)}" ) from e if self.assets and test_assets: - for asset in self.assets.values(): + for asset in self.assets: asset._datasource = self asset.test_connection() diff --git a/tests/datasource/fluent/conftest.py b/tests/datasource/fluent/conftest.py index 24b4d3b808d1..f2b677efa817 100644 --- a/tests/datasource/fluent/conftest.py +++ b/tests/datasource/fluent/conftest.py @@ -29,8 +29,8 @@ ) from tests.sqlalchemy_test_doubles import Dialect, MockSaEngine -EXPERIMENTAL_DATASOURCE_TEST_DIR: Final = pathlib.Path(__file__).parent -PG_CONFIG_YAML_FILE: Final = EXPERIMENTAL_DATASOURCE_TEST_DIR / FileDataContext.GX_YML +FLUENT_DATASOURCE_TEST_DIR: Final = pathlib.Path(__file__).parent +PG_CONFIG_YAML_FILE: Final = FLUENT_DATASOURCE_TEST_DIR / FileDataContext.GX_YML logger = logging.getLogger(__name__) diff --git a/tests/datasource/fluent/integration/conftest.py b/tests/datasource/fluent/integration/conftest.py index 49430646497a..d14ed4400bd7 100644 --- a/tests/datasource/fluent/integration/conftest.py +++ b/tests/datasource/fluent/integration/conftest.py @@ -42,7 +42,7 @@ def default_pandas_data( pandas_ds.read_csv( filepath_or_buffer=csv_path / "yellow_tripdata_sample_2019-02.csv", ) - asset = pandas_ds.assets[DEFAULT_PANDAS_DATA_ASSET_NAME] + asset = pandas_ds.get_asset(asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME) batch_request = asset.build_batch_request() return context, pandas_ds, asset, batch_request @@ -63,7 +63,7 @@ def pandas_sql_data( sql=sqlalchemy.sql.text("SELECT * FROM my_table"), con=con, ) - asset = pandas_ds.assets[DEFAULT_PANDAS_DATA_ASSET_NAME] + asset = pandas_ds.get_asset(asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME) batch_request = asset.build_batch_request() return context, pandas_ds, asset, batch_request diff --git a/tests/datasource/fluent/test_config.py b/tests/datasource/fluent/test_config.py index 04f6a642cd3b..0ff1f6089b76 100644 --- a/tests/datasource/fluent/test_config.py +++ b/tests/datasource/fluent/test_config.py @@ -17,8 +17,16 @@ from great_expectations.core.yaml_handler import YAMLHandler from great_expectations.data_context import FileDataContext -from great_expectations.datasource.fluent.config import GxConfig -from great_expectations.datasource.fluent.constants import _ASSETS_KEY +from great_expectations.datasource.fluent.config import ( + GxConfig, + _convert_fluent_datasources_loaded_from_yaml_to_internal_object_representation, +) +from great_expectations.datasource.fluent.constants import ( + _ASSETS_KEY, + _DATA_ASSET_NAME_KEY, + _DATASOURCE_NAME_KEY, + _FLUENT_DATASOURCES_KEY, +) from great_expectations.datasource.fluent.interfaces import Datasource from great_expectations.datasource.fluent.sources import ( DEFAULT_PANDAS_DATA_ASSET_NAME, @@ -29,6 +37,7 @@ SplitterYearAndMonth, TableAsset, ) +from tests.datasource.fluent.conftest import FLUENT_DATASOURCE_TEST_DIR if TYPE_CHECKING: from pytest import FixtureRequest @@ -40,29 +49,28 @@ p = pytest.param -EXPERIMENTAL_DATASOURCE_TEST_DIR = pathlib.Path(__file__).parent -CSV_PATH = EXPERIMENTAL_DATASOURCE_TEST_DIR.joinpath( +CSV_PATH = FLUENT_DATASOURCE_TEST_DIR.joinpath( pathlib.Path("..", "..", "test_sets", "taxi_yellow_tripdata_samples") ) -PG_CONFIG_YAML_FILE = EXPERIMENTAL_DATASOURCE_TEST_DIR / FileDataContext.GX_YML +PG_CONFIG_YAML_FILE = FLUENT_DATASOURCE_TEST_DIR / FileDataContext.GX_YML PG_CONFIG_YAML_STR: Final[str] = PG_CONFIG_YAML_FILE.read_text() # TODO: create PG_CONFIG_YAML_FILE/STR from this dict COMPLEX_CONFIG_DICT: Final[dict] = { - "fluent_datasources": { - "my_pg_ds": { + _FLUENT_DATASOURCES_KEY: [ + { "connection_string": "postgresql://userName:@hostname/dbName", "kwargs": {"echo": True}, "name": "my_pg_ds", "type": "postgres", - "assets": { - "my_table_asset_wo_splitters": { + "assets": [ + { "name": "my_table_asset_wo_splitters", "table_name": "my_table", "type": "table", }, - "with_splitter": { + { "splitter": { "column_name": "my_column", "method_name": "split_on_year_and_month", @@ -71,7 +79,7 @@ "name": "with_splitter", "type": "table", }, - "with_sorters": { + { "order_by": [ {"key": "year"}, {"key": "month", "reverse": True}, @@ -80,20 +88,20 @@ "name": "with_sorters", "type": "table", }, - "with_dslish_sorters": { + { "order_by": ["year", "-month"], "table_name": "yet_another_table", "name": "with_dslish_sorters", "type": "table", }, - }, + ], }, - "my_pandas_filesystem_ds": { + { "type": "pandas_filesystem", "name": "my_pandas_filesystem_ds", "base_directory": __file__, - "assets": { - "my_csv_asset": { + "assets": [ + { "type": "csv", "name": "my_csv_asset", "batching_regex": r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2}).csv", @@ -103,64 +111,37 @@ "pipeline_filename": "${pipeline_filename}", }, }, - "my_json_asset": { + { "type": "json", "name": "my_json_asset", "batching_regex": r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2}).json", "connect_options": {"glob_directive": "**/*.json"}, "orient": "records", }, - }, + ], }, - } + ], } COMPLEX_CONFIG_JSON: Final[str] = json.dumps(COMPLEX_CONFIG_DICT) SIMPLE_DS_DICT: Final[dict] = { - "fluent_datasources": { - "my_ds": { - "type": "sql", - "name": "my_ds", - "connection_string": "sqlite://", - } - } -} - -SIMPLE_DS_DICT_WITH_DS_NAME_ERROR = { - "fluent_datasources": { - "my_ds": { - "type": "sql", - "name": "my_incorrect_ds", - "connection_string": "sqlite://", - } - } -} - -SIMPLE_DS_DICT_WITH_ASSET_NAME_ERROR = { - "fluent_datasources": { - "my_ds": { + _FLUENT_DATASOURCES_KEY: [ + { "type": "sql", "name": "my_ds", "connection_string": "sqlite://", - "assets": { - "my_csv_asset": { - "type": "csv", - "name": "my_incorrect_csv_asset", - "batching_regex": r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2}).csv", - }, - }, - } - } + }, + ], } -COMBINED_FLUENT_AND_OLD_STYLE_CFG_DICT = { - "fluent_datasources": { - "my_ds": { +COMBINED_FLUENT_AND_OLD_STYLE_CFG_DICT: Final[dict] = { + _FLUENT_DATASOURCES_KEY: [ + { "type": "sql", "name": "my_ds", "connection_string": "sqlite://", - } - }, + }, + ], "name": "getting_started_datasource", "class_name": "Datasource", "execution_engine": { @@ -179,20 +160,20 @@ "class_name": "RuntimeDataConnector", "assets": { "my_runtime_asset_name": { - "batch_identifiers": ["runtime_batch_identifier_name"] - } + "batch_identifiers": ["runtime_batch_identifier_name"], + }, }, }, }, } DEFAULT_PANDAS_DATASOURCE_AND_DATA_ASSET_CONFIG_DICT: Final[dict] = { - "fluent_datasources": { - DEFAULT_PANDAS_DATASOURCE_NAME: { + _FLUENT_DATASOURCES_KEY: [ + { "type": "pandas", "name": DEFAULT_PANDAS_DATASOURCE_NAME, - "assets": { - DEFAULT_PANDAS_DATA_ASSET_NAME: { + "assets": [ + { "name": DEFAULT_PANDAS_DATA_ASSET_NAME, "type": "csv", "filepath_or_buffer": CSV_PATH @@ -200,7 +181,7 @@ "sep": "|", "names": ["col1", "col2"], }, - "my_csv_asset": { + { "name": "my_csv_asset", "type": "csv", "filepath_or_buffer": CSV_PATH @@ -208,9 +189,9 @@ "sep": "|", "names": ["col1", "col2"], }, - }, + ], }, - } + ], } @@ -264,10 +245,20 @@ def test_from_datasource(self, asset_dict: dict): ds_dict = { "name": "my_ds", "base_directory": pathlib.Path(__file__), - "assets": {asset_name: asset_dict_config}, + "assets": [ + asset_dict_config, + ], } datasource: Datasource = ds_class.parse_obj(ds_dict) - assert asset_dict_config == datasource.dict()["assets"][asset_name] + assert ( + asset_dict_config + == list( + filter( + lambda element: element["name"] == asset_name, + datasource.dict()["assets"], + ) + )[0] + ) def test_from_gx_config(self, asset_dict: dict): """ @@ -286,17 +277,35 @@ def test_from_gx_config(self, asset_dict: dict): ds_dict = { "type": "pandas_filesystem", + "name": "my_ds", "base_directory": pathlib.Path(__file__), - "assets": {"my_asset": asset_dict_config}, + "assets": [ + asset_dict_config, + ], } - gx_config = GxConfig.parse_obj({"fluent_datasources": {"my_ds": ds_dict}}) + gx_config = GxConfig.parse_obj( + { + _FLUENT_DATASOURCES_KEY: [ + ds_dict, + ] + } + ) gx_config_dict = gx_config.dict() print(f"gx_config_dict\n{pf(gx_config_dict)}") - assert ( - asset_dict - == gx_config_dict["fluent_datasources"]["my_ds"]["assets"]["my_asset"] - ) + my_datasoure_config_dict = list( + filter( + lambda element: element["name"] == "my_ds", + gx_config_dict[_FLUENT_DATASOURCES_KEY], + ) + )[0] + my_asset_config_dict = list( + filter( + lambda element: element["name"] == "my_asset", + my_datasoure_config_dict["assets"], + ) + )[0] + assert asset_dict == my_asset_config_dict def test_id_only_serialized_if_present(ds_dict_config: dict): @@ -306,7 +315,8 @@ def test_id_only_serialized_if_present(ds_dict_config: dict): no_ids: dict = {} # remove or add ids - for ds_name, ds in ds_dict_config["fluent_datasources"].items(): + for ds in ds_dict_config[_FLUENT_DATASOURCES_KEY]: + ds_name = ds[_DATASOURCE_NAME_KEY] with_ids[ds_name] = copy.deepcopy(ds) no_ids[ds_name] = copy.deepcopy(ds) @@ -317,16 +327,41 @@ def test_id_only_serialized_if_present(ds_dict_config: dict): no_ids[ds_name].pop("id", None) - for asset_name in ds["assets"].keys(): + with_ids[ds_name]["assets"] = { + asset_config[_DATA_ASSET_NAME_KEY]: asset_config + for asset_config in with_ids[ds_name]["assets"] + } + no_ids[ds_name]["assets"] = { + asset_config[_DATA_ASSET_NAME_KEY]: asset_config + for asset_config in no_ids[ds_name]["assets"] + } + + for asset_config in ds["assets"]: + asset_name = asset_config[_DATA_ASSET_NAME_KEY] asset_id = uuid.uuid4() all_ids.append(str(asset_id)) - with_ids[ds_name]["assets"][asset_name]["id"] = asset_id + with_ids[ds_name]["assets"][asset_name]["id"] = asset_id no_ids[ds_name]["assets"][asset_name].pop("id", None) - gx_config_no_ids = GxConfig.parse_obj({"fluent_datasources": no_ids}) - gx_config_with_ids = GxConfig.parse_obj({"fluent_datasources": with_ids}) + no_ids = ( + _convert_fluent_datasources_loaded_from_yaml_to_internal_object_representation( + config={ + _FLUENT_DATASOURCES_KEY: no_ids, + } + ) + ) + with_ids = ( + _convert_fluent_datasources_loaded_from_yaml_to_internal_object_representation( + config={ + _FLUENT_DATASOURCES_KEY: with_ids, + } + ) + ) + + gx_config_no_ids = GxConfig.parse_obj(no_ids) + gx_config_with_ids = GxConfig.parse_obj(with_ids) assert "id" not in str(gx_config_no_ids.dict()) assert "id" not in gx_config_no_ids.json() @@ -363,78 +398,24 @@ def test_load_config(inject_engine_lookup_double, load_method: Callable, input_) assert loaded assert loaded.datasources - for datasource in loaded.datasources.values(): + for datasource in loaded.datasources: assert isinstance(datasource, Datasource) -@pytest.mark.parametrize( - ["load_method", "input_"], - [ - p( - GxConfig.parse_obj, - SIMPLE_DS_DICT_WITH_DS_NAME_ERROR, - id="simple pg config dict with ds name error", - ), - p( - GxConfig.parse_raw, - json.dumps(SIMPLE_DS_DICT_WITH_DS_NAME_ERROR), - id="simple pg json with ds name error", - ), - ], -) -def test_load_incorrect_ds_config_raises_error( - inject_engine_lookup_double, load_method: Callable, input_ -): - with pytest.raises(pydantic.ValidationError) as exc_info: - _ = load_method(input_) - - assert ( - str(exc_info.value) - == '1 validation error for GxConfig\nfluent_datasources\n Datasource key "my_ds" is different from name "my_incorrect_ds" in its configuration. (type=value_error)' - ) - - -@pytest.mark.parametrize( - ["load_method", "input_"], - [ - p( - GxConfig.parse_obj, - SIMPLE_DS_DICT_WITH_ASSET_NAME_ERROR, - id="simple pg config dict with asset name error", - ), - p( - GxConfig.parse_raw, - json.dumps(SIMPLE_DS_DICT_WITH_ASSET_NAME_ERROR), - id="simple pg json with asset name error", - ), - ], -) -def test_load_incorrect_asset_config_raises_error( - inject_engine_lookup_double, load_method: Callable, input_ -): - with pytest.raises(pydantic.ValidationError) as exc_info: - _ = load_method(input_) - - assert ( - str(exc_info.value) - == '1 validation error for GxConfig\nfluent_datasources\n DataAsset key "my_csv_asset" is different from name "my_incorrect_csv_asset" in its configuration. (type=value_error)' - ) - - @pytest.mark.unit @pytest.mark.parametrize( ["config", "expected_error_loc", "expected_msg"], [ - p({}, ("fluent_datasources",), "field required", id="no datasources"), + p({}, (_FLUENT_DATASOURCES_KEY,), "field required", id="no datasources"), p( { - "fluent_datasources": { - "my_bad_ds_missing_type": { + _FLUENT_DATASOURCES_KEY: [ + { "name": "my_bad_ds_missing_type", - } - } + }, + ], }, - ("fluent_datasources",), + (_FLUENT_DATASOURCES_KEY,), "'my_bad_ds_missing_type' is missing a 'type' entry", id="missing 'type' field", ), @@ -465,8 +446,16 @@ def test_catch_bad_top_level_config( ["bad_asset_config", "expected_error_loc", "expected_msg"], [ p( - {"name": "missing `table_name`", "type": "table"}, - ("fluent_datasources", "assets", "missing `table_name`", "table_name"), + { + "name": "missing `table_name`", + "type": "table", + }, + ( + _FLUENT_DATASOURCES_KEY, + "assets", + 0, + "table_name", + ), "field required", id="missing `table_name`", ), @@ -481,9 +470,9 @@ def test_catch_bad_top_level_config( }, }, ( - "fluent_datasources", + _FLUENT_DATASOURCES_KEY, "assets", - "unknown splitter", + 0, "splitter", "method_name", ), @@ -498,18 +487,24 @@ def test_catch_bad_asset_configs( expected_error_loc: tuple, expected_msg: str, ): - config: dict = { - "my_test_ds": { + config: list = [ + { "type": "postgres", "name": "my_test_ds", "connection_string": "postgres://userName:@hostname/dbName", - "assets": {bad_asset_config["name"]: bad_asset_config}, - } - } + "assets": [ + bad_asset_config, + ], + }, + ] print(f" Config\n{pf(config)}\n") with pytest.raises(pydantic.ValidationError) as exc_info: - GxConfig.parse_obj({"fluent_datasources": config}) + GxConfig.parse_obj( + { + _FLUENT_DATASOURCES_KEY: config, + } + ) print(f"\n{exc_info.typename}:{exc_info.value}") @@ -623,7 +618,7 @@ def test_yaml_config_round_trip( pp(re_loaded) assert re_loaded - assert from_yaml_gx_config.dict() == re_loaded.dict() + assert sorted(from_yaml_gx_config.dict()) == sorted(re_loaded.dict()) assert dumped == re_loaded.yaml() @@ -643,7 +638,7 @@ def test_yaml_file_config_round_trip( pp(re_loaded) assert re_loaded - assert from_yaml_gx_config == re_loaded + assert sorted(from_yaml_gx_config.dict()) == sorted(re_loaded.dict()) def test_assets_key_presence( @@ -651,7 +646,7 @@ def test_assets_key_presence( ): ds_wo_assets = None ds_with_assets = None - for ds in from_yaml_gx_config.datasources.values(): + for ds in from_yaml_gx_config.datasources: if ds.assets: ds_with_assets = ds else: @@ -664,16 +659,16 @@ def test_assets_key_presence( f" dict from dumped yaml ->\n\n{pf(dumped_as_dict['fluent_datasources'], depth=2)}" ) - assert _ASSETS_KEY in dumped_as_dict["fluent_datasources"][ds_with_assets.name] - assert _ASSETS_KEY not in dumped_as_dict["fluent_datasources"][ds_wo_assets.name] + assert _ASSETS_KEY in dumped_as_dict[_FLUENT_DATASOURCES_KEY][ds_with_assets.name] + assert _ASSETS_KEY not in dumped_as_dict[_FLUENT_DATASOURCES_KEY][ds_wo_assets.name] def test_splitters_deserialization( inject_engine_lookup_double, from_all_config: GxConfig ): - table_asset: TableAsset = from_all_config.datasources["my_pg_ds"].assets[ - "with_splitter" - ] + table_asset: TableAsset = from_all_config.get_datasource( + datasource_name="my_pg_ds" + ).get_asset(asset_name="with_splitter") assert isinstance(table_asset.splitter, SplitterYearAndMonth) assert table_asset.splitter.method_name == "split_on_year_and_month" @@ -697,7 +692,7 @@ def test_custom_sorter_serialization( dumped: str = from_json_gx_config.json(indent=2) print(f" Dumped JSON ->\n\n{dumped}\n") - expected_sorter_strings: List[str] = COMPLEX_CONFIG_DICT["fluent_datasources"][ + expected_sorter_strings: List[str] = COMPLEX_CONFIG_DICT[_FLUENT_DATASOURCES_KEY][ "my_pg_ds" ]["assets"]["with_dslish_sorters"]["order_by"] @@ -719,17 +714,28 @@ def test_dict_default_pandas_config_round_trip(inject_engine_lookup_double): ) assert ( DEFAULT_PANDAS_DATA_ASSET_NAME - not in from_dict_default_pandas_config.fluent_datasources[ - DEFAULT_PANDAS_DATASOURCE_NAME - ].assets + not in from_dict_default_pandas_config.get_datasource( + datasource_name=DEFAULT_PANDAS_DATASOURCE_NAME + ).get_asset_names() ) dumped: dict = from_dict_default_pandas_config.dict() print(f" Dumped Dict ->\n\n{pf(dumped)}\n") - datasource_without_default_pandas_data_asset_config_dict["fluent_datasources"][ - DEFAULT_PANDAS_DATASOURCE_NAME - ]["assets"].pop(DEFAULT_PANDAS_DATA_ASSET_NAME) + default_pandas_datasoure_config_dict = list( + filter( + lambda element: element["name"] == DEFAULT_PANDAS_DATASOURCE_NAME, + datasource_without_default_pandas_data_asset_config_dict[ + _FLUENT_DATASOURCES_KEY + ], + ) + )[0] + default_pandas_datasoure_config_dict["assets"] = list( + filter( + lambda element: element["name"] != DEFAULT_PANDAS_DATA_ASSET_NAME, + default_pandas_datasoure_config_dict["assets"], + ) + ) assert datasource_without_default_pandas_data_asset_config_dict == dumped re_loaded: GxConfig = GxConfig.parse_obj(dumped) @@ -743,14 +749,25 @@ def test_dict_default_pandas_config_round_trip(inject_engine_lookup_double): only_default_pandas_datasource_and_data_asset_config_dict = copy.deepcopy( DEFAULT_PANDAS_DATASOURCE_AND_DATA_ASSET_CONFIG_DICT ) - only_default_pandas_datasource_and_data_asset_config_dict["fluent_datasources"][ - DEFAULT_PANDAS_DATASOURCE_NAME - ]["assets"].pop("my_csv_asset") + default_pandas_datasoure_config_dict = list( + filter( + lambda element: element["name"] == DEFAULT_PANDAS_DATASOURCE_NAME, + only_default_pandas_datasource_and_data_asset_config_dict[ + _FLUENT_DATASOURCES_KEY + ], + ) + )[0] + default_pandas_datasoure_config_dict["assets"] = list( + filter( + lambda element: element["name"] != "my_csv_asset", + default_pandas_datasoure_config_dict["assets"], + ) + ) from_dict_only_default_pandas_config = GxConfig.parse_obj( only_default_pandas_datasource_and_data_asset_config_dict ) - assert from_dict_only_default_pandas_config.fluent_datasources == {} + assert from_dict_only_default_pandas_config.fluent_datasources == [] @pytest.fixture @@ -791,7 +808,7 @@ def test_config_substitution_retains_original_value_on_save( ): original: dict = cast( dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) - )["fluent_datasources"]["my_sqlite_ds_w_subs"] + )[_FLUENT_DATASOURCES_KEY]["my_sqlite_ds_w_subs"] from great_expectations import get_context @@ -805,9 +822,7 @@ def test_config_substitution_retains_original_value_on_save( my_conn_str = f"sqlite:///{sqlite_database_path}" monkeypatch.setenv("MY_CONN_STR", my_conn_str) - ds_w_subs: SqliteDatasource = context.fluent_config.datasources[ # type: ignore[assignment] - "my_sqlite_ds_w_subs" - ] + ds_w_subs: SqliteDatasource = context.fluent_config.get_datasource(datasource_name="my_sqlite_ds_w_subs") # type: ignore[assignment] assert str(ds_w_subs.connection_string) == r"${MY_CONN_STR}" assert ( @@ -821,10 +836,7 @@ def test_config_substitution_retains_original_value_on_save( round_tripped = cast( dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) - )["fluent_datasources"]["my_sqlite_ds_w_subs"] - - # FIXME: serialized items should not have name - round_tripped.pop("name") + )[_FLUENT_DATASOURCES_KEY]["my_sqlite_ds_w_subs"] assert round_tripped == original @@ -842,7 +854,7 @@ def test_config_substitution_retains_original_value_on_save_w_run_time_mods( original: dict = cast( dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) - )["fluent_datasources"] + )[_FLUENT_DATASOURCES_KEY] assert original.get("my_sqlite_ds_w_subs") # will be modified assert original.get("my_pg_ds") # will be deleted assert not original.get("my_sqlite") # will be added @@ -875,7 +887,7 @@ def test_config_substitution_retains_original_value_on_save_w_run_time_mods( round_tripped_datasources = cast( dict, yaml.load(file_dc_config_file_with_substitutions.read_text()) - )["fluent_datasources"] + )[_FLUENT_DATASOURCES_KEY] assert round_tripped_datasources["my_new_one"] assert round_tripped_datasources["my_sqlite_ds_w_subs"]["assets"]["new_asset"] diff --git a/tests/datasource/fluent/test_metadatasource.py b/tests/datasource/fluent/test_metadatasource.py index 4f6593d63c1d..8c1d8d26ee57 100644 --- a/tests/datasource/fluent/test_metadatasource.py +++ b/tests/datasource/fluent/test_metadatasource.py @@ -13,6 +13,9 @@ from great_expectations.core.yaml_handler import YAMLHandler from great_expectations.data_context import AbstractDataContext, FileDataContext from great_expectations.datasource.fluent.config import GxConfig +from great_expectations.datasource.fluent.constants import ( + _FLUENT_DATASOURCES_KEY, +) from great_expectations.datasource.fluent.interfaces import ( BatchRequest, BatchRequestOptions, @@ -338,9 +341,15 @@ def test_minimal_ds_to_asset_flow(context_sources_cleanup): class RedAsset(DataAsset): type = "red" + def test_connection(self): + ... + class BlueAsset(DataAsset): type = "blue" + def test_connection(self): + ... + class PurpleDatasource(Datasource): asset_types = [RedAsset, BlueAsset] type: str = "purple" @@ -354,7 +363,7 @@ def test_connection(self): def add_red_asset(self, asset_name: str) -> RedAsset: asset = RedAsset(name=asset_name) - self.assets[asset_name] = asset + self._add_asset(asset=asset) return asset # 2. Get context @@ -388,11 +397,11 @@ def context_config_data( def assert_fluent_datasource_content( - config_file_path: str, fluent_datasource_config: dict + config_file_path: pathlib.Path, fluent_datasource_config: dict ): config = yaml.load(config_file_path.read_text()) - assert "fluent_datasources" in config - assert config["fluent_datasources"] == fluent_datasource_config + assert _FLUENT_DATASOURCES_KEY in config + assert config[_FLUENT_DATASOURCES_KEY] == fluent_datasource_config @pytest.fixture @@ -408,14 +417,13 @@ def context_with_fluent_datasource( ) assert 1 == len(context.datasources) assert_fluent_datasource_content( - config_file_path, - { + config_file_path=config_file_path, + fluent_datasource_config={ DEFAULT_CRUD_DATASOURCE_NAME: { "base_directory": str(data_dir), "data_context_root_directory": str(config_file_path.parent), - "name": DEFAULT_CRUD_DATASOURCE_NAME, "type": "pandas_filesystem", - } + }, }, ) return context, config_file_path, data_dir @@ -440,18 +448,16 @@ def test_add_datasource_with_datasource_object( context.sources.add_pandas_filesystem(datasource=new_datasource) assert len(context.datasources) == 2 assert_fluent_datasource_content( - config_file_path, - { - "new_datasource": { + config_file_path=config_file_path, + fluent_datasource_config={ + "pandas_datasource": { "base_directory": str(data_dir), "data_context_root_directory": str(config_file_path.parent), - "name": "new_datasource", "type": "pandas_filesystem", }, - "pandas_datasource": { + "new_datasource": { "base_directory": str(data_dir), "data_context_root_directory": str(config_file_path.parent), - "name": "pandas_datasource", "type": "pandas_filesystem", }, }, @@ -477,14 +483,13 @@ def test_update_datasource(context_with_fluent_datasource, use_positional_arg): data_context_root_directory=config_file_path.parent, ) assert_fluent_datasource_content( - config_file_path, - { + config_file_path=config_file_path, + fluent_datasource_config={ DEFAULT_CRUD_DATASOURCE_NAME: { "base_directory": str(data_dir_2), "data_context_root_directory": str(config_file_path.parent), - "name": DEFAULT_CRUD_DATASOURCE_NAME, "type": "pandas_filesystem", - } + }, }, ) @@ -497,14 +502,13 @@ def test_update_datasource_with_datasource_object( context, config_file_path, data_dir = context_with_fluent_datasource datasource = context.get_datasource(DEFAULT_CRUD_DATASOURCE_NAME) assert_fluent_datasource_content( - config_file_path, - { + config_file_path=config_file_path, + fluent_datasource_config={ DEFAULT_CRUD_DATASOURCE_NAME: { "base_directory": str(data_dir), "data_context_root_directory": str(config_file_path.parent), - "name": DEFAULT_CRUD_DATASOURCE_NAME, "type": "pandas_filesystem", - } + }, }, ) @@ -519,21 +523,19 @@ def test_update_datasource_with_datasource_object( context.sources.update_pandas_filesystem(datasource=datasource) assert_fluent_datasource_content( - config_file_path, - { + config_file_path=config_file_path, + fluent_datasource_config={ DEFAULT_CRUD_DATASOURCE_NAME: { "base_directory": str(data_dir), "data_context_root_directory": str(config_file_path.parent), - "name": DEFAULT_CRUD_DATASOURCE_NAME, "type": "pandas_filesystem", "assets": { "csv_asset": { "batching_regex": "(?P.*).csv", - "name": "csv_asset", "type": "csv", - } + }, }, - } + }, }, ) @@ -559,18 +561,16 @@ def test_add_or_update_datasource_using_add( data_context_root_directory=config_file_path.parent, ) assert_fluent_datasource_content( - config_file_path, - { - DEFAULT_CRUD_DATASOURCE_NAME: { - "base_directory": str(data_dir), + config_file_path=config_file_path, + fluent_datasource_config={ + f"{DEFAULT_CRUD_DATASOURCE_NAME}_2": { + "base_directory": str(data_dir_2), "data_context_root_directory": str(config_file_path.parent), - "name": DEFAULT_CRUD_DATASOURCE_NAME, "type": "pandas_filesystem", }, - f"{DEFAULT_CRUD_DATASOURCE_NAME}_2": { - "base_directory": str(data_dir_2), + DEFAULT_CRUD_DATASOURCE_NAME: { + "base_directory": str(data_dir), "data_context_root_directory": str(config_file_path.parent), - "name": f"{DEFAULT_CRUD_DATASOURCE_NAME}_2", "type": "pandas_filesystem", }, }, @@ -598,14 +598,13 @@ def test_add_or_update_datasource_using_update( data_context_root_directory=config_file_path.parent, ) assert_fluent_datasource_content( - config_file_path, - { + config_file_path=config_file_path, + fluent_datasource_config={ DEFAULT_CRUD_DATASOURCE_NAME: { "base_directory": str(data_dir_2), "data_context_root_directory": str(config_file_path.parent), - "name": DEFAULT_CRUD_DATASOURCE_NAME, "type": "pandas_filesystem", - } + }, }, ) diff --git a/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py index 4f25e8a05589..fad9070029c2 100644 --- a/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py @@ -474,7 +474,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = pandas_abs_datasource - pandas_abs_datasource.assets = {"csv_asset": csv_asset} + pandas_abs_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = AzureBlobStorageDataConnector( datasource_name=pandas_abs_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_pandas_datasource.py b/tests/datasource/fluent/test_pandas_datasource.py index 4288591ac631..34c00971c350 100644 --- a/tests/datasource/fluent/test_pandas_datasource.py +++ b/tests/datasource/fluent/test_pandas_datasource.py @@ -375,9 +375,9 @@ def test_positional_arguments( ) _ = read_method(*positional_args.values()) # read_* returns a validator, but we just want to inspect the asset - asset = empty_data_context.sources.pandas_default.assets[ - DEFAULT_PANDAS_DATA_ASSET_NAME - ] + asset = empty_data_context.sources.pandas_default.get_asset( + asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME + ) for positional_arg_name, positional_arg in positional_args.items(): assert getattr(asset, positional_arg_name) == positional_arg @@ -394,7 +394,9 @@ def test_default_pandas_datasource_get_and_set( filepath_or_buffer=valid_file_path, ) assert isinstance(validator, Validator) - csv_data_asset_1 = pandas_datasource.assets[DEFAULT_PANDAS_DATA_ASSET_NAME] + csv_data_asset_1 = pandas_datasource.get_asset( + asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME + ) assert isinstance(csv_data_asset_1, _PandasDataAsset) assert csv_data_asset_1.name == DEFAULT_PANDAS_DATA_ASSET_NAME assert len(pandas_datasource.assets) == 1 @@ -403,7 +405,7 @@ def test_default_pandas_datasource_get_and_set( pandas_datasource = empty_data_context.sources.pandas_default assert pandas_datasource.name == DEFAULT_PANDAS_DATASOURCE_NAME assert len(pandas_datasource.assets) == 1 - assert pandas_datasource.assets[DEFAULT_PANDAS_DATA_ASSET_NAME] + assert pandas_datasource.get_asset(asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME) # ensure we overwrite the ephemeral data asset if no name is passed _ = pandas_datasource.read_csv(filepath_or_buffer=valid_file_path) @@ -416,7 +418,9 @@ def test_default_pandas_datasource_get_and_set( asset_name=expected_csv_data_asset_name, filepath_or_buffer=valid_file_path, ) - csv_data_asset_2 = pandas_datasource.assets[expected_csv_data_asset_name] + csv_data_asset_2 = pandas_datasource.get_asset( + asset_name=expected_csv_data_asset_name + ) assert csv_data_asset_2.name == expected_csv_data_asset_name assert len(pandas_datasource.assets) == 2 @@ -452,9 +456,9 @@ def test_dataframe_asset(empty_data_context: AbstractDataContext, test_df_pandas ) assert isinstance(validator, Validator) assert isinstance( - empty_data_context.sources.pandas_default.assets[ - DEFAULT_PANDAS_DATA_ASSET_NAME - ], + empty_data_context.sources.pandas_default.get_asset( + asset_name=DEFAULT_PANDAS_DATA_ASSET_NAME + ), DataFrameAsset, ) @@ -469,7 +473,7 @@ def test_dataframe_asset(empty_data_context: AbstractDataContext, test_df_pandas assert all( [ asset.dataframe.equals(test_df_pandas) # type: ignore[attr-defined] - for asset in empty_data_context.sources.pandas_default.assets.values() + for asset in empty_data_context.sources.pandas_default.assets ] ) diff --git a/tests/datasource/fluent/test_pandas_dbfs_datasource.py b/tests/datasource/fluent/test_pandas_dbfs_datasource.py index ab975bfd18b4..acb2a314dda4 100644 --- a/tests/datasource/fluent/test_pandas_dbfs_datasource.py +++ b/tests/datasource/fluent/test_pandas_dbfs_datasource.py @@ -91,7 +91,6 @@ def csv_asset(pandas_dbfs_datasource: PandasDBFSDatasource) -> _FilePathDataAsse return asset -# TODO: ALEX def bad_batching_regex_config( csv_path: pathlib.Path, ) -> tuple[re.Pattern, TestConnectionError]: @@ -107,9 +106,6 @@ def bad_batching_regex_config( return batching_regex, test_connection_error -# TODO: ALEX - - @pytest.fixture def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: regex = re.compile( @@ -204,7 +200,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = pandas_dbfs_datasource - pandas_dbfs_datasource.assets = {"csv_asset": csv_asset} + pandas_dbfs_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = DBFSDataConnector( datasource_name=pandas_dbfs_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_pandas_filesystem_datasource.py b/tests/datasource/fluent/test_pandas_filesystem_datasource.py index 3ae3b603077d..120b15ddfc62 100644 --- a/tests/datasource/fluent/test_pandas_filesystem_datasource.py +++ b/tests/datasource/fluent/test_pandas_filesystem_datasource.py @@ -704,7 +704,9 @@ def datasource_test_connection_error_messages( batching_regex=batching_regex, ) csv_asset._datasource = pandas_filesystem_datasource - pandas_filesystem_datasource.assets = {"csv_asset": csv_asset} + pandas_filesystem_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = FilesystemDataConnector( datasource_name=pandas_filesystem_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py b/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py index 3f55326ebc4e..58c534dcf504 100644 --- a/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py +++ b/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py @@ -430,7 +430,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = pandas_gcs_datasource - pandas_gcs_datasource.assets = {"csv_asset": csv_asset} + pandas_gcs_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = GoogleCloudStorageDataConnector( datasource_name=pandas_gcs_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_pandas_s3_datasource.py b/tests/datasource/fluent/test_pandas_s3_datasource.py index de02a75a5e35..84fdf6611b06 100644 --- a/tests/datasource/fluent/test_pandas_s3_datasource.py +++ b/tests/datasource/fluent/test_pandas_s3_datasource.py @@ -389,7 +389,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = pandas_s3_datasource - pandas_s3_datasource.assets = {"csv_asset": csv_asset} + pandas_s3_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = S3DataConnector( datasource_name=pandas_s3_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_postgres_datasource.py b/tests/datasource/fluent/test_postgres_datasource.py index 902375df3beb..1a78ffdbdb10 100644 --- a/tests/datasource/fluent/test_postgres_datasource.py +++ b/tests/datasource/fluent/test_postgres_datasource.py @@ -102,7 +102,7 @@ def test_construct_postgres_datasource(create_source: CreateSourceFixture): ) as source: assert source.name == "my_datasource" assert source.execution_engine_type is SqlAlchemyExecutionEngine - assert source.assets == {} + assert source.assets == [] def assert_table_asset( @@ -147,7 +147,7 @@ def test_add_table_asset_with_splitter(mocker, create_source: CreateSourceFixtur asset = source.add_table_asset(name="my_asset", table_name="my_table") asset.add_splitter_year_and_month(column_name="my_col") assert len(source.assets) == 1 - assert asset == list(source.assets.values())[0] + assert asset == source.assets[0] assert_table_asset( asset=asset, name="my_asset", @@ -174,7 +174,7 @@ def test_add_table_asset_with_no_splitter(mocker, create_source: CreateSourceFix asset = source.add_table_asset(name="my_asset", table_name="my_table") assert len(source.assets) == 1 - assert asset == list(source.assets.values())[0] + assert asset == source.assets[0] assert_table_asset( asset=asset, name="my_asset", @@ -221,7 +221,7 @@ def create_and_add_table_asset_without_testing_connection( ) # TODO: asset custom init table_asset._datasource = source - source.assets[table_asset.name] = table_asset + source.assets.append(table_asset) return source, table_asset @@ -780,9 +780,25 @@ def test_datasource_dict_has_properties(create_source): asset.add_sorters(["year", "month"]) source_dict = source.dict() pprint(source_dict) - assert isinstance(source_dict["assets"]["my_asset"]["order_by"], list) + assert isinstance( + list( + filter( + lambda element: element["name"] == "my_asset", + source_dict["assets"], + ) + )[0]["order_by"], + list, + ) # type should be in dumped dict even if not explicitly set - assert "type" in source_dict["assets"]["my_asset"] + assert ( + "type" + in list( + filter( + lambda element: element["name"] == "my_asset", + source_dict["assets"], + ) + )[0] + ) @pytest.mark.unit @@ -889,7 +905,9 @@ def bad_configuration_datasource( return PostgresDatasource( name="postgres_datasource", connection_string=connection_string, - assets={"table_asset": table_asset}, + assets=[ + table_asset, + ], ) diff --git a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py index d4b140d4fba0..9d95c6f7a4f5 100644 --- a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py @@ -471,7 +471,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = spark_abs_datasource - spark_abs_datasource.assets = {"csv_asset": csv_asset} + spark_abs_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = AzureBlobStorageDataConnector( datasource_name=spark_abs_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_spark_datasource.py b/tests/datasource/fluent/test_spark_datasource.py index 8d8f607fd0ce..048dcb34c352 100644 --- a/tests/datasource/fluent/test_spark_datasource.py +++ b/tests/datasource/fluent/test_spark_datasource.py @@ -67,10 +67,7 @@ def test_dataframe_asset( assert len(datasource.assets) == 2 assert all( - [ - asset.dataframe.toPandas().equals(pandas_df) - for asset in datasource.assets.values() - ] + [asset.dataframe.toPandas().equals(pandas_df) for asset in datasource.assets] ) diff --git a/tests/datasource/fluent/test_spark_dbfs_datasource.py b/tests/datasource/fluent/test_spark_dbfs_datasource.py index b738715fb5f0..5de238ed53e7 100644 --- a/tests/datasource/fluent/test_spark_dbfs_datasource.py +++ b/tests/datasource/fluent/test_spark_dbfs_datasource.py @@ -183,7 +183,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = spark_dbfs_datasource - spark_dbfs_datasource.assets = {"csv_asset": csv_asset} + spark_dbfs_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = DBFSDataConnector( datasource_name=spark_dbfs_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_spark_filesystem_datasource.py b/tests/datasource/fluent/test_spark_filesystem_datasource.py index bf4bedf13e30..7cdc4e9221a1 100644 --- a/tests/datasource/fluent/test_spark_filesystem_datasource.py +++ b/tests/datasource/fluent/test_spark_filesystem_datasource.py @@ -356,7 +356,9 @@ def datasource_test_connection_error_messages( batching_regex=batching_regex, ) csv_asset._datasource = spark_filesystem_datasource - spark_filesystem_datasource.assets = {"csv_asset": csv_asset} + spark_filesystem_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = FilesystemDataConnector( datasource_name=spark_filesystem_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py b/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py index fa61caa32184..b5514aba539e 100644 --- a/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py @@ -427,7 +427,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = spark_gcs_datasource - spark_gcs_datasource.assets = {"csv_asset": csv_asset} + spark_gcs_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = GoogleCloudStorageDataConnector( datasource_name=spark_gcs_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_spark_s3_datasource.py b/tests/datasource/fluent/test_spark_s3_datasource.py index ed1a9786af68..30b11d12d950 100644 --- a/tests/datasource/fluent/test_spark_s3_datasource.py +++ b/tests/datasource/fluent/test_spark_s3_datasource.py @@ -289,7 +289,9 @@ def test_test_connection_failures( batching_regex=regex, ) csv_asset._datasource = spark_s3_datasource - spark_s3_datasource.assets = {"csv_asset": csv_asset} + spark_s3_datasource.assets = [ + csv_asset, + ] csv_asset._data_connector = S3DataConnector( datasource_name=spark_s3_datasource.name, data_asset_name=csv_asset.name, diff --git a/tests/datasource/fluent/test_viral_snippets.py b/tests/datasource/fluent/test_viral_snippets.py index 28f1f123d283..f5eae91ce9d5 100644 --- a/tests/datasource/fluent/test_viral_snippets.py +++ b/tests/datasource/fluent/test_viral_snippets.py @@ -101,10 +101,13 @@ def test_serialize_fluent_config( assert fluent_file_context.fluent_config.datasources - for ds_name, datasource in fluent_file_context.fluent_config.datasources.items(): + for ( + ds_name, + datasource, + ) in fluent_file_context.fluent_config.get_datasources_as_dict().items(): assert ds_name in dumped_yaml - for asset_name in datasource.assets.keys(): + for asset_name in datasource.get_asset_names(): assert asset_name in dumped_yaml @@ -124,7 +127,7 @@ def test_data_connectors_are_built_on_config_load(fluent_file_context: FileDataC dc_datasources[datasource.type].append(datasource.name) - for asset in datasource.assets.values(): + for asset in datasource.assets: assert isinstance(asset._data_connector, datasource.data_connector_type) print() @@ -164,7 +167,7 @@ def test_save_datacontext_persists_fluent_config( config_file = file_dc_config_dir_init / FileDataContext.GX_YML initial_yaml = config_file.read_text() - for ds_name in fluent_only_config.datasources: + for ds_name in fluent_only_config.get_datasource_names(): assert ds_name not in initial_yaml context: FileDataContext = get_context( @@ -179,7 +182,7 @@ def test_save_datacontext_persists_fluent_config( print("\n".join(diff)) - for ds_name in fluent_only_config.datasources: + for ds_name in fluent_only_config.get_datasource_names(): assert ds_name in final_yaml From 1e4359344483fb1bfc71ab4dae9c2be99a48a943 Mon Sep 17 00:00:00 2001 From: kenwade4 <95714847+kenwade4@users.noreply.github.com> Date: Wed, 12 Apr 2023 23:43:24 -0500 Subject: [PATCH 87/96] [BUGFIX] Remove spark from bic Expectations since it never worked for them (#7619) --- ...expect_column_values_bic_belong_to_country.py | 14 ++++++-------- .../expect_column_values_to_be_valid_bic.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py index eb918d4c506d..92dd0485e511 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py @@ -7,14 +7,12 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ( PandasExecutionEngine, - SparkDFExecutionEngine, ) from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ( ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes def bic_belong_to_country(bic: str, country_code: str) -> bool: @@ -36,13 +34,13 @@ class ColumnValuesBicBelongToCountry(ColumnMapMetricProvider): def _pandas(cls, column, country_code, **kwargs): return column.apply(partial(bic_belong_to_country, country_code=country_code)) - @column_condition_partial(engine=SparkDFExecutionEngine) - def _spark(cls, column, country_code, **kwargs): - @F.udf(sparktypes.BooleanType()) - def bic_belong_to_country_udf(bic: str) -> bool: - return bic_belong_to_country(bic, country_code) + # @column_condition_partial(engine=SparkDFExecutionEngine) + # def _spark(cls, column, country_code, **kwargs): + # @F.udf(sparktypes.BooleanType()) + # def bic_belong_to_country_udf(bic: str) -> bool: + # return bic_belong_to_country(bic, country_code) - return bic_belong_to_country_udf(column) + # return bic_belong_to_country_udf(column) # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine # @column_condition_partial(engine=SqlAlchemyExecutionEngine) diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py index 228afcec08f0..d28a787bc876 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py @@ -6,14 +6,14 @@ from great_expectations.core.expectation_configuration import ExpectationConfiguration from great_expectations.execution_engine import ( PandasExecutionEngine, - SparkDFExecutionEngine, ) from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ( ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes + +# from great_expectations.expectations.metrics.import_manager import F, sparktypes def is_valid_bic(bic_code: str) -> bool: @@ -24,9 +24,9 @@ def is_valid_bic(bic_code: str) -> bool: return False -@F.udf(sparktypes.BooleanType()) -def is_valid_bic_udf(bic: str) -> bool: - return is_valid_bic(bic) +# @F.udf(sparktypes.BooleanType()) +# def is_valid_bic_udf(bic: str) -> bool: +# return is_valid_bic(bic) class ColumnValuesToBeValidBic(ColumnMapMetricProvider): @@ -36,9 +36,9 @@ class ColumnValuesToBeValidBic(ColumnMapMetricProvider): def _pandas(cls, column, **kwargs): return column.apply(is_valid_bic) - @column_condition_partial(engine=SparkDFExecutionEngine) - def _spark(cls, column, **kwargs): - return is_valid_bic_udf(column) + # @column_condition_partial(engine=SparkDFExecutionEngine) + # def _spark(cls, column, **kwargs): + # return is_valid_bic_udf(column) # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine # @column_condition_partial(engine=SqlAlchemyExecutionEngine) From 2d52e9fc7aed0ee8d2be9669e97d8503956d258d Mon Sep 17 00:00:00 2001 From: Rob Gray <104205257+kwcanuck@users.noreply.github.com> Date: Thu, 13 Apr 2023 09:35:43 -0400 Subject: [PATCH 88/96] [DOCS] Corrects Step Numbering in How to instantiate a specific Filesystem Data Context (#7612) --- .../how_to_initialize_a_filesystem_data_context_in_python.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md index 4144c99e901f..a11692339905 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python.md @@ -42,7 +42,7 @@ For purposes of this example, we will assume that we have an empty folder to ini path_to_empty_folder = '/my_gx_project/' ``` -### 2. Run GX's `get_context(...)` method +### 3. Run GX's `get_context(...)` method We will provide our empty folder's path to the GX library's `get_context(...)` method as the `context_root_dir` parameter. Because we are providing a path to an empty folder `get_context(...)` will initialize a Filesystem Data Context at that location. @@ -59,7 +59,7 @@ If a Data Context already exists at the provided `path`, the `get_context(...)` ::: -### 3. Verify the content of the returned Data Context +### 4. Verify the content of the returned Data Context From 9d88d47ba083f4fe6cc443d66d311427cb176d67 Mon Sep 17 00:00:00 2001 From: Rob Gray <104205257+kwcanuck@users.noreply.github.com> Date: Thu, 13 Apr 2023 10:24:46 -0400 Subject: [PATCH 89/96] [DOCS] Corrects Heading Issue in How to host and share Data Docs on Azure Blob Storage (#7620) --- .../how_to_host_and_share_data_docs_on_azure_blob_storage.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/docusaurus/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md b/docs/docusaurus/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md index b0970af335b2..35b7f0070abf 100644 --- a/docs/docusaurus/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md +++ b/docs/docusaurus/docs/guides/setup/configuring_data_docs/how_to_host_and_share_data_docs_on_azure_blob_storage.md @@ -13,8 +13,7 @@ Data Docs will be served using an Azure Blob Storage static website with restric - Have permission to create and configured an [Azure Storage account](https://docs.microsoft.com/en-us/azure/storage) - - + ## Steps ### 1. Create an Azure Blob Storage static website From d9726f63a03dbd288d8c84781cda4ee958d19275 Mon Sep 17 00:00:00 2001 From: Nathan Farmer Date: Thu, 13 Apr 2023 12:41:59 -0400 Subject: [PATCH 90/96] [RELEASE] 0.16.7 (#7622) --- docs/docusaurus/docs/changelog.md | 39 ++++++++++++++++++++++ docs/docusaurus/docs/components/_data.jsx | 2 +- docs/docusaurus/docusaurus.config.js | 2 +- docs_rtd/changelog.rst | 40 +++++++++++++++++++++++ great_expectations/deployment_version | 2 +- 5 files changed, 82 insertions(+), 3 deletions(-) diff --git a/docs/docusaurus/docs/changelog.md b/docs/docusaurus/docs/changelog.md index 7b73b2a150f6..efc217619573 100644 --- a/docs/docusaurus/docs/changelog.md +++ b/docs/docusaurus/docs/changelog.md @@ -10,6 +10,45 @@ title: Changelog - Deprecation warnings are accompanied by a moniker (as a code comment) indicating when they were deprecated. For example: `# deprecated-v0.13` - Changes to methods and parameters due to deprecation are also noted in the relevant docstrings. +### 0.16.7 +* [FEATURE] Added AssumeRole Feature ([#7547](https://github.com/great-expectations/great_expectations/pull/7547)) (thanks @Reactor11) +* [BUGFIX] Fix Fluent Spark `DataConnectors` on config load ([#7560](https://github.com/great-expectations/great_expectations/pull/7560)) +* [BUGFIX] `dataset_name` made optional parameter for Expectations ([#7603](https://github.com/great-expectations/great_expectations/pull/7603)) +* [BUGFIX] Misc gallery bugfixes ([#7611](https://github.com/great-expectations/great_expectations/pull/7611)) +* [BUGFIX] Remove spark from bic Expectations since it never worked for them ([#7619](https://github.com/great-expectations/great_expectations/pull/7619)) +* [DOCS] Use current minor version number in drop down instead of "Current" ([#7581](https://github.com/great-expectations/great_expectations/pull/7581)) +* [DOCS] Adds deprecation policy to changelog page ([#7585](https://github.com/great-expectations/great_expectations/pull/7585)) +* [DOCS] Use the actual version after release ([#7583](https://github.com/great-expectations/great_expectations/pull/7583)) +* [DOCS] Update some docs_rtd requirements so the venv can be created successfully ([#7580](https://github.com/great-expectations/great_expectations/pull/7580)) +* [DOCS] Add Cloud quickstart ([#7441](https://github.com/great-expectations/great_expectations/pull/7441)) +* [DOCS] Updates how the GX Cloud Beta is referenced in the Quickstart guide. ([#7594](https://github.com/great-expectations/great_expectations/pull/7594)) +* [DOCS] Corrects typo in code block within in-memory Pandas guide ([#7600](https://github.com/great-expectations/great_expectations/pull/7600)) +* [DOCS] Updates to Contributing through GitHub ([#7601](https://github.com/great-expectations/great_expectations/pull/7601)) (thanks @kwcanuck) +* [DOCS] Correct expectation documentation for expect_column_max_to_be_between ([#7597](https://github.com/great-expectations/great_expectations/pull/7597)) +* [DOCS] Add scripts under test for "How to create and edit Expectations with instant feedback from a sample Batch of data" ([#7615](https://github.com/great-expectations/great_expectations/pull/7615)) +* [DOCS] Corrects Step Numbering in How to instantiate a specific Filesystem Data Context ([#7612](https://github.com/great-expectations/great_expectations/pull/7612)) (thanks @kwcanuck) +* [DOCS] Corrects Heading Issue in How to host and share Data Docs on Azure Blob Storage ([#7620](https://github.com/great-expectations/great_expectations/pull/7620)) (thanks @kwcanuck) +* [MAINTENANCE] Warning non integer slice on row for SQLAlchemy 2.0 Compatibility ([#7501](https://github.com/great-expectations/great_expectations/pull/7501)) +* [MAINTENANCE] Warning MetaData.bind argument deprecated for SQLAlchemy 2.0 Compatibility ([#7502](https://github.com/great-expectations/great_expectations/pull/7502)) +* [MAINTENANCE] Capitalize "If" in rendering of conditional Expectations ([#7588](https://github.com/great-expectations/great_expectations/pull/7588)) +* [MAINTENANCE] Remove pip pins in CI and in contributing_setup.md ([#7587](https://github.com/great-expectations/great_expectations/pull/7587)) +* [MAINTENANCE] Remove ignore of warning deprecated api features detected sqlalchemy 2 ([#7584](https://github.com/great-expectations/great_expectations/pull/7584)) +* [MAINTENANCE] Fix sqlalchemy 2.0 incompatible warnings ([#7589](https://github.com/great-expectations/great_expectations/pull/7589)) +* [MAINTENANCE] Increase minimum scipy version package to 1.6.0 to take advantage of available capabilities. ([#7591](https://github.com/great-expectations/great_expectations/pull/7591)) +* [MAINTENANCE] Remove s3fs dependency and upper bound for boto3 ([#7598](https://github.com/great-expectations/great_expectations/pull/7598)) +* [MAINTENANCE] Move Fluent Datasources Sorters into `TYPE_CHECKING` block ([#7602](https://github.com/great-expectations/great_expectations/pull/7602)) +* [MAINTENANCE] Bump terser from 5.10.0 to 5.16.8 in /docs/docusaurus ([#7486](https://github.com/great-expectations/great_expectations/pull/7486)) (thanks @dependabot[bot]) +* [MAINTENANCE] Bump cookiecutter from 1.7.3 to 2.1.1 in /contrib/cli ([#7510](https://github.com/great-expectations/great_expectations/pull/7510)) (thanks @dependabot[bot]) +* [MAINTENANCE] Polish and ratchet requirements pins and upper bounds ([#7604](https://github.com/great-expectations/great_expectations/pull/7604)) +* [MAINTENANCE] small documentation updates ([#7606](https://github.com/great-expectations/great_expectations/pull/7606)) +* [MAINTENANCE] SqlAlchemy 2 Compatibility - `engine.execute()` ([#7469](https://github.com/great-expectations/great_expectations/pull/7469)) +* [MAINTENANCE] Deprecate ColumnExpectation in favor of ColumnAggregateExpectation ([#7609](https://github.com/great-expectations/great_expectations/pull/7609)) +* [MAINTENANCE] Deprecate TableExpectation in favor of BatchExpectation ([#7610](https://github.com/great-expectations/great_expectations/pull/7610)) +* [MAINTENANCE] Explicitly test relevant modules in Sqlalchemy compatibility pipeline ([#7613](https://github.com/great-expectations/great_expectations/pull/7613)) +* [MAINTENANCE] Fluent Datasources: Eliminate redundant Datasource name and DataAsset name from dictionary and JSON configuration ([#7573](https://github.com/great-expectations/great_expectations/pull/7573)) +* [CONTRIB] add check to calculate difference between 2 dates in month ([#7576](https://github.com/great-expectations/great_expectations/pull/7576)) (thanks @tb102122) +* [CONTRIB] Expect Column Values to be Valid UUID - Added SqlAlchemyExecutionEngine support ([#7592](https://github.com/great-expectations/great_expectations/pull/7592)) (thanks @asafla) + ### 0.16.6 * [FEATURE] Fluent `DataAsset` `batch_metadata` config variables ([#7513](https://github.com/great-expectations/great_expectations/pull/7513)) * [FEATURE] Add batch metadata to spark add_*_asset methods ([#7534](https://github.com/great-expectations/great_expectations/pull/7534)) diff --git a/docs/docusaurus/docs/components/_data.jsx b/docs/docusaurus/docs/components/_data.jsx index ae23f999e80d..b7588ee0e06c 100644 --- a/docs/docusaurus/docs/components/_data.jsx +++ b/docs/docusaurus/docs/components/_data.jsx @@ -1,5 +1,5 @@ export default { - release_version: 'great_expectations, version 0.16.6', + release_version: 'great_expectations, version 0.16.7', min_python: '3.7', max_python: '3.10' } diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js index 0cc1a946d625..c3663a4222ee 100644 --- a/docs/docusaurus/docusaurus.config.js +++ b/docs/docusaurus/docusaurus.config.js @@ -259,7 +259,7 @@ module.exports = { lastVersion: 'current', versions: { current: { - label: '0.16.6', + label: '0.16.7', path: '' } } diff --git a/docs_rtd/changelog.rst b/docs_rtd/changelog.rst index 2a97fd800217..4ade7ca6081c 100644 --- a/docs_rtd/changelog.rst +++ b/docs_rtd/changelog.rst @@ -4,6 +4,46 @@ Changelog ######### +0.16.7 +----------------- +* [FEATURE] Added AssumeRole Feature ([#7547](https://github.com/great-expectations/great_expectations/pull/7547)) (thanks @Reactor11) +* [BUGFIX] Fix Fluent Spark `DataConnectors` on config load ([#7560](https://github.com/great-expectations/great_expectations/pull/7560)) +* [BUGFIX] `dataset_name` made optional parameter for Expectations ([#7603](https://github.com/great-expectations/great_expectations/pull/7603)) +* [BUGFIX] Misc gallery bugfixes ([#7611](https://github.com/great-expectations/great_expectations/pull/7611)) +* [BUGFIX] Remove spark from bic Expectations since it never worked for them ([#7619](https://github.com/great-expectations/great_expectations/pull/7619)) +* [DOCS] Use current minor version number in drop down instead of "Current" ([#7581](https://github.com/great-expectations/great_expectations/pull/7581)) +* [DOCS] Adds deprecation policy to changelog page ([#7585](https://github.com/great-expectations/great_expectations/pull/7585)) +* [DOCS] Use the actual version after release ([#7583](https://github.com/great-expectations/great_expectations/pull/7583)) +* [DOCS] Update some docs_rtd requirements so the venv can be created successfully ([#7580](https://github.com/great-expectations/great_expectations/pull/7580)) +* [DOCS] Add Cloud quickstart ([#7441](https://github.com/great-expectations/great_expectations/pull/7441)) +* [DOCS] Updates how the GX Cloud Beta is referenced in the Quickstart guide. ([#7594](https://github.com/great-expectations/great_expectations/pull/7594)) +* [DOCS] Corrects typo in code block within in-memory Pandas guide ([#7600](https://github.com/great-expectations/great_expectations/pull/7600)) +* [DOCS] Updates to Contributing through GitHub ([#7601](https://github.com/great-expectations/great_expectations/pull/7601)) (thanks @kwcanuck) +* [DOCS] Correct expectation documentation for expect_column_max_to_be_between ([#7597](https://github.com/great-expectations/great_expectations/pull/7597)) +* [DOCS] Add scripts under test for "How to create and edit Expectations with instant feedback from a sample Batch of data" ([#7615](https://github.com/great-expectations/great_expectations/pull/7615)) +* [DOCS] Corrects Step Numbering in How to instantiate a specific Filesystem Data Context ([#7612](https://github.com/great-expectations/great_expectations/pull/7612)) (thanks @kwcanuck) +* [DOCS] Corrects Heading Issue in How to host and share Data Docs on Azure Blob Storage ([#7620](https://github.com/great-expectations/great_expectations/pull/7620)) (thanks @kwcanuck) +* [MAINTENANCE] Warning non integer slice on row for SQLAlchemy 2.0 Compatibility ([#7501](https://github.com/great-expectations/great_expectations/pull/7501)) +* [MAINTENANCE] Warning MetaData.bind argument deprecated for SQLAlchemy 2.0 Compatibility ([#7502](https://github.com/great-expectations/great_expectations/pull/7502)) +* [MAINTENANCE] Capitalize "If" in rendering of conditional Expectations ([#7588](https://github.com/great-expectations/great_expectations/pull/7588)) +* [MAINTENANCE] Remove pip pins in CI and in contributing_setup.md ([#7587](https://github.com/great-expectations/great_expectations/pull/7587)) +* [MAINTENANCE] Remove ignore of warning deprecated api features detected sqlalchemy 2 ([#7584](https://github.com/great-expectations/great_expectations/pull/7584)) +* [MAINTENANCE] Fix sqlalchemy 2.0 incompatible warnings ([#7589](https://github.com/great-expectations/great_expectations/pull/7589)) +* [MAINTENANCE] Increase minimum scipy version package to 1.6.0 to take advantage of available capabilities. ([#7591](https://github.com/great-expectations/great_expectations/pull/7591)) +* [MAINTENANCE] Remove s3fs dependency and upper bound for boto3 ([#7598](https://github.com/great-expectations/great_expectations/pull/7598)) +* [MAINTENANCE] Move Fluent Datasources Sorters into `TYPE_CHECKING` block ([#7602](https://github.com/great-expectations/great_expectations/pull/7602)) +* [MAINTENANCE] Bump terser from 5.10.0 to 5.16.8 in /docs/docusaurus ([#7486](https://github.com/great-expectations/great_expectations/pull/7486)) (thanks @dependabot[bot]) +* [MAINTENANCE] Bump cookiecutter from 1.7.3 to 2.1.1 in /contrib/cli ([#7510](https://github.com/great-expectations/great_expectations/pull/7510)) (thanks @dependabot[bot]) +* [MAINTENANCE] Polish and ratchet requirements pins and upper bounds ([#7604](https://github.com/great-expectations/great_expectations/pull/7604)) +* [MAINTENANCE] small documentation updates ([#7606](https://github.com/great-expectations/great_expectations/pull/7606)) +* [MAINTENANCE] SqlAlchemy 2 Compatibility - `engine.execute()` ([#7469](https://github.com/great-expectations/great_expectations/pull/7469)) +* [MAINTENANCE] Deprecate ColumnExpectation in favor of ColumnAggregateExpectation ([#7609](https://github.com/great-expectations/great_expectations/pull/7609)) +* [MAINTENANCE] Deprecate TableExpectation in favor of BatchExpectation ([#7610](https://github.com/great-expectations/great_expectations/pull/7610)) +* [MAINTENANCE] Explicitly test relevant modules in Sqlalchemy compatibility pipeline ([#7613](https://github.com/great-expectations/great_expectations/pull/7613)) +* [MAINTENANCE] Fluent Datasources: Eliminate redundant Datasource name and DataAsset name from dictionary and JSON configuration ([#7573](https://github.com/great-expectations/great_expectations/pull/7573)) +* [CONTRIB] add check to calculate difference between 2 dates in month ([#7576](https://github.com/great-expectations/great_expectations/pull/7576)) (thanks @tb102122) +* [CONTRIB] Expect Column Values to be Valid UUID - Added SqlAlchemyExecutionEngine support ([#7592](https://github.com/great-expectations/great_expectations/pull/7592)) (thanks @asafla) + 0.16.6 ----------------- * [FEATURE] Fluent `DataAsset` `batch_metadata` config variables ([#7513](https://github.com/great-expectations/great_expectations/pull/7513)) diff --git a/great_expectations/deployment_version b/great_expectations/deployment_version index c3f65805f7b7..427cda05dcde 100644 --- a/great_expectations/deployment_version +++ b/great_expectations/deployment_version @@ -1 +1 @@ -0.16.6 +0.16.7 From bdee8e08d64d16433f0c8cdbf95f05654291b685 Mon Sep 17 00:00:00 2001 From: David Talbot <17692467+dctalbot@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:05:29 -0400 Subject: [PATCH 91/96] [BUGFIX] fix marshmallow schema for SQLAlchemy `connect_args` passthrough (#7614) --- great_expectations/data_context/types/base.py | 2 +- tests/data_context/test_data_context_types.py | 40 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 tests/data_context/test_data_context_types.py diff --git a/great_expectations/data_context/types/base.py b/great_expectations/data_context/types/base.py index b0fd97d739d7..2205e53bc406 100644 --- a/great_expectations/data_context/types/base.py +++ b/great_expectations/data_context/types/base.py @@ -1023,7 +1023,7 @@ class Meta: keys=fields.Str(), values=fields.Str(), required=False, allow_none=True ) connect_args = fields.Dict( - keys=fields.Str(), values=fields.Dict(), required=False, allow_none=True + keys=fields.Str(), values=fields.Raw(), required=False, allow_none=True ) azure_options = fields.Dict( keys=fields.Str(), values=fields.Str(), required=False, allow_none=True diff --git a/tests/data_context/test_data_context_types.py b/tests/data_context/test_data_context_types.py new file mode 100644 index 000000000000..229418f8fdad --- /dev/null +++ b/tests/data_context/test_data_context_types.py @@ -0,0 +1,40 @@ +from unittest.mock import Mock + +import pytest + +from great_expectations.data_context.types.base import ( + ExecutionEngineConfigSchema, +) + + +@pytest.mark.unit +@pytest.mark.parametrize( + "connect_args", + [ + {"connection_factory": Mock()}, + {"ssl_verify_cert": False}, + {"timeout": 30}, + { + "ssl": { + "ssl_ca": "ca.pem", + "ssl_cert": "client-cert.pem", + "ssl_key": "client-key.pem", + } + }, + ], +) +def test_execution_engine_config_conect_args(connect_args): + """ + this is part of a test-driven fix for: https://github.com/great-expectations/great_expectations/issues/6226 + connect_args examples are here: https://docs.sqlalchemy.org/en/20/core/engines.html#use-the-connect-args-dictionary-parameter + """ + cfg = ExecutionEngineConfigSchema().load( + { + "class_name": "SqlAlchemyExecutionEngine", + "module_name": "great_expectations.execution_engine", + "connection_string": "sqlite://", + "connect_args": connect_args, + } + ) + + assert cfg.connect_args == connect_args From cd0f2bdadd165027ba90ce76fdba7ef39c95bf85 Mon Sep 17 00:00:00 2001 From: Chetan Kini Date: Thu, 13 Apr 2023 14:19:14 -0400 Subject: [PATCH 92/96] [MAINTENANCE] Update `teams.yml` (#7623) --- .github/teams.yml | 58 +++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/.github/teams.yml b/.github/teams.yml index 69711b9f8aa7..d9751f5e8fff 100644 --- a/.github/teams.yml +++ b/.github/teams.yml @@ -2,15 +2,9 @@ # To add an additional team, simply add a top-level key with a list of users. # NOTE - this should be kept in sync with the GX org's teams -platform: - - '@NathanFarmer' # Nathan Farmer - - '@alexsherstinsky' # Alex Sherstinsky - - '@cdkini' # Chetan Kini - - '@billdirks' # Bill Dirks - - '@Kilo59' # Gabriel Gore - dx: - '@Shinnnyshinshin' # Will Shin + - '@alexsherstinsky' # Alex Sherstinsky - '@anthonyburdi' # Anthony Burdi - '@kenwade4' # Ken Wade @@ -20,35 +14,35 @@ devrel: - '@kyleaton' # Kyle Eaton - '@rdodev' # Ruben Orduz - '@talagluck' # Tal Gluck + - '@tjholsman' # TJ Holsman -cloud: - - '@roblim' # Rob Lim - - '@rreinoldsc' # Robby Reinold - - '@joshua-stauffer' # Josh Stauffer - - '@dctalbot' # David Talbot - - '@wookasz' # Łukasz Lempart - - '@josectobar' # José Tobar - - '@elenajdanova' # Elena Jdanova +core: - '@DrewHoo' # Drew Hoover - - '@lockettks' # Kim Mathieu - - '@superengi' # Saahir Foux - -# Aggregates a few different teams -core-team: - # Mario - - '@NathanFarmer' # Nathan Farmer - - '@alexsherstinsky' # Alex Sherstinsky - - '@cdkini' # Chetan Kini - - '@billdirks' # Bill Dirks - '@Kilo59' # Gabriel Gore - # Luigi + - '@NathanFarmer' # Nathan Farmer - '@Shinnnyshinshin' # Will Shin - - '@anthonyburdi' # Anthony Burdi - - '@kenwade4' # Ken Wade - # Misc - - '@abegong' # Abe Gong - - '@jcampbell' # James Campbell - - '@donaldheppner' # Don Heppner - '@Super-Tanner' # Tanner Beam + - '@abegong' # Abe Gong + - '@alexsherstinsky' # Alex Sherstinsky - '@allensallinger' # Allen Sallinger + - '@anthonyburdi' # Anthony Burdi + - '@billdirks' # Bill Dirks + - '@cdkini' # Chetan Kini + - '@dctalbot' # David Talbot + - '@donaldheppner' # Don Heppner + - '@elenajdanova' # Elena Jdanova + - '@jcampbell' # James Campbell + - '@josectobar' # José Tobar + - '@joshua-stauffer' # Josh Stauffer + - '@jshaikGX' # Javed Shaik + - '@kenwade4' # Ken Wade + - '@lockettks' # Kim Mathieu + - '@roblim' # Rob Lim + - '@rreinoldsc' # Robby Reinold - '@sujensen' # Susan Jensen + - '@tyler-hoffman' # Tyler Hoffman + - '@wookasz' # Łukasz Lempart + +bot: + - '@dependabot' + - '@dependabot[bot]' From 3d49f750057444479b1e63e312dd2c267b4caa95 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Thu, 13 Apr 2023 20:13:11 -0600 Subject: [PATCH 93/96] [DOCS] Update overview.md (#7627) --- .../expectations/creating_custom_expectations/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md index 34bec85972c5..b55f09da98da 100644 --- a/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md +++ b/docs/docusaurus/docs/guides/expectations/creating_custom_expectations/overview.md @@ -56,7 +56,7 @@ The code to achieve the first four steps looks somewhat different depending on t |-----------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Column Map Expectation](./how_to_create_custom_column_map_expectations.md) | [column_map_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/column_map_expectation_template.py) | | [Column Aggregate Expectation](./how_to_create_custom_column_aggregate_expectations.md) | [column_aggregate_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/column_aggregate_expectation_template.py) | -| [Batch Expectation](./how_to_create_custom_batch_expectations.md) | [table_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/batch_expectation_template.py) | +| [Batch Expectation](./how_to_create_custom_batch_expectations.md) | [batch_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/batch_expectation_template.py) | | [Regex-Based Column Map Expectation](./how_to_create_custom_regex_based_column_map_expectations.md) | [regex-based map column_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/regex_based_column_map_expectation_template.py) | | [Set-Based Column Map Expectation](./how_to_create_custom_set_based_column_map_expectations.md) | [set-based map_expectation_template](https://github.com/great-expectations/great_expectations/blob/develop/examples/expectations/set_based_column_map_expectation_template.py) | From 8ae452a03ab8f77dd79e40015e23b621c60d8635 Mon Sep 17 00:00:00 2001 From: William Shin Date: Thu, 13 Apr 2023 20:13:53 -0700 Subject: [PATCH 94/96] [BUGFIX] MapCondition Memory Inefficiencies in Spark (#7626) --- .../map_condition_auxilliary_methods.py | 5 +- .../test_checkpoint_result_format.py | 57 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py index 601343c0b43b..1454be2bc006 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py @@ -772,7 +772,10 @@ def _spark_map_condition_index( ) if result_format["result_format"] != "COMPLETE": - filtered.limit(result_format["partial_unexpected_count"]) + filtered = filtered.limit(result_format["partial_unexpected_count"]) + + # Prune the dataframe down only the columns we care about + filtered = filtered.select(columns_to_keep) for row in filtered.collect(): dict_to_add: dict = {} diff --git a/tests/checkpoint/test_checkpoint_result_format.py b/tests/checkpoint/test_checkpoint_result_format.py index 88ba93a13ca6..489c028a8ad2 100644 --- a/tests/checkpoint/test_checkpoint_result_format.py +++ b/tests/checkpoint/test_checkpoint_result_format.py @@ -2085,6 +2085,63 @@ def test_spark_result_format_in_checkpoint_pk_defined_one_expectation_summary_ou assert evrs[0]["results"][0]["result"].get("unexpected_index_query") is None +@pytest.mark.integration +def test_spark_result_format_in_checkpoint_pk_defined_one_expectation_summary_output_limit_1( + in_memory_runtime_context: AbstractDataContext, + batch_request_for_spark_unexpected_rows_and_index: dict, + reference_checkpoint_config_for_unexpected_column_names: dict, + expectation_config_expect_column_values_to_be_in_set: ExpectationConfiguration, + expected_unexpected_indices_output: list[dict[str, str | int]], +): + """ + What does this test? + - unexpected_index_column defined in Checkpoint only. + - SUMMARY output, which means we have `partial_unexpected_index_list` only + - 1 Expectations added to suite + - limit is 1 so we only get 1 output in the `partial_unexpected_index_list` + """ + dict_to_update_checkpoint: dict = { + "result_format": { + "result_format": "SUMMARY", + "partial_unexpected_count": 1, + "unexpected_index_column_names": ["pk_1"], + } + } + context: DataContext = _add_expectations_and_checkpoint( + data_context=in_memory_runtime_context, + checkpoint_config=reference_checkpoint_config_for_unexpected_column_names, + expectations_list=[expectation_config_expect_column_values_to_be_in_set], + dict_to_update_checkpoint=dict_to_update_checkpoint, + ) + + result: CheckpointResult = context.run_checkpoint( + checkpoint_name="my_checkpoint", + expectation_suite_name="metrics_exp", + batch_request=batch_request_for_spark_unexpected_rows_and_index, + ) + evrs: List[ExpectationSuiteValidationResult] = result.list_validation_results() + + index_column_names: List[str] = evrs[0]["results"][0]["result"][ + "unexpected_index_column_names" + ] + assert index_column_names == ["pk_1"] + + first_result_full_list: List[Dict[str, Any]] = evrs[0]["results"][0]["result"].get( + "unexpected_index_list" + ) + assert not first_result_full_list + first_result_partial_list: List[Dict[str, Any]] = evrs[0]["results"][0]["result"][ + "partial_unexpected_index_list" + ] + assert first_result_partial_list == [ + { + "animals": "giraffe", + "pk_1": 3, + } + ] + assert evrs[0]["results"][0]["result"].get("unexpected_index_query") is None + + @pytest.mark.integration def test_spark_result_format_in_checkpoint_pk_defined_one_expectation_basic_output( in_memory_runtime_context: AbstractDataContext, From 6815b066a74e64a4d64adc0b6b6d93fa8d525413 Mon Sep 17 00:00:00 2001 From: Alex Sherstinsky Date: Fri, 14 Apr 2023 02:33:35 -0700 Subject: [PATCH 95/96] [MAINTENANCE] Utilize `NotImported` for SQLAlchemy, Google Cloud Services, Azure Blob Storage, and Spark import usage (#7617) --- .../expect_column_kurtosis_to_be_between.py | 2 +- ...to_have_difference_of_custom_percentage.py | 2 +- .../expect_column_skew_to_be_between.py | 44 +-- ...umn_values_to_be_in_set_spark_optimized.py | 2 +- ...values_to_be_string_integers_increasing.py | 10 +- ...xpect_column_values_to_match_xml_schema.py | 2 +- ...ct_multicolumn_sum_values_to_be_between.py | 2 +- ...sum_values_to_be_equal_to_single_column.py | 2 +- ...compared_to_avg_equivalent_days_of_week.py | 2 +- ...lon_coordinates_in_range_of_given_point.py | 2 +- ...column_values_to_be_lat_lon_in_timezone.py | 2 +- ...ect_column_values_bic_belong_to_country.py | 4 + .../expect_column_values_to_be_ascii.py | 2 +- .../expect_column_values_to_be_valid_bic.py | 4 +- ...column_values_to_be_valid_iana_timezone.py | 2 +- .../expect_column_values_to_be_valid_iban.py | 2 +- ...xpect_column_values_to_be_xml_parseable.py | 2 +- ...pair_values_to_match_prophet_date_model.py | 2 +- ...to_add_spark_support_for_an_expectation.md | 2 +- great_expectations/cli/batch_request.py | 19 +- great_expectations/cli/datasource.py | 7 - .../sqlalchemy_compatibility_wrappers.py | 21 +- great_expectations/core/batch.py | 30 +- great_expectations/core/util.py | 59 ++- .../store/database_store_backend.py | 65 ++-- .../configured_asset_azure_data_connector.py | 11 +- .../configured_asset_gcs_data_connector.py | 32 +- .../inferred_asset_azure_data_connector.py | 11 +- .../inferred_asset_gcs_data_connector.py | 32 +- .../datasource/data_connector/util.py | 34 +- .../google_cloud_storage_data_connector.py | 9 +- .../datasource/fluent/interfaces.py | 8 - .../pandas_azure_blob_storage_datasource.py | 15 +- .../pandas_google_cloud_storage_datasource.py | 39 +- .../spark_azure_blob_storage_datasource.py | 15 +- .../datasource/fluent/spark_datasource.py | 18 +- .../spark_google_cloud_storage_datasource.py | 39 +- .../datasource/fluent/sql_datasource.py | 39 +- .../execution_engine/execution_engine.py | 10 - .../pandas_execution_engine.py | 67 ++-- .../sparkdf_execution_engine.py | 74 ++-- .../split_and_sample/sparkdf_data_sampler.py | 40 +- .../split_and_sample/sparkdf_data_splitter.py | 42 +-- .../sqlalchemy_data_sampler.py | 40 +- .../execution_engine/sqlalchemy_batch_data.py | 28 +- .../sqlalchemy_execution_engine.py | 116 +++--- ...expect_column_values_to_be_in_type_list.py | 9 +- .../expect_column_values_to_be_of_type.py | 33 +- .../column_aggregate_metric_provider.py | 3 +- .../column_bootstrapped_ks_test_p_value.py | 7 - .../column_distinct_values.py | 11 +- .../column_histogram.py | 7 +- .../column_aggregate_metrics/column_max.py | 3 +- .../column_aggregate_metrics/column_mean.py | 3 +- .../column_aggregate_metrics/column_min.py | 3 +- ...ameterized_distribution_ks_test_p_value.py | 12 +- .../column_quantile_values.py | 107 +++--- .../column_standard_deviation.py | 11 +- .../column_aggregate_metrics/column_sum.py | 3 +- .../column_value_counts.py | 8 +- .../column_values_length_max.py | 3 +- .../column_values_length_min.py | 3 +- .../column_value_lengths.py | 3 +- .../column_values_between.py | 3 +- .../column_values_decreasing.py | 13 +- .../column_values_in_set.py | 2 +- .../column_values_increasing.py | 13 +- .../column_values_json_parseable.py | 2 +- .../column_values_match_json_schema.py | 2 +- .../column_values_match_strftime_format.py | 2 +- .../column_values_unique.py | 18 +- .../column_values_z_score.py | 3 +- .../column_pair_values_greater.py | 3 +- .../column_pair_values_in_set.py | 3 +- .../expectations/metrics/import_manager.py | 80 ---- .../column_condition_partial.py | 11 +- .../column_function_partial.py | 3 +- ...column_map_condition_auxilliary_methods.py | 5 +- .../column_pair_condition_partial.py | 11 +- .../column_pair_function_partial.py | 3 +- ...n_pair_map_condition_auxilliary_methods.py | 3 +- .../map_condition_auxilliary_methods.py | 41 +- .../multicolumn_condition_partial.py | 11 +- .../multicolumn_function_partial.py | 11 +- ...column_map_condition_auxilliary_methods.py | 3 +- .../compound_columns_unique.py | 11 +- .../multicolumn_sum_equal.py | 2 +- ...lect_column_values_unique_within_record.py | 3 +- .../metrics/query_metrics/query_column.py | 12 +- .../query_metrics/query_column_pair.py | 12 +- .../query_metrics/query_multiple_columns.py | 12 +- .../metrics/query_metrics/query_table.py | 12 +- .../query_metrics/query_template_values.py | 12 +- .../table_metrics/table_column_types.py | 26 +- .../metrics/table_metrics/table_head.py | 2 +- .../metrics/table_metrics/table_row_count.py | 3 +- .../expectations/metrics/util.py | 188 +++++----- great_expectations/optional_imports.py | 352 +++++++++++++++++- .../profile/user_configurable_profiler.py | 2 +- .../attributed_resolved_metrics.py | 23 +- great_expectations/self_check/util.py | 101 +++-- great_expectations/types/__init__.py | 18 +- great_expectations/util.py | 30 +- .../validator/metrics_calculator.py | 11 +- great_expectations/validator/validator.py | 30 +- tests/conftest.py | 29 +- ...est_configured_asset_gcs_data_connector.py | 80 ++-- .../test_data_connector_util.py | 8 +- .../test_inferred_asset_gcs_data_connector.py | 84 ++--- .../test_azure_blob_storage_data_connector.py | 11 +- ...est_google_cloud_storage_data_connector.py | 34 +- ...st_pandas_azure_blob_storage_datasource.py | 45 +-- .../fluent/test_pandas_dbfs_datasource.py | 15 - ..._pandas_google_cloud_storage_datasource.py | 68 ++-- ...est_spark_azure_blob_storage_datasource.py | 45 +-- ...t_spark_google_cloud_storage_datasource.py | 68 ++-- tests/datasource/test_new_datasource.py | 13 +- ...datasource_with_aws_glue_data_connector.py | 22 +- ..._new_datasource_with_sql_data_connector.py | 30 +- tests/execution_engine/conftest.py | 16 +- .../test_sparkdf_execution_engine_sampling.py | 6 - ...test_sparkdf_execution_engine_splitting.py | 31 +- .../test_pandas_execution_engine.py | 22 +- .../test_sparkdf_execution_engine.py | 55 +-- tests/expectations/metrics/test_core.py | 9 +- .../test_expectation_arguments.py | 12 +- .../sql_database/yaml_example_complete.py | 2 +- .../expect_column_max_to_be_between_custom.py | 2 +- ...ir_values_to_have_a_difference_of_three.py | 2 +- .../expect_column_values_to_equal_three.py | 2 +- ...icolumn_values_to_be_multiples_of_three.py | 2 +- tests/integration/spark/test_spark_config.py | 15 +- tests/test_utils.py | 34 +- 133 files changed, 1560 insertions(+), 1551 deletions(-) delete mode 100644 great_expectations/expectations/metrics/import_manager.py diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py index c4b6b32813be..2ea25ce15ce3 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_kurtosis_to_be_between.py @@ -14,7 +14,7 @@ ColumnAggregateMetricProvider, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F +from great_expectations.optional_imports import F class ColumnKurtosis(ColumnAggregateMetricProvider): diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_pair_values_to_have_difference_of_custom_percentage.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_pair_values_to_have_difference_of_custom_percentage.py index 456ecce90af2..e1fe7866479e 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_pair_values_to_have_difference_of_custom_percentage.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_pair_values_to_have_difference_of_custom_percentage.py @@ -9,11 +9,11 @@ SparkDFExecutionEngine, ) from great_expectations.expectations.expectation import ColumnPairMapExpectation -from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) +from great_expectations.optional_imports import F class ColumnPairValuesDiffCustomPercentageOrLess(ColumnPairMapMetricProvider): diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py index eb1067fb3c0c..ae444cd299db 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_skew_to_be_between.py @@ -21,35 +21,19 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import ( + F, + sa_sql_expression_Select, + sqlalchemy_engine_Row, + sqlalchemy_ProgrammingError, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) -try: - from sqlalchemy.exc import ProgrammingError - from sqlalchemy.sql import Select -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - ProgrammingError = None - Select = None - -try: - from sqlalchemy.engine.row import Row -except ImportError: - try: - from sqlalchemy.engine.row import RowProxy - - Row = RowProxy - except ImportError: - logger.debug( - "Unable to load SqlAlchemy Row class; please upgrade you sqlalchemy installation to the latest version." - ) - RowProxy = None - Row = None - class ColumnSkew(ColumnAggregateMetricProvider): """MetricProvider Class for Aggregate Mean MetricProvider""" @@ -128,19 +112,21 @@ def _sqlalchemy( def _get_query_result(func, selectable, sqlalchemy_engine): - simple_query: Select = sa.select(func).select_from(selectable) + simple_query: sa_sql_expression_Select = sa.select(func).select_from(selectable) try: - result: Row = sqlalchemy_engine.execute(simple_query).fetchone()[0] + result: sqlalchemy_engine_Row = sqlalchemy_engine.execute( + simple_query + ).fetchone()[0] return result - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( f'{type(pe).__name__}: "{str(pe)}". Traceback: "{exception_traceback}".' ) logger.error(exception_message) - raise pe() + raise pe # @classmethod # def _get_evaluation_dependencies( diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py index a3bc3df43fac..65ece63ea536 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_in_set_spark_optimized.py @@ -6,8 +6,8 @@ from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ColumnAggregateMetricProvider -from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import F # This class defines a Metric to support your Expectation. diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py index 419e635f2e5c..6309d958f5a2 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_be_string_integers_increasing.py @@ -23,9 +23,13 @@ ColumnMapMetricProvider, column_function_partial, ) -from great_expectations.expectations.metrics.import_manager import F, Window, sparktypes from great_expectations.expectations.metrics.metric_provider import metric_partial from great_expectations.expectations.registry import get_metric_kwargs +from great_expectations.optional_imports import ( + F, + pyspark_sql_Window, + sparktypes, +) from great_expectations.validator.metric_configuration import MetricConfiguration from great_expectations.validator.validator import ValidationDependencies @@ -95,7 +99,9 @@ def _spark( "Column must be a string-type capable of being cast to int." ) - diff = column - F.lag(column).over(Window.orderBy(F.lit("constant"))) + diff = column - F.lag(column).over( + pyspark_sql_Window.orderBy(F.lit("constant")) + ) diff = F.when(diff.isNull(), 1).otherwise(diff) if metric_value_kwargs["strictly"] is True: diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_match_xml_schema.py b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_match_xml_schema.py index 2ffc10f9975f..1979f6f2fc9a 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_match_xml_schema.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_column_values_to_match_xml_schema.py @@ -14,11 +14,11 @@ ColumnMapExpectation, render_evaluation_parameter_string, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes from great_expectations.expectations.metrics.map_metric import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F, sparktypes from great_expectations.render import RenderedStringTemplateContent from great_expectations.render.renderer.renderer import renderer from great_expectations.render.util import ( diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_between.py b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_between.py index 5a79cbfece5e..f21f6c7e10ec 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_between.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_between.py @@ -13,11 +13,11 @@ SparkDFExecutionEngine, ) from great_expectations.expectations.expectation import MulticolumnMapExpectation -from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, multicolumn_condition_partial, ) +from great_expectations.optional_imports import F # This class defines a Metric to support your Expectation. diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_equal_to_single_column.py b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_equal_to_single_column.py index 6bb6b4ac84d9..b0426170543f 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_equal_to_single_column.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_multicolumn_sum_values_to_be_equal_to_single_column.py @@ -9,11 +9,11 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.expectation import MulticolumnMapExpectation -from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, multicolumn_condition_partial, ) +from great_expectations.optional_imports import F # This class defines a Metric to support your Expectation. diff --git a/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py b/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py index 4d3e6c8006b7..555ed18a71d1 100644 --- a/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py +++ b/contrib/experimental/great_expectations_experimental/expectations/expect_yesterday_count_compared_to_avg_equivalent_days_of_week.py @@ -9,8 +9,8 @@ ) from great_expectations.expectations.expectation import ColumnAggregateExpectation from great_expectations.expectations.metrics import ColumnAggregateMetricProvider -from great_expectations.expectations.metrics.import_manager import sa from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import sqlalchemy as sa TODAY: datetime = datetime(year=2022, month=8, day=10) TODAY_STR: str = datetime.strftime(TODAY, "%Y-%m-%d") diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_coordinates_in_range_of_given_point.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_coordinates_in_range_of_given_point.py index 1d2b36365dc2..50a53eacf364 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_coordinates_in_range_of_given_point.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_coordinates_in_range_of_given_point.py @@ -22,7 +22,7 @@ ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes +from great_expectations.optional_imports import F, sparktypes from great_expectations.render import ( RenderedBulletListContent, RenderedGraphContent, diff --git a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_in_timezone.py b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_in_timezone.py index 946e40fe277a..4d7a3913032b 100644 --- a/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_in_timezone.py +++ b/contrib/great_expectations_geospatial_expectations/great_expectations_geospatial_expectations/expectations/expect_column_values_to_be_lat_lon_in_timezone.py @@ -12,7 +12,7 @@ ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes +from great_expectations.optional_imports import F, sparktypes # This class defines a Metric to support your Expectation. diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py index 92dd0485e511..8f61b1c5ecf1 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_bic_belong_to_country.py @@ -8,12 +8,16 @@ from great_expectations.execution_engine import ( PandasExecutionEngine, ) + +# SparkDFExecutionEngine, from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ( ColumnMapMetricProvider, column_condition_partial, ) +# from great_expectations.optional_imports import F, sparktypes + def bic_belong_to_country(bic: str, country_code: str) -> bool: try: diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_ascii.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_ascii.py index db568204e65c..df37ce1956c5 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_ascii.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_ascii.py @@ -9,7 +9,7 @@ ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes +from great_expectations.optional_imports import F, sparktypes # This class defines a Metric to support your Expectation diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py index d28a787bc876..6082d3054675 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_bic.py @@ -7,13 +7,15 @@ from great_expectations.execution_engine import ( PandasExecutionEngine, ) + +# SparkDFExecutionEngine, from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ( ColumnMapMetricProvider, column_condition_partial, ) -# from great_expectations.expectations.metrics.import_manager import F, sparktypes +# from great_expectations.optional_imports import F, sparktypes def is_valid_bic(bic_code: str) -> bool: diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iana_timezone.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iana_timezone.py index c96608cdeca9..6d294c777233 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iana_timezone.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iana_timezone.py @@ -12,7 +12,7 @@ ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes +from great_expectations.optional_imports import F, sparktypes def is_valid_timezone(timezone: str) -> bool: diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iban.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iban.py index 92e1e9b69d70..0cb7bd406e4c 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iban.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_iban.py @@ -13,7 +13,7 @@ ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes +from great_expectations.optional_imports import F, sparktypes def is_valid_iban(iban: str) -> bool: diff --git a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_xml_parseable.py b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_xml_parseable.py index 5d2e7a16d6c7..c12920ab9b7c 100644 --- a/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_xml_parseable.py +++ b/contrib/great_expectations_semantic_types_expectations/great_expectations_semantic_types_expectations/expectations/expect_column_values_to_be_xml_parseable.py @@ -14,11 +14,11 @@ ColumnMapExpectation, render_evaluation_parameter_string, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes from great_expectations.expectations.metrics.map_metric import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F, sparktypes from great_expectations.render import RenderedStringTemplateContent from great_expectations.render.renderer.renderer import renderer from great_expectations.render.util import ( diff --git a/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py b/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py index 6103a59a51ac..fbb9b9eebc90 100644 --- a/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py +++ b/contrib/time_series_expectations/time_series_expectations/expectations/expect_column_pair_values_to_match_prophet_date_model.py @@ -9,11 +9,11 @@ SparkDFExecutionEngine, ) from great_expectations.expectations.expectation import ColumnPairMapExpectation -from great_expectations.expectations.metrics.import_manager import F, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) +from great_expectations.optional_imports import F, sparktypes from time_series_expectations.expectations.prophet_model_deserializer import ( ProphetModelDeserializer, ) diff --git a/docs/docusaurus/docs/guides/expectations/features_custom_expectations/how_to_add_spark_support_for_an_expectation.md b/docs/docusaurus/docs/guides/expectations/features_custom_expectations/how_to_add_spark_support_for_an_expectation.md index 0b06d7a5196b..4ec8ae6869ab 100644 --- a/docs/docusaurus/docs/guides/expectations/features_custom_expectations/how_to_add_spark_support_for_an_expectation.md +++ b/docs/docusaurus/docs/guides/expectations/features_custom_expectations/how_to_add_spark_support_for_an_expectation.md @@ -74,7 +74,7 @@ For our Custom Column Aggregate Expectation `ExpectColumnMaxToBeBetweenCustom`, ``` If we need a builtin function from `pyspark.sql.functions`, usually aliased to `F`, the import logic in -`from great_expectations.expectations.metrics.import_manager import F` +`from great_expectations.optional_imports import F` allows us to access these functions even when PySpark is not installed.
    diff --git a/great_expectations/cli/batch_request.py b/great_expectations/cli/batch_request.py index 29454cf2bf7e..9e4d4e1555c0 100644 --- a/great_expectations/cli/batch_request.py +++ b/great_expectations/cli/batch_request.py @@ -2,7 +2,7 @@ import logging import re -from typing import Any, Dict, List, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union import click from typing_extensions import Final @@ -31,15 +31,10 @@ logger = logging.getLogger(__name__) -try: - import sqlalchemy # noqa: TID251 - from sqlalchemy.engine.reflection import Inspector # noqa: TID251 -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - sqlalchemy = None - Inspector = None + +if TYPE_CHECKING: + from great_expectations.optional_imports import sqlalchemy_engine_Inspector + DEFAULT_DATA_CONNECTOR_NAMES: Final[List[str]] = [ "default_runtime_data_connector_name", @@ -441,7 +436,9 @@ def _get_data_asset_name_for_simple_sqlalchemy_datasource( def _get_default_schema(datasource: SimpleSqlalchemyDatasource) -> str: execution_engine: SqlAlchemyExecutionEngine = datasource.execution_engine - inspector: Inspector = get_sqlalchemy_inspector(execution_engine.engine) + inspector: sqlalchemy_engine_Inspector = get_sqlalchemy_inspector( + execution_engine.engine + ) return inspector.default_schema_name diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py index ad3e90dcc65a..2d27cdceb19b 100644 --- a/great_expectations/cli/datasource.py +++ b/great_expectations/cli/datasource.py @@ -27,13 +27,6 @@ logger = logging.getLogger(__name__) -try: - import sqlalchemy # noqa: TID251 -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - sqlalchemy = None yaml = YAMLToString() yaml.indent(mapping=2, sequence=4, offset=2) diff --git a/great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py b/great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py index f904c80cad12..037a71105042 100644 --- a/great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py +++ b/great_expectations/compatibility/sqlalchemy_compatibility_wrappers.py @@ -6,22 +6,9 @@ import pandas as pd -logger = logging.getLogger(__name__) - -try: - import sqlalchemy as sa # noqa: TID251 - from sqlalchemy import Table # noqa: TID251 - from sqlalchemy.engine import reflection # noqa: TID251 - from sqlalchemy.sql import Select # noqa: TID251 +from great_expectations.optional_imports import sqlalchemy_engine_Engine -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - sa = None - reflection = None - Table = None - Select = None +logger = logging.getLogger(__name__) def read_sql_table_as_df( @@ -60,7 +47,7 @@ def read_sql_table_as_df( chunksize: If specified, returns an iterator where `chunksize` is the number of rows to include in each chunk. """ - if isinstance(con, sa.engine.Engine): + if isinstance(con, sqlalchemy_engine_Engine): con = con.connect() with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", category=DeprecationWarning) @@ -124,7 +111,7 @@ def add_dataframe_to_db( * 'multi': Pass multiple values in a single ``INSERT`` clause. * callable with signature ``(pd_table, conn, keys, data_iter)``. """ - if isinstance(con, sa.engine.Engine): + if isinstance(con, sqlalchemy_engine_Engine): con = con.connect() with warnings.catch_warnings(): # Note that RemovedIn20Warning is the warning class that we see from sqlalchemy diff --git a/great_expectations/core/batch.py b/great_expectations/core/batch.py index 90bb7e083b31..aed34bed57c8 100644 --- a/great_expectations/core/batch.py +++ b/great_expectations/core/batch.py @@ -6,12 +6,18 @@ import logging from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Set, Type, Union +import pandas as pd + import great_expectations.exceptions as gx_exceptions from great_expectations.alias_types import JSONValues # noqa: TCH001 from great_expectations.core._docs_decorators import deprecated_argument, public_api from great_expectations.core.id_dict import BatchKwargs, BatchSpec, IDDict from great_expectations.core.util import convert_to_json_serializable from great_expectations.exceptions import InvalidBatchIdError +from great_expectations.optional_imports import ( + SPARK_NOT_IMPORTED, + pyspark_sql_DataFrame, +) from great_expectations.types import DictDot, SerializableDictDot, safe_deep_copy from great_expectations.util import deep_filter_properties_iterable, load_class @@ -25,25 +31,6 @@ logger = logging.getLogger(__name__) -try: - import pandas as pd -except ImportError: - pd = None - - logger.debug( - "Unable to load pandas; install optional pandas dependency for support." - ) - -try: - import pyspark - from pyspark.sql import DataFrame as SparkDataFrame -except ImportError: - pyspark = None - SparkDataFrame = None - logger.debug( - "Unable to load pyspark; install optional spark dependency if you will be working with Spark dataframes" - ) - def _get_fluent_batch_request_class() -> Type[FluentBatchRequest]: """Using this function helps work around circular import dependncies.""" @@ -665,7 +652,10 @@ def head(self, *args, **kwargs): return pd.DataFrame({}) -BatchDataType = Union[BatchData, pd.DataFrame, SparkDataFrame] +if SPARK_NOT_IMPORTED: + BatchDataType = Union[BatchData, pd.DataFrame] +else: + BatchDataType = Union[BatchData, pd.DataFrame, pyspark_sql_DataFrame] # TODO: This module needs to be cleaned up. diff --git a/great_expectations/core/util.py b/great_expectations/core/util.py index a2f37e58f4c8..ee14ef0a657a 100644 --- a/great_expectations/core/util.py +++ b/great_expectations/core/util.py @@ -36,6 +36,15 @@ from great_expectations.core._docs_decorators import public_api from great_expectations.core.run_identifier import RunIdentifier from great_expectations.exceptions import InvalidExpectationConfigurationError +from great_expectations.optional_imports import ( + SQLALCHEMY_NOT_IMPORTED, + pyspark_sql_DataFrame, + pyspark_sql_SparkSession, + sparktypes, + sqlalchemy, + sqlalchemy_engine_Connection, + sqlalchemy_TextClause, +) from great_expectations.types import SerializableDictDot from great_expectations.types.base import SerializableDotDict @@ -48,22 +57,6 @@ logger = logging.getLogger(__name__) -try: - import pyspark - from pyspark.sql import SparkSession # noqa: F401 -except ImportError: - pyspark = None # type: ignore[assignment] - SparkSession = None # type: ignore[assignment,misc] - logger.debug( - "Unable to load pyspark; install optional spark dependency if you will be working with Spark dataframes" - ) - -try: - from pyspark.sql.types import StructType -except ImportError: - StructType = None # type: ignore[assignment,misc] - - try: from shapely.geometry import LineString, MultiPolygon, Point, Polygon except ImportError: @@ -72,12 +65,6 @@ MultiPolygon = None LineString = None -from great_expectations.optional_imports import ( - SQLALCHEMY_NOT_IMPORTED, - sqlalchemy, - sqlalchemy_Connection, - sqlalchemy_TextClause, -) try: LegacyRow = sqlalchemy.engine.row.LegacyRow @@ -403,7 +390,7 @@ def convert_to_json_serializable( # noqa: C901 - complexity 32 if isinstance(data, pd.DataFrame): return convert_to_json_serializable(data.to_dict(orient="records")) - if pyspark and isinstance(data, pyspark.sql.DataFrame): + if pyspark_sql_DataFrame and isinstance(data, pyspark_sql_DataFrame): # type: ignore[truthy-function] # using StackOverflow suggestion for converting pyspark df into dictionary # https://stackoverflow.com/questions/43679880/pyspark-dataframe-to-dictionary-columns-as-keys-and-list-of-column-values-ad-di return convert_to_json_serializable( @@ -425,10 +412,10 @@ def convert_to_json_serializable( # noqa: C901 - complexity 32 return data.to_json_dict() # PySpark schema serialization - if StructType is not None and isinstance(data, StructType): + if sparktypes and isinstance(data, sparktypes.StructType): return dict(data.jsonValue()) - if sqlalchemy_Connection and isinstance(data, sqlalchemy_Connection): + if sqlalchemy_engine_Connection and isinstance(data, sqlalchemy_engine_Connection): # Connection is a module, which is non-serializable. Return module name instead. return "sqlalchemy.engine.base.Connection" @@ -522,7 +509,7 @@ def ensure_json_serializable(data): # noqa: C901 - complexity 21 ] return - if pyspark and isinstance(data, pyspark.sql.DataFrame): + if pyspark_sql_DataFrame and isinstance(data, pyspark_sql_DataFrame): # type: ignore[truthy-function] # using StackOverflow suggestion for converting pyspark df into dictionary # https://stackoverflow.com/questions/43679880/pyspark-dataframe-to-dictionary-columns-as-keys-and-list-of-column-values-ad-di return ensure_json_serializable( @@ -541,14 +528,14 @@ def ensure_json_serializable(data): # noqa: C901 - complexity 21 if sqlalchemy_TextClause and isinstance(data, sqlalchemy_TextClause): # TextClause is handled manually by convert_to_json_serializable() return - if sqlalchemy_Connection and isinstance(data, sqlalchemy_Connection): + + if sqlalchemy_engine_Connection and isinstance(data, sqlalchemy_engine_Connection): # Connection module is handled manually by convert_to_json_serializable() return - else: - raise InvalidExpectationConfigurationError( - f"{str(data)} is of type {type(data).__name__} which cannot be serialized to json" - ) + raise InvalidExpectationConfigurationError( + f"{str(data)} is of type {type(data).__name__} which cannot be serialized to json" + ) def substitute_all_strftime_format_strings( @@ -805,7 +792,7 @@ def sniff_s3_compression(s3_url: S3Url) -> Union[str, None]: def get_or_create_spark_application( spark_config: Optional[Dict[str, str]] = None, force_reuse_spark_context: bool = True, -) -> SparkSession: +) -> pyspark_sql_SparkSession: """Obtains configured Spark session if it has already been initialized; otherwise creates Spark session, configures it, and returns it to caller. Due to the uniqueness of SparkContext per JVM, it is impossible to change SparkSession configuration dynamically. @@ -831,7 +818,7 @@ def get_or_create_spark_application( spark_config.update({"spark.app.name": name}) - spark_session: Optional[SparkSession] = get_or_create_spark_session( + spark_session: Optional[pyspark_sql_SparkSession] = get_or_create_spark_session( spark_config=spark_config ) if spark_session is None: @@ -867,7 +854,7 @@ def get_or_create_spark_application( # noinspection PyPep8Naming def get_or_create_spark_session( spark_config: Optional[Dict[str, str]] = None, -) -> SparkSession | None: +) -> pyspark_sql_SparkSession | None: """Obtains Spark session if it already exists; otherwise creates Spark session and returns it to caller. Due to the uniqueness of SparkContext per JVM, it is impossible to change SparkSession configuration dynamically. @@ -882,14 +869,14 @@ def get_or_create_spark_session( Returns: """ - spark_session: Optional[SparkSession] + spark_session: Optional[pyspark_sql_SparkSession] try: if spark_config is None: spark_config = {} else: spark_config = copy.deepcopy(spark_config) - builder = SparkSession.builder + builder = pyspark_sql_SparkSession.builder app_name: Optional[str] = spark_config.get("spark.app.name") if app_name: diff --git a/great_expectations/data_context/store/database_store_backend.py b/great_expectations/data_context/store/database_store_backend.py index c8321383ec23..dcf6c1b61bd3 100644 --- a/great_expectations/data_context/store/database_store_backend.py +++ b/great_expectations/data_context/store/database_store_backend.py @@ -8,38 +8,23 @@ import great_expectations.exceptions as gx_exceptions from great_expectations.data_context.store.store_backend import StoreBackend from great_expectations.optional_imports import ( - SQLALCHEMY_NOT_IMPORTED, - sqlalchemy_Row, + SQLAlchemyError, + sqlalchemy_engine_Row, + sqlalchemy_IntegrityError, + sqlalchemy_NoSuchTableError, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, ) -from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.util import ( filter_properties_dict, get_sqlalchemy_url, import_make_url, ) -try: - from sqlalchemy import Column, MetaData, String, Table, and_, column # noqa: TID251 - from sqlalchemy.engine.url import URL # noqa: TID251 - from sqlalchemy.exc import ( # noqa: TID251 - IntegrityError, - NoSuchTableError, - SQLAlchemyError, - ) - +if sa: make_url = import_make_url() -except ImportError: - Column = SQLALCHEMY_NOT_IMPORTED - MetaData = SQLALCHEMY_NOT_IMPORTED - String = SQLALCHEMY_NOT_IMPORTED - Table = SQLALCHEMY_NOT_IMPORTED - and_ = SQLALCHEMY_NOT_IMPORTED - column = SQLALCHEMY_NOT_IMPORTED - URL = SQLALCHEMY_NOT_IMPORTED - IntegrityError = SQLALCHEMY_NOT_IMPORTED - NoSuchTableError = SQLALCHEMY_NOT_IMPORTED - SQLAlchemyError = SQLALCHEMY_NOT_IMPORTED - create_engine = SQLALCHEMY_NOT_IMPORTED + logger = logging.getLogger(__name__) @@ -100,7 +85,7 @@ def __init__( # noqa: C901 - 16 "Credentials, url, connection_string, or an engine are required for a DatabaseStoreBackend." ) - meta = MetaData(schema=self._schema_name) + meta = sa.MetaData(schema=self._schema_name) self.key_columns = key_columns # Dynamically construct a SQLAlchemy table with the name and column names we'll use cols = [] @@ -109,10 +94,10 @@ def __init__( # noqa: C901 - 16 raise gx_exceptions.InvalidConfigError( "'value' cannot be used as a key_element name" ) - cols.append(Column(column_, String, primary_key=True)) - cols.append(Column("value", String)) + cols.append(sa.Column(column_, sa.String, primary_key=True)) + cols.append(sa.Column("value", sa.String)) try: - table = Table(table_name, meta, autoload_with=self.engine) + table = sa.Table(table_name, meta, autoload_with=self.engine) # We do a "light" check: if the columns' names match, we will proceed, otherwise, create the table if {str(col.name).lower() for col in table.columns} != ( set(key_columns) | {"value"} @@ -120,8 +105,8 @@ def __init__( # noqa: C901 - 16 raise gx_exceptions.StoreBackendError( f"Unable to use table {table_name}: it exists, but does not have the expected schema." ) - except NoSuchTableError: - table = Table(table_name, meta, *cols) + except sqlalchemy_NoSuchTableError: + table = sa.Table(table_name, meta, *cols) try: if self._schema_name: with self.engine.begin() as connection: @@ -203,7 +188,7 @@ def _build_engine(self, credentials, **kwargs) -> "sa.engine.Engine": # noqa: U @staticmethod def _get_sqlalchemy_key_pair_auth_url( drivername: str, credentials: dict - ) -> Tuple["URL", Dict]: # noqa: UP037 + ) -> Tuple["URL", Dict]: # type: ignore[name-defined] # noqa F821 """ Utilizing a private key path and a passphrase in a given credentials dictionary, attempts to encode the provided values into a private key. If passphrase is incorrect, this will fail and an exception is raised. @@ -253,10 +238,10 @@ def _get_sqlalchemy_key_pair_auth_url( def _get(self, key): sel = ( - sa.select(column("value")) + sa.select(sa.column("value")) .select_from(self._table) .where( - and_( + sa.and_( *( getattr(self._table.columns, key_col) == val for key_col, val in zip(self.key_columns, key) @@ -291,7 +276,7 @@ def _set(self, key, value, allow_update=True, **kwargs) -> None: try: with self.engine.begin() as connection: connection.execute(ins) - except IntegrityError as e: + except sqlalchemy_IntegrityError as e: if self._get(key) == value: logger.info(f"Key {str(key)} already exists with the same value.") else: @@ -322,10 +307,10 @@ def _convert_engine_and_key_to_url(self, key): def _has_key(self, key): sel = ( - sa.select(sa.func.count(column("value"))) + sa.select(sa.func.count(sa.column("value"))) .select_from(self._table) .where( - and_( + sa.and_( *( getattr(self._table.columns, key_col) == val for key_col, val in zip(self.key_columns, key) @@ -341,12 +326,12 @@ def _has_key(self, key): return False def list_keys(self, prefix=()): - columns = [column(col) for col in self.key_columns] + columns = [sa.column(col) for col in self.key_columns] sel = ( sa.select(*columns) .select_from(self._table) .where( - and_( + sa.and_( True, *( getattr(self._table.columns, key_col) == val @@ -356,12 +341,12 @@ def list_keys(self, prefix=()): ) ) with self.engine.begin() as connection: - row_list: list[sqlalchemy_Row] = connection.execute(sel).fetchall() + row_list: list[sqlalchemy_engine_Row] = connection.execute(sel).fetchall() return [tuple(row) for row in row_list] def remove_key(self, key): delete_statement = self._table.delete().where( - and_( + sa.and_( *( getattr(self._table.columns, key_col) == val for key_col, val in zip(self.key_columns, key) diff --git a/great_expectations/datasource/data_connector/configured_asset_azure_data_connector.py b/great_expectations/datasource/data_connector/configured_asset_azure_data_connector.py index 7ae63350f724..d52d443e40e9 100644 --- a/great_expectations/datasource/data_connector/configured_asset_azure_data_connector.py +++ b/great_expectations/datasource/data_connector/configured_asset_azure_data_connector.py @@ -14,17 +14,10 @@ sanitize_prefix, ) from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 +from great_expectations.optional_imports import BlobServiceClient logger = logging.getLogger(__name__) -try: - from azure.storage.blob import BlobServiceClient -except ImportError: - BlobServiceClient = None - logger.debug( - "Unable to load BlobServiceClient connection object; install optional Azure Storage Blob dependency for support" - ) - @public_api class ConfiguredAssetAzureDataConnector(ConfiguredAssetFilePathDataConnector): @@ -102,7 +95,7 @@ def __init__( r"(?:https?://)?(.+?).blob.core.windows.net", account_url ).group(1) self._azure = BlobServiceClient(**azure_options) - except (TypeError, AttributeError): + except (TypeError, AttributeError, ModuleNotFoundError): raise ImportError( "Unable to load Azure BlobServiceClient (it is required for ConfiguredAssetAzureDataConnector). \ Please ensure that you have provided the appropriate keys to `azure_options` for authentication." diff --git a/great_expectations/datasource/data_connector/configured_asset_gcs_data_connector.py b/great_expectations/datasource/data_connector/configured_asset_gcs_data_connector.py index cd5eeed5539d..835b3828bfbc 100644 --- a/great_expectations/datasource/data_connector/configured_asset_gcs_data_connector.py +++ b/great_expectations/datasource/data_connector/configured_asset_gcs_data_connector.py @@ -10,19 +10,13 @@ ) from great_expectations.datasource.data_connector.util import list_gcs_keys from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 +from great_expectations.optional_imports import ( + google_cloud_storage, + google_service_account, +) logger = logging.getLogger(__name__) -try: - from google.cloud import storage - from google.oauth2 import service_account -except ImportError: - storage = None - service_account = None - logger.debug( - "Unable to load GCS connection object; install optional Google dependency for support" - ) - @public_api class ConfiguredAssetGCSDataConnector(ConfiguredAssetFilePathDataConnector): @@ -94,16 +88,22 @@ def __init__( credentials = None # If configured with gcloud CLI / env vars if "filename" in gcs_options: filename = gcs_options.pop("filename") - credentials = service_account.Credentials.from_service_account_file( - filename=filename + credentials = ( + google_service_account.Credentials.from_service_account_file( + filename=filename + ) ) elif "info" in gcs_options: info = gcs_options.pop("info") - credentials = service_account.Credentials.from_service_account_info( - info=info + credentials = ( + google_service_account.Credentials.from_service_account_info( + info=info + ) ) - self._gcs = storage.Client(credentials=credentials, **gcs_options) - except (TypeError, AttributeError): + self._gcs = google_cloud_storage.Client( + credentials=credentials, **gcs_options + ) + except (TypeError, AttributeError, ModuleNotFoundError): raise ImportError( "Unable to load GCS Client (it is required for ConfiguredAssetGCSDataConnector)." ) diff --git a/great_expectations/datasource/data_connector/inferred_asset_azure_data_connector.py b/great_expectations/datasource/data_connector/inferred_asset_azure_data_connector.py index a4bd21057638..26a480332cfa 100644 --- a/great_expectations/datasource/data_connector/inferred_asset_azure_data_connector.py +++ b/great_expectations/datasource/data_connector/inferred_asset_azure_data_connector.py @@ -13,17 +13,10 @@ sanitize_prefix, ) from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 +from great_expectations.optional_imports import BlobServiceClient logger = logging.getLogger(__name__) -try: - from azure.storage.blob import BlobServiceClient -except ImportError: - BlobServiceClient = None - logger.debug( - "Unable to load BlobServiceClient connection object; install optional Azure Storage Blob dependency for support" - ) - @public_api class InferredAssetAzureDataConnector(InferredAssetFilePathDataConnector): @@ -104,7 +97,7 @@ def __init__( r"(?:https?://)?(.+?).blob.core.windows.net", account_url ).group(1) self._azure = BlobServiceClient(**azure_options) - except (TypeError, AttributeError): + except (TypeError, AttributeError, ModuleNotFoundError): raise ImportError( "Unable to load Azure BlobServiceClient (it is required for InferredAssetAzureDataConnector). \ Please ensure that you have provided the appropriate keys to `azure_options` for authentication." diff --git a/great_expectations/datasource/data_connector/inferred_asset_gcs_data_connector.py b/great_expectations/datasource/data_connector/inferred_asset_gcs_data_connector.py index bafa5ebcfd29..aeec02b87a90 100644 --- a/great_expectations/datasource/data_connector/inferred_asset_gcs_data_connector.py +++ b/great_expectations/datasource/data_connector/inferred_asset_gcs_data_connector.py @@ -9,19 +9,13 @@ ) from great_expectations.datasource.data_connector.util import list_gcs_keys from great_expectations.execution_engine import ExecutionEngine # noqa: TCH001 +from great_expectations.optional_imports import ( + google_cloud_storage, + google_service_account, +) logger = logging.getLogger(__name__) -try: - from google.cloud import storage - from google.oauth2 import service_account -except ImportError: - storage = None - service_account = None - logger.debug( - "Unable to load GCS connection object; install optional Google dependency for support" - ) - @public_api class InferredAssetGCSDataConnector(InferredAssetFilePathDataConnector): @@ -96,16 +90,22 @@ def __init__( credentials = None # If configured with gcloud CLI / env vars if "filename" in gcs_options: filename = gcs_options.pop("filename") - credentials = service_account.Credentials.from_service_account_file( - filename=filename + credentials = ( + google_service_account.Credentials.from_service_account_file( + filename=filename + ) ) elif "info" in gcs_options: info = gcs_options.pop("info") - credentials = service_account.Credentials.from_service_account_info( - info=info + credentials = ( + google_service_account.Credentials.from_service_account_info( + info=info + ) ) - self._gcs = storage.Client(credentials=credentials, **gcs_options) - except (TypeError, AttributeError): + self._gcs = google_cloud_storage.Client( + credentials=credentials, **gcs_options + ) + except (TypeError, AttributeError, ModuleNotFoundError): raise ImportError( "Unable to load GCS Client (it is required for InferredAssetGCSDataConnector)." ) diff --git a/great_expectations/datasource/data_connector/util.py b/great_expectations/datasource/data_connector/util.py index 3d9d1b9d163a..b21a6ee068fd 100644 --- a/great_expectations/datasource/data_connector/util.py +++ b/great_expectations/datasource/data_connector/util.py @@ -17,6 +17,12 @@ from great_expectations.data_context.util import instantiate_class_from_config from great_expectations.datasource.data_connector.asset import Asset # noqa: TCH001 from great_expectations.datasource.data_connector.sorter import Sorter # noqa: TCH001 +from great_expectations.optional_imports import ( + BlobPrefix, + BlobServiceClient, + ContainerClient, + google_cloud_storage, # noqa: F401 +) if TYPE_CHECKING: from great_expectations.alias_types import PathStr @@ -24,34 +30,6 @@ logger = logging.getLogger(__name__) -try: - from azure.storage.blob import BlobPrefix, BlobServiceClient, ContainerClient -except ImportError: - BlobPrefix = None - BlobServiceClient = None - ContainerClient = None - logger.debug( - "Unable to load azure types; install optional Azure dependency for support." - ) - -try: - from google.cloud import storage -except ImportError: - storage = None - logger.debug( - "Unable to load GCS connection object; install optional Google dependency for support" - ) - -try: - import pyspark - import pyspark.sql as pyspark_sql -except ImportError: - pyspark = None # type: ignore[assignment] - pyspark_sql = None # type: ignore[assignment] - logger.debug( - "Unable to load pyspark and pyspark.sql; install optional Spark dependency for support." - ) - DEFAULT_DATA_ASSET_NAME: str = "DEFAULT_ASSET_NAME" diff --git a/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py b/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py index 03b3f9475ebc..80b08b173831 100644 --- a/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py +++ b/great_expectations/datasource/fluent/data_asset/data_connector/google_cloud_storage_data_connector.py @@ -15,9 +15,8 @@ ) if TYPE_CHECKING: - from google.cloud.storage.client import Client as GCSClient - from great_expectations.core.batch import BatchDefinition + from great_expectations.optional_imports import GoogleCloudStorageClient logger = logging.getLogger(__name__) @@ -61,7 +60,7 @@ def __init__( data_asset_name: str, batching_regex: re.Pattern, # TODO: ALEX - gcs_client: GCSClient, + gcs_client: GoogleCloudStorageClient, bucket_or_name: str, prefix: str = "", delimiter: str = "/", @@ -72,7 +71,7 @@ def __init__( # TODO: ALEX file_path_template_map_fn: Optional[Callable] = None, ) -> None: - self._gcs_client: GCSClient = gcs_client + self._gcs_client: GoogleCloudStorageClient = gcs_client self._bucket_or_name = bucket_or_name self._prefix = prefix @@ -96,7 +95,7 @@ def build_data_connector( datasource_name: str, data_asset_name: str, batching_regex: re.Pattern, - gcs_client: GCSClient, + gcs_client: GoogleCloudStorageClient, bucket_or_name: str, prefix: str = "", delimiter: str = "/", diff --git a/great_expectations/datasource/fluent/interfaces.py b/great_expectations/datasource/fluent/interfaces.py index 6acee7c11caf..5366eb208be5 100644 --- a/great_expectations/datasource/fluent/interfaces.py +++ b/great_expectations/datasource/fluent/interfaces.py @@ -55,14 +55,6 @@ ) from great_expectations.datasource.fluent.type_lookup import TypeLookup -try: - import pyspark - from pyspark.sql import Row as pyspark_sql_Row -except ImportError: - pyspark = None # type: ignore[assignment] - pyspark_sql_Row = None # type: ignore[assignment,misc] - logger.debug("No spark sql dataframe module available.") - class TestConnectionError(Exception): pass diff --git a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py index 6510c3d0bd0b..017837a30ad6 100644 --- a/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/pandas_azure_blob_storage_datasource.py @@ -16,6 +16,9 @@ from great_expectations.datasource.fluent.pandas_datasource import ( PandasDatasourceError, ) +from great_expectations.optional_imports import ( + BlobServiceClient, +) if TYPE_CHECKING: from great_expectations.datasource.fluent.file_path_data_asset import ( @@ -25,16 +28,6 @@ logger = logging.getLogger(__name__) -ABS_IMPORTED = False -try: - from azure.storage.blob import ( - BlobServiceClient, # noqa: disable=E0602 - ) - - ABS_IMPORTED = True -except ImportError: - pass - _MISSING: Final = object() @@ -71,7 +64,7 @@ def _get_azure_client(self) -> BlobServiceClient: ) # Validate that "azure" libararies were successfully imported and attempt to create "azure_client" handle. - if ABS_IMPORTED: + if BlobServiceClient: try: if conn_str is not None: self._account_name = re.search( # type: ignore[union-attr] diff --git a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py index dad0076d0efc..c8c68555e58c 100644 --- a/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/pandas_google_cloud_storage_datasource.py @@ -18,30 +18,23 @@ from great_expectations.datasource.fluent.pandas_datasource import ( PandasDatasourceError, ) +from great_expectations.optional_imports import ( + google_cloud_storage, + google_service_account, +) if TYPE_CHECKING: - from google.cloud.storage.client import Client as GoogleCloudStorageClient - from google.oauth2.service_account import ( - Credentials as GoogleServiceAccountCredentials, - ) - from great_expectations.datasource.fluent.file_path_data_asset import ( _FilePathDataAsset, ) + from great_expectations.optional_imports import ( + GoogleCloudStorageClient, + GoogleServiceAccountCredentials, + ) logger = logging.getLogger(__name__) -GCS_IMPORTED = False -try: - from google.cloud import storage # noqa: disable=E0602 - from google.oauth2 import service_account # noqa: disable=E0602 - - GCS_IMPORTED = True -except ImportError: - pass - - class PandasGoogleCloudStorageDatasourceError(PandasDatasourceError): pass @@ -67,27 +60,23 @@ def _get_gcs_client(self) -> GoogleCloudStorageClient: gcs_client: Union[GoogleCloudStorageClient, None] = self._gcs_client if not gcs_client: # Validate that "google" libararies were successfully imported and attempt to create "gcs_client" handle. - if GCS_IMPORTED: + if google_cloud_storage and google_service_account: try: credentials: Union[ GoogleServiceAccountCredentials, None ] = None # If configured with gcloud CLI / env vars if "filename" in self.gcs_options: filename: str = str(self.gcs_options.pop("filename")) - credentials = ( - service_account.Credentials.from_service_account_file( - filename=filename - ) + credentials = google_service_account.Credentials.from_service_account_file( + filename=filename ) elif "info" in self.gcs_options: info: Any = self.gcs_options.pop("info") - credentials = ( - service_account.Credentials.from_service_account_info( - info=info - ) + credentials = google_service_account.Credentials.from_service_account_info( + info=info ) - gcs_client = storage.Client( + gcs_client = google_cloud_storage.Client( credentials=credentials, **self.gcs_options ) except Exception as e: diff --git a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py index f869a7361235..645c03f8108e 100644 --- a/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_azure_blob_storage_datasource.py @@ -22,20 +22,13 @@ from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) +from great_expectations.optional_imports import ( + BlobServiceClient, +) logger = logging.getLogger(__name__) -ABS_IMPORTED = False -try: - from azure.storage.blob import ( - BlobServiceClient, # noqa: disable=E0602 - ) - - ABS_IMPORTED = True -except ImportError: - pass - _MISSING: Final = object() if TYPE_CHECKING: @@ -78,7 +71,7 @@ def _get_azure_client(self) -> BlobServiceClient: ) # Validate that "azure" libararies were successfully imported and attempt to create "azure_client" handle. - if ABS_IMPORTED: + if BlobServiceClient: try: if conn_str is not None: self._account_name = re.search( # type: ignore[union-attr] # re.search could return None diff --git a/great_expectations/datasource/fluent/spark_datasource.py b/great_expectations/datasource/fluent/spark_datasource.py index 908c7a9f3be9..96524cfa7530 100644 --- a/great_expectations/datasource/fluent/spark_datasource.py +++ b/great_expectations/datasource/fluent/spark_datasource.py @@ -28,7 +28,9 @@ DataAsset, Datasource, ) -from great_expectations.optional_imports import SPARK_NOT_IMPORTED, pyspark +from great_expectations.optional_imports import ( + pyspark_sql_DataFrame, +) if TYPE_CHECKING: from great_expectations.datasource.fluent.interfaces import BatchMetadata @@ -38,12 +40,6 @@ logger = logging.getLogger(__name__) -try: - DataFrame = pyspark.sql.DataFrame -except ImportError: - DataFrame = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] - - # this enables us to include dataframe in the json schema _SparkDataFrameT = TypeVar("_SparkDataFrameT") @@ -89,8 +85,10 @@ class Config: extra = pydantic.Extra.forbid @pydantic.validator("dataframe") - def _validate_dataframe(cls, dataframe: DataFrame) -> DataFrame: - if not isinstance(dataframe, DataFrame): + def _validate_dataframe( + cls, dataframe: pyspark_sql_DataFrame + ) -> pyspark_sql_DataFrame: + if not (pyspark_sql_DataFrame and isinstance(dataframe, pyspark_sql_DataFrame)): # type: ignore[truthy-function] raise ValueError("dataframe must be of type pyspark.sql.DataFrame") return dataframe @@ -210,7 +208,7 @@ def test_connection(self, test_assets: bool = True) -> None: def add_dataframe_asset( self, name: str, - dataframe: DataFrame, + dataframe: pyspark_sql_DataFrame, batch_metadata: Optional[BatchMetadata] = None, ) -> DataFrameAsset: """Adds a Dataframe DataAsset to this SparkDatasource object. diff --git a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py index 4e3cd24ec7a2..bb4eeade2f88 100644 --- a/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py +++ b/great_expectations/datasource/fluent/spark_google_cloud_storage_datasource.py @@ -23,31 +23,24 @@ from great_expectations.datasource.fluent.spark_datasource import ( SparkDatasourceError, ) +from great_expectations.optional_imports import ( + google_cloud_storage, + google_service_account, +) if TYPE_CHECKING: - from google.cloud.storage.client import Client as GoogleCloudStorageClient - from google.oauth2.service_account import ( - Credentials as GoogleServiceAccountCredentials, - ) - from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) + from great_expectations.optional_imports import ( + GoogleCloudStorageClient, + GoogleServiceAccountCredentials, + ) logger = logging.getLogger(__name__) -GCS_IMPORTED = False -try: - from google.cloud import storage # noqa: disable=E0602 - from google.oauth2 import service_account # noqa: disable=E0602 - - GCS_IMPORTED = True -except ImportError: - pass - - class SparkGoogleCloudStorageDatasourceError(SparkDatasourceError): pass @@ -74,27 +67,23 @@ def _get_gcs_client(self) -> GoogleCloudStorageClient: gcs_client: Union[GoogleCloudStorageClient, None] = self._gcs_client if not gcs_client: # Validate that "google" libararies were successfully imported and attempt to create "gcs_client" handle. - if GCS_IMPORTED: + if google_cloud_storage and google_service_account: try: credentials: Union[ GoogleServiceAccountCredentials, None ] = None # If configured with gcloud CLI / env vars if "filename" in self.gcs_options: filename: str = str(self.gcs_options.pop("filename")) - credentials = ( - service_account.Credentials.from_service_account_file( - filename=filename - ) + credentials = google_service_account.Credentials.from_service_account_file( + filename=filename ) elif "info" in self.gcs_options: info: Any = self.gcs_options.pop("info") - credentials = ( - service_account.Credentials.from_service_account_info( - info=info - ) + credentials = google_service_account.Credentials.from_service_account_info( + info=info ) - gcs_client = storage.Client( + gcs_client = google_cloud_storage.Client( credentials=credentials, **self.gcs_options ) except Exception as e: diff --git a/great_expectations/datasource/fluent/sql_datasource.py b/great_expectations/datasource/fluent/sql_datasource.py index 0b9a0ba300ee..74de537605df 100644 --- a/great_expectations/datasource/fluent/sql_datasource.py +++ b/great_expectations/datasource/fluent/sql_datasource.py @@ -43,11 +43,16 @@ from great_expectations.execution_engine.split_and_sample.sqlalchemy_data_splitter import ( SqlAlchemyDataSplitter, ) -from great_expectations.optional_imports import sqlalchemy +from great_expectations.optional_imports import ( + sa_sql_expression_Selectable, + sqlalchemy_engine_Engine, + sqlalchemy_engine_Inspector, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) if TYPE_CHECKING: - # min version of typing_extension missing `Self`, so it can't be imported at runtime - import sqlalchemy as sa # noqa: TID251 from typing_extensions import Self from great_expectations.datasource.fluent.interfaces import BatchMetadata @@ -729,7 +734,7 @@ def _create_batch_spec_kwargs(self) -> dict[str, Any]: """ raise NotImplementedError - def as_selectable(self) -> sqlalchemy.sql.Selectable: + def as_selectable(self) -> sa_sql_expression_Selectable: """Returns a Selectable that can be used to query this data Returns: @@ -750,12 +755,12 @@ def query_must_start_with_select(cls, v: str): raise ValueError("query must start with 'SELECT' followed by a whitespace.") return v - def as_selectable(self) -> sqlalchemy.sql.Selectable: + def as_selectable(self) -> sa_sql_expression_Selectable: """Returns the Selectable that is used to retrieve the data. This can be used in a subselect FROM clause for queries against this data. """ - return sqlalchemy.select(sqlalchemy.text(self.query.lstrip()[6:])).subquery() + return sa.select(sa.text(self.query.lstrip()[6:])).subquery() def _create_batch_spec_kwargs(self) -> dict[str, Any]: return { @@ -796,8 +801,8 @@ def test_connection(self) -> None: TestConnectionError: If the connection test fails. """ datasource: SQLDatasource = self.datasource - engine: sqlalchemy.engine.Engine = datasource.get_engine() - inspector: sqlalchemy.engine.Inspector = sqlalchemy.inspect(engine) + engine: sqlalchemy_engine_Engine = datasource.get_engine() + inspector: sqlalchemy_engine_Inspector = sa.inspect(engine) if self.schema_name and self.schema_name not in inspector.get_schema_names(): raise TestConnectionError( @@ -805,7 +810,7 @@ def test_connection(self) -> None: f'"{self.schema_name}" does not exist.' ) - table_exists = sqlalchemy.inspect(engine).has_table( + table_exists = sa.inspect(engine).has_table( table_name=self.table_name, schema=self.schema_name, ) @@ -818,8 +823,8 @@ def test_connection(self) -> None: def test_splitter_connection(self) -> None: if self.splitter: datasource: SQLDatasource = self.datasource - engine: sqlalchemy.engine.Engine = datasource.get_engine() - inspector: sqlalchemy.engine.Inspector = sqlalchemy.inspect(engine) + engine: sqlalchemy_engine_Engine = datasource.get_engine() + inspector: sqlalchemy_engine_Inspector = sa.inspect(engine) columns: list[dict[str, Any]] = inspector.get_columns( table_name=self.table_name, schema=self.schema_name @@ -831,12 +836,12 @@ def test_splitter_connection(self) -> None: f'The column "{splitter_column_name}" was not found in table "{self.qualified_name}"' ) - def as_selectable(self) -> sqlalchemy.sql.Selectable: + def as_selectable(self) -> sa_sql_expression_Selectable: """Returns the table as a sqlalchemy Selectable. This can be used in a from clause for a query against this data. """ - return sqlalchemy.text(self.table_name) + return sa.text(self.table_name) def _create_batch_spec_kwargs(self) -> dict[str, Any]: return { @@ -882,7 +887,7 @@ class SQLDatasource(Datasource): # private attrs _cached_connection_string: Union[str, ConfigStr] = pydantic.PrivateAttr("") - _engine: Union[sa.engine.Engine, None] = pydantic.PrivateAttr(None) + _engine: Union[sqlalchemy_engine_Engine, None] = pydantic.PrivateAttr(None) # These are instance var because ClassVars can't contain Type variables. See # https://peps.python.org/pep-0526/#class-and-instance-variable-annotations @@ -894,7 +899,7 @@ def execution_engine_type(self) -> Type[SqlAlchemyExecutionEngine]: """Returns the default execution engine type.""" return SqlAlchemyExecutionEngine - def get_engine(self) -> sqlalchemy.engine.Engine: + def get_engine(self) -> sqlalchemy_engine_Engine: if self.connection_string != self._cached_connection_string or not self._engine: try: model_dict = self.dict( @@ -903,7 +908,7 @@ def get_engine(self) -> sqlalchemy.engine.Engine: ) connection_string = model_dict.pop("connection_string") kwargs = model_dict.pop("kwargs", {}) - self._engine = sqlalchemy.create_engine(connection_string, **kwargs) + self._engine = sa.create_engine(connection_string, **kwargs) except Exception as e: # connection_string has passed pydantic validation, but still fails to create a sqlalchemy engine # one possible case is a missing plugin (e.g. psycopg2) @@ -925,7 +930,7 @@ def test_connection(self, test_assets: bool = True) -> None: TestConnectionError: If the connection test fails. """ try: - engine: sqlalchemy.engine.Engine = self.get_engine() + engine: sqlalchemy_engine_Engine = self.get_engine() engine.connect() except Exception as e: raise TestConnectionError( diff --git a/great_expectations/execution_engine/execution_engine.py b/great_expectations/execution_engine/execution_engine.py index db926c5cf273..d54ad89685ef 100644 --- a/great_expectations/execution_engine/execution_engine.py +++ b/great_expectations/execution_engine/execution_engine.py @@ -51,16 +51,6 @@ logger = logging.getLogger(__name__) -try: - import pandas as pd -except ImportError: - pd = None - - logger.debug( - "Unable to load pandas; install optional pandas dependency for support." - ) - - class NoOpDict: def __getitem__(self, item): return None diff --git a/great_expectations/execution_engine/pandas_execution_engine.py b/great_expectations/execution_engine/pandas_execution_engine.py index f00b5398cbc3..948cc996ac1c 100644 --- a/great_expectations/execution_engine/pandas_execution_engine.py +++ b/great_expectations/execution_engine/pandas_execution_engine.py @@ -41,9 +41,17 @@ from great_expectations.execution_engine.split_and_sample.pandas_data_splitter import ( PandasDataSplitter, ) +from great_expectations.optional_imports import ( + BlobServiceClient, + DefaultCredentialsError, + GoogleAPIError, + google_cloud_storage, + google_service_account, +) logger = logging.getLogger(__name__) + try: import boto3 from botocore.exceptions import ClientError, ParamValidationError @@ -55,28 +63,6 @@ "Unable to load AWS connection object; install optional boto3 dependency for support" ) -try: - from azure.storage.blob import BlobServiceClient -except ImportError: - BlobServiceClient = None - logger.debug( - "Unable to load Azure connection object; install optional azure dependency for support" - ) - -try: - from google.api_core.exceptions import GoogleAPIError - from google.auth.exceptions import DefaultCredentialsError - from google.cloud import storage - from google.oauth2 import service_account -except ImportError: - storage = None - service_account = None - GoogleAPIError = None # type: ignore[assignment,misc] # assigning None to a type - DefaultCredentialsError = None - logger.debug( - "Unable to load GCS connection object; install optional google dependency for support" - ) - HASH_THRESHOLD = 1e9 @@ -153,14 +139,19 @@ def __init__(self, *args, **kwargs) -> None: self._data_sampler = PandasDataSampler() def _instantiate_azure_client(self) -> None: - azure_options = self.config.get("azure_options", {}) - try: - if "conn_str" in azure_options: - self._azure = BlobServiceClient.from_connection_string(**azure_options) - else: - self._azure = BlobServiceClient(**azure_options) - except (TypeError, AttributeError): - self._azure = None + self._azure = None + if BlobServiceClient: + azure_options = self.config.get("azure_options", {}) + try: + if "conn_str" in azure_options: + self._azure = BlobServiceClient.from_connection_string( + **azure_options + ) + else: + self._azure = BlobServiceClient(**azure_options) + except (TypeError, AttributeError): + # If exception occurs, then "self._azure = None" remains in effect. + pass def _instantiate_s3_client(self) -> None: # Try initializing cloud provider client. If unsuccessful, we'll catch it when/if a BatchSpec is passed in. @@ -185,15 +176,21 @@ def _instantiate_gcs_client(self) -> None: credentials = None # If configured with gcloud CLI / env vars if "filename" in gcs_options: filename = gcs_options.pop("filename") - credentials = service_account.Credentials.from_service_account_file( - filename=filename + credentials = ( + google_service_account.Credentials.from_service_account_file( + filename=filename + ) ) elif "info" in gcs_options: info = gcs_options.pop("info") - credentials = service_account.Credentials.from_service_account_info( - info=info + credentials = ( + google_service_account.Credentials.from_service_account_info( + info=info + ) ) - self._gcs = storage.Client(credentials=credentials, **gcs_options) + self._gcs = google_cloud_storage.Client( + credentials=credentials, **gcs_options + ) except (TypeError, AttributeError, DefaultCredentialsError): self._gcs = None diff --git a/great_expectations/execution_engine/sparkdf_execution_engine.py b/great_expectations/execution_engine/sparkdf_execution_engine.py index 2bdc95ca1d11..20d40765e4df 100644 --- a/great_expectations/execution_engine/sparkdf_execution_engine.py +++ b/great_expectations/execution_engine/sparkdf_execution_engine.py @@ -5,7 +5,6 @@ import logging from functools import reduce from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -62,6 +61,15 @@ RowConditionParserType, parse_condition_to_spark, ) +from great_expectations.optional_imports import ( + F, + pyspark_DataFrameReader, + pyspark_sql_DataFrame, + pyspark_sql_Row, + pyspark_sql_SparkSession, + pyspark_sql_utils_AnalysisException, + sparktypes, +) from great_expectations.validator.computed_metric import MetricValue # noqa: TCH001 from great_expectations.validator.metric_configuration import ( MetricConfiguration, # noqa: TCH001 @@ -69,32 +77,6 @@ logger = logging.getLogger(__name__) -try: - import pyspark - import pyspark.sql.functions as F - - # noinspection SpellCheckingInspection - import pyspark.sql.types as sparktypes - from pyspark import SparkContext - from pyspark.sql import DataFrame, Row, SparkSession - from pyspark.sql.readwriter import DataFrameReader -except ImportError: - pyspark = None # type: ignore[assignment] - SparkContext = None # type: ignore[assignment,misc] - SparkSession = None # type: ignore[assignment,misc] - Row = None # type: ignore[assignment,misc] - DataFrame = None # type: ignore[assignment,misc] - DataFrameReader = None # type: ignore[assignment,misc] - F = None # type: ignore[assignment] - # noinspection SpellCheckingInspection - sparktypes = None # type: ignore[assignment] - - logger.debug( - "Unable to load pyspark; install optional spark dependency for support." - ) - -if TYPE_CHECKING: - from pyspark.sql import DataFrame # noqa: TCH004 # noinspection SpellCheckingInspection def apply_dateutil_parse(column): @@ -220,7 +202,7 @@ def __init__( if spark_config is None: spark_config = {} - spark: SparkSession = get_or_create_spark_application( + spark: pyspark_sql_SparkSession = get_or_create_spark_application( spark_config=spark_config, force_reuse_spark_context=force_reuse_spark_context, ) @@ -248,7 +230,7 @@ def __init__( self._data_sampler = SparkDataSampler() @property - def dataframe(self) -> DataFrame: + def dataframe(self) -> pyspark_sql_DataFrame: """If a batch has been loaded, returns a Spark Dataframe containing the data within the loaded batch""" if self.batch_manager.active_batch_data is None: raise ValueError( @@ -258,9 +240,9 @@ def dataframe(self) -> DataFrame: return cast(SparkDFBatchData, self.batch_manager.active_batch_data).dataframe def load_batch_data( # type: ignore[override] - self, batch_id: str, batch_data: Union[SparkDFBatchData, DataFrame] + self, batch_id: str, batch_data: Union[SparkDFBatchData, pyspark_sql_DataFrame] ) -> None: - if isinstance(batch_data, DataFrame): + if pyspark_sql_DataFrame and isinstance(batch_data, pyspark_sql_DataFrame): # type: ignore[truthy-function] batch_data = SparkDFBatchData(self, batch_data) elif not isinstance(batch_data, SparkDFBatchData): raise GreatExpectationsError( @@ -293,8 +275,8 @@ def get_batch_data_and_markers( reader_method: str reader_options: dict path: str - schema: Optional[Union[pyspark.sql.types.StructType, dict, str]] - reader: DataFrameReader + schema: Optional[Union[sparktypes.StructType, dict, str]] + reader: pyspark_DataFrameReader reader_fn: Callable if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated @@ -377,7 +359,7 @@ def get_batch_data_and_markers( """ ) # pyspark will raise an AnalysisException error if path is incorrect - except pyspark.sql.utils.AnalysisException: + except pyspark_sql_utils_AnalysisException: raise ExecutionEngineError( f"""Unable to read in batch from the following path: {path}. Please check your configuration.""" ) @@ -493,7 +475,7 @@ def _get_reader_fn(self, reader, reader_method=None, path=None) -> Callable: def get_domain_records( # noqa: C901 - 18 self, domain_kwargs: dict, - ) -> DataFrame: + ) -> "pyspark_sql_DataFrame": # noqa F821 """Uses the given Domain kwargs (which include row_condition, condition_parser, and ignore_row_if directives) to obtain and/or query a batch. Args: @@ -502,6 +484,11 @@ def get_domain_records( # noqa: C901 - 18 Returns: A DataFrame (the data on which to compute returned in the format of a Spark DataFrame) """ + """ + # TODO: Docusaurus run fails, unless "pyspark_sql_DataFrame" type hint above is enclosed in quotes. + This may be caused by it becoming great_expectations.optional_imports.NotImported when pyspark is not installed. + + """ table = domain_kwargs.get("table", None) if table: raise ValueError( @@ -639,7 +626,7 @@ def get_compute_domain( domain_kwargs: dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, - ) -> Tuple[DataFrame, dict, dict]: + ) -> Tuple["pyspark_sql_DataFrame", dict, dict]: # noqa F821 """Uses a DataFrame and Domain kwargs (which include a row condition and a condition parser) to obtain and/or query a Batch of data. Returns in the format of a Spark DataFrame along with Domain arguments required for computing. If the Domain \ @@ -661,13 +648,20 @@ def get_compute_domain( - a dictionary of accessor_domain_kwargs, describing any accessors needed to identify the Domain within the compute domain """ + """ + # TODO: Docusaurus run fails, unless "pyspark_sql_DataFrame" type hint above is enclosed in quotes. + This may be caused by it becoming great_expectations.optional_imports.NotImported when pyspark is not installed. + + """ table: str = domain_kwargs.get("table", None) if table: raise ValueError( "SparkDFExecutionEngine does not currently support multiple named tables." ) - data: DataFrame = self.get_domain_records(domain_kwargs=domain_kwargs) + data: pyspark_sql_DataFrame = self.get_domain_records( + domain_kwargs=domain_kwargs + ) split_domain_kwargs: SplitDomainKwargs = self._split_domain_kwargs( domain_kwargs, domain_type, accessor_keys @@ -731,7 +725,7 @@ def resolve_metric_bundle( """ resolved_metrics: Dict[Tuple[str, str, str], MetricValue] = {} - res: List[Row] + res: List[pyspark_sql_Row] aggregates: Dict[Tuple[str, str, str], dict] = {} @@ -764,7 +758,9 @@ def resolve_metric_bundle( for aggregate in aggregates.values(): domain_kwargs: dict = aggregate["domain_kwargs"] - df: DataFrame = self.get_domain_records(domain_kwargs=domain_kwargs) + df: pyspark_sql_DataFrame = self.get_domain_records( + domain_kwargs=domain_kwargs + ) assert len(aggregate["column_aggregates"]) == len(aggregate["metric_ids"]) diff --git a/great_expectations/execution_engine/split_and_sample/sparkdf_data_sampler.py b/great_expectations/execution_engine/split_and_sample/sparkdf_data_sampler.py index 78ade32115ea..5c3045076c99 100644 --- a/great_expectations/execution_engine/split_and_sample/sparkdf_data_sampler.py +++ b/great_expectations/execution_engine/split_and_sample/sparkdf_data_sampler.py @@ -6,33 +6,17 @@ from great_expectations.execution_engine.split_and_sample.data_sampler import ( DataSampler, ) +from great_expectations.optional_imports import F, pyspark_sql_DataFrame, sparktypes logger = logging.getLogger(__name__) -try: - import pyspark - import pyspark.sql.functions as F - - # noinspection SpellCheckingInspection - import pyspark.sql.types as sparktypes - from pyspark.sql import DataFrame - -except ImportError: - pyspark = None # type: ignore[assignment] - DataFrame = None # type: ignore[assignment,misc] - F = None # type: ignore[assignment] - # noinspection SpellCheckingInspection - sparktypes = None # type: ignore[assignment] - - logger.debug( - "Unable to load pyspark; install optional spark dependency for support." - ) - class SparkDataSampler(DataSampler): """Methods for sampling a Spark dataframe.""" - def sample_using_limit(self, df: DataFrame, batch_spec: BatchSpec) -> DataFrame: + def sample_using_limit( + self, df: pyspark_sql_DataFrame, batch_spec: BatchSpec + ) -> pyspark_sql_DataFrame: """Sample the first n rows of data. Args: @@ -52,7 +36,9 @@ def sample_using_limit(self, df: DataFrame, batch_spec: BatchSpec) -> DataFrame: n: int = batch_spec["sampling_kwargs"]["n"] return df.limit(n) - def sample_using_random(self, df: DataFrame, batch_spec: BatchSpec) -> DataFrame: + def sample_using_random( + self, df: pyspark_sql_DataFrame, batch_spec: BatchSpec + ) -> pyspark_sql_DataFrame: """Take a random sample of rows, retaining proportion p. Args: @@ -79,7 +65,9 @@ def sample_using_random(self, df: DataFrame, batch_spec: BatchSpec) -> DataFrame ) return res - def sample_using_mod(self, df: DataFrame, batch_spec: BatchSpec) -> DataFrame: + def sample_using_mod( + self, df: pyspark_sql_DataFrame, batch_spec: BatchSpec + ) -> pyspark_sql_DataFrame: """Take the mod of named column, and only keep rows that match the given value. Args: @@ -112,9 +100,9 @@ def sample_using_mod(self, df: DataFrame, batch_spec: BatchSpec) -> DataFrame: def sample_using_a_list( self, - df: DataFrame, + df: pyspark_sql_DataFrame, batch_spec: BatchSpec, - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Match the values in the named column against value_list, and only keep the matches. Args: @@ -141,9 +129,9 @@ def sample_using_a_list( def sample_using_hash( self, - df: DataFrame, + df: pyspark_sql_DataFrame, batch_spec: BatchSpec, - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Hash the values in the named column, and only keep rows that match the given hash_value. Args: diff --git a/great_expectations/execution_engine/split_and_sample/sparkdf_data_splitter.py b/great_expectations/execution_engine/split_and_sample/sparkdf_data_splitter.py index 683ff8a6b894..1416f05d7bb0 100644 --- a/great_expectations/execution_engine/split_and_sample/sparkdf_data_splitter.py +++ b/great_expectations/execution_engine/split_and_sample/sparkdf_data_splitter.py @@ -9,26 +9,10 @@ DataSplitter, DatePart, ) +from great_expectations.optional_imports import F, pyspark_sql_DataFrame, sparktypes logger = logging.getLogger(__name__) -try: - import pyspark - import pyspark.sql.functions as F - - # noinspection SpellCheckingInspection - import pyspark.sql.types as sparktypes - from pyspark.sql import DataFrame -except ImportError: - pyspark = None # type: ignore[assignment] - F = None # type: ignore[assignment] - DataFrame = None # type: ignore[assignment,misc] - # noinspection SpellCheckingInspection - sparktypes = None # type: ignore[assignment] - logger.debug( - "Unable to load pyspark; install optional spark dependency if you will be working with Spark dataframes" - ) - class SparkDataSplitter(DataSplitter): """Methods for splitting data accessible via SparkDFExecutionEngine. @@ -39,10 +23,10 @@ class SparkDataSplitter(DataSplitter): def split_on_year( self, - df: DataFrame, + df: pyspark_sql_DataFrame, column_name: str, batch_identifiers: dict, - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Split on year values in column_name. Args: @@ -65,10 +49,10 @@ def split_on_year( def split_on_year_and_month( self, - df: DataFrame, + df: pyspark_sql_DataFrame, column_name: str, batch_identifiers: dict, - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Split on year and month values in column_name. Args: @@ -91,10 +75,10 @@ def split_on_year_and_month( def split_on_year_and_month_and_day( self, - df: DataFrame, + df: pyspark_sql_DataFrame, column_name: str, batch_identifiers: dict, - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Split on year and month and day values in column_name. Args: @@ -117,11 +101,11 @@ def split_on_year_and_month_and_day( def split_on_date_parts( self, - df: DataFrame, + df: pyspark_sql_DataFrame, column_name: str, batch_identifiers: dict, date_parts: Union[List[DatePart], List[str]], - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Split on date_part values in column_name. Values are NOT truncated, for example this will return data for a @@ -189,8 +173,8 @@ def _convert_date_part_to_spark_equivalent(date_part: DatePart | str) -> str: @staticmethod def split_on_whole_table( - df: DataFrame, - ) -> DataFrame: + df: pyspark_sql_DataFrame, + ) -> pyspark_sql_DataFrame: """No op. Return the same data that is passed in. Args: @@ -204,7 +188,7 @@ def split_on_whole_table( @staticmethod def split_on_column_value( df, column_name: str, batch_identifiers: dict - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Return a dataframe where rows are filtered based on the specified column value. Args: @@ -223,7 +207,7 @@ def split_on_converted_datetime( column_name: str, batch_identifiers: dict, date_format_string: str = "yyyy-MM-dd", - ) -> DataFrame: + ) -> pyspark_sql_DataFrame: """Return a dataframe where rows are filtered based on whether their converted datetime (using date_format_string) matches the datetime string value provided in batch_identifiers for the specified column. diff --git a/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_sampler.py b/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_sampler.py index 15428f261aa9..6c4fb6c33d91 100644 --- a/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_sampler.py +++ b/great_expectations/execution_engine/split_and_sample/sqlalchemy_data_sampler.py @@ -8,18 +8,14 @@ DataSampler, ) from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect -from great_expectations.optional_imports import sqlalchemy as sa - -try: - from sa.engine import Dialect - from sa.sql import Selectable - from sa.sql.elements import BinaryExpression, BooleanClauseList -except ImportError: - Selectable = None - BinaryExpression = None - BooleanClauseList = None - Dialect = None - +from great_expectations.optional_imports import ( + sa_sql_expression_BinaryExpression, + sa_sql_expression_BooleanClauseList, + sa_sql_expression_Selectable, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) if TYPE_CHECKING: from great_expectations.execution_engine import SqlAlchemyExecutionEngine @@ -32,8 +28,10 @@ def sample_using_limit( self, execution_engine: SqlAlchemyExecutionEngine, batch_spec: BatchSpec, - where_clause: Optional[Selectable] = None, - ) -> Union[str, BinaryExpression, BooleanClauseList]: + where_clause: Optional[sa_sql_expression_Selectable] = None, + ) -> Union[ + str, sa_sql_expression_BinaryExpression, sa_sql_expression_BooleanClauseList + ]: """Sample using a limit with configuration provided via the batch_spec. Note: where_clause needs to be included at this stage since SqlAlchemy's semantics @@ -66,7 +64,7 @@ def sample_using_limit( if dialect_name == GXSqlDialect.ORACLE: # TODO: AJB 20220429 WARNING THIS oracle dialect METHOD IS NOT COVERED BY TESTS # limit doesn't compile properly for oracle so we will append rownum to query string later - raw_query: Selectable = ( + raw_query: sa_sql_expression_Selectable = ( sa.select("*") .select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None)) @@ -84,7 +82,7 @@ def sample_using_limit( elif dialect_name == GXSqlDialect.MSSQL: # Note that this code path exists because the limit parameter is not getting rendered # successfully in the resulting mssql query. - selectable_query: Selectable = ( + selectable_query: sa_sql_expression_Selectable = ( sa.select("*") .select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None)) @@ -137,8 +135,8 @@ def _validate_mssql_limit_param(n: Union[str, int]) -> None: def sample_using_random( execution_engine: SqlAlchemyExecutionEngine, batch_spec: BatchSpec, - where_clause: Optional[Selectable] = None, - ) -> Selectable: + where_clause: Optional[sa_sql_expression_Selectable] = None, + ) -> sa_sql_expression_Selectable: """Sample using random data with configuration provided via the batch_spec. Note: where_clause needs to be included at this stage since we use the where clause @@ -180,7 +178,7 @@ def sample_using_random( def sample_using_mod( self, batch_spec: BatchSpec, - ) -> Selectable: + ) -> sa_sql_expression_Selectable: """Take the mod of named column, and only keep rows that match the given value. Args: @@ -207,7 +205,7 @@ def sample_using_mod( def sample_using_a_list( self, batch_spec: BatchSpec, - ) -> Selectable: + ) -> sa_sql_expression_Selectable: """Match the values in the named column against value_list, and only keep the matches. Args: @@ -233,7 +231,7 @@ def sample_using_a_list( def sample_using_md5( self, batch_spec: BatchSpec, - ) -> Selectable: + ) -> sa_sql_expression_Selectable: """Hash the values in the named column using md5, and only keep rows that match the given hash_value. Args: diff --git a/great_expectations/execution_engine/sqlalchemy_batch_data.py b/great_expectations/execution_engine/sqlalchemy_batch_data.py index 7f3a1fefda43..273625367f16 100644 --- a/great_expectations/execution_engine/sqlalchemy_batch_data.py +++ b/great_expectations/execution_engine/sqlalchemy_batch_data.py @@ -3,21 +3,17 @@ from great_expectations.core.batch import BatchData from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect +from great_expectations.optional_imports import ( + quoted_name, + sqlalchemy_DatabaseError, + sqlalchemy_engine_DefaultDialect, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.util import generate_temporary_table_name -try: - import sqlalchemy as sa # noqa: TID251 - from sqlalchemy.engine import Engine # noqa: TID251 - from sqlalchemy.engine.default import DefaultDialect # noqa: TID251 - from sqlalchemy.exc import DatabaseError # noqa: TID251 - from sqlalchemy.sql.elements import quoted_name # noqa: TID251 -except ImportError: - sa = None - quoted_name = None - DefaultDialect = None - DatabaseError = None - Engine = None - logger = logging.getLogger(__name__) @@ -181,7 +177,7 @@ def dialect(self) -> GXSqlDialect: return self._dialect @property - def sql_engine_dialect(self) -> DefaultDialect: + def sql_engine_dialect(self) -> sqlalchemy_engine_DefaultDialect: """Returns the Batches' current engine dialect""" return self._engine.dialect @@ -293,13 +289,13 @@ def _create_temporary_table( # noqa: C901 - 18 with connection.begin(): try: connection.execute(sa.text(stmt_1)) - except DatabaseError: + except sqlalchemy_DatabaseError: connection.execute(sa.text(stmt_2)) else: # Since currently self._engine can also be a connection we need to # check first that it is an engine before creating a connection from it. # Otherwise, we use the connection. - if isinstance(self._engine, Engine): + if isinstance(self._engine, sqlalchemy_engine_Engine): with self._engine.connect() as connection: with connection.begin(): connection.execute(sa.text(stmt)) diff --git a/great_expectations/execution_engine/sqlalchemy_execution_engine.py b/great_expectations/execution_engine/sqlalchemy_execution_engine.py index 7494988b1ddf..cf6858ba58f9 100644 --- a/great_expectations/execution_engine/sqlalchemy_execution_engine.py +++ b/great_expectations/execution_engine/sqlalchemy_execution_engine.py @@ -45,9 +45,20 @@ SqlAlchemyDataSplitter, ) from great_expectations.optional_imports import ( - sqlalchemy_Engine, + quoted_name, + sa_sql_expression_Select, + sa_sql_expression_Selectable, + sa_sql_expression_TextualSelect, + sqlalchemy_engine_Dialect, + sqlalchemy_engine_Engine, + sqlalchemy_engine_Row, + sqlalchemy_OperationalError, + sqlalchemy_TextClause, sqlalchemy_version_check, ) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.validator.computed_metric import MetricValue # noqa: TCH001 del get_versions # isort:skip @@ -91,39 +102,10 @@ logger = logging.getLogger(__name__) -try: - import sqlalchemy as sa # noqa: TID251 +if sa: sqlalchemy_version_check(sa.__version__) - make_url = import_make_url() -except ImportError: - sa = None - -try: - from sqlalchemy.engine import Dialect, Row # noqa: TID251 - from sqlalchemy.exc import OperationalError # noqa: TID251 - from sqlalchemy.sql import Selectable # noqa: TID251 - from sqlalchemy.sql.elements import ( # noqa: TID251 - BooleanClauseList, - Label, - TextClause, - quoted_name, - ) - from sqlalchemy.sql.selectable import Select, TextualSelect # noqa: TID251 -except ImportError: - BooleanClauseList = None - DefaultDialect = None - Dialect = None - Label = None - OperationalError = None - reflection = None - Row = None - Select = None - Selectable = None - TextClause = None - TextualSelect = None - quoted_name = None try: @@ -450,7 +432,9 @@ def _on_connect(dbapi_con, connection_record): self._engine_backup = self.engine # sqlite/mssql temp tables only persist within a connection so override the engine # but only do this if self.engine is an Engine and isn't a Connection - if sqlalchemy_Engine and isinstance(self.engine, sqlalchemy_Engine): + if sqlalchemy_engine_Engine and isinstance( + self.engine, sqlalchemy_engine_Engine + ): self.engine = self.engine.connect() # Send a connect event to provide dialect type @@ -499,7 +483,7 @@ def url(self) -> Optional[str]: return self._url @property - def dialect(self) -> Dialect: + def dialect(self) -> sqlalchemy_engine_Dialect: return self.engine.dialect @property @@ -597,7 +581,7 @@ def _get_sqlalchemy_key_pair_auth_url( def get_domain_records( # noqa: C901 - 24 self, domain_kwargs: dict, - ) -> Selectable: + ) -> sa_sql_expression_Selectable: """Uses the given Domain kwargs (which include row_condition, condition_parser, and ignore_row_if directives) to obtain and/or query a Batch of data. Args: @@ -629,7 +613,7 @@ def get_domain_records( # noqa: C901 - 24 f"Unable to find batch with batch_id {batch_id}" ) - selectable: Selectable + selectable: sa_sql_expression_Selectable if "table" in domain_kwargs and domain_kwargs["table"] is not None: # TODO: Add logic to handle record_set_name once implemented # (i.e. multiple record sets (tables) in one batch @@ -654,7 +638,7 @@ def get_domain_records( # noqa: C901 - 24 as a subquery wrapped in "(subquery) alias". TextClause must first be converted to TextualSelect using sa.columns() before it can be converted to type Subquery """ - if TextClause and isinstance(selectable, TextClause): + if sqlalchemy_TextClause and isinstance(selectable, sqlalchemy_TextClause): selectable = selectable.columns().subquery() # Filtering by row condition. @@ -815,7 +799,7 @@ def get_compute_domain( domain_kwargs: dict, domain_type: Union[str, MetricDomainTypes], accessor_keys: Optional[Iterable[str]] = None, - ) -> Tuple[Selectable, dict, dict]: + ) -> Tuple[sa_sql_expression_Selectable, dict, dict]: """Uses a given batch dictionary and Domain kwargs to obtain a SqlAlchemy column object. Args: @@ -834,7 +818,9 @@ def get_compute_domain( domain_kwargs, domain_type, accessor_keys ) - selectable: Selectable = self.get_domain_records(domain_kwargs=domain_kwargs) + selectable: sa_sql_expression_Selectable = self.get_domain_records( + domain_kwargs=domain_kwargs + ) return selectable, split_domain_kwargs.compute, split_domain_kwargs.accessor @@ -986,7 +972,7 @@ def resolve_metric_bundle( """ resolved_metrics: Dict[Tuple[str, str, str], MetricValue] = {} - res: List[Row] + res: List[sqlalchemy_engine_Row] # We need a different query for each Domain (where clause). queries: Dict[Tuple[str, str, str], dict] = {} @@ -1032,7 +1018,7 @@ def resolve_metric_bundle( for query in queries.values(): domain_kwargs: dict = query["domain_kwargs"] - selectable: Selectable = self.get_domain_records( + selectable: sa_sql_expression_Selectable = self.get_domain_records( domain_kwargs=domain_kwargs ) @@ -1044,12 +1030,18 @@ def resolve_metric_bundle( as a subquery wrapped in "(subquery) alias". TextClause must first be converted to TextualSelect using sa.columns() before it can be converted to type Subquery """ - if TextClause and isinstance(selectable, TextClause): + if sqlalchemy_TextClause and isinstance( + selectable, sqlalchemy_TextClause + ): sa_query_object = sa.select(*query["select"]).select_from( selectable.columns().subquery() ) - elif (Select and isinstance(selectable, Select)) or ( - TextualSelect and isinstance(selectable, TextualSelect) + elif ( + sa_sql_expression_Select + and isinstance(selectable, sa_sql_expression_Select) + ) or ( + sa_sql_expression_TextualSelect + and isinstance(selectable, sa_sql_expression_TextualSelect) ): sa_query_object = sa.select(*query["select"]).select_from( selectable.subquery() @@ -1061,15 +1053,18 @@ def resolve_metric_bundle( logger.debug(f"Attempting query {str(sa_query_object)}") - if sqlalchemy_Engine and isinstance(self.engine, sqlalchemy_Engine): + if sqlalchemy_engine_Engine and isinstance( + self.engine, sqlalchemy_engine_Engine + ): self.engine = self.engine.connect() + res = self.engine.execute(sa_query_object).fetchall() logger.debug( f"""SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id \ {IDDict(domain_kwargs).to_id()}""" ) - except OperationalError as oe: + except sqlalchemy_OperationalError as oe: exception_message: str = "An SQL execution Exception occurred. " exception_traceback: str = traceback.format_exc() exception_message += f'{type(oe).__name__}: "{str(oe)}". Traceback: "{exception_traceback}".' @@ -1129,7 +1124,9 @@ def _get_splitter_method(self, splitter_method_name: str) -> Callable: """ return self._data_splitter.get_splitter_method(splitter_method_name) - def execute_split_query(self, split_query: Selectable) -> List[Row]: + def execute_split_query( + self, split_query: sa_sql_expression_Selectable + ) -> List[sqlalchemy_engine_Row]: """Use the execution engine to run the split query and fetch all of the results. Args: @@ -1149,16 +1146,23 @@ def execute_split_query(self, split_query: Selectable) -> List[Row]: pattern = re.compile(r"(CAST\(EXTRACT\(.*?\))( AS STRING\))", re.IGNORECASE) split_query = re.sub(pattern, r"\1 AS VARCHAR)", split_query) - if sqlalchemy_Engine and isinstance(self.engine, sqlalchemy_Engine): + if sqlalchemy_engine_Engine and isinstance( + self.engine, sqlalchemy_engine_Engine + ): connection = self.engine.connect() else: connection = self.engine - query_result: List[Row] = connection.execute(split_query).fetchall() + query_result: List[sqlalchemy_engine_Row] = connection.execute( + split_query + ).fetchall() return query_result def get_data_for_batch_identifiers( - self, selectable: Selectable, splitter_method_name: str, splitter_kwargs: dict + self, + selectable: sa_sql_expression_Selectable, + splitter_method_name: str, + splitter_kwargs: dict, ) -> List[dict]: """Build data used to construct batch identifiers for the input table using the provided splitter config. @@ -1181,7 +1185,7 @@ def get_data_for_batch_identifiers( def _build_selectable_from_batch_spec( self, batch_spec: BatchSpec - ) -> Union[Selectable, str]: + ) -> Union[sa_sql_expression_Selectable, str]: if ( batch_spec.get("query") is not None and batch_spec.get("sampling_method") is not None @@ -1206,7 +1210,7 @@ def _build_selectable_from_batch_spec( else: split_clause = sa.true() - selectable: Selectable = self._subselectable(batch_spec) + selectable: sa_sql_expression_Selectable = self._subselectable(batch_spec) sampling_method: Optional[str] = batch_spec.get("sampling_method") if sampling_method is not None: if sampling_method in [ @@ -1236,10 +1240,10 @@ def _build_selectable_from_batch_spec( return sa.select("*").select_from(selectable).where(split_clause) - def _subselectable(self, batch_spec: BatchSpec) -> Selectable: + def _subselectable(self, batch_spec: BatchSpec) -> sa_sql_expression_Selectable: table_name = batch_spec.get("table_name") query = batch_spec.get("query") - selectable: Selectable + selectable: sa_sql_expression_Selectable if table_name: selectable = sa.table( table_name, schema=batch_spec.get("schema_name", None) @@ -1317,9 +1321,9 @@ def get_batch_data_and_markers( source_schema_name=source_schema_name, ) elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec): - selectable: Union[Selectable, str] = self._build_selectable_from_batch_spec( - batch_spec=batch_spec - ) + selectable: Union[ + sa_sql_expression_Selectable, str + ] = self._build_selectable_from_batch_spec(batch_spec=batch_spec) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, diff --git a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py index 0b148adbe5cb..74f217ede289 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py @@ -27,6 +27,7 @@ render_evaluation_parameter_string, ) from great_expectations.expectations.registry import get_metric_kwargs +from great_expectations.optional_imports import sparktypes from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent from great_expectations.render.renderer.renderer import renderer from great_expectations.render.renderer_configuration import ( @@ -52,14 +53,6 @@ logger = logging.getLogger(__name__) -try: - import pyspark.sql.types as sparktypes -except ImportError as e: - logger.debug(str(e)) - logger.debug( - "Unable to load spark context; install optional spark dependency for support." - ) - class ExpectColumnValuesToBeInTypeList(ColumnMapExpectation): """ diff --git a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py index aca1fe3e0a41..a557435231f6 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py @@ -24,6 +24,13 @@ render_evaluation_parameter_string, ) from great_expectations.expectations.registry import get_metric_kwargs +from great_expectations.optional_imports import ( + sparktypes, + sqlalchemy_dialects_registry, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.render import LegacyRendererType, RenderedStringTemplateContent from great_expectations.render.renderer.renderer import renderer from great_expectations.render.renderer_configuration import ( @@ -46,24 +53,6 @@ logger = logging.getLogger(__name__) -try: - import pyspark.sql.types as sparktypes -except ImportError as e: - logger.debug(str(e)) - logger.debug( - "Unable to load spark context; install optional spark dependency for support." - ) - -try: - import sqlalchemy as sa # noqa: TID251 - from sqlalchemy.dialects import registry # noqa: TID251 - -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - sa = None - registry = None try: import sqlalchemy_redshift.dialect @@ -75,7 +64,9 @@ try: import sqlalchemy_bigquery as sqla_bigquery - registry.register("bigquery", _BIGQUERY_MODULE_NAME, "BigQueryDialect") + sqlalchemy_dialects_registry.register( + "bigquery", _BIGQUERY_MODULE_NAME, "BigQueryDialect" + ) bigquery_types_tuple = None try: from sqlalchemy_bigquery import GEOGRAPHY # noqa: F401 @@ -97,7 +88,9 @@ # Sometimes "pybigquery.sqlalchemy_bigquery" fails to self-register in Azure (our CI/CD pipeline) in certain cases, so we do it explicitly. # (see https://stackoverflow.com/questions/53284762/nosuchmoduleerror-cant-load-plugin-sqlalchemy-dialectssnowflake) - registry.register("bigquery", _BIGQUERY_MODULE_NAME, "dialect") + sqlalchemy_dialects_registry.register( + "bigquery", _BIGQUERY_MODULE_NAME, "dialect" + ) try: getattr(sqla_bigquery, "INTEGER") bigquery_types_tuple = None diff --git a/great_expectations/expectations/metrics/column_aggregate_metric_provider.py b/great_expectations/expectations/metrics/column_aggregate_metric_provider.py index e1427664b7d9..287cd62a979e 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metric_provider.py +++ b/great_expectations/expectations/metrics/column_aggregate_metric_provider.py @@ -14,7 +14,6 @@ SqlAlchemyExecutionEngine, ) from great_expectations.expectations.metrics import DeprecatedMetaMetricProvider -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, metric_value, @@ -25,6 +24,8 @@ from great_expectations.expectations.metrics.util import ( get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import quoted_name +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.validator.metric_configuration import MetricConfiguration logger = logging.getLogger(__name__) diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_bootstrapped_ks_test_p_value.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_bootstrapped_ks_test_p_value.py index 5612bf944491..aebaf8c141ea 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_bootstrapped_ks_test_p_value.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_bootstrapped_ks_test_p_value.py @@ -11,13 +11,6 @@ logger = logging.getLogger(__name__) -try: - from pyspark.sql.functions import stddev_samp # noqa: F401 -except ImportError as e: - logger.debug(str(e)) - logger.debug( - "Unable to load spark context; install optional spark dependency for support." - ) import numpy as np from scipy import stats diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_distinct_values.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_distinct_values.py index 9dbb9551dbf8..603a5575e1fe 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_distinct_values.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_distinct_values.py @@ -15,18 +15,19 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import ( F, pyspark_sql_Column, pyspark_sql_DataFrame, pyspark_sql_Row, - sa_func_count, sa_sql_expression_ColumnClause, sa_sql_expression_Selectable, sqlalchemy_engine_Engine, ) -from great_expectations.expectations.metrics.metric_provider import metric_value -from great_expectations.optional_imports import sqlalchemy as sa +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.validator.metric_configuration import MetricConfiguration @@ -116,7 +117,7 @@ def _sqlalchemy( cls, column: sa_sql_expression_ColumnClause, **kwargs, - ) -> sa_func_count: + ) -> sa_sql_expression_Selectable: """ Past implementations of column.distinct_values.count depended on column.value_counts and column.distinct_values. This was causing performance issues due to the complex query used in column.value_counts and subsequent diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_histogram.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_histogram.py index 1b5befdcf8d6..51dc0a6456c2 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_histogram.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_histogram.py @@ -18,8 +18,9 @@ from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, ) -from great_expectations.expectations.metrics.import_manager import Bucketizer, F, sa from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import F, pyspark_ml_Bucketizer +from great_expectations.optional_imports import sqlalchemy as sa logger = logging.getLogger(__name__) @@ -255,7 +256,9 @@ def _spark( bins.append(float("inf")) temp_column = df.select(column).where(F.col(column).isNotNull()) - bucketizer = Bucketizer(splits=bins, inputCol=column, outputCol="buckets") + bucketizer = pyspark_ml_Bucketizer( + splits=bins, inputCol=column, outputCol="buckets" + ) bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column) # This is painful to do, but: bucketizer cannot handle values outside of a range diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_max.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_max.py index 653e9c951e19..10c927887434 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_max.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_max.py @@ -13,7 +13,8 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_mean.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_mean.py index b24e226df8f1..422bafed6d03 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_mean.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_mean.py @@ -8,7 +8,8 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa class ColumnMean(ColumnAggregateMetricProvider): diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_min.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_min.py index e236459519ff..c4e1497ea888 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_min.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_min.py @@ -13,7 +13,8 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_parameterized_distribution_ks_test_p_value.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_parameterized_distribution_ks_test_p_value.py index 3999bb9ba724..256e2e030c58 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_parameterized_distribution_ks_test_p_value.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_parameterized_distribution_ks_test_p_value.py @@ -1,5 +1,7 @@ import logging +from scipy import stats + from great_expectations.execution_engine import PandasExecutionEngine from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, @@ -12,16 +14,6 @@ logger = logging.getLogger(__name__) -try: - from pyspark.sql.functions import stddev_samp # noqa: F401 -except ImportError as e: - logger.debug(str(e)) - logger.debug( - "Unable to load spark context; install optional spark dependency for support." - ) - -from scipy import stats - class ColumnParameterizedDistributionKSTestPValue(ColumnAggregateMetricProvider): """MetricProvider Class for Aggregate Standard Deviation metric""" diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py index 03dd8406e2da..6eee1398206a 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_quantile_values.py @@ -21,7 +21,18 @@ ) from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.expectations.metrics.util import attempt_allowing_relative_error -from great_expectations.optional_imports import sqlalchemy as sa +from great_expectations.optional_imports import ( + SQLALCHEMY_NOT_IMPORTED, + sa_sql_expression_CTE, + sa_sql_expression_Label, + sa_sql_expression_Select, + sa_sql_expression_WithinGroup, + sqlalchemy_ProgrammingError, + sqlalchemy_TextClause, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) @@ -30,33 +41,23 @@ except ImportError: TrinoUserError = None -try: - from sqlalchemy.exc import ProgrammingError # noqa: TID251 - from sqlalchemy.sql import Select # noqa: TID251 - from sqlalchemy.sql.elements import Label, TextClause, WithinGroup # noqa: TID251 - from sqlalchemy.sql.selectable import CTE # noqa: TID251 -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - ProgrammingError = None - Select = None - Label = None - TextClause = None - WithinGroup = None - CTE = None try: from sqlalchemy.engine.row import Row # noqa: TID251 + + from great_expectations.optional_imports import sqlalchemy # noqa: TID251 except ImportError: try: from sqlalchemy.engine.row import RowProxy # noqa: TID251 + from great_expectations.optional_imports import sqlalchemy # noqa: TID251 + Row = RowProxy except ImportError: logger.debug( "Unable to load SqlAlchemy Row class; please upgrade you sqlalchemy installation to the latest version." ) + sqlalchemy = SQLALCHEMY_NOT_IMPORTED RowProxy = None Row = None @@ -213,16 +214,18 @@ def _get_column_quantiles_mssql( column, quantiles: Iterable, selectable, sqlalchemy_engine ) -> list: # mssql requires over(), so we add an empty over() clause - selects: List[WithinGroup] = [ + selects: List[sa_sql_expression_WithinGroup] = [ sa.func.percentile_disc(quantile).within_group(column.asc()).over() for quantile in quantiles ] - quantiles_query: Select = sa.select(*selects).select_from(selectable) + quantiles_query: sa_sql_expression_Select = sa.select(*selects).select_from( + selectable + ) try: quantiles_results: Row = sqlalchemy_engine.execute(quantiles_query).fetchone() return list(quantiles_results) - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( @@ -236,15 +239,17 @@ def _get_column_quantiles_bigquery( column, quantiles: Iterable, selectable, sqlalchemy_engine ) -> list: # BigQuery does not support "WITHIN", so we need a special case for it - selects: List[WithinGroup] = [ + selects: List[sa_sql_expression_WithinGroup] = [ sa.func.percentile_disc(column, quantile).over() for quantile in quantiles ] - quantiles_query: Select = sa.select(*selects).select_from(selectable) + quantiles_query: sa_sql_expression_Select = sa.select(*selects).select_from( + selectable + ) try: quantiles_results: Row = sqlalchemy_engine.execute(quantiles_query).fetchone() return list(quantiles_results) - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( @@ -259,7 +264,7 @@ def _get_column_quantiles_mysql( ) -> list: # MySQL does not support "percentile_disc", so we implement it as a compound query. # Please see https://stackoverflow.com/questions/19770026/calculate-percentile-value-using-mysql for reference. - percent_rank_query: CTE = ( + percent_rank_query: sa_sql_expression_CTE = ( sa.select( column, sa.cast( @@ -272,19 +277,19 @@ def _get_column_quantiles_mysql( .cte("t") ) - selects: List[WithinGroup] = [] + selects: List[sa_sql_expression_WithinGroup] = [] for idx, quantile in enumerate(quantiles): # pymysql cannot handle conversion of numpy float64 to float; convert just in case if np.issubdtype(type(quantile), np.float_): quantile = float(quantile) - quantile_column: Label = ( + quantile_column: sa_sql_expression_Label = ( sa.func.first_value(column) .over( order_by=sa.case( ( - percent_rank_query.c.p + percent_rank_query.columns.p <= sa.cast(quantile, sa.dialects.mysql.DECIMAL(18, 15)), - percent_rank_query.c.p, + percent_rank_query.columns.p, ), else_=None, ).desc() @@ -292,14 +297,14 @@ def _get_column_quantiles_mysql( .label(f"q_{idx}") ) selects.append(quantile_column) - quantiles_query: Select = ( - sa.select(*selects).distinct().order_by(percent_rank_query.c.p.desc()) + quantiles_query: sa_sql_expression_Select = ( + sa.select(*selects).distinct().order_by(percent_rank_query.columns.p.desc()) ) try: quantiles_results: Row = sqlalchemy_engine.execute(quantiles_query).fetchone() return list(quantiles_results) - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( @@ -314,13 +319,15 @@ def _get_column_quantiles_trino( ) -> list: # Trino does not have the percentile_disc func, but instead has approx_percentile sql_approx: str = f"approx_percentile({column}, ARRAY{list(quantiles)})" - selects_approx: List[TextClause] = [sa.text(sql_approx)] - quantiles_query: Select = sa.select(*selects_approx).select_from(selectable) + selects_approx: List[sqlalchemy_TextClause] = [sa.text(sql_approx)] + quantiles_query: sa_sql_expression_Select = sa.select(*selects_approx).select_from( + selectable + ) try: quantiles_results: Row = sqlalchemy_engine.execute(quantiles_query).fetchone() return list(quantiles_results)[0] - except (ProgrammingError, TrinoUserError) as pe: + except (sqlalchemy_ProgrammingError, TrinoUserError) as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( @@ -340,7 +347,7 @@ def _get_column_quantiles_sqlite( the analytical processing is not a very strongly represented capability of the SQLite database management system. """ offsets: List[int] = [quantile * table_row_count - 1 for quantile in quantiles] - quantile_queries: List[Select] = [ + quantile_queries: List[sa_sql_expression_Select] = [ sa.select(column) .order_by(column.asc()) .offset(offset) @@ -350,7 +357,7 @@ def _get_column_quantiles_sqlite( ] quantile_result: Row - quantile_query: Select + quantile_query: sa_sql_expression_Select try: quantiles_results: List[Row] = [ sqlalchemy_engine.execute(quantile_query).fetchone() @@ -361,7 +368,7 @@ def _get_column_quantiles_sqlite( [list(quantile_result) for quantile_result in quantiles_results] ) ) - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( @@ -378,8 +385,10 @@ def _get_column_quantiles_athena( sqlalchemy_engine, ) -> list: approx_percentiles = f"approx_percentile({column}, ARRAY{list(quantiles)})" - selects_approx: List[TextClause] = [sa.text(approx_percentiles)] - quantiles_query_approx: Select = sa.select(*selects_approx).select_from(selectable) + selects_approx: List[sqlalchemy_TextClause] = [sa.text(approx_percentiles)] + quantiles_query_approx: sa_sql_expression_Select = sa.select( + *selects_approx + ).select_from(selectable) try: quantiles_results: Row = sqlalchemy_engine.execute( quantiles_query_approx @@ -387,9 +396,7 @@ def _get_column_quantiles_athena( # the ast literal eval is needed because the method is returning a json string and not a dict results = ast.literal_eval(quantiles_results[0]) return results - - return results - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += ( @@ -411,16 +418,18 @@ def _get_column_quantiles_generic_sqlalchemy( selectable, sqlalchemy_engine, ) -> list: - selects: List[WithinGroup] = [ + selects: List[sa_sql_expression_WithinGroup] = [ sa.func.percentile_disc(quantile).within_group(column.asc()) for quantile in quantiles ] - quantiles_query: Select = sa.select(*selects).select_from(selectable) + quantiles_query: sa_sql_expression_Select = sa.select(*selects).select_from( + selectable + ) try: quantiles_results: Row = sqlalchemy_engine.execute(quantiles_query).fetchone() return list(quantiles_results) - except ProgrammingError: + except sqlalchemy_ProgrammingError: # ProgrammingError: (psycopg2.errors.SyntaxError) Aggregate function "percentile_disc" is not supported; # use approximate percentile_disc or percentile_cont instead. if attempt_allowing_relative_error(dialect): @@ -428,17 +437,17 @@ def _get_column_quantiles_generic_sqlalchemy( sql_approx: str = get_approximate_percentile_disc_sql( selects=selects, sql_engine_dialect=dialect ) - selects_approx: List[TextClause] = [sa.text(sql_approx)] - quantiles_query_approx: Select = sa.select(*selects_approx).select_from( - selectable - ) + selects_approx: List[sqlalchemy_TextClause] = [sa.text(sql_approx)] + quantiles_query_approx: sa_sql_expression_Select = sa.select( + *selects_approx + ).select_from(selectable) if allow_relative_error or sqlalchemy_engine.driver == "psycopg2": try: quantiles_results: Row = sqlalchemy_engine.execute( quantiles_query_approx ).fetchone() return list(quantiles_results) - except ProgrammingError as pe: + except sqlalchemy_ProgrammingError as pe: exception_message: str = "An SQL syntax Exception occurred." exception_traceback: str = traceback.format_exc() exception_message += f'{type(pe).__name__}: "{str(pe)}". Traceback: "{exception_traceback}".' diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_standard_deviation.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_standard_deviation.py index 3a7e5e8f4f80..1b643ba90e92 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_standard_deviation.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_standard_deviation.py @@ -17,19 +17,12 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.validator.metric_configuration import MetricConfiguration logger = logging.getLogger(__name__) -try: - from pyspark.sql.functions import stddev_samp # noqa: F401 -except ImportError as e: - logger.debug(str(e)) - logger.debug( - "Unable to load spark context; install optional spark dependency for support." - ) - class ColumnStandardDeviation(ColumnAggregateMetricProvider): """MetricProvider Class for Aggregate Standard Deviation metric""" diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_sum.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_sum.py index 3756e5e970e0..eded5a5a458c 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_sum.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_sum.py @@ -8,7 +8,8 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa class ColumnSum(ColumnAggregateMetricProvider): diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_value_counts.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_value_counts.py index 4863221c9b6f..24cb8a019d3a 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_value_counts.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_value_counts.py @@ -11,16 +11,18 @@ from great_expectations.expectations.metrics.column_aggregate_metric_provider import ( ColumnAggregateMetricProvider, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import ( F, pyspark_sql_DataFrame, pyspark_sql_Row, - sa, sa_sql_expression_Select, sa_sql_expression_Selectable, sqlalchemy_engine_Row, ) -from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) class ColumnValueCounts(ColumnAggregateMetricProvider): diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_max.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_max.py index 98a3ebfa0ba0..bac5702fca53 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_max.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_max.py @@ -10,7 +10,8 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa class ColumnValuesLengthMax(ColumnAggregateMetricProvider): diff --git a/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_min.py b/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_min.py index 79e9f5b401c6..20005804fb33 100644 --- a/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_min.py +++ b/great_expectations/expectations/metrics/column_aggregate_metrics/column_values_length_min.py @@ -10,7 +10,8 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa class ColumnValuesLengthMin(ColumnAggregateMetricProvider): diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_value_lengths.py b/great_expectations/expectations/metrics/column_map_metrics/column_value_lengths.py index 707c5f9f2c1d..d356c8c5ed02 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_value_lengths.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_value_lengths.py @@ -10,12 +10,13 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, column_function_partial, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.util import pandas_series_between_inclusive from great_expectations.validator.metric_configuration import MetricConfiguration diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_between.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_between.py index 38388e0f0c22..47e034fab6a9 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_between.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_between.py @@ -9,11 +9,12 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_decreasing.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_decreasing.py index 0e8dd64e91e4..a1e01c82946a 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_decreasing.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_decreasing.py @@ -10,12 +10,16 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, Window, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) from great_expectations.expectations.metrics.metric_provider import metric_partial +from great_expectations.optional_imports import ( + F, + pyspark_sql_Window, + sparktypes, +) from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes @@ -124,10 +128,13 @@ def _spark( column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType) ): diff = F.datediff( - column, F.lag(column).over(Window.orderBy(F.lit("constant"))) + column, + F.lag(column).over(pyspark_sql_Window.orderBy(F.lit("constant"))), ) else: - diff = column - F.lag(column).over(Window.orderBy(F.lit("constant"))) + diff = column - F.lag(column).over( + pyspark_sql_Window.orderBy(F.lit("constant")) + ) diff = F.when(diff.isNull(), -1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py index c382241284d4..31c428c3586a 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_in_set.py @@ -7,11 +7,11 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes try: diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_increasing.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_increasing.py index 7c01ef74ffb5..59bc1f38316c 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_increasing.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_increasing.py @@ -10,12 +10,16 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, Window, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) from great_expectations.expectations.metrics.metric_provider import metric_partial +from great_expectations.optional_imports import ( + F, + pyspark_sql_Window, + sparktypes, +) from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes @@ -124,10 +128,13 @@ def _spark( column_metadata["type"], (sparktypes.TimestampType, sparktypes.DateType) ): diff = F.datediff( - column, F.lag(column).over(Window.orderBy(F.lit("constant"))) + column, + F.lag(column).over(pyspark_sql_Window.orderBy(F.lit("constant"))), ) else: - diff = column - F.lag(column).over(Window.orderBy(F.lit("constant"))) + diff = column - F.lag(column).over( + pyspark_sql_Window.orderBy(F.lit("constant")) + ) diff = F.when(diff.isNull(), 1).otherwise(diff) # NOTE: because in spark we are implementing the window function directly, diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_json_parseable.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_json_parseable.py index 9af7c8e9175a..100acfdfcd4b 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_json_parseable.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_json_parseable.py @@ -4,11 +4,11 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F, sparktypes class ColumnValuesJsonParseable(ColumnMapMetricProvider): diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_match_json_schema.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_match_json_schema.py index 38e250f35d23..8bd4521066ed 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_match_json_schema.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_match_json_schema.py @@ -7,11 +7,11 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F, sparktypes class ColumnValuesMatchJsonSchema(ColumnMapMetricProvider): diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_match_strftime_format.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_match_strftime_format.py index 14df328afbc8..731ffcd4c777 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_match_strftime_format.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_match_strftime_format.py @@ -4,11 +4,11 @@ PandasExecutionEngine, SparkDFExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import F, sparktypes class ColumnValuesMatchStrftimeFormat(ColumnMapMetricProvider): diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_unique.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_unique.py index 1a254da8e788..3e1bba768abd 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_unique.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_unique.py @@ -4,16 +4,16 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( - F, - Window, - sa, - sqlalchemy_engine_Engine, -) from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, ) +from great_expectations.optional_imports import ( + F, + pyspark_sql_Window, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.util import generate_temporary_table_name @@ -63,7 +63,9 @@ def _sqlalchemy_window(cls, column, _table, **kwargs): source_table=_table, column_name=column.name, ) - if isinstance(sql_engine, sqlalchemy_engine_Engine): + if sqlalchemy_engine_Engine and isinstance( + sql_engine, sqlalchemy_engine_Engine + ): with sql_engine.connect() as connection: with connection.begin(): connection.execute(sa.text(temp_table_stmt)) @@ -91,4 +93,4 @@ def _sqlalchemy_window(cls, column, _table, **kwargs): partial_fn_type=MetricPartialFunctionTypes.WINDOW_CONDITION_FN, ) def _spark(cls, column, **kwargs): - return F.count(F.lit(1)).over(Window.partitionBy(column)) <= 1 + return F.count(F.lit(1)).over(pyspark_sql_Window.partitionBy(column)) <= 1 diff --git a/great_expectations/expectations/metrics/column_map_metrics/column_values_z_score.py b/great_expectations/expectations/metrics/column_map_metrics/column_values_z_score.py index e50c6d49df11..63d85abd4aa1 100644 --- a/great_expectations/expectations/metrics/column_map_metrics/column_values_z_score.py +++ b/great_expectations/expectations/metrics/column_map_metrics/column_values_z_score.py @@ -12,12 +12,13 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.map_metric_provider import ( ColumnMapMetricProvider, column_condition_partial, column_function_partial, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.validator.metric_configuration import MetricConfiguration diff --git a/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_greater.py b/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_greater.py index 276451fedbdc..c53a730bd582 100644 --- a/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_greater.py +++ b/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_greater.py @@ -5,11 +5,12 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.warnings import warn_deprecated_parse_strings_as_datetimes diff --git a/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_in_set.py b/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_in_set.py index e7ca03beef9e..67c37590a6af 100644 --- a/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_in_set.py +++ b/great_expectations/expectations/metrics/column_pair_map_metrics/column_pair_values_in_set.py @@ -8,11 +8,12 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa class ColumnPairValuesInSet(ColumnPairMapMetricProvider): diff --git a/great_expectations/expectations/metrics/import_manager.py b/great_expectations/expectations/metrics/import_manager.py deleted file mode 100644 index 6ba45648c36e..000000000000 --- a/great_expectations/expectations/metrics/import_manager.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -This file manages common global-level imports for which we want to centralize error handling -""" -import logging - -logger = logging.getLogger(__name__) -sa_import_warning_required = False -spark_import_warning_required = False - -try: - import sqlalchemy as sa # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy module available.") - sa = None - -try: - from sqlalchemy.engine import Engine as sqlalchemy_engine_Engine # noqa: TID251 - from sqlalchemy.engine import Row as sqlalchemy_engine_Row # noqa: TID251 - from sqlalchemy.engine import reflection # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy.engine module available.") - reflection = None - sqlalchemy_engine_Engine = None - sqlalchemy_engine_Row = None - -try: - import sqlalchemy.func.count as sa_func_count # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy.func module available.") - sa_func_count = None - -try: - import sqlalchemy.sql.expression.ColumnClause as sa_sql_expression_ColumnClause # noqa: TID251 - import sqlalchemy.sql.expression.Select as sa_sql_expression_Select # noqa: TID251 - import sqlalchemy.sql.expression.Selectable as sa_sql_expression_Selectable # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy.sql.expression module available.") - sa_sql_expression_ColumnClause = None - sa_sql_expression_Select = None - sa_sql_expression_Selectable = None - -try: - from sqlalchemy.sql.elements import quoted_name # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy.sql.elements module available.") - quoted_name = None - -try: - import pyspark.sql.functions as F - import pyspark.sql.types as sparktypes -except ImportError: - logger.debug("No spark functions module available.") - sparktypes = None # type: ignore[assignment] - F = None # type: ignore[assignment] - -try: - from pyspark.ml.feature import Bucketizer -except ImportError: - logger.debug("No spark Bucketizer available.") - Bucketizer = None # type: ignore[assignment,misc] - -try: - from pyspark.sql import Window -except ImportError: - logger.debug("No spark Window function available.") - Window = None # type: ignore[assignment,misc] - -try: - from pyspark.sql import Column as pyspark_sql_Column - from pyspark.sql import DataFrame as pyspark_sql_DataFrame - from pyspark.sql import Row as pyspark_sql_Row - from pyspark.sql import SparkSession as pyspark_sql_SparkSession - from pyspark.sql import SQLContext -except ImportError: - logger.debug("No spark SQLContext available.") - SQLContext = None # type: ignore[assignment,misc] - pyspark_sql_Column = None # type: ignore[assignment,misc] - pyspark_sql_DataFrame = None # type: ignore[assignment,misc] - pyspark_sql_Row = None # type: ignore[assignment,misc] - pyspark_sql_SparkSession = None # type: ignore[assignment,misc] diff --git a/great_expectations/expectations/metrics/map_metric_provider/column_condition_partial.py b/great_expectations/expectations/metrics/map_metric_provider/column_condition_partial.py index b4a67bb9a5ad..a17c526e9973 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/column_condition_partial.py +++ b/great_expectations/expectations/metrics/map_metric_provider/column_condition_partial.py @@ -22,14 +22,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, ) from great_expectations.expectations.metrics.util import ( - Engine, get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import ( + quoted_name, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) @@ -166,7 +171,7 @@ def inner_func( batch_columns_list=metrics["table.columns"], ) - sqlalchemy_engine: Engine = execution_engine.engine + sqlalchemy_engine: sqlalchemy_engine_Engine = execution_engine.engine dialect = execution_engine.dialect_module if dialect is None: diff --git a/great_expectations/expectations/metrics/map_metric_provider/column_function_partial.py b/great_expectations/expectations/metrics/map_metric_provider/column_function_partial.py index 4d15d418fc75..6996eff8361d 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/column_function_partial.py +++ b/great_expectations/expectations/metrics/map_metric_provider/column_function_partial.py @@ -21,13 +21,14 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, ) from great_expectations.expectations.metrics.util import ( get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import quoted_name +from great_expectations.optional_imports import sqlalchemy as sa logger = logging.getLogger(__name__) diff --git a/great_expectations/expectations/metrics/map_metric_provider/column_map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/column_map_condition_auxilliary_methods.py index 7bedd8986928..22fe4d0d6bf9 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/column_map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/column_map_condition_auxilliary_methods.py @@ -18,16 +18,15 @@ SqlAlchemyExecutionEngine, ) - # from great_expectations.expectations.metrics.import_manager import quoted_name - from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect -from great_expectations.expectations.metrics.import_manager import F, quoted_name, sa from great_expectations.expectations.metrics.map_metric_provider.is_sqlalchemy_metric_selectable import ( _is_sqlalchemy_metric_selectable, ) from great_expectations.expectations.metrics.util import ( get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import F, quoted_name +from great_expectations.optional_imports import sqlalchemy as sa if TYPE_CHECKING: import pyspark diff --git a/great_expectations/expectations/metrics/map_metric_provider/column_pair_condition_partial.py b/great_expectations/expectations/metrics/map_metric_provider/column_pair_condition_partial.py index cd88bd0db6b2..96f11b70202e 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/column_pair_condition_partial.py +++ b/great_expectations/expectations/metrics/map_metric_provider/column_pair_condition_partial.py @@ -22,14 +22,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, ) from great_expectations.expectations.metrics.util import ( - Engine, get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import ( + quoted_name, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) @@ -177,7 +182,7 @@ def inner_func( batch_columns_list=metrics["table.columns"], ) - sqlalchemy_engine: Engine = execution_engine.engine + sqlalchemy_engine: sqlalchemy_engine_Engine = execution_engine.engine dialect = execution_engine.dialect_module expected_condition = metric_fn( diff --git a/great_expectations/expectations/metrics/map_metric_provider/column_pair_function_partial.py b/great_expectations/expectations/metrics/map_metric_provider/column_pair_function_partial.py index 38b0606b9581..8a95b5c78175 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/column_pair_function_partial.py +++ b/great_expectations/expectations/metrics/map_metric_provider/column_pair_function_partial.py @@ -23,13 +23,14 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, ) from great_expectations.expectations.metrics.util import ( get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import quoted_name +from great_expectations.optional_imports import sqlalchemy as sa logger = logging.getLogger(__name__) diff --git a/great_expectations/expectations/metrics/map_metric_provider/column_pair_map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/column_pair_map_condition_auxilliary_methods.py index eb5c6f99873d..1387f7eae518 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/column_pair_map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/column_pair_map_condition_auxilliary_methods.py @@ -9,7 +9,6 @@ Union, ) -from great_expectations.expectations.metrics.import_manager import F, quoted_name from great_expectations.expectations.metrics.map_metric_provider.is_sqlalchemy_metric_selectable import ( _is_sqlalchemy_metric_selectable, ) @@ -17,6 +16,7 @@ get_dbms_compatible_column_names, verify_column_names_exist, ) +from great_expectations.optional_imports import F, quoted_name from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.util import ( get_sqlalchemy_selectable, @@ -29,7 +29,6 @@ SqlAlchemyExecutionEngine, ) - # from great_expectations.expectations.metrics.import_manager import quoted_name logger = logging.getLogger(__name__) diff --git a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py index 1454be2bc006..3bf6fe92cbb1 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/map_condition_auxilliary_methods.py @@ -19,40 +19,41 @@ ) from great_expectations.core.util import convert_to_json_serializable from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect -from great_expectations.execution_engine.sqlalchemy_execution_engine import ( - OperationalError, -) -from great_expectations.expectations.metrics.import_manager import F, quoted_name from great_expectations.expectations.metrics.map_metric_provider.is_sqlalchemy_metric_selectable import ( _is_sqlalchemy_metric_selectable, ) from great_expectations.expectations.metrics.util import ( - Insert, - Label, - Select, compute_unexpected_pandas_indices, get_dbms_compatible_column_names, get_sqlalchemy_source_table_and_schema, sql_statement_with_post_compile_to_string, verify_column_names_exist, ) -from great_expectations.optional_imports import sqlalchemy as sa -from great_expectations.optional_imports import sqlalchemy_Engine +from great_expectations.optional_imports import ( + F, + pyspark, + quoted_name, + sa_sql_expression_Label, + sa_sql_expression_Select, + sa_sql_Insert, + sqlalchemy_engine_Engine, + sqlalchemy_OperationalError, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.util import ( generate_temporary_table_name, get_sqlalchemy_selectable, ) if TYPE_CHECKING: - import pyspark - from great_expectations.execution_engine import ( PandasExecutionEngine, SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) - # from great_expectations.expectations.metrics.import_manager import quoted_name logger = logging.getLogger(__name__) @@ -315,7 +316,7 @@ def _sqlalchemy_map_condition_unexpected_count_value( selectable = execution_engine.get_domain_records(domain_kwargs=domain_kwargs) # The integral values are cast to SQL Numeric in order to avoid a bug in AWS Redshift (converted to integer later). - count_case_statement: List[Label] = sa.case( + count_case_statement: List[sa_sql_expression_Label] = sa.case( ( unexpected_condition, sa.sql.expression.cast(1, sa.Numeric), @@ -323,7 +324,7 @@ def _sqlalchemy_map_condition_unexpected_count_value( else_=sa.sql.expression.cast(0, sa.Numeric), ).label("condition") - count_selectable: Select = sa.select(count_case_statement) + count_selectable: sa_sql_expression_Select = sa.select(count_case_statement) if not _is_sqlalchemy_metric_selectable(map_metric_provider=cls): selectable = get_sqlalchemy_selectable(selectable) count_selectable = count_selectable.select_from(selectable) @@ -346,7 +347,7 @@ def _sqlalchemy_map_condition_unexpected_count_value( ) temp_table_obj.create(execution_engine.engine, checkfirst=True) - inner_case_query: Insert = temp_table_obj.insert().from_select( + inner_case_query: sa_sql_Insert = temp_table_obj.insert().from_select( [count_case_statement], count_selectable, ) @@ -355,14 +356,16 @@ def _sqlalchemy_map_condition_unexpected_count_value( count_selectable = temp_table_obj count_selectable = get_sqlalchemy_selectable(count_selectable) - unexpected_count_query: Select = ( + unexpected_count_query: sa_sql_expression_Select = ( sa.select( sa.func.sum(sa.column("condition")).label("unexpected_count"), ) .select_from(count_selectable) .alias("UnexpectedCountSubquery") ) - if sqlalchemy_Engine and isinstance(execution_engine.engine, sqlalchemy_Engine): + if sqlalchemy_engine_Engine and isinstance( + execution_engine.engine, sqlalchemy_engine_Engine + ): connection = execution_engine.engine.connect() else: # execution_engine.engine is already a Connection. Use it directly @@ -381,7 +384,7 @@ def _sqlalchemy_map_condition_unexpected_count_value( except TypeError: unexpected_count = 0 - except OperationalError as oe: + except sqlalchemy_OperationalError as oe: exception_message: str = f"An SQL execution Exception occurred: {str(oe)}." raise gx_exceptions.InvalidMetricAccessorDomainKwargsKeyError( message=exception_message @@ -424,7 +427,7 @@ def _sqlalchemy_map_condition_rows( query = query.limit(result_format["partial_unexpected_count"]) try: return execution_engine.engine.execute(query).fetchall() - except OperationalError as oe: + except sqlalchemy_OperationalError as oe: exception_message: str = f"An SQL execution Exception occurred: {str(oe)}." raise gx_exceptions.InvalidMetricAccessorDomainKwargsKeyError( message=exception_message diff --git a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_condition_partial.py b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_condition_partial.py index 0c429fa15686..af0a3d451995 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_condition_partial.py +++ b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_condition_partial.py @@ -22,14 +22,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, ) from great_expectations.expectations.metrics.util import ( - Engine, get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import ( + quoted_name, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) @@ -164,7 +169,7 @@ def inner_func( batch_columns_list=metrics["table.columns"], ) - sqlalchemy_engine: Engine = execution_engine.engine + sqlalchemy_engine: sqlalchemy_engine_Engine = execution_engine.engine column_selector = [ sa.column(column_name) for column_name in column_list diff --git a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_function_partial.py b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_function_partial.py index 358c5c2f19a9..dc859a586835 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_function_partial.py +++ b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_function_partial.py @@ -22,14 +22,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import quoted_name, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, ) from great_expectations.expectations.metrics.util import ( - Engine, get_dbms_compatible_column_names, ) +from great_expectations.optional_imports import ( + quoted_name, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) @@ -155,7 +160,7 @@ def inner_func( batch_columns_list=table_columns, ) - sqlalchemy_engine: Engine = execution_engine.engine + sqlalchemy_engine: sqlalchemy_engine_Engine = execution_engine.engine column_selector = [ sa.column(column_name) for column_name in column_list diff --git a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py index 1e55283e243c..09ecd97f87ec 100644 --- a/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py +++ b/great_expectations/expectations/metrics/map_metric_provider/multicolumn_map_condition_auxilliary_methods.py @@ -9,7 +9,6 @@ Union, ) -from great_expectations.expectations.metrics.import_manager import F, quoted_name from great_expectations.expectations.metrics.map_metric_provider.is_sqlalchemy_metric_selectable import ( _is_sqlalchemy_metric_selectable, ) @@ -17,6 +16,7 @@ get_dbms_compatible_column_names, verify_column_names_exist, ) +from great_expectations.optional_imports import F, quoted_name from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.util import ( get_sqlalchemy_selectable, @@ -29,7 +29,6 @@ SqlAlchemyExecutionEngine, ) - # from great_expectations.expectations.metrics.import_manager import quoted_name logger = logging.getLogger(__name__) diff --git a/great_expectations/expectations/metrics/multicolumn_map_metrics/compound_columns_unique.py b/great_expectations/expectations/metrics/multicolumn_map_metrics/compound_columns_unique.py index ea9e30dec0d7..f642b30123d7 100644 --- a/great_expectations/expectations/metrics/multicolumn_map_metrics/compound_columns_unique.py +++ b/great_expectations/expectations/metrics/multicolumn_map_metrics/compound_columns_unique.py @@ -10,7 +10,6 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, Window, sa from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, ) @@ -20,6 +19,11 @@ from great_expectations.expectations.metrics.map_metric_provider.multicolumn_function_partial import ( multicolumn_function_partial, ) +from great_expectations.optional_imports import ( + F, + pyspark_sql_Window, +) +from great_expectations.optional_imports import sqlalchemy as sa from great_expectations.validator.validation_graph import MetricConfiguration @@ -157,7 +161,10 @@ def _sqlalchemy_condition(cls, column_list, **kwargs): def _spark(cls, column_list, **kwargs): column_names = column_list.columns row_wise_cond = ( - F.count(F.lit(1)).over(Window.partitionBy(F.struct(*column_names))) <= 1 + F.count(F.lit(1)).over( + pyspark_sql_Window.partitionBy(F.struct(*column_names)) + ) + <= 1 ) return row_wise_cond diff --git a/great_expectations/expectations/metrics/multicolumn_map_metrics/multicolumn_sum_equal.py b/great_expectations/expectations/metrics/multicolumn_map_metrics/multicolumn_sum_equal.py index e05de5ac2f68..9f7304bd077a 100644 --- a/great_expectations/expectations/metrics/multicolumn_map_metrics/multicolumn_sum_equal.py +++ b/great_expectations/expectations/metrics/multicolumn_map_metrics/multicolumn_sum_equal.py @@ -3,13 +3,13 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, ) from great_expectations.expectations.metrics.map_metric_provider.multicolumn_condition_partial import ( multicolumn_condition_partial, ) +from great_expectations.optional_imports import F class MulticolumnSumEqual(MulticolumnMapMetricProvider): diff --git a/great_expectations/expectations/metrics/multicolumn_map_metrics/select_column_values_unique_within_record.py b/great_expectations/expectations/metrics/multicolumn_map_metrics/select_column_values_unique_within_record.py index 03a0aaacf6f0..29c8cbd6229a 100644 --- a/great_expectations/expectations/metrics/multicolumn_map_metrics/select_column_values_unique_within_record.py +++ b/great_expectations/expectations/metrics/multicolumn_map_metrics/select_column_values_unique_within_record.py @@ -6,13 +6,14 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, ) from great_expectations.expectations.metrics.map_metric_provider.multicolumn_condition_partial import ( multicolumn_condition_partial, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa logger = logging.getLogger(__name__) diff --git a/great_expectations/expectations/metrics/query_metrics/query_column.py b/great_expectations/expectations/metrics/query_metrics/query_column.py index a04490efed70..15388125a165 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_column.py +++ b/great_expectations/expectations/metrics/query_metrics/query_column.py @@ -5,17 +5,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.expectations.metrics.query_metric_provider import ( + QueryMetricProvider, +) +from great_expectations.optional_imports import ( pyspark_sql_DataFrame, pyspark_sql_Row, pyspark_sql_SparkSession, - sa, sqlalchemy_engine_Engine, sqlalchemy_engine_Row, ) -from great_expectations.expectations.metrics.metric_provider import metric_value -from great_expectations.expectations.metrics.query_metric_provider import ( - QueryMetricProvider, +from great_expectations.optional_imports import ( + sqlalchemy as sa, ) from great_expectations.util import get_sqlalchemy_subquery_type diff --git a/great_expectations/expectations/metrics/query_metrics/query_column_pair.py b/great_expectations/expectations/metrics/query_metrics/query_column_pair.py index 3bf5ff04e588..206b205599c6 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_column_pair.py +++ b/great_expectations/expectations/metrics/query_metrics/query_column_pair.py @@ -5,17 +5,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.expectations.metrics.query_metric_provider import ( + QueryMetricProvider, +) +from great_expectations.optional_imports import ( pyspark_sql_DataFrame, pyspark_sql_Row, pyspark_sql_SparkSession, - sa, sqlalchemy_engine_Engine, sqlalchemy_engine_Row, ) -from great_expectations.expectations.metrics.metric_provider import metric_value -from great_expectations.expectations.metrics.query_metric_provider import ( - QueryMetricProvider, +from great_expectations.optional_imports import ( + sqlalchemy as sa, ) from great_expectations.util import get_sqlalchemy_subquery_type diff --git a/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py b/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py index f65d513f3fe5..224a9d9a756b 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py +++ b/great_expectations/expectations/metrics/query_metrics/query_multiple_columns.py @@ -5,17 +5,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.expectations.metrics.query_metric_provider import ( + QueryMetricProvider, +) +from great_expectations.optional_imports import ( pyspark_sql_DataFrame, pyspark_sql_Row, pyspark_sql_SparkSession, - sa, sqlalchemy_engine_Engine, sqlalchemy_engine_Row, ) -from great_expectations.expectations.metrics.metric_provider import metric_value -from great_expectations.expectations.metrics.query_metric_provider import ( - QueryMetricProvider, +from great_expectations.optional_imports import ( + sqlalchemy as sa, ) from great_expectations.util import get_sqlalchemy_subquery_type diff --git a/great_expectations/expectations/metrics/query_metrics/query_table.py b/great_expectations/expectations/metrics/query_metrics/query_table.py index 427667d92f93..5430d5fb1719 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_table.py +++ b/great_expectations/expectations/metrics/query_metrics/query_table.py @@ -5,17 +5,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.expectations.metrics.query_metric_provider import ( + QueryMetricProvider, +) +from great_expectations.optional_imports import ( pyspark_sql_DataFrame, pyspark_sql_Row, pyspark_sql_SparkSession, - sa, sqlalchemy_engine_Engine, sqlalchemy_engine_Row, ) -from great_expectations.expectations.metrics.metric_provider import metric_value -from great_expectations.expectations.metrics.query_metric_provider import ( - QueryMetricProvider, +from great_expectations.optional_imports import ( + sqlalchemy as sa, ) from great_expectations.util import get_sqlalchemy_subquery_type diff --git a/great_expectations/expectations/metrics/query_metrics/query_template_values.py b/great_expectations/expectations/metrics/query_metrics/query_template_values.py index 17f4a2c78327..0dca9daef2dd 100644 --- a/great_expectations/expectations/metrics/query_metrics/query_template_values.py +++ b/great_expectations/expectations/metrics/query_metrics/query_template_values.py @@ -5,17 +5,19 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( +from great_expectations.expectations.metrics.metric_provider import metric_value +from great_expectations.expectations.metrics.query_metric_provider import ( + QueryMetricProvider, +) +from great_expectations.optional_imports import ( pyspark_sql_DataFrame, pyspark_sql_Row, pyspark_sql_SparkSession, - sa, sqlalchemy_engine_Engine, sqlalchemy_engine_Row, ) -from great_expectations.expectations.metrics.metric_provider import metric_value -from great_expectations.expectations.metrics.query_metric_provider import ( - QueryMetricProvider, +from great_expectations.optional_imports import ( + sqlalchemy as sa, ) from great_expectations.util import get_sqlalchemy_subquery_type diff --git a/great_expectations/expectations/metrics/table_metrics/table_column_types.py b/great_expectations/expectations/metrics/table_metrics/table_column_types.py index 47f834accdd1..9613ce9af36a 100644 --- a/great_expectations/expectations/metrics/table_metrics/table_column_types.py +++ b/great_expectations/expectations/metrics/table_metrics/table_column_types.py @@ -10,17 +10,15 @@ from great_expectations.execution_engine.sqlalchemy_batch_data import ( SqlAlchemyBatchData, ) -from great_expectations.expectations.metrics.import_manager import sparktypes from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.expectations.metrics.table_metric_provider import ( TableMetricProvider, ) from great_expectations.expectations.metrics.util import get_sqlalchemy_column_metadata - -try: - from sqlalchemy.sql.elements import TextClause # noqa: TID251 -except ImportError: - TextClause = None +from great_expectations.optional_imports import ( + sparktypes, + sqlalchemy_TextClause, +) class ColumnTypes(TableMetricProvider): @@ -94,8 +92,10 @@ def _spark( def _get_sqlalchemy_column_metadata(engine, batch_data: SqlAlchemyBatchData): # if a custom query was passed - if isinstance(batch_data.selectable, TextClause): - table_selectable: TextClause = batch_data.selectable + if sqlalchemy_TextClause and isinstance( + batch_data.selectable, sqlalchemy_TextClause + ): + table_selectable: sqlalchemy_TextClause = batch_data.selectable schema_name = None else: table_selectable: str = ( # type: ignore[no-redef] @@ -115,17 +115,21 @@ def _get_spark_column_metadata(field, parent_name="", include_nested=True): if parent_name != "": parent_name = f"{parent_name}." - if isinstance(field, sparktypes.StructType): + if sparktypes and isinstance(field, sparktypes.StructType): for child in field.fields: cols += _get_spark_column_metadata(child, parent_name=parent_name) - elif isinstance(field, sparktypes.StructField): + elif sparktypes and isinstance(field, sparktypes.StructField): if "." in field.name: name = f"{parent_name}`{field.name}`" else: name = parent_name + field.name field_metadata = {"name": name, "type": field.dataType} cols.append(field_metadata) - if include_nested and isinstance(field.dataType, sparktypes.StructType): + if ( + include_nested + and sparktypes + and isinstance(field.dataType, sparktypes.StructType) + ): for child in field.dataType.fields: cols += _get_spark_column_metadata( child, diff --git a/great_expectations/expectations/metrics/table_metrics/table_head.py b/great_expectations/expectations/metrics/table_metrics/table_head.py index f55e4227f6b4..29137d64556e 100644 --- a/great_expectations/expectations/metrics/table_metrics/table_head.py +++ b/great_expectations/expectations/metrics/table_metrics/table_head.py @@ -27,7 +27,7 @@ from great_expectations.validator.validator import Validator if TYPE_CHECKING: - from great_expectations.expectations.metrics.import_manager import pyspark_sql_Row + from great_expectations.optional_imports import pyspark_sql_Row class TableHead(TableMetricProvider): diff --git a/great_expectations/expectations/metrics/table_metrics/table_row_count.py b/great_expectations/expectations/metrics/table_metrics/table_row_count.py index b92f0337516e..a1a73066ba25 100644 --- a/great_expectations/expectations/metrics/table_metrics/table_row_count.py +++ b/great_expectations/expectations/metrics/table_metrics/table_row_count.py @@ -7,7 +7,6 @@ SparkDFExecutionEngine, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import F, sa from great_expectations.expectations.metrics.metric_provider import ( metric_partial, metric_value, @@ -15,6 +14,8 @@ from great_expectations.expectations.metrics.table_metric_provider import ( TableMetricProvider, ) +from great_expectations.optional_imports import F +from great_expectations.optional_imports import sqlalchemy as sa class TableRowCount(TableMetricProvider): diff --git a/great_expectations/expectations/metrics/util.py b/great_expectations/expectations/metrics/util.py index 8c614afb7549..900cb41d2089 100644 --- a/great_expectations/expectations/metrics/util.py +++ b/great_expectations/expectations/metrics/util.py @@ -20,7 +20,21 @@ ) from great_expectations.execution_engine.sqlalchemy_dialect import GXSqlDialect from great_expectations.execution_engine.util import check_sql_engine_dialect -from great_expectations.optional_imports import sqlalchemy_Engine +from great_expectations.optional_imports import ( + sa_sql_expression_BinaryExpression, + sa_sql_expression_Select, + sa_sql_expression_TableClause, + sqlalchemy_custom_op, + sqlalchemy_dialects_registry, + sqlalchemy_engine_Dialect, + sqlalchemy_engine_Engine, + sqlalchemy_literal, + sqlalchemy_reflection, + sqlalchemy_TextClause, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.util import get_sqlalchemy_inspector try: @@ -34,40 +48,6 @@ except ImportError: snowflake = None -try: - import sqlalchemy as sa # noqa: TID251 - from sqlalchemy.dialects import registry # noqa: TID251 - from sqlalchemy.engine import Connection, Engine, reflection # noqa: TID251 - from sqlalchemy.engine.interfaces import Dialect # noqa: TID251 - from sqlalchemy.exc import OperationalError # noqa: TID251 - from sqlalchemy.sql import Insert, Select, TableClause # noqa: TID251 - from sqlalchemy.sql.elements import ( # noqa: TID251 - BinaryExpression, - ColumnElement, - Label, - TextClause, - literal, - quoted_name, - ) - from sqlalchemy.sql.operators import custom_op # noqa: TID251 -except ImportError: - sa = None - registry = None - Engine = None - Connection = None - reflection = None - Dialect = None - Insert = None - Select = None - BinaryExpression = None - ColumnElement = None - Label = None - TableClause = None - TextClause = None - literal = None - quoted_name = None - custom_op = None - OperationalError = None try: import sqlalchemy_redshift @@ -79,7 +59,9 @@ try: import sqlalchemy_dremio.pyodbc - registry.register("dremio", "sqlalchemy_dremio.pyodbc", "dialect") + sqlalchemy_dialects_registry.register( + "dremio", "sqlalchemy_dremio.pyodbc", "dialect" + ) except ImportError: sqlalchemy_dremio = None @@ -92,7 +74,9 @@ try: import sqlalchemy_bigquery as sqla_bigquery - registry.register("bigquery", _BIGQUERY_MODULE_NAME, "BigQueryDialect") + sqlalchemy_dialects_registry.register( + "bigquery", _BIGQUERY_MODULE_NAME, "BigQueryDialect" + ) bigquery_types_tuple = None except ImportError: try: @@ -107,7 +91,9 @@ _BIGQUERY_MODULE_NAME = "pybigquery.sqlalchemy_bigquery" # Sometimes "pybigquery.sqlalchemy_bigquery" fails to self-register in Azure (our CI/CD pipeline) in certain cases, so we do it explicitly. # (see https://stackoverflow.com/questions/53284762/nosuchmoduleerror-cant-load-plugin-sqlalchemy-dialectssnowflake) - registry.register("bigquery", _BIGQUERY_MODULE_NAME, "dialect") + sqlalchemy_dialects_registry.register( + "bigquery", _BIGQUERY_MODULE_NAME, "dialect" + ) try: getattr(sqla_bigquery, "INTEGER") bigquery_types_tuple = None @@ -144,9 +130,13 @@ def get_dialect_regex_expression( # noqa: C901 - 36 # postgres if issubclass(dialect.dialect, sa.dialects.postgresql.dialect): if positive: - return BinaryExpression(column, literal(regex), custom_op("~")) + return sa_sql_expression_BinaryExpression( + column, sqlalchemy_literal(regex), sqlalchemy_custom_op("~") + ) else: - return BinaryExpression(column, literal(regex), custom_op("!~")) + return sa_sql_expression_BinaryExpression( + column, sqlalchemy_literal(regex), sqlalchemy_custom_op("!~") + ) except AttributeError: pass @@ -157,9 +147,13 @@ def get_dialect_regex_expression( # noqa: C901 - 36 dialect.dialect, sqlalchemy_redshift.dialect.RedshiftDialect ): if positive: - return BinaryExpression(column, literal(regex), custom_op("~")) + return sa_sql_expression_BinaryExpression( + column, sqlalchemy_literal(regex), sqlalchemy_custom_op("~") + ) else: - return BinaryExpression(column, literal(regex), custom_op("!~")) + return sa_sql_expression_BinaryExpression( + column, sqlalchemy_literal(regex), sqlalchemy_custom_op("!~") + ) except ( AttributeError, TypeError, @@ -170,9 +164,15 @@ def get_dialect_regex_expression( # noqa: C901 - 36 # MySQL if issubclass(dialect.dialect, sa.dialects.mysql.dialect): if positive: - return BinaryExpression(column, literal(regex), custom_op("REGEXP")) + return sa_sql_expression_BinaryExpression( + column, sqlalchemy_literal(regex), sqlalchemy_custom_op("REGEXP") + ) else: - return BinaryExpression(column, literal(regex), custom_op("NOT REGEXP")) + return sa_sql_expression_BinaryExpression( + column, + sqlalchemy_literal(regex), + sqlalchemy_custom_op("NOT REGEXP"), + ) except AttributeError: pass @@ -201,9 +201,11 @@ def get_dialect_regex_expression( # noqa: C901 - 36 # Bigquery if hasattr(dialect, "BigQueryDialect"): if positive: - return sa.func.REGEXP_CONTAINS(column, literal(regex)) + return sa.func.REGEXP_CONTAINS(column, sqlalchemy_literal(regex)) else: - return sa.not_(sa.func.REGEXP_CONTAINS(column, literal(regex))) + return sa.not_( + sa.func.REGEXP_CONTAINS(column, sqlalchemy_literal(regex)) + ) except ( AttributeError, TypeError, @@ -221,9 +223,9 @@ def get_dialect_regex_expression( # noqa: C901 - 36 dialect, trino.sqlalchemy.dialect.TrinoDialect ): if positive: - return sa.func.regexp_like(column, literal(regex)) + return sa.func.regexp_like(column, sqlalchemy_literal(regex)) else: - return sa.not_(sa.func.regexp_like(column, literal(regex))) + return sa.not_(sa.func.regexp_like(column, sqlalchemy_literal(regex))) except ( AttributeError, TypeError, @@ -234,9 +236,11 @@ def get_dialect_regex_expression( # noqa: C901 - 36 # Dremio if hasattr(dialect, "DremioDialect"): if positive: - return sa.func.REGEXP_MATCHES(column, literal(regex)) + return sa.func.REGEXP_MATCHES(column, sqlalchemy_literal(regex)) else: - return sa.not_(sa.func.REGEXP_MATCHES(column, literal(regex))) + return sa.not_( + sa.func.REGEXP_MATCHES(column, sqlalchemy_literal(regex)) + ) except ( AttributeError, TypeError, @@ -247,9 +251,19 @@ def get_dialect_regex_expression( # noqa: C901 - 36 # Teradata if issubclass(dialect.dialect, teradatasqlalchemy.dialect.TeradataDialect): if positive: - return sa.func.REGEXP_SIMILAR(column, literal(regex), literal("i")) == 1 + return ( + sa.func.REGEXP_SIMILAR( + column, sqlalchemy_literal(regex), sqlalchemy_literal("i") + ) + == 1 + ) else: - return sa.func.REGEXP_SIMILAR(column, literal(regex), literal("i")) == 0 + return ( + sa.func.REGEXP_SIMILAR( + column, sqlalchemy_literal(regex), sqlalchemy_literal("i") + ) + == 0 + ) except (AttributeError, TypeError): pass @@ -260,9 +274,9 @@ def get_dialect_regex_expression( # noqa: C901 - 36 sa.__version__ ) >= version.parse("1.4"): if positive: - return column.regexp_match(literal(regex)) + return column.regexp_match(sqlalchemy_literal(regex)) else: - return sa.not_(column.regexp_match(literal(regex))) + return sa.not_(column.regexp_match(sqlalchemy_literal(regex))) else: logger.debug( "regex_match is only enabled for sqlite when SQLAlchemy version is >= 1.4", @@ -339,8 +353,8 @@ def attempt_allowing_relative_error(dialect): def is_column_present_in_table( - engine: Engine, - table_selectable: Select, + engine: sqlalchemy_engine_Engine, + table_selectable: sa_sql_expression_Select, column_name: str, schema_name: Optional[str] = None, ) -> bool: @@ -356,15 +370,19 @@ def is_column_present_in_table( def get_sqlalchemy_column_metadata( - engine: Engine, table_selectable: Select, schema_name: Optional[str] = None + engine: sqlalchemy_engine_Engine, + table_selectable: sa_sql_expression_Select, + schema_name: Optional[str] = None, ) -> Optional[List[Dict[str, Any]]]: try: columns: List[Dict[str, Any]] - inspector: reflection.Inspector = get_sqlalchemy_inspector(engine) + inspector: sqlalchemy_reflection.Inspector = get_sqlalchemy_inspector(engine) try: # if a custom query was passed - if isinstance(table_selectable, TextClause): + if sqlalchemy_TextClause and isinstance( + table_selectable, sqlalchemy_TextClause + ): if hasattr(table_selectable, "selected_columns"): columns = table_selectable.selected_columns.columns else: @@ -403,11 +421,13 @@ def get_sqlalchemy_column_metadata( def column_reflection_fallback( - selectable: Select, dialect: Dialect, sqlalchemy_engine: Engine + selectable: sa_sql_expression_Select, + dialect: sqlalchemy_engine_Dialect, + sqlalchemy_engine: sqlalchemy_engine_Engine, ) -> List[Dict[str, str]]: """If we can't reflect the table, use a query to at least get column names.""" - if isinstance(sqlalchemy_engine.engine, sqlalchemy_Engine): + if isinstance(sqlalchemy_engine.engine, sqlalchemy_engine_Engine): connection = sqlalchemy_engine.engine.connect() else: connection = sqlalchemy_engine.engine @@ -420,25 +440,25 @@ def column_reflection_fallback( if dialect.name.lower() == "mssql": # Get column names and types from the database # Reference: https://dataedo.com/kb/query/sql-server/list-table-columns-in-database - tables_table_clause: TableClause = sa.table( + tables_table_clause: sa_sql_expression_TableClause = sa.table( "tables", sa.column("object_id"), sa.column("schema_id"), sa.column("name"), schema="sys", ).alias("sys_tables_table_clause") - tables_table_query: Select = ( + tables_table_query: sa_sql_expression_Select = ( sa.select( - tables_table_clause.c.object_id.label("object_id"), - sa.func.schema_name(tables_table_clause.c.schema_id).label( + tables_table_clause.columns.object_id.label("object_id"), + sa.func.schema_name(tables_table_clause.columns.schema_id).label( "schema_name" ), - tables_table_clause.c.name.label("table_name"), + tables_table_clause.columns.name.label("table_name"), ) .select_from(tables_table_clause) .alias("sys_tables_table_subquery") ) - columns_table_clause: TableClause = sa.table( + columns_table_clause: sa_sql_expression_TableClause = sa.table( "columns", sa.column("object_id"), sa.column("user_type_id"), @@ -448,36 +468,36 @@ def column_reflection_fallback( sa.column("precision"), schema="sys", ).alias("sys_columns_table_clause") - columns_table_query: Select = ( + columns_table_query: sa_sql_expression_Select = ( sa.select( - columns_table_clause.c.object_id.label("object_id"), - columns_table_clause.c.user_type_id.label("user_type_id"), - columns_table_clause.c.column_id.label("column_id"), - columns_table_clause.c.name.label("column_name"), - columns_table_clause.c.max_length.label("column_max_length"), - columns_table_clause.c.precision.label("column_precision"), + columns_table_clause.columns.object_id.label("object_id"), + columns_table_clause.columns.user_type_id.label("user_type_id"), + columns_table_clause.columns.column_id.label("column_id"), + columns_table_clause.columns.name.label("column_name"), + columns_table_clause.columns.max_length.label("column_max_length"), + columns_table_clause.columns.precision.label("column_precision"), ) .select_from(columns_table_clause) .alias("sys_columns_table_subquery") ) - types_table_clause: TableClause = sa.table( + types_table_clause: sa_sql_expression_TableClause = sa.table( "types", sa.column("user_type_id"), sa.column("name"), schema="sys", ).alias("sys_types_table_clause") - types_table_query: Select = ( + types_table_query: sa_sql_expression_Select = ( sa.select( - types_table_clause.c.user_type_id.label("user_type_id"), - types_table_clause.c.name.label("column_data_type"), + types_table_clause.columns.user_type_id.label("user_type_id"), + types_table_clause.columns.name.label("column_data_type"), ) .select_from(types_table_clause) .alias("sys_types_table_subquery") ) - inner_join_conditions: BinaryExpression = sa.and_( + inner_join_conditions: sa_sql_expression_BinaryExpression = sa.and_( *(tables_table_query.c.object_id == columns_table_query.c.object_id,) ) - outer_join_conditions: BinaryExpression = sa.and_( + outer_join_conditions: sa_sql_expression_BinaryExpression = sa.and_( *( columns_table_query.columns.user_type_id == types_table_query.columns.user_type_id, @@ -608,8 +628,8 @@ def column_reflection_fallback( ] else: # if a custom query was passed - if isinstance(selectable, TextClause): - query: TextClause = selectable + if isinstance(selectable, sqlalchemy_TextClause): + query: sqlalchemy_TextClause = selectable else: # noinspection PyUnresolvedReferences if dialect.name.lower() == GXSqlDialect.REDSHIFT: @@ -838,9 +858,9 @@ def get_dialect_like_pattern_expression( # noqa: C901 - 28 if dialect_supported: try: if positive: - return column.like(literal(like_pattern)) + return column.like(sqlalchemy_literal(like_pattern)) else: - return sa.not_(column.like(literal(like_pattern))) + return sa.not_(column.like(sqlalchemy_literal(like_pattern))) except AttributeError: pass diff --git a/great_expectations/optional_imports.py b/great_expectations/optional_imports.py index 0bca9f436acb..31a86ec20e49 100644 --- a/great_expectations/optional_imports.py +++ b/great_expectations/optional_imports.py @@ -95,37 +95,369 @@ def is_version_less_than( SQLALCHEMY_NOT_IMPORTED = NotImported( "sqlalchemy is not installed, please 'pip install sqlalchemy'" ) + try: import sqlalchemy sqlalchemy_version_check(sqlalchemy.__version__) -except ImportError: +except (ImportError, AttributeError): sqlalchemy = SQLALCHEMY_NOT_IMPORTED try: - sqlalchemy_Connection = sqlalchemy.engine.Connection + from sqlalchemy.dialects import registry as sqlalchemy_dialects_registry +except (ImportError, AttributeError): + sqlalchemy_dialects_registry = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.engine import ( + Dialect as sqlalchemy_engine_Dialect, + ) +except (ImportError, AttributeError): + sqlalchemy_engine_Dialect = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.engine import ( + Inspector as sqlalchemy_engine_Inspector, + ) +except (ImportError, AttributeError): + sqlalchemy_engine_Inspector = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.engine import ( + reflection as sqlalchemy_reflection, + ) +except (ImportError, AttributeError): + sqlalchemy_reflection = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.engine import ( + Connection as sqlalchemy_engine_Connection, + ) +except (ImportError, AttributeError): + sqlalchemy_engine_Connection = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.engine import ( + Engine as sqlalchemy_engine_Engine, + ) +except (ImportError, AttributeError): + sqlalchemy_engine_Engine = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.engine import ( + Row as sqlalchemy_engine_Row, + ) except (ImportError, AttributeError): - sqlalchemy_Connection = SQLALCHEMY_NOT_IMPORTED + sqlalchemy_engine_Row = SQLALCHEMY_NOT_IMPORTED try: - sqlalchemy_Engine = sqlalchemy.engine.Engine + from sqlalchemy.engine.default import ( + DefaultDialect as sqlalchemy_engine_DefaultDialect, + ) except (ImportError, AttributeError): - sqlalchemy_Engine = SQLALCHEMY_NOT_IMPORTED + sqlalchemy_engine_DefaultDialect = SQLALCHEMY_NOT_IMPORTED try: - sqlalchemy_Row = sqlalchemy.engine.Row + from sqlalchemy.exc import ( + DatabaseError as sqlalchemy_DatabaseError, + ) except (ImportError, AttributeError): - sqlalchemy_Row = SQLALCHEMY_NOT_IMPORTED + sqlalchemy_DatabaseError = SQLALCHEMY_NOT_IMPORTED try: - sqlalchemy_TextClause = sqlalchemy.sql.elements.TextClause + from sqlalchemy.exc import ( + IntegrityError as sqlalchemy_IntegrityError, + ) +except (ImportError, AttributeError): + sqlalchemy_IntegrityError = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.exc import ( + NoSuchTableError as sqlalchemy_NoSuchTableError, + ) +except (ImportError, AttributeError): + sqlalchemy_NoSuchTableError = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.exc import ( + OperationalError as sqlalchemy_OperationalError, + ) +except (ImportError, AttributeError): + sqlalchemy_OperationalError = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.exc import ( + ProgrammingError as sqlalchemy_ProgrammingError, + ) +except (ImportError, AttributeError): + sqlalchemy_ProgrammingError = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.exc import ( + SQLAlchemyError, + ) +except (ImportError, AttributeError): + SQLAlchemyError = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql import Insert as sa_sql_Insert +except (ImportError, AttributeError): + sa_sql_Insert = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.elements import ( + literal as sqlalchemy_literal, + ) +except (ImportError, AttributeError): + sqlalchemy_literal = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.elements import ( + TextClause as sqlalchemy_TextClause, + ) except (ImportError, AttributeError): sqlalchemy_TextClause = SQLALCHEMY_NOT_IMPORTED +try: + from sqlalchemy.sql.elements import ( + quoted_name, + ) +except (ImportError, AttributeError): + quoted_name = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + CTE as sa_sql_expression_CTE, # noqa N812 + ) +except (ImportError, AttributeError): + sa_sql_expression_CTE = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + BinaryExpression as sa_sql_expression_BinaryExpression, + ) +except (ImportError, AttributeError): + sa_sql_expression_BinaryExpression = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + BooleanClauseList as sa_sql_expression_BooleanClauseList, + ) +except (ImportError, AttributeError): + sa_sql_expression_BooleanClauseList = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + ColumnClause as sa_sql_expression_ColumnClause, + ) +except (ImportError, AttributeError): + sa_sql_expression_ColumnClause = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + Label as sa_sql_expression_Label, + ) +except (ImportError, AttributeError): + sa_sql_expression_Label = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + Select as sa_sql_expression_Select, + ) +except (ImportError, AttributeError): + sa_sql_expression_Select = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + Selectable as sa_sql_expression_Selectable, + ) +except (ImportError, AttributeError): + sa_sql_expression_Selectable = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + TableClause as sa_sql_expression_TableClause, + ) +except (ImportError, AttributeError): + sa_sql_expression_TableClause = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + TextualSelect as sa_sql_expression_TextualSelect, + ) +except (ImportError, AttributeError): + sa_sql_expression_TextualSelect = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.expression import ( + WithinGroup as sa_sql_expression_WithinGroup, + ) +except (ImportError, AttributeError): + sa_sql_expression_WithinGroup = SQLALCHEMY_NOT_IMPORTED + +try: + from sqlalchemy.sql.operators import custom_op as sqlalchemy_custom_op +except (ImportError, AttributeError): + sqlalchemy_custom_op = SQLALCHEMY_NOT_IMPORTED + + SPARK_NOT_IMPORTED = NotImported( "pyspark is not installed, please 'pip install pyspark'" ) + try: - import pyspark -except ImportError: + import pyspark as pyspark +except (ImportError, AttributeError): pyspark = SPARK_NOT_IMPORTED # type: ignore[assignment] + +try: + import pyspark.sql.functions as F # noqa N801 +except (ImportError, AttributeError): + F = SPARK_NOT_IMPORTED # type: ignore[assignment] + +try: + import pyspark.sql.types as sparktypes +except (ImportError, AttributeError): + sparktypes = SPARK_NOT_IMPORTED # type: ignore[assignment] + +try: + from pyspark import SparkContext as pyspark_SparkContext +except (ImportError, AttributeError): + pyspark_SparkContext = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.ml.feature import Bucketizer as pyspark_ml_Bucketizer +except (ImportError, AttributeError): + pyspark_ml_Bucketizer = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql import ( + Column as pyspark_sql_Column, + ) +except (ImportError, AttributeError): + pyspark_sql_Column = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql import ( + DataFrame as pyspark_sql_DataFrame, + ) +except (ImportError, AttributeError): + pyspark_sql_DataFrame = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql import ( + Row as pyspark_sql_Row, + ) +except (ImportError, AttributeError): + pyspark_sql_Row = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql import ( + SparkSession as pyspark_sql_SparkSession, + ) +except (ImportError, AttributeError): + pyspark_sql_SparkSession = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql import ( + SQLContext as pyspark_SQLContext, + ) +except (ImportError, AttributeError): + pyspark_SQLContext = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql import ( + Window as pyspark_sql_Window, + ) +except (ImportError, AttributeError): + pyspark_sql_Window = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql.readwriter import DataFrameReader as pyspark_DataFrameReader +except (ImportError, AttributeError): + pyspark_DataFrameReader = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from pyspark.sql.utils import ( + AnalysisException as pyspark_sql_utils_AnalysisException, + ) +except (ImportError, AttributeError): + pyspark_sql_utils_AnalysisException = SPARK_NOT_IMPORTED # type: ignore[assignment,misc] + + +GOOGLE_CLOUD_STORAGE_NOT_IMPORTED = NotImported( + "google cloud storage components are not installed, please 'pip install google-cloud-storage google-cloud-secret-manager'" +) + +try: + from google.api_core.exceptions import GoogleAPIError # noqa N801 +except (ImportError, AttributeError): + GoogleAPIError = GOOGLE_CLOUD_STORAGE_NOT_IMPORTED # type: ignore[assignment,misc] + +try: + from google.auth.exceptions import DefaultCredentialsError # noqa N801 +except (ImportError, AttributeError): + DefaultCredentialsError = GOOGLE_CLOUD_STORAGE_NOT_IMPORTED + +try: + from google.cloud import storage as google_cloud_storage +except (ImportError, AttributeError): + google_cloud_storage = GOOGLE_CLOUD_STORAGE_NOT_IMPORTED + +try: + from google.cloud.storage import Client as GoogleCloudStorageClient +except (ImportError, AttributeError): + GoogleCloudStorageClient = GOOGLE_CLOUD_STORAGE_NOT_IMPORTED + +try: + from google.oauth2 import service_account as google_service_account +except (ImportError, AttributeError): + google_service_account = GOOGLE_CLOUD_STORAGE_NOT_IMPORTED + +try: + from google.oauth2.service_account import ( + Credentials as GoogleServiceAccountCredentials, + ) +except (ImportError, AttributeError): + GoogleServiceAccountCredentials = GOOGLE_CLOUD_STORAGE_NOT_IMPORTED + + +AZURE_BLOB_STORAGE_NOT_IMPORTED = NotImported( + "azure blob storage components are not installed, please 'pip install azure-storage-blob azure-identity azure-keyvault-secrets'" +) + +try: + from azure import storage as azure_storage +except (ImportError, AttributeError): + azure_storage = AZURE_BLOB_STORAGE_NOT_IMPORTED + +try: + from azure.storage.blob import ( + BlobPrefix, + ) +except (ImportError, AttributeError): + BlobPrefix = AZURE_BLOB_STORAGE_NOT_IMPORTED + +try: + from azure.storage.blob import ( + BlobServiceClient, + ) +except (ImportError, AttributeError): + BlobServiceClient = AZURE_BLOB_STORAGE_NOT_IMPORTED + +try: + from azure.storage.blob import ( + ContainerClient, + ) +except (ImportError, AttributeError): + ContainerClient = AZURE_BLOB_STORAGE_NOT_IMPORTED + + +PYARROW_NOT_IMPORTED = NotImported( + "pyarrow is not installed, please 'pip install pyarrow'" +) + +try: + import pyarrow as pyarrow +except (ImportError, AttributeError): + pyarrow = PYARROW_NOT_IMPORTED diff --git a/great_expectations/profile/user_configurable_profiler.py b/great_expectations/profile/user_configurable_profiler.py index 8e703b9b4ec4..fcafe53c702b 100644 --- a/great_expectations/profile/user_configurable_profiler.py +++ b/great_expectations/profile/user_configurable_profiler.py @@ -123,7 +123,7 @@ def __init__( if isinstance(self.profile_dataset, Batch): context = self.profile_dataset.data_context self.profile_dataset = Validator( - execution_engine=self.profile_dataset.data.execution_engine, # type: ignore[arg-type] + execution_engine=self.profile_dataset.data.execution_engine, batches=[self.profile_dataset], ) self.all_table_columns = self.profile_dataset.get_metric( diff --git a/great_expectations/rule_based_profiler/attributed_resolved_metrics.py b/great_expectations/rule_based_profiler/attributed_resolved_metrics.py index e76831cc3efe..b3cea02908a7 100644 --- a/great_expectations/rule_based_profiler/attributed_resolved_metrics.py +++ b/great_expectations/rule_based_profiler/attributed_resolved_metrics.py @@ -6,6 +6,7 @@ import pandas as pd from great_expectations.core.util import convert_to_json_serializable +from great_expectations.optional_imports import pyspark_sql_Row, sqlalchemy_engine_Row from great_expectations.rule_based_profiler.metric_computation_result import ( MetricValues, # noqa: TCH001 ) @@ -16,24 +17,6 @@ logger = logging.getLogger(__name__) -try: - import sqlalchemy as sa # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy module available.") - sa = None - -try: - from sqlalchemy.engine import Row as sqlalchemy_engine_Row # noqa: TID251 -except ImportError: - logger.debug("No SqlAlchemy.engine module available.") - sqlalchemy_engine_Row = None - -try: - from pyspark.sql import Row as pyspark_sql_Row -except ImportError: - logger.debug("No spark SQLContext available.") - pyspark_sql_Row = None # type: ignore[assignment,misc] - def _condition_metric_values(metric_values: MetricValues) -> MetricValues: def _detect_illegal_array_type_or_shape(values: MetricValues) -> bool: @@ -44,8 +27,8 @@ def _detect_illegal_array_type_or_shape(values: MetricValues) -> bool: properties=( pd.DataFrame, pd.Series, - sqlalchemy_engine_Row, - pyspark_sql_Row, + sqlalchemy_engine_Row if sqlalchemy_engine_Row else None, + pyspark_sql_Row if pyspark_sql_Row else None, # type: ignore[truthy-function] set, ) ), diff --git a/great_expectations/self_check/util.py b/great_expectations/self_check/util.py index 2600bf92bd1c..216065988874 100644 --- a/great_expectations/self_check/util.py +++ b/great_expectations/self_check/util.py @@ -67,6 +67,17 @@ from great_expectations.execution_engine.sqlalchemy_batch_data import ( SqlAlchemyBatchData, ) +from great_expectations.optional_imports import ( + SQLALCHEMY_NOT_IMPORTED, + SQLAlchemyError, + pyspark_sql_DataFrame, + pyspark_sql_SparkSession, + sparktypes, + sqlalchemy_engine_Engine, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) from great_expectations.profile import ColumnsExistProfiler from great_expectations.self_check.sqlalchemy_connection_manager import ( LockingConnectionCheck, @@ -100,32 +111,7 @@ logger = logging.getLogger(__name__) try: - import sqlalchemy as sqlalchemy # noqa: TID251 - from sqlalchemy import create_engine # noqa: TID251 - from sqlalchemy.engine import Engine # noqa: TID251 - from sqlalchemy.exc import SQLAlchemyError # noqa: TID251 -except ImportError: - sqlalchemy = None - create_engine = None - Engine = None - SQLAlchemyError = None - logger.debug("Unable to load SqlAlchemy or one of its subclasses.") - -try: - from pyspark.sql import DataFrame as SparkDataFrame - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType -except ImportError: - SparkDataFrame = type(None) # type: ignore[assignment,misc] - SparkSession = None # type: ignore[assignment,misc] - StructType = None # type: ignore[assignment,misc] - -try: - from pyspark.sql import DataFrame as spark_DataFrame -except ImportError: - spark_DataFrame = type(None) # type: ignore[assignment,misc] - -try: + from great_expectations.optional_imports import sqlalchemy # isort:skip import sqlalchemy.dialects.sqlite as sqlitetypes # noqa: TID251 # noinspection PyPep8Naming @@ -143,8 +129,9 @@ "TIMESTAMP": sqlitetypes.TIMESTAMP, } except (ImportError, KeyError): - sqlitetypes = None - sqliteDialect = None + sqlalchemy = SQLALCHEMY_NOT_IMPORTED + sqlitetypes = SQLALCHEMY_NOT_IMPORTED + sqliteDialect = SQLALCHEMY_NOT_IMPORTED SQLITE_TYPES = {} _BIGQUERY_MODULE_NAME = "sqlalchemy_bigquery" @@ -793,7 +780,7 @@ def _get_test_validator_with_data_sqlalchemy( context: AbstractDataContext | None, pk_column: bool, ) -> Validator | None: - if not create_engine: + if not sa: return None if table_name is None: @@ -1026,7 +1013,7 @@ def build_sa_validator_with_data( # noqa: C901 - 39 db_hostname = os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost") if sa_engine_name == "sqlite": connection_string = get_sqlite_connection_url(sqlite_db_path) - engine = create_engine(connection_string) + engine = sa.create_engine(connection_string) elif sa_engine_name == "postgresql": connection_string = f"postgresql://postgres@{db_hostname}/test_ci" engine = connection_manager.get_connection(connection_string) @@ -1035,25 +1022,25 @@ def build_sa_validator_with_data( # noqa: C901 - 39 engine = connection_manager.get_connection(connection_string) elif sa_engine_name == "mssql": connection_string = f"mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@{db_hostname}:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true" - engine = create_engine( + engine = sa.create_engine( connection_string, # echo=True, ) elif sa_engine_name == "bigquery": connection_string = _get_bigquery_connection_string() - engine = create_engine(connection_string) + engine = sa.create_engine(connection_string) elif sa_engine_name == "trino": connection_string = _get_trino_connection_string() - engine = create_engine(connection_string) + engine = sa.create_engine(connection_string) elif sa_engine_name == "redshift": connection_string = _get_redshift_connection_string() - engine = create_engine(connection_string) + engine = sa.create_engine(connection_string) elif sa_engine_name == "athena": connection_string = _get_athena_connection_string() - engine = create_engine(connection_string) + engine = sa.create_engine(connection_string) elif sa_engine_name == "snowflake": connection_string = _get_snowflake_connection_string() - engine = create_engine(connection_string) + engine = sa.create_engine(connection_string) else: connection_string = None engine = None @@ -1225,8 +1212,8 @@ def locale_wrapper(*args, **kwargs) -> None: def build_spark_validator_with_data( - df: Union[pd.DataFrame, SparkDataFrame], - spark: SparkSession, + df: Union[pd.DataFrame, pyspark_sql_DataFrame], + spark: pyspark_sql_SparkSession, batch_definition: Optional[BatchDefinition] = None, context: Optional[AbstractDataContext] = None, ) -> Validator: @@ -1281,7 +1268,9 @@ def build_sa_engine( table_name: str = "test" # noinspection PyUnresolvedReferences - sqlalchemy_engine: Engine = sa.create_engine("sqlite://", echo=False) + sqlalchemy_engine: sqlalchemy_engine_Engine = sa.create_engine( + "sqlite://", echo=False + ) add_dataframe_to_db( df=df, name=table_name, @@ -1312,9 +1301,9 @@ def build_sa_engine( # Builds a Spark Execution Engine def build_spark_engine( - spark: SparkSession, - df: Union[pd.DataFrame, SparkDataFrame], - schema: Optional[StructType] = None, + spark: pyspark_sql_SparkSession, + df: Union[pd.DataFrame, pyspark_sql_DataFrame], + schema: Optional[sparktypes.StructType] = None, batch_id: Optional[str] = None, batch_definition: Optional[BatchDefinition] = None, ) -> SparkDFExecutionEngine: @@ -1663,7 +1652,7 @@ def build_test_backends_list( # noqa: C901 - 48 if include_mysql: try: - engine = create_engine(f"mysql+pymysql://root@{db_hostname}/test_ci") + engine = sa.create_engine(f"mysql+pymysql://root@{db_hostname}/test_ci") conn = engine.connect() conn.close() except (ImportError, SQLAlchemyError): @@ -1683,7 +1672,7 @@ def build_test_backends_list( # noqa: C901 - 48 if include_mssql: # noinspection PyUnresolvedReferences try: - engine = create_engine( + engine = sa.create_engine( f"mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@{db_hostname}:1433/test_ci?" "driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true", # echo=True, @@ -2822,8 +2811,8 @@ def _check_if_valid_dataset_name(dataset_name: str) -> str: return dataset_name -def _create_bigquery_engine() -> Engine: - return create_engine(_get_bigquery_connection_string()) +def _create_bigquery_engine() -> sqlalchemy_engine_Engine: + return sa.create_engine(_get_bigquery_connection_string()) def _get_bigquery_connection_string() -> str: @@ -2847,8 +2836,8 @@ def _bigquery_dataset() -> str: def _create_trino_engine( hostname: str = "localhost", schema_name: str = "schema" -) -> Engine: - engine = create_engine( +) -> sqlalchemy_engine_Engine: + engine = sa.create_engine( _get_trino_connection_string(hostname=hostname, schema_name=schema_name) ) from sqlalchemy import text # noqa: TID251 @@ -2900,8 +2889,8 @@ def _get_trino_connection_string( return f"trino://test@{hostname}:8088/memory/{schema_name}" -def _create_redshift_engine() -> Engine: - return create_engine(_get_redshift_connection_string()) +def _create_redshift_engine() -> sqlalchemy_engine_Engine: + return sa.create_engine(_get_redshift_connection_string()) def _get_redshift_connection_string() -> str: @@ -2945,8 +2934,12 @@ def _get_redshift_connection_string() -> str: return url -def _create_athena_engine(db_name_env_var: str = "ATHENA_DB_NAME") -> Engine: - return create_engine(_get_athena_connection_string(db_name_env_var=db_name_env_var)) +def _create_athena_engine( + db_name_env_var: str = "ATHENA_DB_NAME", +) -> sqlalchemy_engine_Engine: + return sa.create_engine( + _get_athena_connection_string(db_name_env_var=db_name_env_var) + ) def _get_athena_connection_string(db_name_env_var: str = "ATHENA_DB_NAME") -> str: @@ -2972,8 +2965,8 @@ def _get_athena_connection_string(db_name_env_var: str = "ATHENA_DB_NAME") -> st return url -def _create_snowflake_engine() -> Engine: - return create_engine(_get_snowflake_connection_string()) +def _create_snowflake_engine() -> sqlalchemy_engine_Engine: + return sa.create_engine(_get_snowflake_connection_string()) def _get_snowflake_connection_string() -> str: diff --git a/great_expectations/types/__init__.py b/great_expectations/types/__init__.py index 5ca649633855..bf3cff477783 100644 --- a/great_expectations/types/__init__.py +++ b/great_expectations/types/__init__.py @@ -5,8 +5,14 @@ import pandas as pd +from great_expectations.optional_imports import ( + SPARK_NOT_IMPORTED, + pyspark, + pyspark_sql_DataFrame, +) + from ..alias_types import JSONValues # noqa: TCH001 -from ..core._docs_decorators import public_api +from ..core._docs_decorators import public_api # noqa: F401 from .base import SerializableDotDict from .colors import ColorPalettes, PrimaryColors, SecondaryColors, TintsAndShades from .configurations import ClassConfig @@ -14,14 +20,6 @@ logger = logging.getLogger(__name__) -try: - import pyspark -except ImportError: - pyspark = None # type: ignore[assignment] - logger.debug( - "Unable to load pyspark; install optional spark dependency if you will be working with Spark dataframes" - ) - class DictDot: """A convenience class for migrating away from untyped dictionaries to stronger typed objects. @@ -258,7 +256,7 @@ def safe_deep_copy(data, memo=None): This method makes a copy of a dictionary, applying deep copy to attribute values, except for non-pickleable objects. """ if isinstance(data, (pd.Series, pd.DataFrame)) or ( - pyspark and isinstance(data, pyspark.sql.DataFrame) + pyspark and isinstance(data, pyspark_sql_DataFrame) ): return data diff --git a/great_expectations/util.py b/great_expectations/util.py index 66671f67cbee..ceccde9c1648 100644 --- a/great_expectations/util.py +++ b/great_expectations/util.py @@ -61,6 +61,13 @@ PluginClassNotFoundError, PluginModuleNotFoundError, ) +from great_expectations.optional_imports import ( + sa_sql_expression_Select, + sqlalchemy_reflection, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) try: import black @@ -76,21 +83,6 @@ logger = logging.getLogger(__name__) -try: - import sqlalchemy as sa # noqa: TID251 - from sqlalchemy import Table # noqa: TID251 - from sqlalchemy.engine import reflection # noqa: TID251 - from sqlalchemy.sql import Select # noqa: TID251 - -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - sa = None - reflection = None - Table = None - Select = None - if TYPE_CHECKING: # needed until numpy min version 1.20 @@ -2023,7 +2015,7 @@ def generate_temporary_table_name( def get_sqlalchemy_inspector(engine): if version.parse(sa.__version__) < version.parse("1.4"): # Inspector.from_engine deprecated since 1.4, sa.inspect() should be used instead - insp = reflection.Inspector.from_engine(engine) + insp = sqlalchemy_reflection.Inspector.from_engine(engine) else: insp = sa.inspect(engine) return insp @@ -2038,7 +2030,9 @@ def get_sqlalchemy_url(drivername, **credentials): return url -def get_sqlalchemy_selectable(selectable: Union[Table, Select]) -> Union[Table, Select]: +def get_sqlalchemy_selectable( + selectable: Union[sa.Table, sa_sql_expression_Select] +) -> Union[sa.Table, sa_sql_expression_Select]: """ Beginning from SQLAlchemy 1.4, a select() can no longer be embedded inside of another select() directly, without explicitly turning the inner select() into a subquery first. This helper method ensures that this @@ -2049,7 +2043,7 @@ def get_sqlalchemy_selectable(selectable: Union[Table, Select]) -> Union[Table, https://docs.sqlalchemy.org/en/14/changelog/migration_14.html#change-4617 """ - if isinstance(selectable, Select): + if sa_sql_expression_Select and isinstance(selectable, sa_sql_expression_Select): if version.parse(sa.__version__) >= version.parse("1.4"): selectable = selectable.subquery() else: diff --git a/great_expectations/validator/metrics_calculator.py b/great_expectations/validator/metrics_calculator.py index f08aeb1b735c..da09d2268202 100644 --- a/great_expectations/validator/metrics_calculator.py +++ b/great_expectations/validator/metrics_calculator.py @@ -3,6 +3,8 @@ import logging from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union +import pandas as pd + from great_expectations.validator.computed_metric import MetricValue # noqa: TCH001 from great_expectations.validator.exception_info import ExceptionInfo # noqa: TCH001 from great_expectations.validator.metric_configuration import MetricConfiguration @@ -14,15 +16,6 @@ logger = logging.getLogger(__name__) logging.captureWarnings(True) -try: - import pandas as pd -except ImportError: - pd = None - - logger.debug( - "Unable to load pandas; install optional pandas dependency for support." - ) - class MetricsCalculator: def __init__( diff --git a/great_expectations/validator/validator.py b/great_expectations/validator/validator.py index fadd033e605d..1ad977c1039e 100644 --- a/great_expectations/validator/validator.py +++ b/great_expectations/validator/validator.py @@ -24,6 +24,7 @@ Union, ) +import pandas as pd from marshmallow import ValidationError from great_expectations import __version__ as ge_version @@ -74,14 +75,6 @@ logger = logging.getLogger(__name__) logging.captureWarnings(True) -try: - import pandas as pd -except ImportError: - pd = None - - logger.debug( - "Unable to load pandas; install optional pandas dependency for support." - ) if TYPE_CHECKING: from great_expectations.core.batch import ( @@ -1977,14 +1970,12 @@ def __init__( self.expectation_engine = PandasDataset if self.expectation_engine is None: - try: - import pyspark + from great_expectations.optional_imports import ( + pyspark_sql_DataFrame, + ) - if isinstance(batch.data, pyspark.sql.DataFrame): - self.expectation_engine = SparkDFDataset - except ImportError: - # noinspection PyUnusedLocal - pyspark = None + if pyspark_sql_DataFrame and isinstance(batch.data, pyspark_sql_DataFrame): + self.expectation_engine = SparkDFDataset if self.expectation_engine is None: raise ValueError( @@ -2016,9 +2007,14 @@ def get_dataset(self): ) elif issubclass(self.expectation_engine, SparkDFDataset): - import pyspark + from great_expectations.optional_imports import ( + pyspark_sql_DataFrame, + ) - if not isinstance(self.batch.data, pyspark.sql.DataFrame): + if not ( + pyspark_sql_DataFrame + and isinstance(self.batch.data, pyspark_sql_DataFrame) + ): raise ValueError( "SparkDFDataset expectation_engine requires a spark DataFrame for its batch" ) diff --git a/tests/conftest.py b/tests/conftest.py index 827ec83b44a8..ff0bfbac54c8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -108,8 +108,10 @@ ) if TYPE_CHECKING: - import pyspark.sql - from pyspark.sql import SparkSession + from great_expectations.optional_imports import ( + pyspark_sql_DataFrame, + pyspark_sql_SparkSession, + ) yaml = YAMLHandler() ### @@ -130,7 +132,7 @@ def spark_warehouse_session(tmp_path_factory): pyspark = pytest.importorskip("pyspark") # noqa: F841 spark_warehouse_path: str = str(tmp_path_factory.mktemp("spark-warehouse")) - spark: SparkSession = get_or_create_spark_application( + spark: pyspark_sql_SparkSession = get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "in-memory", "spark.executor.memory": "450m", @@ -381,7 +383,7 @@ def sa(test_backends): pytest.skip("No recognized sqlalchemy backend selected.") else: try: - import sqlalchemy as sa + from great_expectations.optional_imports import sqlalchemy as sa return sa except ImportError: @@ -390,14 +392,15 @@ def sa(test_backends): @pytest.mark.order(index=2) @pytest.fixture -def spark_session(test_backends) -> SparkSession: +def spark_session(test_backends) -> pyspark_sql_SparkSession: if "SparkDFDataset" not in test_backends: pytest.skip("No spark backend selected.") - try: - import pyspark # noqa: F401 - from pyspark.sql import SparkSession # noqa: F401 + from great_expectations.optional_imports import ( + pyspark_sql_SparkSession, + ) + if pyspark_sql_SparkSession: return get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "hive", @@ -405,8 +408,8 @@ def spark_session(test_backends) -> SparkSession: # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } ) - except ImportError: - raise ValueError("spark tests are requested, but pyspark is not installed") + + raise ValueError("spark tests are requested, but pyspark is not installed") @pytest.fixture @@ -7458,7 +7461,7 @@ def pandas_multicolumn_sum_dataframe_for_unexpected_rows_and_index() -> pd.DataF @pytest.fixture def spark_column_pairs_dataframe_for_unexpected_rows_and_index( spark_session, -) -> pyspark.sql.dataframe.DataFrame: +) -> pyspark_sql_DataFrame: df: pd.DataFrame = pd.DataFrame( { "pk_1": [0, 1, 2, 3, 4, 5], @@ -7488,7 +7491,7 @@ def spark_column_pairs_dataframe_for_unexpected_rows_and_index( @pytest.fixture def spark_multicolumn_sum_dataframe_for_unexpected_rows_and_index( spark_session, -) -> pyspark.sql.dataframe.DataFrame: +) -> pyspark_sql_DataFrame: df: pd.DataFrame = pd.DataFrame( { "pk_1": [0, 1, 2, 3, 4, 5], @@ -7505,7 +7508,7 @@ def spark_multicolumn_sum_dataframe_for_unexpected_rows_and_index( @pytest.fixture def spark_dataframe_for_unexpected_rows_with_index( spark_session, -) -> pyspark.sql.dataframe.DataFrame: +) -> pyspark_sql_DataFrame: df: pd.DataFrame = pd.DataFrame( { "pk_1": [0, 1, 2, 3, 4, 5], diff --git a/tests/datasource/data_connector/test_configured_asset_gcs_data_connector.py b/tests/datasource/data_connector/test_configured_asset_gcs_data_connector.py index 89896834653f..cff5688d960b 100644 --- a/tests/datasource/data_connector/test_configured_asset_gcs_data_connector.py +++ b/tests/datasource/data_connector/test_configured_asset_gcs_data_connector.py @@ -13,10 +13,8 @@ from great_expectations.core.yaml_handler import YAMLHandler from great_expectations.data_context.util import instantiate_class_from_config from great_expectations.datasource.data_connector import ConfiguredAssetGCSDataConnector -from great_expectations.datasource.data_connector.configured_asset_gcs_data_connector import ( - storage, -) from great_expectations.execution_engine import PandasExecutionEngine +from great_expectations.optional_imports import google_cloud_storage yaml = YAMLHandler() @@ -230,7 +228,7 @@ def expected_batch_definitions_sorted(): @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -238,7 +236,7 @@ def expected_batch_definitions_sorted(): return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_without_args( mock_gcs_conn, mock_list_keys, expected_config_dict @@ -263,7 +261,7 @@ def test_instantiation_without_args( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -271,10 +269,10 @@ def test_instantiation_without_args( return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.service_account.Credentials.from_service_account_file" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_service_account.Credentials.from_service_account_file" ) def test_instantiation_with_filename_arg( mock_auth_method, mock_gcs_conn, mock_list_keys, expected_config_dict @@ -303,7 +301,7 @@ def test_instantiation_with_filename_arg( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -311,10 +309,10 @@ def test_instantiation_with_filename_arg( return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.service_account.Credentials.from_service_account_info" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_service_account.Credentials.from_service_account_info" ) def test_instantiation_with_info_arg( mock_auth_method, mock_gcs_conn, mock_list_keys, expected_config_dict @@ -343,7 +341,7 @@ def test_instantiation_with_info_arg( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -354,7 +352,7 @@ def test_instantiation_with_info_arg( return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_with_test_yaml_config( mock_gcs_conn, @@ -390,7 +388,7 @@ def test_instantiation_with_test_yaml_config( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -401,7 +399,7 @@ def test_instantiation_with_test_yaml_config( return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_with_test_yaml_config_emits_proper_payload( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -449,7 +447,7 @@ def test_instantiation_with_test_yaml_config_emits_proper_payload( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -460,7 +458,7 @@ def test_instantiation_with_test_yaml_config_emits_proper_payload( return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_from_a_config_with_nonmatching_regex_creates_unmatched_references( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -507,7 +505,7 @@ def test_instantiation_from_a_config_with_nonmatching_regex_creates_unmatched_re @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -518,7 +516,7 @@ def test_instantiation_from_a_config_with_nonmatching_regex_creates_unmatched_re return_value=["alpha-1.csv", "alpha-2.csv", "alpha-3.csv"], ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasource_name_raises_error( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -548,7 +546,7 @@ def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasourc @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -558,7 +556,7 @@ def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasourc "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @pytest.mark.slow # 1.65s def test_get_definition_list_from_batch_request_with_empty_args_raises_error( @@ -612,7 +610,7 @@ def test_get_definition_list_from_batch_request_with_empty_args_raises_error( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -622,7 +620,7 @@ def test_get_definition_list_from_batch_request_with_empty_args_raises_error( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_get_definition_list_from_batch_request_with_unnamed_data_asset_name_raises_error( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -665,7 +663,7 @@ def test_get_definition_list_from_batch_request_with_unnamed_data_asset_name_rai @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -675,7 +673,7 @@ def test_get_definition_list_from_batch_request_with_unnamed_data_asset_name_rai "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_return_all_batch_definitions_unsorted_without_named_data_asset_name( mock_gcs_conn, @@ -743,7 +741,7 @@ def test_return_all_batch_definitions_unsorted_without_named_data_asset_name( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -753,7 +751,7 @@ def test_return_all_batch_definitions_unsorted_without_named_data_asset_name( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_return_all_batch_definitions_unsorted_with_named_data_asset_name( mock_gcs_conn, @@ -821,7 +819,7 @@ def test_return_all_batch_definitions_unsorted_with_named_data_asset_name( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -831,7 +829,7 @@ def test_return_all_batch_definitions_unsorted_with_named_data_asset_name( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_return_all_batch_definitions_basic_sorted( mock_gcs_conn, @@ -910,7 +908,7 @@ def test_return_all_batch_definitions_basic_sorted( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( @@ -920,7 +918,7 @@ def test_return_all_batch_definitions_basic_sorted( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_return_all_batch_definitions_returns_specified_partition( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -1021,11 +1019,11 @@ def test_return_all_batch_definitions_returns_specified_partition( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" @@ -1110,11 +1108,11 @@ def test_return_all_batch_definitions_sorted_without_data_connector_query( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" @@ -1183,11 +1181,11 @@ def test_return_all_batch_definitions_raises_error_due_to_sorter_that_does_not_m @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys" @@ -1252,11 +1250,11 @@ def test_return_all_batch_definitions_too_many_sorters( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys", @@ -1387,11 +1385,11 @@ def test_example_with_explicit_data_asset_names( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in configured_asset_gcs_data_connector.py", ) @mock.patch( - "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( "great_expectations.datasource.data_connector.configured_asset_gcs_data_connector.list_gcs_keys", diff --git a/tests/datasource/data_connector/test_data_connector_util.py b/tests/datasource/data_connector/test_data_connector_util.py index 09078df19b9c..31e374e9c6fe 100644 --- a/tests/datasource/data_connector/test_data_connector_util.py +++ b/tests/datasource/data_connector/test_data_connector_util.py @@ -15,8 +15,8 @@ list_gcs_keys, map_batch_definition_to_data_reference_string_using_regex, map_data_reference_string_to_batch_definition_list_using_regex, - storage, ) +from great_expectations.optional_imports import google_cloud_storage def test_batch_definition_matches_batch_request(): @@ -513,10 +513,12 @@ def test_build_sorters_from_config_bad_config(): @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in datasource.data_connector.util", ) -@mock.patch("great_expectations.datasource.data_connector.util.storage.Client") +@mock.patch( + "great_expectations.datasource.data_connector.util.google_cloud_storage.Client" +) def test_list_gcs_keys_overwrites_delimiter(mock_gcs_conn): # Set defaults for ConfiguredAssetGCSDataConnector query_options = {"delimiter": None} diff --git a/tests/datasource/data_connector/test_inferred_asset_gcs_data_connector.py b/tests/datasource/data_connector/test_inferred_asset_gcs_data_connector.py index c33c8dca5447..d571d191dc51 100644 --- a/tests/datasource/data_connector/test_inferred_asset_gcs_data_connector.py +++ b/tests/datasource/data_connector/test_inferred_asset_gcs_data_connector.py @@ -8,10 +8,8 @@ from great_expectations.core.yaml_handler import YAMLHandler from great_expectations.data_context.util import instantiate_class_from_config from great_expectations.datasource.data_connector import InferredAssetGCSDataConnector -from great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector import ( - storage, -) from great_expectations.execution_engine import PandasExecutionEngine +from great_expectations.optional_imports import google_cloud_storage yaml = YAMLHandler() @@ -40,7 +38,7 @@ def expected_config_dict(): @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -53,7 +51,7 @@ def expected_config_dict(): ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_without_args( mock_gcs_conn, mock_list_keys, expected_config_dict @@ -77,7 +75,7 @@ def test_instantiation_without_args( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -90,10 +88,10 @@ def test_instantiation_without_args( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.service_account.Credentials.from_service_account_file" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_service_account.Credentials.from_service_account_file" ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_with_filename_arg( mock_gcs_conn, mock_auth_method, mock_list_keys, expected_config_dict @@ -120,7 +118,7 @@ def test_instantiation_with_filename_arg( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -133,10 +131,10 @@ def test_instantiation_with_filename_arg( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.service_account.Credentials.from_service_account_info" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_service_account.Credentials.from_service_account_info" ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_with_info_arg( mock_gcs_conn, mock_auth_method, mock_list_keys, expected_config_dict @@ -163,7 +161,7 @@ def test_instantiation_with_info_arg( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -179,7 +177,7 @@ def test_instantiation_with_info_arg( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasource_name_raises_error( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -208,7 +206,7 @@ def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasourc @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -227,7 +225,7 @@ def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasourc ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_get_batch_definition_list_from_batch_request_with_unknown_data_connector_raises_error( mock_gcs_conn, mock_list_keys, mock_emit @@ -258,7 +256,7 @@ def test_get_batch_definition_list_from_batch_request_with_unknown_data_connecto @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -275,7 +273,7 @@ def test_get_batch_definition_list_from_batch_request_with_unknown_data_connecto ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_simple_regex_example_with_implicit_data_asset_names_self_check( mock_gcs_conn, mock_list_keys, mock_emit @@ -319,7 +317,7 @@ def test_simple_regex_example_with_implicit_data_asset_names_self_check( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -338,7 +336,7 @@ def test_simple_regex_example_with_implicit_data_asset_names_self_check( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_complex_regex_example_with_implicit_data_asset_names( mock_gcs_conn, mock_list_keys, mock_emit @@ -409,7 +407,7 @@ def test_complex_regex_example_with_implicit_data_asset_names( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -425,7 +423,7 @@ def test_complex_regex_example_with_implicit_data_asset_names( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_self_check(mock_gcs_conn, mock_list_keys, mock_emit): my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector( @@ -463,7 +461,7 @@ def test_self_check(mock_gcs_conn, mock_list_keys, mock_emit): @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -482,7 +480,7 @@ def test_self_check(mock_gcs_conn, mock_list_keys, mock_emit): ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_test_yaml_config( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -538,7 +536,7 @@ def test_test_yaml_config( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -557,7 +555,7 @@ def test_test_yaml_config( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_instantiation_with_test_yaml_config_emits_proper_payload( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -605,7 +603,7 @@ def test_instantiation_with_test_yaml_config_emits_proper_payload( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -626,7 +624,7 @@ def test_instantiation_with_test_yaml_config_emits_proper_payload( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_yaml_config_excluding_non_regex_matching_files( mock_gcs_client, mock_list_keys, mock_emit, empty_data_context_stats_enabled @@ -684,7 +682,7 @@ def test_yaml_config_excluding_non_regex_matching_files( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -708,7 +706,7 @@ def test_yaml_config_excluding_non_regex_matching_files( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_nested_directory_data_asset_name_in_folder( mock_gcs_client, mock_list_keys, mock_emit, empty_data_context @@ -760,7 +758,7 @@ def test_nested_directory_data_asset_name_in_folder( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -779,7 +777,7 @@ def test_nested_directory_data_asset_name_in_folder( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_redundant_information_in_naming_convention_random_hash( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context @@ -828,7 +826,7 @@ def test_redundant_information_in_naming_convention_random_hash( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -847,7 +845,7 @@ def test_redundant_information_in_naming_convention_random_hash( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_redundant_information_in_naming_convention_timestamp( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context @@ -895,7 +893,7 @@ def test_redundant_information_in_naming_convention_timestamp( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -914,7 +912,7 @@ def test_redundant_information_in_naming_convention_timestamp( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_redundant_information_in_naming_convention_bucket( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context @@ -963,7 +961,7 @@ def test_redundant_information_in_naming_convention_bucket( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -982,7 +980,7 @@ def test_redundant_information_in_naming_convention_bucket( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_redundant_information_in_naming_convention_bucket_sorted( mock_gcs_conn, mock_list_keys, mock_emit @@ -1091,7 +1089,7 @@ def test_redundant_information_in_naming_convention_bucket_sorted( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -1110,7 +1108,7 @@ def test_redundant_information_in_naming_convention_bucket_sorted( ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_redundant_information_in_naming_convention_bucket_sorter_does_not_match_group( mock_gcs_conn, mock_list_keys, mock_emit @@ -1153,7 +1151,7 @@ def test_redundant_information_in_naming_convention_bucket_sorter_does_not_match @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( @@ -1172,7 +1170,7 @@ def test_redundant_information_in_naming_convention_bucket_sorter_does_not_match ], ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) def test_redundant_information_in_naming_convention_bucket_too_many_sorters( mock_gcs_conn, mock_list_keys, mock_emit @@ -1218,11 +1216,11 @@ def test_redundant_information_in_naming_convention_bucket_too_many_sorters( @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in inferred_asset_gcs_data_connector.py", ) @mock.patch( - "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.storage.Client" + "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.google_cloud_storage.Client" ) @mock.patch( "great_expectations.datasource.data_connector.inferred_asset_gcs_data_connector.list_gcs_keys", diff --git a/tests/datasource/fluent/data_asset/data_connector/test_azure_blob_storage_data_connector.py b/tests/datasource/fluent/data_asset/data_connector/test_azure_blob_storage_data_connector.py index e1fc0d53003d..c2905fe6a817 100644 --- a/tests/datasource/fluent/data_asset/data_connector/test_azure_blob_storage_data_connector.py +++ b/tests/datasource/fluent/data_asset/data_connector/test_azure_blob_storage_data_connector.py @@ -12,6 +12,7 @@ AzureBlobStorageDataConnector, ) from great_expectations.datasource.fluent.interfaces import BatchRequest +from great_expectations.optional_imports import BlobServiceClient, ContainerClient if TYPE_CHECKING: from great_expectations.datasource.fluent.data_asset.data_connector import ( @@ -22,16 +23,6 @@ logger = logging.getLogger(__name__) -try: - from azure.storage.blob import BlobServiceClient, ContainerClient -except ImportError: - BlobServiceClient = None - ContainerClient = None - logger.debug( - "Unable to load BlobServiceClient connection object; install optional Azure Storage Blob dependency for support" - ) - - class MockContainerClient: pass diff --git a/tests/datasource/fluent/data_asset/data_connector/test_google_cloud_storage_data_connector.py b/tests/datasource/fluent/data_asset/data_connector/test_google_cloud_storage_data_connector.py index b60e452db836..8ca353fe3504 100644 --- a/tests/datasource/fluent/data_asset/data_connector/test_google_cloud_storage_data_connector.py +++ b/tests/datasource/fluent/data_asset/data_connector/test_google_cloud_storage_data_connector.py @@ -12,6 +12,7 @@ GoogleCloudStorageDataConnector, ) from great_expectations.datasource.fluent.interfaces import BatchRequest +from great_expectations.optional_imports import GoogleCloudStorageClient if TYPE_CHECKING: from great_expectations.datasource.fluent.data_asset.data_connector import ( @@ -22,15 +23,6 @@ logger = logging.getLogger(__name__) -try: - from google.cloud.storage.client import Client as GCSClient -except ImportError: - GCSClient = None - logger.debug( - "Unable to load GoogleCloudStorage connection object; install optional Google dependency for support" - ) - - class MockGCSClient: # noinspection PyMethodMayBeStatic,PyUnusedLocal def list_blobs( @@ -55,7 +47,9 @@ def test_basic_instantiation(mock_list_keys): "alpha-3.csv", ] - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) my_data_connector: DataConnector = GoogleCloudStorageDataConnector( datasource_name="my_file_path_datasource", data_asset_name="my_google_cloud_storage_data_asset", @@ -102,7 +96,9 @@ def test_instantiation_batching_regex_does_not_match_paths(mock_list_keys): "alpha-3.csv", ] - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) my_data_connector: DataConnector = GoogleCloudStorageDataConnector( datasource_name="my_file_path_datasource", data_asset_name="my_google_cloud_storage_data_asset", @@ -146,7 +142,9 @@ def test_return_all_batch_definitions_unsorted(mock_list_keys): "will_20200810_1001.csv", ] - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) my_data_connector: DataConnector = GoogleCloudStorageDataConnector( datasource_name="my_file_path_datasource", data_asset_name="my_google_cloud_storage_data_asset", @@ -470,7 +468,9 @@ def test_return_only_unique_batch_definitions(mock_list_keys): "A/file_3.csv", ] - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) my_data_connector: DataConnector @@ -556,7 +556,9 @@ def test_alpha(mock_list_keys): "test_dir_alpha/D.csv", ] - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) my_data_connector: DataConnector = GoogleCloudStorageDataConnector( datasource_name="my_file_path_datasource", data_asset_name="my_google_cloud_storage_data_asset", @@ -613,7 +615,9 @@ def test_alpha(mock_list_keys): def test_foxtrot(mock_list_keys): mock_list_keys.return_value = [] - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) my_data_connector: DataConnector diff --git a/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py index fad9070029c2..7ebe7d217dac 100644 --- a/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_pandas_azure_blob_storage_datasource.py @@ -25,6 +25,11 @@ from great_expectations.datasource.fluent.pandas_file_path_datasource import ( CSVAsset, ) +from great_expectations.optional_imports import ( + BlobServiceClient, + ContainerClient, + azure_storage, +) logger = logging.getLogger(__file__) @@ -37,16 +42,6 @@ ] -try: - from azure.storage.blob import BlobServiceClient, ContainerClient -except ImportError: - BlobServiceClient = None - ContainerClient = None - logger.debug( - "Unable to load BlobServiceClient connection object; install optional Azure Storage Blob dependency for support" - ) - - class MockContainerClient: # noinspection PyMethodMayBeStatic,PyUnusedLocal def walk_blobs( @@ -79,7 +74,7 @@ def _build_pandas_abs_datasource( @pytest.fixture @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def pandas_abs_datasource() -> PandasAzureBlobStorageDatasource: @@ -125,7 +120,7 @@ def csv_asset( @pytest.fixture @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: @@ -141,7 +136,7 @@ def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_pandas_abs_datasource_with_account_url_and_credential(): @@ -159,7 +154,7 @@ def test_construct_pandas_abs_datasource_with_account_url_and_credential(): @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_pandas_abs_datasource_with_conn_str_and_credential(): @@ -177,7 +172,7 @@ def test_construct_pandas_abs_datasource_with_conn_str_and_credential(): @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_pandas_abs_datasource_with_valid_account_url_assigns_account_name(): @@ -195,7 +190,7 @@ def test_construct_pandas_abs_datasource_with_valid_account_url_assigns_account_ @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_pandas_abs_datasource_with_valid_conn_str_assigns_account_name(): @@ -213,7 +208,7 @@ def test_construct_pandas_abs_datasource_with_valid_conn_str_assigns_account_nam @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_pandas_abs_datasource_with_multiple_auth_methods_raises_error(): @@ -232,7 +227,7 @@ def test_construct_pandas_abs_datasource_with_multiple_auth_methods_raises_error @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -260,7 +255,7 @@ def test_add_csv_asset_to_datasource( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -284,7 +279,7 @@ def test_construct_csv_asset_directly( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -314,7 +309,7 @@ def test_csv_asset_with_batching_regex_unnamed_parameters( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -344,7 +339,7 @@ def test_csv_asset_with_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -374,7 +369,7 @@ def test_csv_asset_with_some_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -405,7 +400,7 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( reason="Accessing objects on azure.storage.blob using Pandas is not working, due to local credentials issues (this test is conducted using Jupyter notebook manually)." ) @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_get_batch_list_from_fully_specified_batch_request( @@ -461,7 +456,7 @@ def instantiate_azure_client_spy(self) -> None: @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_test_connection_failures( diff --git a/tests/datasource/fluent/test_pandas_dbfs_datasource.py b/tests/datasource/fluent/test_pandas_dbfs_datasource.py index acb2a314dda4..30dc41585b0e 100644 --- a/tests/datasource/fluent/test_pandas_dbfs_datasource.py +++ b/tests/datasource/fluent/test_pandas_dbfs_datasource.py @@ -91,21 +91,6 @@ def csv_asset(pandas_dbfs_datasource: PandasDBFSDatasource) -> _FilePathDataAsse return asset -def bad_batching_regex_config( - csv_path: pathlib.Path, -) -> tuple[re.Pattern, TestConnectionError]: - batching_regex = re.compile( - r"green_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv" - ) - test_connection_error = TestConnectionError( - "No file at base_directory path " - f'"{csv_path.resolve()}" matched regular expressions pattern ' - f'"{batching_regex.pattern}" and/or glob_directive "**/*" for ' - 'DataAsset "csv_asset".' - ) - return batching_regex, test_connection_error - - @pytest.fixture def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: regex = re.compile( diff --git a/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py b/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py index 58c534dcf504..2c876c1a6f32 100644 --- a/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py +++ b/tests/datasource/fluent/test_pandas_google_cloud_storage_datasource.py @@ -25,21 +25,14 @@ from great_expectations.datasource.fluent.pandas_file_path_datasource import ( CSVAsset, ) +from great_expectations.optional_imports import ( + GoogleCloudStorageClient, + google_cloud_storage, +) logger = logging.getLogger(__file__) -try: - from google.cloud import storage - from google.cloud.storage import Client as GCSClient -except ImportError: - storage = None - GCSClient = None - logger.debug( - "Unable to load GoogleCloudStorage connection object; install optional Google dependency for support" - ) - - # apply markers to entire test module pytestmark = [ pytest.mark.skipif( @@ -64,7 +57,9 @@ def list_blobs( def _build_pandas_gcs_datasource( gcs_options: Dict[str, Any] | None = None ) -> PandasGoogleCloudStorageDatasource: - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) pandas_gcs_datasource = PandasGoogleCloudStorageDatasource( # type: ignore[call-arg] name="pandas_gcs_datasource", bucket_or_name="test_bucket", @@ -76,7 +71,8 @@ def _build_pandas_gcs_datasource( @pytest.fixture @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def pandas_gcs_datasource() -> PandasGoogleCloudStorageDatasource: pandas_gcs_datasource: PandasGoogleCloudStorageDatasource = ( @@ -120,7 +116,8 @@ def csv_asset( @pytest.fixture @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: regex = re.compile( @@ -135,7 +132,8 @@ def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def test_construct_pandas_gcs_datasource_without_gcs_options(): google_cred_file = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") @@ -147,14 +145,15 @@ def test_construct_pandas_gcs_datasource_without_gcs_options(): bucket_or_name="test_bucket", gcs_options={}, ) - gcs_client: GCSClient = pandas_gcs_datasource._get_gcs_client() + gcs_client: GoogleCloudStorageClient = pandas_gcs_datasource._get_gcs_client() assert gcs_client is not None assert pandas_gcs_datasource.name == "pandas_gcs_datasource" @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -171,14 +170,15 @@ def test_construct_pandas_gcs_datasource_with_filename_in_gcs_options( "filename": "my_filename.csv", }, ) - gcs_client: GCSClient = pandas_gcs_datasource._get_gcs_client() + gcs_client: GoogleCloudStorageClient = pandas_gcs_datasource._get_gcs_client() assert gcs_client is not None assert pandas_gcs_datasource.name == "pandas_gcs_datasource" @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -195,14 +195,15 @@ def test_construct_pandas_gcs_datasource_with_info_in_gcs_options( "info": "{my_csv: my_content,}", }, ) - gcs_client: GCSClient = pandas_gcs_datasource._get_gcs_client() + gcs_client: GoogleCloudStorageClient = pandas_gcs_datasource._get_gcs_client() assert gcs_client is not None assert pandas_gcs_datasource.name == "pandas_gcs_datasource" @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -228,7 +229,8 @@ def test_add_csv_asset_to_datasource( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -251,7 +253,8 @@ def test_construct_csv_asset_directly( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -279,7 +282,8 @@ def test_csv_asset_with_batching_regex_unnamed_parameters( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -307,7 +311,8 @@ def test_csv_asset_with_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -335,7 +340,8 @@ def test_csv_asset_with_some_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -364,13 +370,16 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( reason="Accessing objects on google.cloud.storage using Pandas is not working, due to local credentials issues (this test is conducted using Jupyter notebook manually)." ) @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def test_get_batch_list_from_fully_specified_batch_request( monkeypatch: pytest.MonkeyPatch, pandas_gcs_datasource: PandasGoogleCloudStorageDatasource, ): - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) def instantiate_gcs_client_spy(self) -> None: self._gcs = gcs_client @@ -418,7 +427,8 @@ def instantiate_gcs_client_spy(self) -> None: @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def test_test_connection_failures( pandas_gcs_datasource: PandasGoogleCloudStorageDatasource, diff --git a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py index 9d95c6f7a4f5..f714162f6230 100644 --- a/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_azure_blob_storage_datasource.py @@ -24,20 +24,15 @@ from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) +from great_expectations.optional_imports import ( + BlobServiceClient, + ContainerClient, + azure_storage, +) logger = logging.getLogger(__file__) -try: - from azure.storage.blob import BlobServiceClient, ContainerClient -except ImportError: - BlobServiceClient = None - ContainerClient = None - logger.debug( - "Unable to load BlobServiceClient connection object; install optional Azure Storage Blob dependency for support" - ) - - class MockContainerClient: # noinspection PyMethodMayBeStatic,PyUnusedLocal def walk_blobs( @@ -70,7 +65,7 @@ def _build_spark_abs_datasource( @pytest.fixture @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def spark_abs_datasource() -> SparkAzureBlobStorageDatasource: @@ -116,7 +111,7 @@ def csv_asset( @pytest.fixture @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: @@ -132,7 +127,7 @@ def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_spark_abs_datasource_with_account_url_and_credential(): @@ -150,7 +145,7 @@ def test_construct_spark_abs_datasource_with_account_url_and_credential(): @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_spark_abs_datasource_with_conn_str_and_credential(): @@ -168,7 +163,7 @@ def test_construct_spark_abs_datasource_with_conn_str_and_credential(): @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_spark_abs_datasource_with_valid_account_url_assigns_account_name(): @@ -186,7 +181,7 @@ def test_construct_spark_abs_datasource_with_valid_account_url_assigns_account_n @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_spark_abs_datasource_with_valid_conn_str_assigns_account_name(): @@ -204,7 +199,7 @@ def test_construct_spark_abs_datasource_with_valid_conn_str_assigns_account_name @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_construct_spark_abs_datasource_with_multiple_auth_methods_raises_error(): @@ -223,7 +218,7 @@ def test_construct_spark_abs_datasource_with_multiple_auth_methods_raises_error( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -254,7 +249,7 @@ def test_add_csv_asset_to_datasource( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -278,7 +273,7 @@ def test_construct_csv_asset_directly( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -308,7 +303,7 @@ def test_csv_asset_with_batching_regex_unnamed_parameters( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -338,7 +333,7 @@ def test_csv_asset_with_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -368,7 +363,7 @@ def test_csv_asset_with_some_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) @mock.patch( @@ -399,7 +394,7 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( reason="Accessing objects on azure.storage.blob using Spark is not working, due to local credentials issues (this test is conducted using Jupyter notebook manually)." ) @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_get_batch_list_from_fully_specified_batch_request( @@ -458,7 +453,7 @@ def instantiate_azure_client_spy(self) -> None: @pytest.mark.integration @pytest.mark.skipif( - BlobServiceClient is None, + not azure_storage, reason='Could not import "azure.storage.blob" from Microsoft Azure cloud', ) def test_test_connection_failures( diff --git a/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py b/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py index b5514aba539e..4c5768598155 100644 --- a/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py +++ b/tests/datasource/fluent/test_spark_google_cloud_storage_datasource.py @@ -24,21 +24,14 @@ from great_expectations.datasource.fluent.spark_file_path_datasource import ( CSVAsset, ) +from great_expectations.optional_imports import ( + GoogleCloudStorageClient, + google_cloud_storage, +) logger = logging.getLogger(__file__) -try: - from google.cloud import storage - from google.cloud.storage import Client as GCSClient -except ImportError: - storage = None - GCSClient = None - logger.debug( - "Unable to load GoogleCloudStorage connection object; install optional Google dependency for support" - ) - - class MockGCSClient: # noinspection PyMethodMayBeStatic,PyUnusedLocal def list_blobs( @@ -55,7 +48,9 @@ def list_blobs( def _build_spark_gcs_datasource( gcs_options: Dict[str, Any] | None = None ) -> SparkGoogleCloudStorageDatasource: - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) spark_gcs_datasource = SparkGoogleCloudStorageDatasource( name="spark_gcs_datasource", bucket_or_name="test_bucket", @@ -67,7 +62,8 @@ def _build_spark_gcs_datasource( @pytest.fixture @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def spark_gcs_datasource() -> SparkGoogleCloudStorageDatasource: spark_gcs_datasource: SparkGoogleCloudStorageDatasource = ( @@ -111,7 +107,8 @@ def csv_asset( @pytest.fixture @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: regex = re.compile( @@ -126,7 +123,8 @@ def bad_regex_config(csv_asset: CSVAsset) -> tuple[re.Pattern, str]: @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def test_construct_spark_gcs_datasource_without_gcs_options(): google_cred_file = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") @@ -138,14 +136,15 @@ def test_construct_spark_gcs_datasource_without_gcs_options(): bucket_or_name="test_bucket", gcs_options={}, ) - gcs_client: GCSClient = spark_gcs_datasource._get_gcs_client() + gcs_client: GoogleCloudStorageClient = spark_gcs_datasource._get_gcs_client() assert gcs_client is not None assert spark_gcs_datasource.name == "spark_gcs_datasource" @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -162,14 +161,15 @@ def test_construct_spark_gcs_datasource_with_filename_in_gcs_options( "filename": "my_filename.csv", }, ) - gcs_client: GCSClient = spark_gcs_datasource._get_gcs_client() + gcs_client: GoogleCloudStorageClient = spark_gcs_datasource._get_gcs_client() assert gcs_client is not None assert spark_gcs_datasource.name == "spark_gcs_datasource" @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -186,14 +186,15 @@ def test_construct_spark_gcs_datasource_with_info_in_gcs_options( "info": "{my_csv: my_content,}", }, ) - gcs_client: GCSClient = spark_gcs_datasource._get_gcs_client() + gcs_client: GoogleCloudStorageClient = spark_gcs_datasource._get_gcs_client() assert gcs_client is not None assert spark_gcs_datasource.name == "spark_gcs_datasource" @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -222,7 +223,8 @@ def test_add_csv_asset_to_datasource( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -245,7 +247,8 @@ def test_construct_csv_asset_directly( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -273,7 +276,8 @@ def test_csv_asset_with_batching_regex_unnamed_parameters( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -301,7 +305,8 @@ def test_csv_asset_with_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -329,7 +334,8 @@ def test_csv_asset_with_some_batching_regex_named_parameters( @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) @mock.patch( "great_expectations.datasource.fluent.data_asset.data_connector.google_cloud_storage_data_connector.list_gcs_keys" @@ -358,13 +364,16 @@ def test_csv_asset_with_non_string_batching_regex_named_parameters( reason="Accessing objects on google.cloud.storage using Spark is not working, due to local credentials issues (this test is conducted using Jupyter notebook manually)." ) @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def test_get_batch_list_from_fully_specified_batch_request( monkeypatch: pytest.MonkeyPatch, spark_gcs_datasource: SparkGoogleCloudStorageDatasource, ): - gcs_client: GCSClient = cast(GCSClient, MockGCSClient()) + gcs_client: GoogleCloudStorageClient = cast( + GoogleCloudStorageClient, MockGCSClient() + ) def instantiate_gcs_client_spy(self) -> None: self._gcs = gcs_client @@ -415,7 +424,8 @@ def instantiate_gcs_client_spy(self) -> None: @pytest.mark.integration @pytest.mark.skipif( - storage is None, reason='Could not import "storage" from google.cloud' + not google_cloud_storage, + reason='Could not import "storage" from google.cloud', ) def test_test_connection_failures( spark_gcs_datasource: SparkGoogleCloudStorageDatasource, diff --git a/tests/datasource/test_new_datasource.py b/tests/datasource/test_new_datasource.py index 357d92b93851..d47f9634cac6 100644 --- a/tests/datasource/test_new_datasource.py +++ b/tests/datasource/test_new_datasource.py @@ -6,15 +6,6 @@ import pandas as pd import pytest -from great_expectations.util import is_candidate_subset_of_target - -try: - pyspark = pytest.importorskip("pyspark") - from pyspark.sql.types import Row -except ImportError: - pyspark = None - Row = None - import great_expectations.exceptions as gx_exceptions from great_expectations.core.batch import ( Batch, @@ -32,6 +23,8 @@ ConfiguredAssetFilesystemDataConnector, ) from great_expectations.datasource.new_datasource import Datasource +from great_expectations.optional_imports import pyspark_sql_Row +from great_expectations.util import is_candidate_subset_of_target from tests.test_utils import create_files_in_directory yaml = YAMLHandler() @@ -949,7 +942,7 @@ def test_spark_with_batch_spec_passthrough(tmp_path_factory, spark_session): BatchRequest(**batch_request) ) # check that the batch_spec_passthrough has worked - assert batch[0].data.dataframe.head() == Row(x="1", y="2") + assert batch[0].data.dataframe.head() == pyspark_sql_Row(x="1", y="2") @pytest.mark.integration diff --git a/tests/datasource/test_new_datasource_with_aws_glue_data_connector.py b/tests/datasource/test_new_datasource_with_aws_glue_data_connector.py index 102d0b809793..ce1cd5edde78 100644 --- a/tests/datasource/test_new_datasource_with_aws_glue_data_connector.py +++ b/tests/datasource/test_new_datasource_with_aws_glue_data_connector.py @@ -2,15 +2,11 @@ import pytest -try: - pyspark = pytest.importorskip("pyspark") -except ImportError: - pyspark = None - from great_expectations import DataContext from great_expectations.core.yaml_handler import YAMLHandler from great_expectations.data_context.util import instantiate_class_from_config from great_expectations.datasource import BaseDatasource, LegacyDatasource +from great_expectations.optional_imports import pyarrow, pyspark yaml = YAMLHandler() @@ -53,6 +49,14 @@ def data_source_config_with_aws_glue_catalog_data_connectors(): @pytest.mark.integration +@pytest.mark.skipif( + not pyspark, + reason='Could not import "pyspark"', +) +@pytest.mark.skipif( + not pyarrow, + reason='Could not import "pyarrow"', +) def test_instantiation_from_config( glue_titanic_catalog, data_source_config_with_aws_glue_catalog_data_connectors ): @@ -131,6 +135,14 @@ def test_instantiation_from_config( @pytest.mark.integration +@pytest.mark.skipif( + not pyspark, + reason='Could not import "pyspark"', +) +@pytest.mark.skipif( + not pyarrow, + reason='Could not import "pyarrow"', +) def test_instantiation_from_datasource( glue_titanic_catalog, empty_data_context, diff --git a/tests/datasource/test_new_datasource_with_sql_data_connector.py b/tests/datasource/test_new_datasource_with_sql_data_connector.py index fcec05c5b5d4..b9c413f48807 100644 --- a/tests/datasource/test_new_datasource_with_sql_data_connector.py +++ b/tests/datasource/test_new_datasource_with_sql_data_connector.py @@ -3,27 +3,9 @@ import random from typing import Optional, Union +import pandas as pd import pytest -from great_expectations.datasource import ( - BaseDatasource, - LegacyDatasource, - SimpleSqlalchemyDatasource, -) -from great_expectations.exceptions.exceptions import ExecutionEngineError - -logger = logging.getLogger(__name__) - - -try: - import pandas as pd -except ImportError: - pd = None - - logger.debug( - "Unable to load pandas; install optional pandas dependency for support." - ) - import great_expectations.exceptions as gx_exceptions from great_expectations import DataContext from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest @@ -33,6 +15,12 @@ file_relative_path, instantiate_class_from_config, ) +from great_expectations.datasource import ( + BaseDatasource, + LegacyDatasource, + SimpleSqlalchemyDatasource, +) +from great_expectations.exceptions.exceptions import ExecutionEngineError from great_expectations.validator.validator import Validator try: @@ -48,9 +36,13 @@ except ImportError: sqla_bigquery = None + yaml = YAMLHandler() +logger = logging.getLogger(__name__) + + @pytest.fixture def data_context_with_sql_data_connectors_including_schema_for_testing_get_batch( sa, diff --git a/tests/execution_engine/conftest.py b/tests/execution_engine/conftest.py index 522d781d1b67..8b3cfbc5d99e 100644 --- a/tests/execution_engine/conftest.py +++ b/tests/execution_engine/conftest.py @@ -10,18 +10,7 @@ from moto import mock_s3 from great_expectations.core.batch_spec import AzureBatchSpec, GCSBatchSpec - -try: - import pyspark - - # noinspection PyPep8Naming - import pyspark.sql.functions as F - from pyspark.sql.types import IntegerType, StringType -except ImportError: - pyspark = None - F = None - IntegerType = None - StringType = None +from great_expectations.optional_imports import sparktypes, F @pytest.fixture(scope="function") @@ -209,7 +198,8 @@ def generate_ascending_list_of_datetimes( ) ) spark_df = spark_df.withColumn( - "timestamp", F.col("timestamp").cast(IntegerType()).cast(StringType()) + "timestamp", + F.col("timestamp").cast(sparktypes.IntegerType()).cast(sparktypes.StringType()), ) return spark_df diff --git a/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_sampling.py b/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_sampling.py index c359445124ab..9310c54044ad 100644 --- a/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_sampling.py +++ b/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_sampling.py @@ -5,12 +5,6 @@ import great_expectations.exceptions as gx_exceptions from great_expectations.core.batch_spec import RuntimeDataBatchSpec -try: - pyspark = pytest.importorskip("pyspark") - -except ImportError: - pyspark = None - @pytest.mark.parametrize( "underscore_prefix", diff --git a/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_splitting.py b/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_splitting.py index e7deac79ac06..28264d499945 100644 --- a/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_splitting.py +++ b/tests/execution_engine/split_and_sample/test_sparkdf_execution_engine_splitting.py @@ -25,16 +25,7 @@ SINGLE_DATE_PART_BATCH_IDENTIFIERS, SINGLE_DATE_PART_DATE_PARTS, ) - -try: - pyspark = pytest.importorskip("pyspark") - import pyspark.sql.functions as F - from pyspark.sql import DataFrame - -except ImportError: - pyspark = None - F = None - DataFrame = None +from great_expectations.optional_imports import F, pyspark_sql_DataFrame, pyarrow # Here we add SparkDataSplitter specific test cases to the generic test cases: SINGLE_DATE_PART_DATE_PARTS += [ @@ -65,7 +56,7 @@ def simple_multi_year_spark_df(spark_session): ("2020-12-04 12:00:00.000",), ] - spark_df: DataFrame = spark_session.createDataFrame( + spark_df: pyspark_sql_DataFrame = spark_session.createDataFrame( data=spark_df_data, schema=["input_timestamp"] ) spark_df = spark_df.withColumn("timestamp", F.to_timestamp("input_timestamp")) @@ -86,10 +77,10 @@ def test_get_batch_with_split_on_year( num_values_in_df, spark_session, basic_spark_df_execution_engine, - simple_multi_year_spark_df: DataFrame, + simple_multi_year_spark_df: pyspark_sql_DataFrame, ): - split_df: DataFrame = basic_spark_df_execution_engine.get_batch_data( + split_df: pyspark_sql_DataFrame = basic_spark_df_execution_engine.get_batch_data( RuntimeDataBatchSpec( batch_data=simple_multi_year_spark_df, splitter_method="split_on_year", @@ -121,10 +112,10 @@ def test_get_batch_with_split_on_date_parts_day( num_values_in_df, spark_session, basic_spark_df_execution_engine, - simple_multi_year_spark_df: DataFrame, + simple_multi_year_spark_df: pyspark_sql_DataFrame, ): - split_df: DataFrame = basic_spark_df_execution_engine.get_batch_data( + split_df: pyspark_sql_DataFrame = basic_spark_df_execution_engine.get_batch_data( RuntimeDataBatchSpec( batch_data=simple_multi_year_spark_df, splitter_method="split_on_date_parts", @@ -161,7 +152,7 @@ def test_split_on_date_parts_single_date_parts( """ data_splitter: SparkDataSplitter = SparkDataSplitter() column_name: str = "timestamp" - result: DataFrame = data_splitter.split_on_date_parts( + result: pyspark_sql_DataFrame = data_splitter.split_on_date_parts( df=simple_multi_year_spark_df, column_name=column_name, batch_identifiers={column_name: batch_identifiers_for_column}, @@ -190,7 +181,7 @@ def test_split_on_date_parts_multiple_date_parts( """ data_splitter: SparkDataSplitter = SparkDataSplitter() column_name: str = "timestamp" - result: DataFrame = data_splitter.split_on_date_parts( + result: pyspark_sql_DataFrame = data_splitter.split_on_date_parts( df=simple_multi_year_spark_df, column_name=column_name, batch_identifiers={column_name: batch_identifiers_for_column}, @@ -217,7 +208,7 @@ def test_named_date_part_methods( mock_split_on_date_parts: mock.MagicMock, splitter_method_name: str, called_with_date_parts: List[DatePart], - simple_multi_year_spark_df: DataFrame, + simple_multi_year_spark_df: pyspark_sql_DataFrame, ): """Test that a partially pre-filled version of split_on_date_parts() was called with the appropriate params. For example, split_on_year. @@ -309,6 +300,10 @@ def test_get_batch_empty_splitter_tsv( assert len(test_sparkdf.columns) == 2 +@pytest.mark.skipif( + not pyarrow, + reason='Could not import "pyarrow"', +) def test_get_batch_empty_splitter_parquet( test_folder_connection_path_parquet, basic_spark_df_execution_engine ): diff --git a/tests/execution_engine/test_pandas_execution_engine.py b/tests/execution_engine/test_pandas_execution_engine.py index c9bc093121e9..36acaad5bdcf 100644 --- a/tests/execution_engine/test_pandas_execution_engine.py +++ b/tests/execution_engine/test_pandas_execution_engine.py @@ -8,19 +8,11 @@ # noinspection PyBroadException from great_expectations.core.metric_domain_types import MetricDomainTypes from great_expectations.validator.computed_metric import MetricValue - -try: - # noinspection PyUnresolvedReferences - from azure.storage.blob import BlobServiceClient -except: - azure = None - - +from great_expectations.optional_imports import google_cloud_storage import great_expectations.exceptions as gx_exceptions from great_expectations.core.batch_spec import RuntimeDataBatchSpec, S3BatchSpec from great_expectations.execution_engine.pandas_execution_engine import ( PandasExecutionEngine, - storage, ) from great_expectations.util import is_library_loadable from great_expectations.validator.metric_configuration import MetricConfiguration @@ -572,14 +564,14 @@ def test_get_batch_with_no_azure_configured(azure_batch_spec): @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in pandas_execution_engine.py", ) @mock.patch( - "great_expectations.execution_engine.pandas_execution_engine.service_account", + "great_expectations.execution_engine.pandas_execution_engine.google_service_account", ) @mock.patch( - "great_expectations.execution_engine.pandas_execution_engine.storage.Client", + "great_expectations.execution_engine.pandas_execution_engine.google_cloud_storage.Client", ) def test_constructor_with_gcs_options(mock_gcs_conn, mock_auth_method): # default instantiation @@ -596,11 +588,11 @@ def test_constructor_with_gcs_options(mock_gcs_conn, mock_auth_method): @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in pandas_execution_engine.py", ) @mock.patch( - "great_expectations.execution_engine.pandas_execution_engine.storage.Client", + "great_expectations.execution_engine.pandas_execution_engine.google_cloud_storage.Client", ) def test_get_batch_data_with_gcs_batch_spec( mock_gcs_conn, @@ -634,7 +626,7 @@ def test_get_batch_data_with_gcs_batch_spec_no_credentials(gcs_batch_spec, monke @pytest.mark.skipif( - storage is None, + not google_cloud_storage, reason="Could not import 'storage' from google.cloud in pandas_execution_engine.py", ) def test_get_batch_with_gcs_misconfigured(gcs_batch_spec): diff --git a/tests/execution_engine/test_sparkdf_execution_engine.py b/tests/execution_engine/test_sparkdf_execution_engine.py index dbf5252da8d8..d6e5743d9fff 100644 --- a/tests/execution_engine/test_sparkdf_execution_engine.py +++ b/tests/execution_engine/test_sparkdf_execution_engine.py @@ -20,30 +20,7 @@ from great_expectations.validator.metric_configuration import MetricConfiguration from tests.expectations.test_util import get_table_columns_metric from tests.test_utils import create_files_in_directory - -try: - pyspark = pytest.importorskip("pyspark") - # noinspection PyPep8Naming - import pyspark.sql.functions as F - from pyspark.sql.types import ( - DoubleType, - IntegerType, - LongType, - Row, - StringType, - StructField, - StructType, - ) -except ImportError: - pyspark = None - F = None - IntegerType = None - LongType = None - StringType = None - Row = None - DoubleType = None - StructType = None - StructField = None +from great_expectations.optional_imports import F, sparktypes, pyspark_sql_Row def test_reader_fn(spark_session, basic_spark_df_execution_engine): @@ -76,7 +53,7 @@ def test_reader_fn_parameters( test_sparkdf_with_no_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec(path=test_df_small_csv_path, data_asset_name="DATA_ASSET") ).dataframe - assert test_sparkdf_with_no_header_param.head() == Row(_c0="x", _c1="y") + assert test_sparkdf_with_no_header_param.head() == pyspark_sql_Row(_c0="x", _c1="y") test_sparkdf_with_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec( @@ -85,18 +62,18 @@ def test_reader_fn_parameters( reader_options={"header": True}, ) ).dataframe - assert test_sparkdf_with_header_param.head() == Row(x="1", y="2") + assert test_sparkdf_with_header_param.head() == pyspark_sql_Row(x="1", y="2") test_sparkdf_with_no_header_param = basic_spark_df_execution_engine.get_batch_data( PathBatchSpec(path=test_df_small_csv_path, data_asset_name="DATA_ASSET") ).dataframe - assert test_sparkdf_with_no_header_param.head() == Row(_c0="x", _c1="y") + assert test_sparkdf_with_no_header_param.head() == pyspark_sql_Row(_c0="x", _c1="y") # defining schema - schema: pyspark.sql.types.StructType = StructType( + schema: sparktypes.StructType = sparktypes.StructType( [ - StructField("x", IntegerType(), True), - StructField("y", IntegerType(), True), + sparktypes.StructField("x", sparktypes.IntegerType(), True), + sparktypes.StructField("y", sparktypes.IntegerType(), True), ] ) schema_dict: dict = schema @@ -110,7 +87,7 @@ def test_reader_fn_parameters( ) ).dataframe ) - assert test_sparkdf_with_header_param_and_schema.head() == Row(x=1, y=2) + assert test_sparkdf_with_header_param_and_schema.head() == pyspark_sql_Row(x=1, y=2) assert test_sparkdf_with_header_param_and_schema.schema == schema_dict @@ -290,7 +267,9 @@ def test_get_domain_records_with_column_pair_domain( } ) for column_name in data.columns: - data = data.withColumn(column_name, data[column_name].cast(LongType())) + data = data.withColumn( + column_name, data[column_name].cast(sparktypes.LongType()) + ) expected_column_pair_pd_df = pd.DataFrame( {"a": [2, 3, 4], "b": [3, 4, 5], "c": [2, 3, 4]} @@ -361,7 +340,9 @@ def test_get_domain_records_with_multicolumn_domain( } ) for column_name in data.columns: - data = data.withColumn(column_name, data[column_name].cast(LongType())) + data = data.withColumn( + column_name, data[column_name].cast(sparktypes.LongType()) + ) expected_multicolumn_pd_df = pd.DataFrame( {"a": [2, 3, 4, 5], "b": [3, 4, 5, 7], "c": [2, 3, 4, 6]}, index=[0, 1, 2, 4] @@ -396,7 +377,9 @@ def test_get_domain_records_with_multicolumn_domain( } ) for column_name in data.columns: - data = data.withColumn(column_name, data[column_name].cast(LongType())) + data = data.withColumn( + column_name, data[column_name].cast(sparktypes.LongType()) + ) expected_multicolumn_pd_df = pd.DataFrame( {"a": [1, 2, 3, 4], "b": [2, 3, 4, 5], "c": [1, 2, 3, 4]}, index=[0, 1, 2, 3] @@ -1194,9 +1177,9 @@ def test_dataframe_property_given_loaded_batch(spark_session): @pytest.mark.integration def test_schema_properly_added(spark_session): - schema: pyspark.sql.types.StructType = StructType( + schema: sparktypes.StructType = sparktypes.StructType( [ - StructField("a", IntegerType(), True), + sparktypes.StructField("a", sparktypes.IntegerType(), True), ] ) engine: SparkDFExecutionEngine = build_spark_engine( diff --git a/tests/expectations/metrics/test_core.py b/tests/expectations/metrics/test_core.py index 9fc69b48e5d7..868785a62628 100644 --- a/tests/expectations/metrics/test_core.py +++ b/tests/expectations/metrics/test_core.py @@ -22,10 +22,7 @@ SqlAlchemyBatchData, SqlAlchemyExecutionEngine, ) -from great_expectations.expectations.metrics.import_manager import ( - pyspark_sql_Column, - quoted_name, -) +from great_expectations.optional_imports import pyspark_sql_Column, quoted_name from great_expectations.expectations.metrics.util import ( get_dbms_compatible_column_names, ) @@ -1233,7 +1230,7 @@ def test_column_partition_metric_spark(spark_session): Expected partition boundaries are pre-computed algorithmically and asserted to be "close" to actual metric values. """ - from great_expectations.expectations.metrics.import_manager import sparktypes + from great_expectations.optional_imports import sparktypes week_idx: int engine: SparkDFExecutionEngine = build_spark_engine( @@ -4332,7 +4329,7 @@ def test_value_counts_metric_sa(sa): @pytest.mark.integration def test_value_counts_metric_spark(spark_session): - from great_expectations.expectations.metrics.import_manager import sparktypes + from great_expectations.optional_imports import sparktypes engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, diff --git a/tests/expectations/test_expectation_arguments.py b/tests/expectations/test_expectation_arguments.py index fb95381fe26d..476042f89d8b 100644 --- a/tests/expectations/test_expectation_arguments.py +++ b/tests/expectations/test_expectation_arguments.py @@ -18,18 +18,10 @@ UsageStatisticsHandler, ) from great_expectations.validator.validator import Validator +from great_expectations.optional_imports import pyspark_sql_DataFrame logger = logging.getLogger(__name__) -try: - from pyspark.sql import DataFrame -except ImportError: - DataFrame = None - - logger.debug( - "Unable to load pyspark; install optional spark dependency for support." - ) - @pytest.fixture def test_pandas_df(): @@ -41,7 +33,7 @@ def test_pandas_df(): @pytest.fixture def test_spark_df(test_pandas_df, spark_session): - df: DataFrame = spark_session.createDataFrame(data=test_pandas_df) + df: pyspark_sql_DataFrame = spark_session.createDataFrame(data=test_pandas_df) return df diff --git a/tests/integration/docusaurus/connecting_to_your_data/how_to_introspect_and_partition_your_data/sql_database/yaml_example_complete.py b/tests/integration/docusaurus/connecting_to_your_data/how_to_introspect_and_partition_your_data/sql_database/yaml_example_complete.py index 082f554f609a..7f20b09f5766 100644 --- a/tests/integration/docusaurus/connecting_to_your_data/how_to_introspect_and_partition_your_data/sql_database/yaml_example_complete.py +++ b/tests/integration/docusaurus/connecting_to_your_data/how_to_introspect_and_partition_your_data/sql_database/yaml_example_complete.py @@ -6,7 +6,7 @@ yaml = YAMLHandler() # -from great_expectations.expectations.metrics.import_manager import sa +from great_expectations.optional_imports import sqlalchemy as sa # context = gx.get_context() diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py index a586682ce5bb..497e68e9897e 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_max_to_be_between_custom.py @@ -22,7 +22,7 @@ column_aggregate_partial, column_aggregate_value, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F, sqlalchemy as sa from great_expectations.expectations.metrics.metric_provider import metric_value from great_expectations.render import RenderedStringTemplateContent from great_expectations.render.renderer.renderer import renderer diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_pair_values_to_have_a_difference_of_three.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_pair_values_to_have_a_difference_of_three.py index 3c707ccdd527..5b742fbc8f18 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_pair_values_to_have_a_difference_of_three.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_pair_values_to_have_a_difference_of_three.py @@ -14,7 +14,7 @@ ColumnPairMapExpectation, ExpectationValidationResult, ) -from great_expectations.expectations.metrics.import_manager import F, sa +from great_expectations.optional_imports import F, sqlalchemy as sa from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_equal_three.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_equal_three.py index fc3a34873961..787595b38faf 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_equal_three.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_column_values_to_equal_three.py @@ -18,7 +18,7 @@ ColumnMapMetricProvider, column_condition_partial, ) -from great_expectations.expectations.metrics.import_manager import F +from great_expectations.optional_imports import F from great_expectations.expectations.metrics.metric_provider import metric_partial from great_expectations.render import CollapseContent, RenderedStringTemplateContent from great_expectations.render.renderer.renderer import renderer diff --git a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_multicolumn_values_to_be_multiples_of_three.py b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_multicolumn_values_to_be_multiples_of_three.py index 26eb2ef8b983..7e65c7904719 100644 --- a/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_multicolumn_values_to_be_multiples_of_three.py +++ b/tests/integration/docusaurus/expectations/creating_custom_expectations/expect_multicolumn_values_to_be_multiples_of_three.py @@ -14,7 +14,7 @@ ExpectationValidationResult, MulticolumnMapExpectation, ) -from great_expectations.expectations.metrics.import_manager import F, sa, sparktypes +from great_expectations.optional_imports import F, sqlalchemy as sa, sparktypes from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, multicolumn_condition_partial, diff --git a/tests/integration/spark/test_spark_config.py b/tests/integration/spark/test_spark_config.py index 05a97e2f182e..88bfb04b6085 100644 --- a/tests/integration/spark/test_spark_config.py +++ b/tests/integration/spark/test_spark_config.py @@ -1,23 +1,16 @@ import logging -import os from typing import Dict, List -import pytest from packaging.version import Version, parse as parse_version +from great_expectations.optional_imports import pyspark_sql_SparkSession, pyspark + logger = logging.getLogger(__name__) try: - import pyspark - from pyspark import SparkContext - from pyspark.sql import SparkSession - from great_expectations.datasource import SparkDFDatasource from great_expectations.execution_engine import SparkDFExecutionEngine except ImportError: - pyspark = None - SparkContext = None - SparkSession = None SparkDFDatasource = None SparkDFExecutionEngine = None # TODO: review logging more detail here @@ -41,7 +34,7 @@ def test_spark_config_datasource(spark_session_v012): # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } source: SparkDFDatasource = SparkDFDatasource(spark_config=spark_config) - spark_session: SparkSession = source.spark + spark_session: pyspark_sql_SparkSession = source.spark # noinspection PyProtectedMember sc_stopped: bool = spark_session.sparkContext._jsc.sc().isStopped() assert not sc_stopped @@ -63,7 +56,7 @@ def test_spark_config_execution_engine(spark_session): # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. } execution_engine = SparkDFExecutionEngine(spark_config=new_spark_config) - new_spark_session: SparkSession = execution_engine.spark + new_spark_session: pyspark_sql_SparkSession = execution_engine.spark # noinspection PyProtectedMember sc_stopped: bool = new_spark_session.sparkContext._jsc.sc().isStopped() diff --git a/tests/test_utils.py b/tests/test_utils.py index 8323890918a8..1cd714e80cad 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -30,25 +30,13 @@ ) from great_expectations.data_context.util import instantiate_class_from_config from great_expectations.execution_engine import SqlAlchemyExecutionEngine - -logger = logging.getLogger(__name__) - -from great_expectations.optional_imports import sqlalchemy_Connection - -try: - import sqlalchemy as sa - from sqlalchemy.exc import SQLAlchemyError - -except ImportError: - logger.debug( - "Unable to load SqlAlchemy context; install optional sqlalchemy dependency for support" - ) - sa = None - reflection = None - Table = None - Select = None - SQLAlchemyError = None - +from great_expectations.optional_imports import ( + SQLAlchemyError, + sqlalchemy_engine_Connection, +) +from great_expectations.optional_imports import ( + sqlalchemy as sa, +) logger = logging.getLogger(__name__) yaml_handler = YAMLHandler() @@ -158,7 +146,9 @@ def get_sqlite_temp_table_names(execution_engine): statement = sa.text("SELECT name FROM sqlite_temp_master") - if isinstance(execution_engine.engine, sqlalchemy_Connection): + if sqlalchemy_engine_Connection and isinstance( + execution_engine.engine, sqlalchemy_engine_Connection + ): connection = execution_engine.engine result = connection.execute(statement) else: @@ -173,7 +163,9 @@ def get_sqlite_table_names(execution_engine): statement = sa.text("SELECT name FROM sqlite_master") - if isinstance(execution_engine.engine, sqlalchemy_Connection): + if sqlalchemy_engine_Connection and isinstance( + execution_engine.engine, sqlalchemy_engine_Connection + ): connection = execution_engine.engine result = connection.execute(statement) else: From d7398f75d6bfe7717051125b4ece02bbab6b13b3 Mon Sep 17 00:00:00 2001 From: Rachel-Reverie <94694058+Rachel-Reverie@users.noreply.github.com> Date: Fri, 14 Apr 2023 10:39:46 -0500 Subject: [PATCH 96/96] [DOCS] Updates the "Interactive Mode" guide for creating Expectations (#7624) Co-authored-by: Rob Gray <104205257+kwcanuck@users.noreply.github.com> --- ...context_initialize_quick_or_filesystem.mdx | 7 + ...es_of_data_from_a_configured_datasource.md | 2 +- .../_preface.mdx | 36 ++++ ...nt_feedback_from_a_sample_batch_of_data.md | 178 +++++++++++++++--- ...pectations_with_instant_feedback_fluent.py | 5 +- 5 files changed, 192 insertions(+), 36 deletions(-) create mode 100644 docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_quick_or_filesystem.mdx diff --git a/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_quick_or_filesystem.mdx b/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_quick_or_filesystem.mdx new file mode 100644 index 000000000000..f57775d8c545 --- /dev/null +++ b/docs/docusaurus/docs/components/setup/link_lists/_data_context_initialize_quick_or_filesystem.mdx @@ -0,0 +1,7 @@ +**Quickstart Data Context** +- [How to quickly instantiate a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context) + +**Filesystem Data Contexts** +- [How to initialize a new Data Context with the CLI](/docs/guides/setup/configuring_data_contexts/how_to_configure_a_new_data_context_with_the_cli) +- [How to initialize a filesystem Data Context in Python](/docs/guides/setup/configuring_data_contexts/initializing_data_contexts/how_to_initialize_a_filesystem_data_context_in_python) +- [How to instantiate a specific Filesystem Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_instantiate_a_specific_filesystem_data_context) \ No newline at end of file diff --git a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md index 8693562abc2d..fcbb02c22b25 100644 --- a/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md +++ b/docs/docusaurus/docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource.md @@ -1,5 +1,5 @@ --- -title: How to get one or more Batches of data from a configured Datasource +title: How to get one or more Batches from a Datasource configured with the block-config method --- import Prerequisites from '../connecting_to_your_data/components/prerequisites.jsx' import TechnicalTag from '@site/docs/term_tags/_tag.mdx'; diff --git a/docs/docusaurus/docs/guides/expectations/components_how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data/_preface.mdx b/docs/docusaurus/docs/guides/expectations/components_how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data/_preface.mdx index 5d0cad1c66d3..49858ba6feb4 100644 --- a/docs/docusaurus/docs/guides/expectations/components_how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data/_preface.mdx +++ b/docs/docusaurus/docs/guides/expectations/components_how_to_create_and_edit_expectations_with_instant_feedback_from_a_sample_batch_of_data/_preface.mdx @@ -10,3 +10,39 @@ This guide will take you through the process of creating +To Validate data we must first define a set of Expectations for that data to be Validated against. In this guide, you'll learn how to create Expectations and interactively edit them with feedback from Validating each against a Batch of data. Validating your Expectations as you define them allows you to quickly determine if the Expectations are suitable for our data, and identify where changes might be necessary. + +:::info Does this process edit my data? +No. The interactive method used to create and edit Expectations does not edit or alter the Batch data. +::: + +## Prerequisites + + + +- Great Expectations installed in a Python environment +- A Filesystem Data Context for your Expectations +- Created a Datasource from which to request a Batch of data for introspection +- A passion for data quality + + + +
    + + +### If you haven't set up Great Expectations + + + + + +
    + +
    + + +### If you haven't initialized your Data Context + + + +See one of the following guides: + + + +
    + +
    + + +### If you haven't created a Datasource + + + +See one of the following guides: + + + +
    ## Steps -### 1. Use the CLI to begin the interactive process of creating Expectations - +### 1. Import the Great Expectations module and instantiate a Data Context + +For this guide we will be working with Python code in a Jupyter Notebook. Jupyter is included with GX and lets us easily edit code and immediately see the results of our changes. + +Run the following code to import Great Expectations and instantiate a Data Context: + +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py imports and data context" +``` + +:::info Data Contexts and persisting data + +If you're using an Ephemeral Data Context, your configurations will not persist beyond the current Python session. However, if you're using a Filesystem or Cloud Data Context, they do persist. The `get_context()` method returns the first Cloud or Filesystem Data Context it can find. If a Cloud or Filesystem Data Context has not be configured or cannot be found, it provides an Ephemeral Data Context. For more information about the `get_context()` method, see [How to quickly instantiate a Data Context](/docs/guides/setup/configuring_data_contexts/instantiating_data_contexts/how_to_quickly_instantiate_a_data_context). + +::: + +### 2. Use an existing Data Asset to create a Batch Request + +Add the following method to retrieve a previously configured Data Asset from the Data Context you initialized and create a Batch Request to identify the Batch of data that you'll use to validate your Expectations: + +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py get_data_asset_and_build_batch_request" +``` + +:::info Limit the Batches returned by a Batch Request + +You can provide a dictionary as the `options` parameter of `build_batch_request()` to limit the Batches returned by a Batch Request. If you leave the `options` parameter empty, your Batch Request will include all the Batches configured in the corresponding Data Asset. For more information about Batch Requests, see [How to request data from a Data Asset](/docs/guides/connecting_to_your_data/fluent/batch_requests/how_to_request_data_from_a_data_asset). + +::: + +:::caution Batch Requests and Datasources built with the advanced block-config method + +If you are working with a Datasource that was created using the advanced block-config method, you will need to build your Batch Request differently than was demonstrated earlier. For more information, please see our guide on [How to get one or more batches from a Datasource configured with the block-config method](/docs/guides/connecting_to_your_data/how_to_get_one_or_more_batches_of_data_from_a_configured_datasource). + +::: + +### 3. Create a Validator + +We will use a Validator to interactively create our Expectations. To do this, a Validator needs two parameters. One parameter indicates the Batch containing data that is used to Validate the Expectations. The other provides a name for the combined list of Expectations we create. + +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py get_validator_and_inspect_data" +``` + +:::info Working outside a Jupyter Notebook + +If you're using a Jupyter Notebook you'll automatically see the results of the code you run in a new cell when you run the code. If you're using a different interpreter, you might need to explicitly print these results to view them: + +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py inspect_data_no_jupyter" +``` + +::: + +### 4. Use the Validator to create and run an Expectation + +The Validator provides access to all the available Expectations as methods. When an `expect_*()` method is run from the Validator, the Validator adds the specified Expectation to an Expectation Suite (or edits an existing Expectation in the Expectation Suite, if applicable) in its configuration, and then the specified Expectation is run against the data that was provided when the Validator was initialized with a Batch Request. + +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py interactive_validation" +``` + +Since we are working in a Jupyter Notebook, the results of the Validation are printed after we run an `expect_*()` method. We can examine those results to determine if the Expectation needs to be edited. + +:::info Working outside a Jupyter Notebook +If you are not working in a Jupyter Notebook you may need to explicitly print your results: + +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py interactive_validation_no_jupyter" +``` + +::: + +### 5. (Optional) Repeat step 4 to edit Expectations or create additional Expectations + +If you choose to edit an Expectation after you've viewed the Validation Results that were returned when it was created, you can do so by running the `validator.expect_*()` method with different parameters than you supplied previously. You can also have the Validator run an entirely different `expect_*()` method and create additional Expectations. All the Expectations that you create are stored in a list in the Validator's in-memory configuration. + +:::tip What if I want to use the same Expectation more than once? + +GX takes into account certain parameters when determining if an Expectation is being added to the list or if an existing Expectation should be edited. For example, if you are created an Expectation with a method such as `expect_column_*()` you could later edit it by providing the same `column` parameter when running the `expect_column_*()` method a second time, and different values for any other parameters. However, if you ran the same `expect_column_*()` method and provided a different `column` parameter, you will create an additional instance of the Expectation for the new `column` value, rather than overwrite the Expectation you defined with the first `column` value. -### 2. Specify a Datasource (if multiple are available) - +::: -### 3. Specify the name of your new Expectation Suite - +### 6. (Optional) Save your Expectations for future use -### 4. Continue the workflow within a Jupyter Notebook - +The Expectations you create with the interactive method are saved in an Expectation Suite on the Validator object. Validators do not persist outside the current Python session and for this reason these Expectations will not be kept unless you save them to your Data Context. This can be ideal if you are using a Validator for quick data validation and exploration, but in most cases you'll want to reuse your newly created Expectation Suite in future Python sessions. - +To keep your Expectations for future use, you save them to your Data Context. A Filesystem or Cloud Data Context persists outside the current Python session, so saving the Expectation Suite in your Data Context's Expectations Store ensures you can access it in the future: -## Optional alternative Interactive Mode workflows +```python name="tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py save_expectation_suite" +``` -### 1. (Optional) Edit an existing Expectation Suite in Interactive Mode - +:::caution Ephemeral Data Contexts and persistence -### 2. (Optional) Profile your data to generate Expectations, then edit them in Interactive Mode. - +Ephemeral Data Contexts don't persist beyond the current Python session. If you're working with an Ephemeral Data Context, you'll need to convert it to a Filesystem Data Context using the Data Context's `convert_to_file_context()` method. Otherwise, your saved configurations won't be available in future Python sessions as the Data Context itself is no longer available. -## Additional tips and tricks +::: -### 1. Save a Batch Request to reuse when editing an Expectation Suite in Interactive Mode - +## Next steps -### 2. Use the built-in help to review the CLI's `suite new` optional flags - +Now that you have created and saved an Expectation Suite, you can [Validate your data](/docs/guides/validation/validate_data_overview). \ No newline at end of file diff --git a/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py b/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py index 75c6e2663abb..13c4f7a0cab8 100644 --- a/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py +++ b/tests/integration/docusaurus/validation/validator/how_to_create_and_edit_expectations_with_instant_feedback_fluent.py @@ -1,9 +1,6 @@ -# +# import great_expectations as gx -# - -# context = gx.get_context() #