In [1]:
import great_expectations as gx
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest
from ruamel import yaml
import pyspark

import os

import pyarrow.fs as fs
from deltalake import DeltaTable

## Get Dataframe

In [2]:
os.environ 
## Should see S3_ENDPOINT, S3_ACCESS_KEY, and S3_SECRET_KEY environment varibles.
# These environment variables are set in the docker-compose.yml, and the service account used by PySpark
#> to read from and write to Minio are created by the minio-init container defined in docker-compose.yml

environ{'PATH': '/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
        'HOSTNAME': '2dfc3a6dd965',
        'S3_ENDPOINT': 'http://minio:9000',
        'S3_BUCKET': 'test',
        'S3_ACCESS_KEY': 'jupyteraccesskey',
        'S3_SECRET_KEY': 'jupytersupersecretkey',
        'LANG': 'C.UTF-8',
        'GPG_KEY': 'A035C8C19219BA821ECEA86B64E628F8D684696D',
        'PYTHON_VERSION': '3.11.0',
        'PYTHON_PIP_VERSION': '22.3',
        'PYTHON_SETUPTOOLS_VERSION': '65.5.0',
        'PYTHON_GET_PIP_URL': 'https://github.com/pypa/get-pip/raw/66030fa03382b4914d4c4d0896961a0bdeeeb274/public/get-pip.py',
        'PYTHON_GET_PIP_SHA256': '1e501cf004eac1b7eb1f97266d28f995ae835d30250bec7f8850562703067dc6',
        'HOME': '/root',
        'PYDEVD_USE_FRAME_EVAL': 'NO',
        'JPY_SESSION_NAME': '/notebooks/great_expectations_delta_pandas_example.ipynb.ipynb',
        'JPY_PARENT_PID': '1',
        'TERM': 'xterm-color',
        'CLICOLOR': '1',
        'FORCE_C

In [3]:
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY")
S3_BUCKET = os.environ.get("S3_BUCKET")
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY")
S3_ENDPOINT = os.environ.get("S3_ENDPOINT")

In [4]:
table_uri = "s3://test/appl_stock_delta_table"
s3_options = {
    "access_key_id": S3_ACCESS_KEY, "secret_access_key":S3_SECRET_KEY, "aws_endpoint_url":S3_ENDPOINT, "AWS_ALLOW_HTTP":"true", "region":"us-east-1"
}
dt = DeltaTable(table_uri, storage_options=s3_options)

In [5]:
df = dt.to_pandas()

## Prepare Great Expectations Context and Connector

In [6]:
context = gx.get_context()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/22 01:41:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
datasource_name = "delta_lake"

config = f"""
name: {datasource_name}
class_name: Datasource
module_name: great_expectations.datasource
execution_engine:
  module_name: great_expectations.execution_engine
  class_name: PandasExecutionEngine
data_connectors:
    default_runtime_data_connector_name:
        class_name: RuntimeDataConnector
        batch_identifiers:
            - default_identifier_name
"""

In [8]:
context.test_yaml_config(config)

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	default_runtime_data_connector_name:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x7f7b19fab710>

In [9]:
context.add_datasource(**yaml.load(config))

<great_expectations.datasource.new_datasource.Datasource at 0x7f7b19f8c2d0>

In [10]:
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="APPL_TABLE",  # this is the name of the table you want to retrieve
    batch_identifiers={"default_identifier_name":"my_batch"},
    runtime_parameters={"batch_data":df}
)


In [11]:
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)

In [12]:
validator.expect_column_to_exist("Close")

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "meta": {},
  "result": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj_Close,Month,Year
0,2010-06-01,259.690002,265.939999,258.959999,260.830002,219118200,33.793018,6,2010
1,2010-06-02,264.539993,264.799999,260.32999,263.949993,172137000,34.197243,6,2010
2,2010-06-03,265.180008,265.550003,260.409992,263.119987,162526700,34.089707,6,2010
3,2010-06-04,258.209995,261.900013,254.629993,255.960011,189576100,33.162064,6,2010
4,2010-06-07,258.289997,259.14999,250.550007,250.940002,221735500,32.511674,6,2010


In [14]:
validator.expect_column_distinct_values_to_be_in_set("Month", list(range(1,13)))

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "meta": {},
  "result": {
    "observed_value": [
      1,
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      9,
      10,
      11,
      12
    ],
    "details": {
      "value_counts": [
        {
          "value": 1,
          "count": 121
        },
        {
          "value": 2,
          "count": 135
        },
        {
          "value": 3,
          "count": 153
        },
        {
          "value": 4,
          "count": 146
        },
        {
          "value": 5,
          "count": 147
        },
        {
          "value": 6,
          "count": 150
        },
        {
          "value": 7,
          "count": 148
        },
        {
          "value": 8,
          "count": 155
        },
        {
          "value": 9,
          "count": 144
        },
        {
          "value": 10,
          "count": 152
        },
        {
          "value": 11,
          "count": 143
        },
        {
          "value": 12,
        