In [1]:
import great_expectations as gx
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest
from ruamel import yaml
import pyspark

import os

import pyarrow.fs as fs
from deltalake import DeltaTable
import pandas as pd

## Get Dataframe

In [2]:
os.environ 
## Should see AWS_ENDPOINT_URL, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY environment varibles.
# These environment variables are set in the docker-compose.yml, and the service account used by PySpark
#> to read from and write to Minio are created by the minio-init container defined in docker-compose.yml

environ{'PATH': '/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin',
        'HOSTNAME': 'a610c2cb8b68',
        'AWS_ACCESS_KEY_ID': 'jupyteraccesskey',
        'AWS_SECRET_ACCESS_KEY': 'jupytersupersecretkey',
        'AWS_ENDPOINT_URL': 'http://minio:9000',
        'S3_BUCKET': 'test',
        'LANG': 'C.UTF-8',
        'GPG_KEY': 'A035C8C19219BA821ECEA86B64E628F8D684696D',
        'PYTHON_VERSION': '3.11.6',
        'PYTHON_PIP_VERSION': '23.2.1',
        'PYTHON_SETUPTOOLS_VERSION': '65.5.1',
        'PYTHON_GET_PIP_URL': 'https://github.com/pypa/get-pip/raw/9af82b715db434abb94a0a6f3569f43e72157346/public/get-pip.py',
        'PYTHON_GET_PIP_SHA256': '45a2bb8bf2bb5eff16fdd00faef6f29731831c7c59bd9fc2bf1f3bed511ff1fe',
        'HOME': '/root',
        'JPY_SESSION_NAME': '/notebooks/great_expectations_pandas_example.ipynb',
        'JPY_PARENT_PID': '1',
        'PYDEVD_USE_FRAME_EVAL': 'NO',
        'TERM': 'xterm-color',
        'CLICOLOR': '1',
        '

In [3]:
S3_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY_ID")
S3_BUCKET = os.environ.get("S3_BUCKET")
S3_SECRET_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
S3_ENDPOINT = os.environ.get("AWS_ENDPOINT_URL")

In [4]:
table_uri = "s3://test/appl_stock_delta_table"
s3_options = {
    "access_key_id": S3_ACCESS_KEY, "secret_access_key":S3_SECRET_KEY, "aws_endpoint_url":S3_ENDPOINT, "AWS_STORAGE_ALLOW_HTTP":"true", "region":"us-east-1"
}
dt = DeltaTable(table_uri, storage_options=s3_options)

In [5]:
## Pandas will use the python libs fsspec and s3fs to fetch data in S3 (MINIO)
### Uses environment variables AWS_* for authentication (see env vars above)
df = pd.read_csv("s3://test/appl_stock.csv")

In [4]:
## Create a Month Column that we'll use with Great Expectations below
df["Month"] = df["Date"].str[5:7].astype(int)

## Prepare Great Expectations Context and Connector

In [5]:
context = gx.get_context()

In [6]:
datasource_name = "delta_lake"

config = f"""
name: {datasource_name}
class_name: Datasource
module_name: great_expectations.datasource
execution_engine:
  module_name: great_expectations.execution_engine
  class_name: PandasExecutionEngine
data_connectors:
    default_runtime_data_connector_name:
        class_name: RuntimeDataConnector
        batch_identifiers:
            - default_identifier_name
"""

In [7]:
context.test_yaml_config(config)

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	default_runtime_data_connector_name:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x7f8659c04bd0>

In [8]:
context.add_datasource(**yaml.load(config))

<great_expectations.datasource.new_datasource.Datasource at 0x7f8659c08250>

In [9]:
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="APPL_TABLE",  # this is the name of the table you want to retrieve
    batch_identifiers={"default_identifier_name":"my_batch"},
    runtime_parameters={"batch_data":df}
)


In [10]:
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)

In [11]:
validator.expect_column_to_exist("Close")

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Date,Open,High,Low,Close,Volume,AdjClose,Month
0,2010-01-04,213.429998,214.499996,212.380001,214.009998,123432400,27.727039,1
1,2010-01-05,214.599998,215.589994,213.249994,214.379993,150476200,27.774976,1
2,2010-01-06,214.379993,215.23,210.750004,210.969995,138040000,27.333178,1
3,2010-01-07,211.75,212.000006,209.050005,210.58,119282800,27.28265,1
4,2010-01-08,210.299994,212.000006,209.060005,211.980005,111902700,27.464034,1


In [13]:
validator.expect_column_distinct_values_to_be_in_set("Month", list(range(1,13)))

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      1,
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      9,
      10,
      11,
      12
    ],
    "details": {
      "value_counts": [
        {
          "value": 1,
          "count": 140
        },
        {
          "value": 2,
          "count": 135
        },
        {
          "value": 3,
          "count": 153
        },
        {
          "value": 4,
          "count": 146
        },
        {
          "value": 5,
          "count": 147
        },
        {
          "value": 6,
          "count": 150
        },
        {
          "value": 7,
          "count": 148
        },
        {
          "value": 8,
          "count": 155
        },
        {
          "value": 9,
          "count": 144
        },
        {
          "value": 10,
          "count": 152
        },
        {
          "value": 11,
          "count": 143
        },
        {
          "value": 12,
          "count": 149