In [None]:
import great_expectations as gx
from great_expectations.core.batch import BatchRequest, RuntimeBatchRequest
from ruamel import yaml
import pyspark

import os

import pyarrow.fs as fs
from deltalake import DeltaTable
import pandas as pd

## Get Dataframe

In [None]:
os.environ 
## Should see AWS_ENDPOINT_URL, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY environment varibles.
# These environment variables are set in the docker-compose.yml, and the service account used by PySpark
#> to read from and write to Minio are created by the minio-init container defined in docker-compose.yml

In [None]:
S3_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY_ID")
S3_BUCKET = os.environ.get("S3_BUCKET")
S3_SECRET_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
S3_ENDPOINT = os.environ.get("AWS_ENDPOINT_URL")

In [None]:
## Pandas will use the python libs fsspec and s3fs to fetch data in S3 (MINIO)
### Uses environment variables AWS_* for authentication (see env vars above)
df = pd.read_csv("s3://test/appl_stock.csv")

In [None]:
df.head()

In [None]:
## Create a Month Column that we'll use with Great Expectations below
df["Month"] = df["Date"].str[5:7].astype(int)

## Prepare Great Expectations Context and Connector

In [None]:
context = gx.get_context()

In [None]:
datasource_name = "delta_lake"

config = f"""
name: {datasource_name}
class_name: Datasource
module_name: great_expectations.datasource
execution_engine:
  module_name: great_expectations.execution_engine
  class_name: PandasExecutionEngine
data_connectors:
    default_runtime_data_connector_name:
        class_name: RuntimeDataConnector
        batch_identifiers:
            - default_identifier_name
"""

In [None]:
context.test_yaml_config(config)

In [None]:
context.add_datasource(**yaml.load(config))

In [None]:
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="APPL_TABLE",  # this is the name of the table you want to retrieve
    batch_identifiers={"default_identifier_name":"my_batch"},
    runtime_parameters={"batch_data":df}
)


In [None]:
context.expe

In [None]:
context.add_expectation_suite(
    expectation_suite_name="test_suite"
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)

In [None]:
validator.expect_column_to_exist("Close")

In [None]:
validator.head()

In [None]:
validator.expect_column_distinct_values_to_be_in_set("Month", list(range(1,13)))