In [1]:
import great_expectations as gx
from great_expectations.checkpoint import SimpleCheckpoint

# Set up
context = gx.get_context()

### Standard way of setting up a FileSystem DataSource
```
datasource_yaml = f"""
name: my_s3_datasource
class_name: Datasource
execution_engine:
    class_name: PandasExecutionEngine
data_connectors:
    default_runtime_data_connector_name:
        class_name: RuntimeDataConnector
        batch_identifiers:
            - default_identifier_name
    default_inferred_data_connector_name:
        class_name: InferredAssetS3DataConnector
        bucket: demo-gp-taxi-data
        default_regex:
            pattern: (.*)\.csv
            group_names:
                - data_asset_name
"""
context.add_datasource(**yaml.load(datasource_yaml))
```

In [2]:
# Set a few variables
datasource_name = "MyNewS3Datasource"
bucket_name = "<BUCKET_NAME>"
boto3_options = {}

In [3]:

data_source = context.sources.add_or_update_pandas_s3(
    name=datasource_name, bucket=bucket_name, boto3_options=boto3_options
)

In [10]:
data_asset = data_source.add_csv_asset(
    name="MyTaxiDataAsset",
    batching_regex="yellow_tripdata_sample_2019-01.csv" 
)


### That's it!

Now we can use this data source for profiling, validation and documentation

In [12]:
# Create Expectations
exp_suite_name = "test_suite"

context.create_expectation_suite(
    expectation_suite_name=exp_suite_name, overwrite_existing=True
)
v = context.get_validator(batch_request=data_asset.build_batch_request(), expectation_suite_name=exp_suite_name)
# v.head()
v.expect_column_values_to_not_be_null("pickup_datetime")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {}
}

In [13]:
# Validate data
checkpoint = SimpleCheckpoint( 
    f"NY-Taxi-Data",
    context,
    validator=v
)
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [14]:
# View results
validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=validation_result_identifier)