# Create Data Context

In [1]:
# Create a data context
import pandas as pd
import os
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# Create to DataSource

In [2]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-transaction-data'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'credit-card-transaction'
path_to_data = "data/processed/fact_transaction.csv"
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [3]:
df = pd.read_csv(path_to_data)
df.head()

Unnamed: 0,trx_id,trx_timestamp,date_id,cc_number,merchant_id,amt,is_fraud,category,latitude,longitude
0,0b242abb623afc578575680df30655b9,2019-01-01 00:00:18,20190101,2703186189652095,1,4.97,False,misc_net,36.0788,-81.1781
1,1f76529f8574734946361c461b024d99,2019-01-01 00:00:44,20190101,630423337322,2,107.23,False,grocery_pos,48.8878,-118.2105
2,a1a22d70485983eac12b5b88dad1cf95,2019-01-01 00:00:51,20190101,38859492057661,3,220.11,False,entertainment,42.1808,-112.262
3,6b849c168bdad6f867558c3793159a81,2019-01-01 00:01:16,20190101,3534093764340240,4,45.0,False,gas_transport,46.2306,-112.1138
4,a41d7549acf90789359a9aa5346dcb46,2019-01-01 00:03:06,20190101,375534208663984,5,41.96,False,misc_pos,38.4207,-79.4629


# Create an Expectation Suite

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-transaction-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,trx_id,trx_timestamp,date_id,cc_number,merchant_id,amt,is_fraud,category,latitude,longitude
0,0b242abb623afc578575680df30655b9,2019-01-01 00:00:18,20190101,2703186189652095,1,4.97,False,misc_net,36.0788,-81.1781
1,1f76529f8574734946361c461b024d99,2019-01-01 00:00:44,20190101,630423337322,2,107.23,False,grocery_pos,48.8878,-118.2105
2,a1a22d70485983eac12b5b88dad1cf95,2019-01-01 00:00:51,20190101,38859492057661,3,220.11,False,entertainment,42.1808,-112.262
3,6b849c168bdad6f867558c3793159a81,2019-01-01 00:01:16,20190101,3534093764340240,4,45.0,False,gas_transport,46.2306,-112.1138
4,a41d7549acf90789359a9aa5346dcb46,2019-01-01 00:03:06,20190101,375534208663984,5,41.96,False,misc_pos,38.4207,-79.4629


# Expectation

In [5]:
# Expectation 1 : Column `Trans Num` can not contain missing values

validator.expect_column_values_to_not_be_null('trx_id')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1296675,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# Expectation 2 : Column `cc_num` can not contain missing values

validator.expect_column_values_to_not_be_null('cc_number')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1296675,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 3 : Column 'trx_id' to be unique

validator.expect_column_values_to_be_unique('trx_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1296675,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 4 : Column `amt` must in form of float

validator.expect_column_values_to_be_of_type("amt", "float")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

# Create Check Point

In [10]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [11]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/18 [00:00<?, ?it/s]

# Build Data Context

In [None]:
# Build data docs

context.build_data_docs()

# Data Validation using Another File

In [12]:
# Connect to a data source

import great_expectations as gx

context_jan = gx.get_context(context_root_dir='./gx/')

In [13]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-transaction-feb'
datasource = context_jan.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'transaction-february'
path_to_data = 'data/processed/fact_transaction.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request_feb = asset.build_batch_request()

In [14]:
# Create a checkpoint

checkpoint_2 = context_jan.add_or_update_checkpoint(
    name = 'checkpoint_2',
    batch_request = batch_request_feb,
    expectation_suite_name = 'expectation-transaction-dataset'
)

checkpoint_result = checkpoint_2.run()

Calculating Metrics:   0%|          | 0/18 [00:00<?, ?it/s]

In [29]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://d:\\CODA_PHASE 2\\demo-airflow_with_spark\\gx\\uncommitted/data_docs/local_site/index.html'}