# Import all the libraries that you need

**WARNING** : USE THE CUSTOM EXPECTATIONS WITH THIS NOTEBOOK IN `plugins` folder

In [1]:
import os
import great_expectations as gx
from great_expectations.checkpoint import Checkpoint

# Custom Expectations
from expectations.expect_column_values_less_than_or_equal_three import ExpectColumnValuesLessThanOrEqualThree
import pandas as pd
from openpyxl import Workbook
from datetime import datetime
import psycopg2

# Set up GX

In [2]:
context = gx.get_context()

In [3]:
print(context)

{
  "anonymous_usage_statistics": {
    "explicit_url": false,
    "enabled": true,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "explicit_id": true,
    "data_context_id": "027b5a80-5165-4e05-a45f-e35dda73cb3c"
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_version": 3.0,
  "data_docs_sites": {},
  "datasources": {
    "pandas_datasource": {
      "module_name": "great_expectations.datasource",
      "data_connectors": {
        "runtime_data_connector": {
          "module_name": "great_expectations.datasource.data_connector",
          "batch_identifiers": [
            "id_key_0",
            "id_key_1"
          ],
          "class_name": "RuntimeDataConnector",
          "name": "runtime_data_connector"
        }
      },
      "execution_engine": {
        "class_name": "PandasExecutionEngine",
        "module_name": "great_expectations.execution_engine"
      },
      "class_name": "Datasource",
    

In [4]:
# Check List of Datasources
context.list_datasources()

[{'module_name': 'great_expectations.datasource',
  'class_name': 'Datasource',
  'data_connectors': {'runtime_data_connector': {'module_name': 'great_expectations.datasource.data_connector',
    'batch_identifiers': ['id_key_0', 'id_key_1'],
    'class_name': 'RuntimeDataConnector'}},
  'execution_engine': {'module_name': 'great_expectations.execution_engine',
   'class_name': 'PandasExecutionEngine'},
  'name': 'pandas_datasource'},
 {'module_name': 'great_expectations.datasource',
  'class_name': 'Datasource',
  'data_connectors': {'runtime_data_connector': {'module_name': 'great_expectations.datasource.data_connector',
    'batch_identifiers': ['id_key_0', 'id_key_1'],
    'class_name': 'RuntimeDataConnector'}},
  'execution_engine': {'module_name': 'great_expectations.execution_engine',
   'class_name': 'SparkDFExecutionEngine'},
  'name': 'spark_datasource'},
 {'type': 'pandas',
  'name': 'postgres_v3_customers_inner_join_orders_datasource',
  'assets': [{'name': 'postgres_v3_cus

In [24]:
# WARNING : THIS CODE FOR DELETE DATASOURCE, PLEASE USING WISELY
# context.delete_datasource("postgres_v3c_credit_card_clients_datasource")

## Connect to your data

**DESCRIPTION**:
1. **ID**: ID of each client
2. **LIMIT_BAL**: Amount of given credit in NT dollars (includes individual and family/supplementary credit)
3. **SEX**: Gender (1=male, 2=female)
4. **EDUCATION**: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
5. **MARRIAGE**: Marital status (1=married, 2=single, 3=others)
6. **AGE**: Age in years
7. **PAY_0**: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
8. **PAY_2**: Repayment status in August, 2005 (scale same as above)

In [5]:
# Run the following Python code to create a Pandas Data Source:
pd_datasource = context.sources.add_pandas(name="postgres_v3c_credit_card_clients_datasource")

# Connect to database
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    database="postgres",
    user="postgres",
    password="KantorAHP123!")

# Run SQL
sql_query = pd.read_sql("SELECT id, limit_bal, sex, education, marriage, age, pay_0, pay_2 FROM uci_credit_cards;", con=conn)

# Convert to dataframe
df = pd.DataFrame(data=sql_query, columns=['id', 'limit_bal', 'sex', 'education', 'marriage', 'age', 'pay_0', 'pay_2'])

# Set Data Asset name
data_asset_name = "postgres_v3c_credit_card_clients_dataframe"

# Set Data Asset
data_asset = pd_datasource.add_dataframe_asset(name=data_asset_name)

# Set Batch Request
batch_request = data_asset.build_batch_request(dataframe=df)

# Create Expectations

You'll use a **Validator** to interact with your batch of data and generate an **Expectation Suite**.

Every time you evaluate an Expectation with `validator.expect_*`, it is immediately Validated against your data. This instant feedback helps you identify unexpected data and removes the guesswork from data exploration. The Expectation configuration is stored in the Validator. When you are finished running the Expectations on the dataset, you can use `validator.save_expectation_suite()` to save all of your Expectation configurations into an Expectation Suite for later use in a checkpoint.

In [6]:
# Run the following command to create the suite and get a Validator
expectation_suite_name = "postgres_v3c_credit_card_clients_suite"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

print(validator.head())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

   id  limit_bal  sex  education  marriage  age  pay_0  pay_2
0   1      20000    2          2         1   24      2      2
1   2     120000    2          2         2   26     -1      2
2   3      90000    2          2         2   34      0      0
3   4      50000    2          2         1   37      0      0
4   5      50000    1          2         1   57     -1      0


In [7]:
# Run the following command to use the Validator to add a few Expectations:
# I want to see pay_0 (Repayment status in September, 2005) to be -2 up to 3

validator_pay_0 = validator.expect_column_values_less_than_or_equal_three(
    column="pay_0",
    result_format={
        "result_format" : "COMPLETE",
        "unexpected_index_column_names" : ['id', 'limit_bal', 'sex', 'education', 'marriage', 'age'], 
        "return_unexpected_index_query" : True,
    }, 
    catch_exception=True,
)

print(validator_pay_0)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 30000,
    "unexpected_count": 29537,
    "unexpected_percent": 98.45666666666666,
    "partial_unexpected_list": [
      2,
      -1,
      0,
      0,
      -1,
      0,
      0,
      0,
      0,
      -2,
      0,
      -1,
      -1,
      1,
      0,
      1,
      0,
      0,
      1,
      1
    ],
    "unexpected_index_column_names": [
      "id",
      "limit_bal",
      "sex",
      "education",
      "marriage",
      "age"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 98.45666666666666,
    "unexpected_percent_nonmissing": 98.45666666666666,
    "partial_unexpected_counts": [
      {
        "value": 0,
        "count": 14737
      },
      {
        "value": -1,
        "count": 5686
      },
      {
        "value": 1,
        "count": 3688
      },
      {
        "value": -2,
        "count": 2759
      },
      {
        "value": 2,
        "count": 2667
      }
    ],

In [8]:
unexpected_index_query_variable_pay_0 = validator_pay_0.get_metric(
    "expect_column_values_less_than_or_equal_three.result.unexpected_index_query",
    column="pay_0"
)

print(unexpected_index_query_variable_pay_0)

print(type(unexpected_index_query_variable_pay_0))

df.filter(items=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220

In [9]:
unexpected_index_query_variable_pay_0_result = pd.eval(unexpected_index_query_variable_pay_0, target=df)
unexpected_index_query_variable_pay_0_result

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2
0,1,20000,2,2,1,24,2,2
1,2,120000,2,2,2,26,-1,2
2,3,90000,2,2,2,34,0,0
3,4,50000,2,2,1,37,0,0
4,5,50000,1,2,1,57,-1,0
...,...,...,...,...,...,...,...,...
29994,29995,80000,1,2,2,34,2,2
29995,29996,220000,1,3,1,39,0,0
29996,29997,150000,1,3,2,43,-1,-1
29998,29999,80000,1,3,1,41,1,-1


In [11]:
# Directory to save excel file
path = os.getcwd()+"/file_result"

# Date now
date_now = datetime.now().strftime("%Y-%m-%d")

# Unexpected Index Query from Unexpected Index Query (Great Expectations)
unexpected_index_query = unexpected_index_query_variable_pay_0_result

# Get data and convert to dataframe
df = unexpected_index_query

# Save dataframe to excel file
df.to_excel(f'{path}/unexpected_index_query_variable_pay_0_result_{date_now}.xlsx', sheet_name='Sheet1', index=False)

print(f'unexpected_index_query_variable_pay_0_result_{date_now}.xlsx created')

unexpected_index_query_variable_pay_0_result_2023-12-19.xlsx created


In [12]:
# Run the following command to get your Expectation Suite
print(validator.get_expectation_suite(discard_failed_expectations=False))

{
  "expectation_suite_name": "postgres_v3c_credit_card_clients_suite",
  "expectations": [
    {
      "kwargs": {
        "column": "pay_0",
        "catch_exception": true
      },
      "meta": {},
      "expectation_type": "expect_column_values_less_than_or_equal_three"
    }
  ],
  "meta": {
    "great_expectations_version": "0.18.4"
  },
  "data_asset_type": null,
  "ge_cloud_id": null
}


In [13]:
# Run the following command to save your Expectation Suite (all the unique Expectation Configurations from each run of validator.expect_*) to your Expectation Store:
validator.save_expectation_suite("postgres_v3c_credit_card_clients_great_expectations.json", discard_failed_expectations=False)

# Validate your data

You'll create and store a *Checkpoint* for your batch, which you can use to validate and run post-validation actions.

In [14]:
# Run the following command to create the Checkpoint configuration that uses your Data Context:
my_checkpoint_name = "postgres_v3c_credit_card_clients_checkpoint"

checkpoint = Checkpoint(
    name=my_checkpoint_name,
    run_name_template="%Y%m%d-%H%M%S-postgres_v3c_credit_card_clients_checkpoint",
    data_context=context,
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
    action_list=[
        {
            "name": "store_validation_result",
            "action": {"class_name": "StoreValidationResultAction"},
        },
        {
            "name": "store_evaluation_params",
            "action": {"class_name": "StoreEvaluationParametersAction"},
        },
        {
            "name": "update_data_docs", 
            "action": {"class_name": "UpdateDataDocsAction"}
        },
    ],
)

The `store_validation_result` action saves your validation results from the Checkpoint run and allows the results to be persisted for future use. The `store_evaluation_params` store evaluation parameters from a validation result. The `update_data_docs` action builds Data Docs files for the validations run in the Checkpoint.

In [15]:
# Run the following command to save the Checkpoint:
context.add_or_update_checkpoint(checkpoint=checkpoint)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction"
      }
    }
  ],
  "batch_request": {
    "datasource_name": "postgres_v3c_credit_card_clients_datasource",
    "data_asset_name": "postgres_v3c_credit_card_clients_dataframe",
    "options": {}
  },
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "expectation_suite_name": "postgres_v3c_credit_card_clients_suite",
  "module_name": "great_expectations.checkpoint",
  "name": "postgres_v3c_credit_card_clients_checkpoint",
  "profilers": [],
  "run_name_template": "%Y%m%d-%H%M%S-postgres_v3c_credit_card_clients_checkpoint",
  "runtime_configuration": {},
  "valida

In [16]:
# Run the following command to run the Checkpoint and pass in your Batch Request (your data) and your Expectation Suite (your tests):
checkpoint_result = checkpoint.run()

Calculating Metrics: 0it [00:00, ?it/s]

In [17]:
# View the full Checkpoint configuration
print(checkpoint.get_config().to_yaml_str())

name: postgres_v3c_credit_card_clients_checkpoint
config_version: 1.0
template_name:
module_name: great_expectations.checkpoint
class_name: Checkpoint
run_name_template: '%Y%m%d-%H%M%S-postgres_v3c_credit_card_clients_checkpoint'
expectation_suite_name: postgres_v3c_credit_card_clients_suite
batch_request:
  datasource_name: postgres_v3c_credit_card_clients_datasource
  data_asset_name: postgres_v3c_credit_card_clients_dataframe
  options: {}
  batch_slice:
action_list:
  - name: store_validation_result
    action:
      class_name: StoreValidationResultAction
  - name: store_evaluation_params
    action:
      class_name: StoreEvaluationParametersAction
  - name: update_data_docs
    action:
      class_name: UpdateDataDocsAction
evaluation_parameters: {}
runtime_configuration: {}
validations: []
profilers: []
ge_cloud_id:
expectation_suite_ge_cloud_id:



# Build and view Data Docs

Your Checkpoint contained an `UpdateDataDocsAction`, so your Data Docs have already been built from the validation you ran and your Data Docs store contains a new rendered validation result.

In [23]:
# Run the following command to open your Data Docs and review the results of your Checkpoint run:
context.open_data_docs()