In [51]:
import great_expectations as gx
import pandas as pd
from datetime import datetime
from great_expectations.checkpoint import Checkpoint
from great_expectations.data_context.types.base import DataContextConfig
from datetime import datetime
import logging
import shutil

In [5]:
def create_validator(path, context):
    validator = context.sources.pandas_default.read_csv(path)

    validator.expect_column_values_to_not_be_null('TotRmsAbvGrd')
    validator.expect_column_values_to_be_between('TotRmsAbvGrd', min_value=1, max_value=30)
    validator.expect_column_values_to_not_be_null('WoodDeckSF')
    validator.expect_column_values_to_not_be_null('YrSold')
    validator.expect_column_values_to_not_be_null('1stFlrSF')
    validator.expect_column_values_to_not_be_null('Foundation_BrkTil')
    validator.expect_column_values_to_not_be_null('Foundation_CBlock')
    validator.expect_column_values_to_not_be_null('Foundation_PConc')
    validator.expect_column_values_to_not_be_null('Foundation_Slab')
    validator.expect_column_values_to_not_be_null('Foundation_Stone')
    validator.expect_column_values_to_not_be_null('Foundation_Wood')
    validator.expect_column_values_to_not_be_null('KitchenQual_Ex')
    validator.expect_column_values_to_not_be_null('KitchenQual_Fa')
    validator.expect_column_values_to_not_be_null('KitchenQual_Gd')
    validator.expect_column_values_to_not_be_null('KitchenQual_TA')
    validator.save_expectation_suite(discard_failed_expectations=False)
    return validator

def validate(path, df):
    context = gx.get_context()
    now = datetime.now()

    validator = create_validator(path, context)

    checkpoint = context.add_or_update_checkpoint(name='housing_checkpoint', validator=validator)
    result_format: dict = {
        "result_format": "COMPLETE",
        "unexpected_index_column_names": ["Id"],
    }
    checkpoint_result = checkpoint.run(result_format=result_format)
    # context.view_validation_result(checkpoint_result)
    return checkpoint_result

In [90]:
def filter_expectations_result(result_json):
    run_results = result_json['run_results']
    val_result_id = list(run_results)[0]
    val_results = run_results[val_result_id]['validation_result']
    
    site_path = run_results[val_result_id]['actions_results']['update_data_docs']['local_site'].replace("%23","#")
    now_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
    print(site_path[7:])
    shutil.move(site_path[7:],
                f'/mnt/c/dev/epita/dsp/dsp23-project/validations/{now_str}_validation_result.html')
    
    statistics = val_results['statistics']
    statistics['success_percent'] = round(statistics['success_percent'], 2) 
    # print(statistics)
    info = ''
    failed_rows = []
    failed_cols = []
    failed_conf = []

    for res in val_results['results']:
        if res['success'] == False:
            _col = res['expectation_config']['kwargs']['column']
            _exp_conf = res['expectation_config']['expectation_type']
            info += f'column -> {_col}\n'
            info += f'    expectation_config -> {_exp_conf}\n'
            indexes_list = res['result']['partial_unexpected_index_list']
            failed_rows_id = [item['Id'] for item in indexes_list]
            failed_rows = list(set(failed_rows + failed_rows_id))
            failed_conf = list(set(failed_conf + [_exp_conf]))
            failed_cols = list(set(failed_cols + [_col]))
            info += f'    rows with errors -> {failed_rows_id}\n'
            print(info)
            info = ''

    desc = f'Validation Error on Rows {failed_rows} of file _FILENAME_ !'
    desc += f'Columns {failed_cols} failed on the following expectations : {failed_conf}'
    desc = desc.replace('[', '')
    desc = desc.replace(']', '')
    return failed_rows, desc


In [70]:
path = '../data/folder_A/file_1.csv'

df = pd.read_csv(path)
df = df[df.columns[1:]]


In [71]:
result = validate(path,df)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/78 [00:00<?, ?it/s]

In [72]:
log,failed_rows = filter_expectations_result(result.to_json_dict())
print(log)
print(failed_rows)

/tmp/tmpmc0kkr0y/validations/default/__none__/20240110T175620.098142Z/default_pandas_datasource-#ephemeral_pandas_asset.html
column -> TotRmsAbvGrd
    expectation_config -> expect_column_values_to_be_between
    rows with errors -> [1001, 693]

column -> KitchenQual_Fa
    expectation_config -> expect_column_values_to_not_be_null
    rows with errors -> [477, 578]

column -> KitchenQual_TA
    expectation_config -> expect_column_values_to_not_be_null
    rows with errors -> [952]

[578, 1001, 693, 952, 477]
Validation Error on Rows 578, 1001, 693, 952, 477 of file _FILENAME_ !Columns 'TotRmsAbvGrd', 'KitchenQual_Fa', 'KitchenQual_TA' failed on the following expectations : 'expect_column_values_to_be_between', 'expect_column_values_to_not_be_null'


In [None]:
filtered_df = df[~df['Id'].isin(failed_rows)]

rows = list(filtered_df['Id'].values)
rows

In [None]:
path = '../data/folder_A/file_error_example.csv'
context = gx.get_context()
df = pd.read_csv(path)
result = validate(path,df)
log, failed_rows = log_failed_expectations(result.to_json_dict())
print(log)
print(failed_rows)