# Data Validations


In [1]:
from datetime import datetime, timezone
import pandas as pd
import great_expectations as ge
import great_expectations.jupyter_ux
from great_expectations.exceptions import DataContextError
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.data_context.types.base import DatasourceConfig
from great_expectations.data_context.types.base import FilesystemStoreBackendDefaults
from great_expectations.data_context import BaseDataContext
from pathlib import Path
from os.path import abspath

2021-10-22T17:34:33-0300 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.


  pd.set_option("display.max_colwidth", -1)


In [2]:
project_path = Path(abspath('')).parent.absolute().as_posix()
project_path

'/home/naranja/Develop/diplo/nerdearla'

## 1. Recreamos el DataContext

In [3]:
datasource = "my_pandas_datasource"
context = BaseDataContext(
                project_config=DataContextConfig(
                    config_version=2,
                    plugins_directory=f"{project_path}/plugins",
                    datasources={
                        datasource: DatasourceConfig(
                            class_name="PandasDatasource",
                            data_asset_type = {
                                "module_name": "custom_expectation",
                                "class_name": "MyCustomPandasDataset"
                            },
                            batch_kwargs_generators={}
                        )
                    },
                    validation_operators={
                        "action_list_operator": {
                            "class_name": "ActionListValidationOperator",
                            "action_list": [
                                {
                                    "name": "store_validation_result",
                                    "action": {"class_name": "StoreValidationResultAction"},
                                },
                                {
                                    "name": "update_data_docs",
                                    "action": {"class_name": "UpdateDataDocsAction"},
                                },
                            ],
                        }
                    },
                    store_backend_defaults=FilesystemStoreBackendDefaults(
                        root_directory=project_path
                    )
                )
            )

## 2. Elegimos nuestra Expectation Suite

In [4]:
context.list_expectation_suite_names()

['nerdearla']

In [5]:
expectation_suite_name = "nerdearla"

## 3. Cargamos la data con a la cual le queremos aplicar validaciones

In [6]:
df = pd.read_csv(
    f"{project_path}/data/nx_nerdearla.csv",
    dtype={
        "date":str,
        "fecha_nacimiento":str,
        "dni":str}
)

batch_kwargs = {
    "datasource": "my_pandas_datasource",
    "dataset": df,
    "expectation_suite_names": expectation_suite_name
}


batch = context.get_batch(batch_kwargs, expectation_suite_name)
batch.head(5)

Unnamed: 0,dni,date,sexo,estado_civil,fecha_nacimiento,asset_level,education_level
0,37511093,20210625,F,Casado,1955-01-09 00:00:00,"{""name"":""Sin especificar"",id:0}","{""name"":""Terciario"",id:3}"
1,94977718,20210625,M,Casado,1951-04-23 00:00:00,"{""name"":""Sin especificar"",""id"":""0""}","{""name"":""Primario"",""id"":""1""}"
2,8627709,20210625,F,Soltero,1973-04-05 00:00:00,"{""name"":""Sin especificar"",""id"":""0""}","{""name"":""Primario"",""id"":""1""}"
3,37007709,20210625,M,Soltero,1988-09-23 00:00:00,"{""name"":""Sin especificar"",""id"":""0""}","{""name"":""Primario"",""id"":""1""}"
4,28704754,20210625,F,Soltero,1983-03-17 00:00:00,"{""name"":""Sin especificar"",""id"":""0""}","{""name"":""Terciario"",""id"":""3""}"


## 4.Validation Operators

`Validation Operators` es la forma de validar multiples expectation y definir las acciones que se debe tomar despues de la validacio.

In [7]:
run_id = {
  "run_name": "nx_nerdearla",
  "run_time": datetime.now(timezone.utc)
}

results = context.run_validation_operator(
    "action_list_operator",
    assets_to_validate=[batch],
    run_id=run_id)

2021-10-22T17:34:33-0300 - INFO - 	11 expectation(s) included in expectation_suite.


In [8]:
results.get_statistics()

{'data_asset_count': 1,
 'validation_result_count': 1,
 'successful_validation_count': 0,
 'unsuccessful_validation_count': 1,
 'successful_validation_percent': 0.0,
 'validation_statistics': {ValidationResultIdentifier::nerdearla/nx_nerdearla/20211022T203433.419915Z/03c4cd754fa1ed9316e6983867383c0b: {'evaluated_expectations': 11,
   'successful_expectations': 5,
   'unsuccessful_expectations': 6,
   'success_percent': 45.45454545454545}}}

## 5. Vemos el Validation Results en Data Docs

Ahora la libreria crea el Data Docs. Este va a incluir **reporte de calidad de datos** formado con el `ValidationResults`

In [9]:
context.open_data_docs()