In [1]:
import great_expectations as gx

In [2]:
import pandas as pd

In [3]:
heart_attack_df = pd.read_csv("../data/heart_attack_dataset.csv")

In [17]:
heart_attack_df.head()

Unnamed: 0,Age,Gender,Cholesterol,BloodPressure,HeartRate,BMI,Smoker,Diabetes,Hypertension,FamilyHistory,...,ExerciseInducedAngina,Slope,NumberOfMajorVessels,Thalassemia,PreviousHeartAttack,StrokeHistory,Residence,EmploymentStatus,MaritalStatus,Outcome
0,31,Male,194,162,71,22.9,0,1,0,0,...,Yes,Downsloping,1,Normal,0,0,Suburban,Retired,Single,No Heart Attack
1,69,Male,208,148,93,33.9,1,1,0,0,...,Yes,Upsloping,2,Normal,0,0,Suburban,Unemployed,Married,No Heart Attack
2,34,Female,132,161,94,34.0,0,0,1,1,...,Yes,Upsloping,0,Normal,1,0,Rural,Retired,Single,Heart Attack
3,53,Male,268,134,91,35.0,0,1,1,0,...,Yes,Flat,0,Reversible defect,1,0,Suburban,Retired,Widowed,No Heart Attack
4,57,Female,203,140,75,30.1,0,1,0,0,...,Yes,Flat,0,Fixed defect,1,0,Rural,Retired,Married,Heart Attack


In [4]:
heart_attack_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372974 entries, 0 to 372973
Data columns (total 32 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Age                    372974 non-null  int64  
 1   Gender                 372974 non-null  object 
 2   Cholesterol            372974 non-null  int64  
 3   BloodPressure          372974 non-null  int64  
 4   HeartRate              372974 non-null  int64  
 5   BMI                    372974 non-null  float64
 6   Smoker                 372974 non-null  int64  
 7   Diabetes               372974 non-null  int64  
 8   Hypertension           372974 non-null  int64  
 9   FamilyHistory          372974 non-null  int64  
 10  PhysicalActivity       372974 non-null  int64  
 11  AlcoholConsumption     372974 non-null  int64  
 12  Diet                   372974 non-null  object 
 13  StressLevel            372974 non-null  int64  
 14  Ethnicity              372974 non-nu

In [9]:
# get ephemeral data context
context = gx.get_context() # retrieve Ephemeral data context from great expectations
assert type(context).__name__ == "EphemeralDataContext" # checks that the type of context is indeed EphemeralDataContext

In [10]:
# add a pandas data source

# add a data source to the context
data_source = context.data_sources.add_pandas(name = "heart_attack_data")

# add a data asset (specific dataset or table) to the data source
data_asset = data_source.add_dataframe_asset(name = "heart_attack_asset")

In [11]:
# adding a batch definition

# define the Batch Definition name
batch_definition_name = "heart_attack_batch"

# add the batch definition
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)
assert batch_definition.name == batch_definition_name


In [13]:
# retrieving a batch

# define the batch parameters
batch_parameters = {"dataframe": heart_attack_df}
# retrieve the batch
batch = batch_definition.get_batch(batch_parameters = batch_parameters)


In [14]:
batch

<great_expectations.datasource.fluent.interfaces.Batch at 0x7f0d24388190>

creating a suite and defining expectations

In [18]:
# Create an Expectation Suite
expectation_suite_name = "heart_attack_suite"
suite = gx.ExpectationSuite(name = expectation_suite_name)
# Add Expectations
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column = "HeartRate")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column = "Age", max_value = 45, min_value = 20)
)
# Add the Expectation Suite to the Context
context.suites.add(suite)

{
  "name": "heart_attack_suite",
  "id": "1fd4d9dc-08cd-4165-809b-0a8593e5fbbf",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "HeartRate"
      },
      "meta": {},
      "id": "99632662-3ba1-41d4-935f-0f8ebf586e92"
    },
    {
      "type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "Age",
        "min_value": 20.0,
        "max_value": 45.0
      },
      "meta": {},
      "id": "7f727bfc-3418-44f9-8482-4e9574ec5593"
    }
  ],
  "meta": {
    "great_expectations_version": "1.3.6"
  },
  "notes": null
}

validating the data

In [19]:
# Validate the Data Against the Suite
validation_results = batch.validate(suite)
# Evaluate the Results
print(validation_results)

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "batch_id": "heart_attack_data-heart_attack_asset",
          "column": "HeartRate"
        },
        "meta": {},
        "id": "99632662-3ba1-41d4-935f-0f8ebf586e92"
      },
      "result": {
        "element_count": 372974,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "heart_attack_data-heart_attack_asset",
          "column": "Age",
    