In [4]:
'''
=================================================
Milestone 3

Nama  : Reza Muhammad Rhafi
Batch : FTDS-023-HCK

Program ini dibuat untuk melakukan automatisasi transform dan load data dari PostgreSQL ke ElasticSearch.
=================================================
'''



In [5]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [6]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-satisfaction'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'airline-satisfaction'
path_to_data = 'P2M3_reza_muhammad_rhafi_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [7]:
# Creat an expectation suite
expectation_suite_name = 'expectation-airline-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,row_id,satisfaction,customer_type,age,type_of_travel,class,flight_distance,seat_comfort,departure_arrival_time_convenient,food_and_drink,...,online_support,ease_of_online_booking,onboard_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes
0,0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,...,4,2,2,0,2,4,2,5,0,0.0


In [8]:
# 1. Expectation to be unique
validator.expect_column_values_to_be_unique('row_id')   

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

In [9]:
# 2. Expectation to be between min_value and max_value
validator.expect_column_values_to_be_between(column='age', min_value=7, max_value=100)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

In [10]:
# 3. Expectation to be in set
validator.expect_column_values_to_be_in_set(column='customer_type', value_set=['Loyal Customer', 'disloyal Customer'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

In [11]:
# 4. Expectation to be in type list
validator.expect_column_values_to_be_in_type_list('class', ['str'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

In [12]:
# 5. Expectation column values are not null
validator.expect_column_values_to_not_be_null('satisfaction')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true
}

In [13]:
# 6. Expectation column values should be lower than a specific value
validator.expect_column_values_to_be_between(
    column='departure_delay_in_minutes', max_value=1592)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

In [14]:
# 7 Expectation: column values should be greater than a specific value
validator.expect_column_values_to_be_between(
    column='flight_distance',
    min_value=0
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 129487,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true
}

In [15]:
# Mengecek hasil Expectations
results = validator.validate()
results

Calculating Metrics:   0%|          | 0/36 [00:00<?, ?it/s]

{
  "meta": {
    "great_expectations_version": "0.18.19",
    "expectation_suite_name": "expectation-airline-dataset",
    "run_id": {
      "run_time": "2025-01-13T16:20:35.125724+07:00",
      "run_name": null
    },
    "batch_spec": {
      "reader_method": "read_csv",
      "reader_options": {
        "filepath_or_buffer": "P2M3_reza_muhammad_rhafi_data_clean.csv"
      }
    },
    "batch_markers": {
      "ge_load_time": "20250113T092032.935118Z",
      "pandas_data_fingerprint": "bce2c8718646d2c710f15530c335d3ce"
    },
    "active_batch_definition": {
      "datasource_name": "csv-data-satisfaction",
      "data_connector_name": "fluent",
      "data_asset_name": "airline-satisfaction",
      "batch_identifiers": {}
    },
    "validation_time": "20250113T092035.125724Z",
    "checkpoint_name": null
  },
  "success": true,
  "evaluation_parameters": {},
  "statistics": {
    "evaluated_expectations": 7,
    "successful_expectations": 7,
    "unsuccessful_expectations": 0,
   