# I. Import

In [40]:
# Import libraries

import pandas as pd
import great_expectations as ge
from great_expectations.data_context import FileDataContext

# II. Create Data Context

In [41]:
# Membuat data context

context = FileDataContext.create(project_root_dir='./')

# III. Connect to Datasource

In [42]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-ray'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'ray-data-clean'
path_to_data = './data/P2M3_Dionisius_Ray_data_clean.csv'  # Pastikan file ini ada di direktori proyek
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# IV. Expectations Suite

In [43]:
# Creat an expectation suite
expectation_suite_name = 'expectation-trip-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method,unique_transaction_key
0,Walmart,1128299,2021-06-17,Southeast,Florida,Orlando,Women's Apparel,103.0,218,2245,1257,Online,1128299_2021-06-17_Women's Apparel_Online_Orla...
1,West Gear,1128299,2021-07-16,South,Louisiana,New Orleans,Women's Apparel,103.0,163,1679,806,Online,1128299_2021-07-16_Women's Apparel_Online_New ...
2,Sports Direct,1197831,2021-08-25,South,Alabama,Birmingham,Men's Street Footwear,10.0,700,7000,3150,Outlet,1197831_2021-08-25_Men's Street Footwear_Outle...
3,Sports Direct,1197831,2021-08-27,South,Alabama,Birmingham,Women's Street Footwear,15.0,575,8625,3881,Outlet,1197831_2021-08-27_Women's Street Footwear_Out...
4,Sports Direct,1197831,2021-08-21,South,Alabama,Birmingham,Women's Street Footwear,15.0,475,7125,3206,Outlet,1197831_2021-08-21_Women's Street Footwear_Out...


## A. Expectations

### A.1. Unique

In [44]:
# Expectation 1: Check if the column 'retailer_id' is unique

validator.expect_column_values_to_be_unique('unique_transaction_key')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9639,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.2. Between Min Value and Max Value

In [45]:
# Expectation 2: Check if the column 'units_sold' is must be beyween 5 and 200 

validator.expect_column_values_to_be_between(column='price_per_unit', min_value=5, max_value=200)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9639,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.3. Be in a Set

In [46]:
# Expectation 3: Check if the column 'city' is in set of the US city

us_states = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 
    'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 
    'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 
    'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 
    'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 
    'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
]

validator.expect_column_values_to_be_in_set('state', us_states)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9639,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.4. Be in a Type List

In [47]:
# Expectation 4 : Column `operating_profit` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('operating_profit', ['int', 'int64', 'float']) # Karena perubahan kolom menggunakan pd.to_numeric(), saya tambahkan pengecekan int64

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.5. Match Regex

In [48]:
# Expectation 5 : Column `invoice_date` must in a YYYY-MM-DD format

validator.expect_column_values_to_match_strftime_format('invoice_date', '%Y-%m-%d')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9639,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.6. Column 'total_sales' Greater Than 'operating_profit'

In [49]:
# # Expectation 6 : Column `total_sales` must be greater or equal than column `operating_profit`

validator.expect_column_pair_values_A_to_be_greater_than_B('total_sales', 'operating_profit', True)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9639,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.7. Column 'retailer' Character Length Between 5 and 100

In [50]:
# Expectations 7 : Column `retailer` must be between 5 and 100 characters

validator.expect_column_value_lengths_to_be_between('retailer', min_value=5, max_value=100)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 9639,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### A.8. Save Expectations into Expectation Suite

In [51]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## B. Checkpoints

In [52]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [53]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/45 [00:00<?, ?it/s]

## C. Create Data Docs

In [54]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\user\\Documents\\GitHub\\p2-ftds025-hck-m3-ray-dion\\gx\\uncommitted/data_docs/local_site/index.html'}