In [1]:
import great_expectations as ge
import pandas as pd

In [2]:
# Load dataset
file_path = "/Users/rizkystiawanp/Documents/Hacktiv8/H8-P2/MILESTONE_P2/p2-ftds024-hck-m3-rizkystiawanp/dags/P2M3_RizkySP_data_clean.csv"  # Sesuaikan dengan path file yang benar
df = pd.read_csv(file_path)

gdf = ge.from_pandas(df)


In [3]:
# 1. Expectation: to be unique (Membuat kolom baru jika tidak ada yang unik)
df['unique_id'] = df['store_id'].astype(str) + '_' + df['product_id'].astype(str) + '_' + df['date'].astype(str)
gdf.expect_column_values_to_be_unique("unique_id")

{
  "success": true,
  "result": {
    "element_count": 73100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [4]:
# 2. Expectation: to be between min_value and max_value
gdf.expect_column_values_to_be_between("price", min_value=1, max_value=10000)

{
  "success": true,
  "result": {
    "element_count": 73100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# 3. Expectation: to be in set
gdf.expect_column_values_to_be_in_set("region", ["North", "South", "East", "West"])

{
  "success": true,
  "result": {
    "element_count": 73100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# 4. Expectation: to be in type list
gdf.expect_column_values_to_be_of_type("units_sold", "int64")

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# 5. Custom Expectation: Column values to match regex pattern (contoh SKU format tertentu)
gdf.expect_column_values_to_match_regex("product_id", "^P\d{4}$")

{
  "success": true,
  "result": {
    "element_count": 73100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# 6. Custom Expectation: Column values to not be null
gdf.expect_column_values_to_not_be_null("category")

{
  "success": true,
  "result": {
    "element_count": 73100,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# 7. Custom Expectation: Column values to have length between a range (contoh validasi panjang nama kategori)
gdf.expect_column_value_lengths_to_be_between("category", min_value=3, max_value=50)

{
  "success": true,
  "result": {
    "element_count": 73100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Simpan hasil validasi
output_file = "/Users/rizkystiawanp/Documents/Hacktiv8/H8-P2/MILESTONE_P2/p2-ftds024-hck-m3-rizkystiawanp/P2M3_RizkySP_GX.ipynb"
print(f"Notebook validasi Great Expectations telah dibuat: {output_file}")

Notebook validasi Great Expectations telah dibuat: /Users/rizkystiawanp/Documents/Hacktiv8/H8-P2/MILESTONE_P2/p2-ftds024-hck-m3-rizkystiawanp/P2M3_RizkySP_GX.ipynb
