In [9]:
import pandas as pd
import great_expectations as gx

In [51]:
# Load Data
brands_df = pd.read_json("data/brands.json", lines=True)
brands_df["_id"] = brands_df['_id'].apply(lambda x: pd.json_normalize(x)['$oid'])
receipts_df = pd.read_json("data/receipts.json", lines=True)
receipts_df["_id"] = receipts_df['_id'].apply(lambda x: pd.json_normalize(x)['$oid'])
users_df = pd.read_json("data/users.json", lines=True)
users_df["_id"] = users_df['_id'].apply(lambda x: pd.json_normalize(x)['$oid'])
users_df["lastlogin"] = users_df['lastlogin'].apply(lambda x: pd.json_normalize(x)['$date'])

In [52]:
# Create data contexts
context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")

brands_data_asset = data_source.add_dataframe_asset(name="brands_asset")
brands_batch_definition = brands_data_asset.add_batch_definition_whole_dataframe("receipts_batch_def")
brands_batch = brands_batch_definition.get_batch(batch_parameters={"dataframe": brands_df})

receipts_data_asset = data_source.add_dataframe_asset(name="receipts_asset")
receipts_batch_definition = receipts_data_asset.add_batch_definition_whole_dataframe("receipts_batch_def")
receipts_batch = receipts_batch_definition.get_batch(batch_parameters={"dataframe": receipts_df})

users_data_asset = data_source.add_dataframe_asset(name="users_asset")
users_batch_definition = users_data_asset.add_batch_definition_whole_dataframe("users_batch_def")
users_batch = users_batch_definition.get_batch(batch_parameters={"dataframe": users_df})

In [53]:
# Create expectations/DQ Checks
brand_table_expectation = gx.expectations.ExpectTableColumnsToMatchOrderedList(
    column_list=['_id', 'barcode', 'category', 'categoryCode', 'cpg', 'name', 'topBrand', 'brandCode']
)

brand_column_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="barcode", type_="NUMBER")

brand_uniqueness_expectation = gx.expectations.ExpectColumnValuesToBeUnique(column="_id")

brand_not_null_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="barcode")

In [54]:
table_validation_result = brands_batch.validate(brand_table_expectation)
print(table_validation_result["success"])
column_validation_result = brands_batch.validate(brand_column_expectation)
print(column_validation_result["success"])
uniqueness_validation_result = brands_batch.validate(brand_uniqueness_expectation)
print(uniqueness_validation_result["success"])
not_null_validation_result = brands_batch.validate(brand_not_null_expectation)
print(not_null_validation_result["success"])

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 332.70it/s]


True


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 166.53it/s]


False


Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 243.89it/s]


True


Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 98.43it/s]  


True


In [40]:
# Create receipt expectations/DQ Checks
receipt_table_expectation = gx.expectations.ExpectTableColumnsToMatchOrderedList(
    column_list=['_id', 'bonusPointsEarned', 'bonusPointsEarnedReason', 'createDate', 'dateScanned', 'finishedDate', 'modifyDate', 'pointsAwardedDate', 'pointsEarned', 'purchaseDate', 'purchasedItemCount', 'rewardsReceiptItemList', 'rewardsReceiptStatus', 'totalSpent', 'userId']
)

receipt_column_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="bonusPointsEarned", type_="NUMBER")

receipt_uniqueness_expectation = gx.expectations.ExpectColumnValuesToBeUnique(column="_id")

receipt_not_null_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="userId")

In [69]:
receipt_table_validation_result = receipts_batch.validate(receipt_table_expectation)
print(receipt_table_validation_result["success"])
receipt_column_validation_result = receipts_batch.validate(receipt_column_expectation)
print(receipt_column_validation_result["success"])
receipt_uniqueness_validation_result = receipts_batch.validate(receipt_uniqueness_expectation)
print(receipt_uniqueness_validation_result["success"])
receipt_not_null_validation_result = receipts_batch.validate(receipt_not_null_expectation)
print(receipt_not_null_validation_result["success"])

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 333.49it/s]


True


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 333.46it/s] 


False


Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 237.28it/s]


True


Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 277.98it/s] 

True





In [72]:
# Create user expectations/DQ Checks
user_table_expectation = gx.expectations.ExpectTableColumnsToMatchOrderedList(
    column_list=['_id', 'active', 'createdDate', 'lastLogin', 'role', 'signUpSource', 'state']
)

user_column_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="role", type_="STRING")

user_uniqueness_expectation = gx.expectations.ExpectColumnValuesToBeUnique(column="_id")

user_not_null_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="active")

user_unique_proportion_expectation = gx.expectations.ExpectColumnProportionOfUniqueValuesToBeBetween(
    column="state", min_value=0.5, max_value=1.0
)

In [73]:
user_table_validation_result = receipts_batch.validate(user_table_expectation)
print(user_table_validation_result["success"])
user_column_validation_result = receipts_batch.validate(user_column_expectation)
print(user_column_validation_result["success"])
user_uniqueness_validation_result = receipts_batch.validate(user_uniqueness_expectation)
print(user_uniqueness_validation_result["success"])
user_not_null_validation_result = receipts_batch.validate(user_not_null_expectation)
print(user_not_null_validation_result["success"])
user_unique_proportion_validation_result = receipts_batch.validate(user_unique_proportion_expectation)
print(user_unique_proportion_validation_result["success"])

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 164.00it/s]


False


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 200.15it/s] 


False


Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 232.58it/s]


True


Calculating Metrics:  38%|███▊      | 3/8 [00:00<00:00, 79.02it/s]  


False


Calculating Metrics:  43%|████▎     | 3/7 [00:00<00:00, 47.62it/s]  

False



