In [None]:
import sys
import awswrangler as wr

# Define the parameters
DATABASE_NAME = "de_proj_database2"
TABLE_NAME = "jobdetails_jobs_with_details_parquet_tbl"

# Define data quality checks
# Example: Check if critical columns contain NULL values
DQ_CHECKS = [
    {
        "name": "Check for NULL values in job_id",
        "sql": f"""
            SELECT SUM(CASE WHEN job_id IS NULL THEN 1 ELSE 0 END) AS res_col
            FROM "{DATABASE_NAME}"."{TABLE_NAME}"
        """
    },
    {
        "name": "Check for NULL values in title",
        "sql": f"""
            SELECT SUM(CASE WHEN title IS NULL THEN 1 ELSE 0 END) AS res_col
            FROM "{DATABASE_NAME}"."{TABLE_NAME}"
        """
    },
    {
        "name": "Check for NULL values in company_name",
        "sql": f"""
            SELECT SUM(CASE WHEN company_name IS NULL THEN 1 ELSE 0 END) AS res_col
            FROM "{DATABASE_NAME}"."{TABLE_NAME}"
        """
    }
]

# Run data quality checks
for check in DQ_CHECKS:
    print(f"Running data quality check: {check['name']}")
    # Execute the query
    df = wr.athena.read_sql_query(sql=check["sql"], database=DATABASE_NAME)
    
    # Check the result
    if df['res_col'][0] > 0:
        print(f"Data quality check failed: {check['name']}")
        sys.exit(f"Check failed: {check['name']}. Found {df['res_col'][0]} NULL values.")
    else:
        print(f"Data quality check passed: {check['name']}")

print("All data quality checks passed.")