In [1]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.








In [2]:
import pandas as pd
import re
from dateutil import parser

# Sample DataFrame (replace with your actual data)
data = {
    'date_column': ['2023/10/26', '26-Nov-22', '12.05.2024', 'Oct 26, 2023', '20221126', '01/05/2023', '05/01/2024'],
    'age': [30, -5, 45, 0, 22, 100, -1],
    'email': ['test@example.com', 'invalid_email', 'another@domain.net', 'TEST@EXAMPLE.COM', 'Valid.Email@sub.domain.co.uk'],
    'phone': ['123-456-7890', '123.456.7890', '1234567890', '(123) 456-7890', '123 456 7890', '+1-555-1212', '44 20 7946 0991'],
    'text_column': ['  mixed Case  ', 'ALL CAPS', 'lowercase', 'Proper Case', 'another STRING']
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n" + "="*50 + "\n")

# 13. Date Format Standardization
def standardize_date(date_str):
    formats_to_try = ['%Y/%m/%d', '%d-%b-%y', '%d.%m.%Y', '%b %d, %Y', '%Y%m%d']
    for fmt in formats_to_try:
        try:
            return pd.to_datetime(date_str, format=fmt).strftime('%Y-%m-%d')
        except ValueError:
            continue
    try:
        return parser.parse(date_str).strftime('%Y-%m-%d') # Try dateutil for more flexibility
    except (ValueError, TypeError):
        return None

df['date_standardized'] = df['date_column'].apply(standardize_date)
print("13. Date Format Standardization:")
print(df[['date_column', 'date_standardized']])
print("\n" + "="*50 + "\n")

# 14. Numeric Constraints Enforcement (age > 0)
df_cleaned_age = df[df['age'] > 0].copy() # Create a copy to avoid SettingWithCopyWarning
df['age_valid'] = df['age'] > 0
print("14. Numeric Constraints Enforcement (age > 0):")
print("Cleaned DataFrame (age > 0):\n", df_cleaned_age[['age']])
print("\nOriginal DataFrame with 'age_valid' flag:\n", df[['age', 'age_valid']])
print("\n" + "="*50 + "\n")

# 15. String Format Checks (basic email format)
def is_valid_email(email):
    pattern = r"[^@]+@[^@]+\.[^@]+"
    return bool(re.match(pattern, str(email))) # Ensure email is treated as string

df['email_valid'] = df['email'].apply(is_valid_email)
print("15. String Format Checks (basic email format):")
print(df[['email', 'email_valid']])
print("\n" + "="*50 + "\n")

# 16. Standardizing Date Formats (handling inconsistencies - using the same function as 13)
print("16. Standardizing Date Formats (handling inconsistencies):")
print(df[['date_column', 'date_standardized']]) # Already done in step 13
print("\n" + "="*50 + "\n")

# 17. Pattern Matching for Consistency (phone numbers to (123) 456-7890)
def standardize_phone(phone_number):
    digits_only = re.sub(r'\D', '', str(phone_number)) # Ensure phone_number is a string
    if len(digits_only) == 10:
        return f"({digits_only[:3]}) {digits_only[3:6]}-{digits_only[6:]}"
    elif digits_only.startswith('1') and len(digits_only) == 11: # Handle +1 or 1 prefix
        return f"({digits_only[1:4]}) {digits_only[4:7]}-{digits_only[7:]}"
    elif digits_only.startswith('44') and len(digits_only) == 11: # Basic UK mobile
        return f"0{digits_only[2:]}" # Example UK format
    elif digits_only.startswith('44') and len(digits_only) == 10: # Basic UK landline
        return f"0{digits_only[2:]}"
    else:
        return None

df['phone_standardized'] = df['phone'].apply(standardize_phone)
print("17. Pattern Matching for Consistency (phone numbers to (123) 456-7890):")
print(df[['phone', 'phone_standardized']])
print("\n" + "="*50 + "\n")

# 18. Handling Mixed Case Text (all uppercase)
df['text_uppercase'] = df['text_column'].str.upper().str.strip()
print("18. Handling Mixed Case Text (all uppercase):")
print(df[['text_column', 'text_uppercase']])

ValueError: All arrays must be of the same length

In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.










In [None]:
pip install pandas great_expectations

In [None]:
import great_expectations as ge
import os

# 1. Create a Data Context.  This is the main configuration object for Great Expectations.
context = ge.data_context.DataContext.create(project_dir_path=os.getcwd())

# 2.  Create a datasource (e.g., a connection to a CSV file or database).
datasource_name = "my_pandas_datasource"
pandas_datasource = context.add_datasource(
    datasource_name,
    class_name="Datasource",
    module_name="great_expectations.datasource",
    execution_engine={
        "class_name": "PandasExecutionEngine",
        "module_name": "great_expectations.execution_engine",
    },
    # Define the data you'll be working with.  This is a configuration, not the actual data.
    data_connectors={
        "default_inferred_data_connector_name": {  # You can name this connector
            "class_name": "InferredAssetsFileDataConnector",
            "base_directory": "./data",  # The directory where your data files are located (e.g., CSVs)
            "default_regex": r"(.*)\.csv",  # A regex to find CSV files.  Adapt to your file naming.
            "group_names": ["data_asset_name"],  # How to group the matched files.
        },
    },
)

# 3. Create a Data Asset (e.g., a specific CSV file)
asset_name = "my_data_asset"  #  Choose a name for your data asset
batch_request = {
    "datasource_name": datasource_name,
    "data_connector_name": "default_inferred_data_connector_name",
    "data_asset_name": "my_data.csv",  # The name of your CSV file (e.g., "my_data.csv")
    "batch_spec_passthrough": {
        "reader_method": "read_csv",  #  The pandas reader method
        "reader_kwargs": {"delimiter": ","}, # Add any reader args
    },
}

# 4. Create an expectation suite.  This is where you'll define your data quality rules.
expectation_suite_name = "my_expectation_suite"
suite = context.create_expectation_suite(
    expectation_suite_name=expectation_suite_name, overwrite_existing=True
)

print(f"Great Expectations Context Initialized and Expectation Suite '{expectation_suite_name}' created.")
