In [None]:
# Activity 4: Data Quality Automation Tools

# Task A: Using Great Expectations

# 19. Setting Up Expectations:
# - Install Great Expectations and set up a basic expectation suite.
# - Validate a dataset and list unmet expectations.






# 20. Testing for Expectation:
# - Create expectations such as “column values must fall within a certain range.”






# 21. Generating Data Docs:
# - Automatically generate data quality documentation.








In [None]:
# Task B: Using DQ Labs

# 22. Tool Setup and Configuration:
# - Download and configure DQ Labs on your local environment.
# - Create a new data quality project.








# 23. Data Analysis Automation:
# - Apply DQ Labs for automating data profiling and quality checks.







# 24. Quality Rule Creation:
# - Create quality rules for detecting and handling duplicates or enforcing standards.










In [3]:
# Task A: Simulating Great Expectations with Python

import pandas as pd

def check_expectation_not_null(df, column):
    """Checks if all values in a column are not null."""
    null_values = df[df[column].isnull()]
    success = null_values.empty
    details = f"Number of null values: {len(null_values)}"
    return {"success": success, "details": details, "expectation": "column_values_not_be_null", "column": column}

def check_expectation_in_set(df, column, value_set):
    """Checks if all values in a column are within a specified set."""
    invalid_values = df[~df[column].isin(value_set)]
    success = invalid_values.empty
    details = f"Invalid values: {invalid_values[column].unique().tolist()}"
    return {"success": success, "details": details, "expectation": "column_values_in_set", "column": column, "value_set": value_set}

def check_expectation_between(df, column, min_value, max_value):
    """Checks if all values in a column are within a specified range."""
    out_of_range = df[(df[column] < min_value) | (df[column] > max_value)]
    success = out_of_range.empty
    details = f"Out of range values: {out_of_range[column].unique().tolist()}"
    return {"success": success, "details": details, "expectation": "column_values_between", "column": column, "min_value": min_value, "max_value": max_value}

def validate_data(df, expectations):
    """Validates a DataFrame against a list of expectations."""
    results = []
    for expectation in expectations:
        expectation_type = expectation['type']
        kwargs = expectation['kwargs']
        if expectation_type == "column_values_not_be_null":
            results.append(check_expectation_not_null(df, **kwargs))
        elif expectation_type == "column_values_in_set":
            results.append(check_expectation_in_set(df, **kwargs))
        elif expectation_type == "column_values_between":
            results.append(check_expectation_between(df, **kwargs))
        else:
            results.append({"success": False, "details": f"Unknown expectation type: {expectation_type}", "expectation": expectation_type})
    return results

def list_unmet_expectations(validation_results):
    """Lists the expectations that were not met."""
    unmet = [result for result in validation_results if not result['success']]
    if unmet:
        print("\n--- Unmet Expectations ---")
        for result in unmet:
            print(f"Expectation: {result['expectation']} - Column: {result.get('column', 'N/A')} - Details: {result['details']}")
    else:
        print("\n--- All Expectations Met ---")

def generate_data_docs_simple(validation_results, output_file="data_quality_report.txt"):
    """Generates a simple text-based data quality report."""
    with open(output_file, "w") as f:
        f.write("--- Data Quality Report ---\n\n")
        for result in validation_results:
            f.write(f"Expectation: {result['expectation']}\n")
            f.write(f"Column: {result.get('column', 'N/A')}\n")
            f.write(f"Success: {result['success']}\n")
            f.write(f"Details: {result['details']}\n")
            f.write("-" * 30 + "\n")
    print(f"\n--- Simple Data Docs Generated ---")
    print(f"Report saved to: {output_file}")

# Task B: Simulating DQ Labs with Python

def analyze_data_profile(df):
    """Simulates basic data profiling."""
    profile = {}
    for col in df.columns:
        profile[col] = {
            "data_type": df[col].dtype,
            "non_null_count": df[col].count(),
            "null_count": df[col].isnull().sum(),
            "unique_count": df[col].nunique(),
            "min": df[col].min() if pd.api.types.is_numeric_dtype(df[col]) else None,
            "max": df[col].max() if pd.api.types.is_numeric_dtype(df[col]) else None,
            "top_value": df[col].mode().iloc[0] if not df[col].empty else None,
            "top_value_count": df[col].value_counts().iloc[0] if not df[col].empty else 0
        }
    print("\n--- Data Profile ---")
    for col, stats in profile.items():
        print(f"Column: {col}")
        for key, value in stats.items():
            print(f"  {key}: {value}")
        print("-" * 20)
    return profile

def check_duplicates(df, columns=None):
    """Checks for duplicate rows based on specified columns."""
    if columns:
        duplicates = df[df.duplicated(subset=columns, keep=False)]
        duplicate_count = duplicates.shape[0] // df.duplicated(subset=columns).sum() if df.duplicated(subset=columns).sum() > 0 else 0
        details = f"Number of duplicate rows (considering {columns}): {duplicates.shape[0]}"
    else:
        duplicates = df[df.duplicated(keep=False)]
        duplicate_count = duplicates.shape[0] // df.duplicated().sum() if df.duplicated().sum() > 0 else 0
        details = f"Number of duplicate rows (considering all columns): {duplicates.shape[0]}"
    success = duplicates.empty
    return {"success": success, "details": details, "rule": "check_duplicates", "columns": columns, "duplicate_count": duplicate_count}

def enforce_value_set(df, column, allowed_values):
    """Checks if all values in a column are within a specified set and flags violations."""
    violations = df[~df[column].isin(allowed_values)]
    success = violations.empty
    details = f"Violating values: {violations[column].unique().tolist()}"
    return {"success": success, "details": details, "rule": "enforce_value_set", "column": column, "allowed_values": allowed_values, "violation_count": len(violations)}

# Example Usage:
data = {'col1': [1, 2, 3, 4, 5, None],
        'col2': ['A', 'B', 'C', 'A', 'E', 'A'],
        'col3': [10, 20, 30, 60, 50, 15],
        'email': ['test@example.com', 'invalid', 'another@domain.net', 'test@example.com', None, 'third@example.com']}
df = pd.DataFrame(data)

# Task A: Using Simulated Great Expectations
expectations_suite = [
    {"type": "column_values_not_be_null", "kwargs": {"column": "col1"}},
    {"type": "column_values_in_set", "kwargs": {"column": "col2", "value_set": ["A", "B", "C", "D"]}},
    {"type": "column_values_between", "kwargs": {"column": "col3", "min_value": 5, "max_value": 55}}
]

validation_results = validate_data(df, expectations_suite)
list_unmet_expectations(validation_results)
generate_data_docs_simple(validation_results)

# Task B: Simulating DQ Labs
analyze_data_profile(df)

duplicate_check_result = check_duplicates(df, columns=['col1', 'col2'])
print("\n--- Duplicate Check ---")
print(f"Rule: {duplicate_check_result['rule']}")
print(f"Columns: {duplicate_check_result.get('columns', 'All')}")
print(f"Success: {duplicate_check_result['success']}")
print(f"Details: {duplicate_check_result['details']}")

value_set_enforcement_result = enforce_value_set(df, 'col2', ['A', 'B', 'C', 'D'])
print("\n--- Value Set Enforcement ---")
print(f"Rule: {value_set_enforcement_result['rule']}")
print(f"Column: {value_set_enforcement_result['column']}")
print(f"Success: {value_set_enforcement_result['success']}")
print(f"Details: {value_set_enforcement_result['details']}")


--- Unmet Expectations ---
Expectation: column_values_not_be_null - Column: col1 - Details: Number of null values: 1
Expectation: column_values_in_set - Column: col2 - Details: Invalid values: ['E']
Expectation: column_values_between - Column: col3 - Details: Out of range values: [60]

--- Simple Data Docs Generated ---
Report saved to: data_quality_report.txt

--- Data Profile ---
Column: col1
  data_type: float64
  non_null_count: 5
  null_count: 1
  unique_count: 5
  min: 1.0
  max: 5.0
  top_value: 1.0
  top_value_count: 1
--------------------
Column: col2
  data_type: object
  non_null_count: 6
  null_count: 0
  unique_count: 4
  min: None
  max: None
  top_value: A
  top_value_count: 3
--------------------
Column: col3
  data_type: int64
  non_null_count: 6
  null_count: 0
  unique_count: 6
  min: 10
  max: 60
  top_value: 10
  top_value_count: 1
--------------------
Column: email
  data_type: object
  non_null_count: 5
  null_count: 1
  unique_count: 4
  min: None
  max: None
 