### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [1]:
# write your code from here
import pandas as pd
import json

def manage_data_quality_with_metadata(metadata_file, data_file):
    """
    Loads metadata and data, uses the metadata to validate data quality,
    and shows the valid data.

    Args:
        metadata_file (str): Path to the JSON file containing metadata.
        data_file (str): Path to the CSV file containing the data.
    """
    try:
        # Step 1: Load metadata
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        print(f"Metadata loaded successfully from: {metadata_file}\n")
        print("Metadata:")
        print(json.dumps(metadata, indent=4))

        # Step 2: Load data
        df = pd.read_csv(data_file)
        print(f"\nData loaded successfully from: {data_file}\n")
        print("Sample of loaded data:")
        print(df.head())

        # Step 3: Use metadata to validate data quality
        valid_data = df.copy()
        validation_errors = []

        for column, constraints in metadata.get('columns', {}).items():
            if column not in valid_data.columns:
                validation_errors.append(f"Error: Column '{column}' defined in metadata not found in data.")
                continue

            for constraint_type, constraint_value in constraints.items():
                if constraint_type == 'data_type':
                    expected_type = constraint_value
                    actual_type = valid_data[column].dtype
                    # Simple type check, more robust checks might be needed
                    if expected_type == 'numeric' and not pd.api.types.is_numeric_dtype(actual_type):
                        validation_errors.append(f"Validation Error: Column '{column}' expected to be numeric, but is '{actual_type}'.")
                        valid_data[column] = pd.to_numeric(valid_data[column], errors='coerce') # Try to convert
                    elif expected_type == 'string' and not pd.api.types.is_string_dtype(actual_type):
                        validation_errors.append(f"Validation Error: Column '{column}' expected to be string, but is '{actual_type}'.")
                        valid_data[column] = valid_data[column].astype(str)
                    elif expected_type == 'boolean' and not pd.api.types.is_bool_dtype(actual_type):
                        validation_errors.append(f"Validation Error: Column '{column}' expected to be boolean, but is '{actual_type}'.")
                        # Simple boolean conversion, might need more sophisticated handling
                        valid_data[column] = valid_data[column].astype(bool)
                elif constraint_type == 'min':
                    try:
                        min_val = float(constraint_value)
                        invalid_rows = valid_data[valid_data[column] < min_val]
                        if not invalid_rows.empty:
                            validation_errors.extend(f"Validation Error: Column '{column}' has values below minimum ({min_val}) in rows: {invalid_rows.index.tolist()}")
                            valid_data = valid_data[valid_data[column] >= min_val] # Filter out invalid rows
                    except ValueError:
                        validation_errors.append(f"Error: Invalid 'min' value in metadata for column '{column}'.")
                elif constraint_type == 'max':
                    try:
                        max_val = float(constraint_value)
                        invalid_rows = valid_data[valid_data[column] > max_val]
                        if not invalid_rows.empty:
                            validation_errors.extend(f"Validation Error: Column '{column}' has values above maximum ({max_val}) in rows: {invalid_rows.index.tolist()}")
                            valid_data = valid_data[valid_data[column] <= max_val] # Filter out invalid rows
                    except ValueError:
                        validation_errors.append(f"Error: Invalid 'max' value in metadata for column '{column}'.")
                elif constraint_type == 'allowed_values':
                    allowed_list = constraint_value
                    invalid_rows = valid_data[~valid_data[column].isin(allowed_list)]
                    if not invalid_rows.empty:
                        validation_errors.extend(f"Validation Error: Column '{column}' has values not in allowed list ({allowed_list}) in rows: {invalid_rows.index.tolist()}")
                        valid_data = valid_data[valid_data[column].isin(allowed_list)] # Filter out invalid rows
                elif constraint_type == 'not_null':
                    if constraint_value is True:
                        invalid_rows = valid_data[valid_data[column].isnull()]
                        if not invalid_rows.empty:
                            validation_errors.extend(f"Validation Error: Column '{column}' has null values in rows: {invalid_rows.index.tolist()}")
                            valid_data = valid_data[valid_data[column].notnull()] # Filter out rows with nulls

        if validation_errors:
            print("\nData Quality Validation Errors:")
            for error in validation_errors:
                print(f"- {error}")
            print("\nShowing valid data after filtering based on metadata constraints.")
        else:
            print("\nData Quality Validation Passed. All data conforms to the metadata constraints.")

        # Step 4: Show valid data
        print("\nValid Data:")
        print(valid_data.head())
        print(f"\nShape of valid data: {valid_data.shape}")

    except FileNotFoundError:
        print(f"Error: One or both files not found.")
    except json.JSONDecodeError:
        print("Error: Invalid JSON format in metadata file.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example Usage:
metadata_file = 'data_quality_metadata.json'
data_file = 'data_to_validate.csv'

# Create a sample metadata file
sample_metadata = {
    "description": "Metadata for validating customer data",
    "columns": {
        "customer_id": {"data_type": "numeric", "not_null": True},
        "name": {"data_type": "string", "not_null": True},
        "age": {"data_type": "numeric", "min": 18, "max": 99},
        "city": {"data_type": "string", "allowed_values": ["Bengaluru", "Mumbai", "Delhi"]},
        "is_active": {"data_type": "boolean"}
    }
}
with open(metadata_file, 'w') as f:
    json.dump(sample_metadata, f, indent=4)
print(f"Sample metadata created at: {metadata_file}")

# Create a sample data file
sample_data = {
    'customer_id': [1, 2, 3, 4, 5, 6],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
    'age': [30, 15, 45, 22, 100, None],
    'city': ['Bengaluru', 'London', 'Delhi', 'Mumbai', 'Bengaluru', 'Pune'],
    'is_active': [True, False, 'True', 'False', 1, 0]
}
df_sample = pd.DataFrame(sample_data)
df_sample.to_csv(data_file, index=False)
print(f"Sample data created at: {data_file}")

# Run the data quality management function
manage_data_quality_with_metadata(metadata_file, data_file)

Sample metadata created at: data_quality_metadata.json
Sample data created at: data_to_validate.csv
Metadata loaded successfully from: data_quality_metadata.json

Metadata:
{
    "description": "Metadata for validating customer data",
    "columns": {
        "customer_id": {
            "data_type": "numeric",
            "not_null": true
        },
        "name": {
            "data_type": "string",
            "not_null": true
        },
        "age": {
            "data_type": "numeric",
            "min": 18,
            "max": 99
        },
        "city": {
            "data_type": "string",
            "allowed_values": [
                "Bengaluru",
                "Mumbai",
                "Delhi"
            ]
        },
        "is_active": {
            "data_type": "boolean"
        }
    }
}

Data loaded successfully from: data_to_validate.csv

Sample of loaded data:
   customer_id     name    age       city is_active
0            1    Alice   30.0  Bengaluru      True