### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [None]:
# write your code from here

In [1]:
import pandas as pd
import io
import json

# Task: Metadata Management for Data Quality
# Description: This script demonstrates how to store and use metadata to manage
#              data quality in a pipeline using Python and Pandas.

# Step 1: Load metadata
# Metadata can define expected data types, allowed ranges, or other validation rules.
# For simplicity, we'll define it as a Python dictionary here, simulating loading from a file.
metadata = {
    "schema": {
        "product_id": {"type": "int", "nullable": False},
        "product_name": {"type": "str", "nullable": False},
        "price": {"type": "float", "nullable": False, "min_value": 0.01, "max_value": 1000.00},
        "stock_quantity": {"type": "int", "nullable": False, "min_value": 0},
        "category": {"type": "str", "nullable": True, "allowed_values": ["Electronics", "Books", "Clothing", "Home Goods"]}
    },
    "quality_checks": {
        "price_range": "price >= min_value and price <= max_value",
        "stock_positive": "stock_quantity >= min_value",
        "category_valid": "category in allowed_values or category is None"
    }
}

print("Step 1: Metadata Loaded.")
print(json.dumps(metadata, indent=2))

# Step 2: Load data (simulated)
# We'll create a dummy DataFrame with some data, including some that violates the metadata.
csv_data = """
product_id,product_name,price,stock_quantity,category
101,Laptop,1200.50,50,Electronics
102,Novel,15.99,200,Books
103,T-Shirt,25.00,150,Clothing
104,Coffee Maker,99.99,30,Home Goods
105,Invalid Price,-5.00,10,Electronics
106,Zero Stock,50.00,0,Books
107,Invalid Category,100.00,20,Gadgets
108,Missing Stock,200.00,,Electronics
109,Missing Price,,5,Clothing
110,Valid Product,75.00,100,Electronics
"""

try:
    df = pd.read_csv(io.StringIO(csv_data))
    print("\nStep 2: Data Loaded.")
    print("\nOriginal DataFrame:")
    print(df)
    print(f"\nOriginal DataFrame shape: {df.shape}")

    # Step 3: Use metadata to validate data quality
    invalid_rows_indices = set()
    validation_results = []

    print("\nStep 3: Validating Data Quality using Metadata...")

    # Validate based on schema (data types and nullability)
    for col_name, col_meta in metadata["schema"].items():
        if col_name not in df.columns:
            print(f"Error: Column '{col_name}' defined in metadata but not found in data.")
            continue

        # Validate data types
        expected_type_str = col_meta["type"]
        if expected_type_str == "int":
            expected_type = int
        elif expected_type_str == "float":
            expected_type = float
        elif expected_type_str == "str":
            expected_type = str
        else:
            print(f"Warning: Unknown type '{expected_type_str}' for column '{col_name}'. Skipping type validation.")
            continue

        # Convert column to object type first to handle mixed types gracefully before checking
        # This helps in identifying actual type mismatches rather than coercion errors
        df[col_name] = df[col_name].apply(lambda x: x if pd.isna(x) else x)

        # Check for type mismatches (ignoring NaN for numerical types)
        type_mismatch_mask = df[col_name].apply(lambda x: not isinstance(x, expected_type) if pd.notna(x) else False)
        if type_mismatch_mask.any():
            invalid_rows_indices.update(df[type_mismatch_mask].index.tolist())
            validation_results.append(f"Type mismatch in '{col_name}': {df[type_mismatch_mask][col_name].tolist()}")

        # Validate nullability
        if not col_meta["nullable"] and df[col_name].isnull().any():
            null_mask = df[col_name].isnull()
            invalid_rows_indices.update(df[null_mask].index.tolist())
            validation_results.append(f"Null values in non-nullable column '{col_name}'")

        # Validate range for numerical types
        if expected_type_str in ["int", "float"]:
            if "min_value" in col_meta and "max_value" in col_meta:
                min_val = col_meta["min_value"]
                max_val = col_meta["max_value"]
                # Ensure column is numeric before range check, coerce errors to NaN
                numeric_col = pd.to_numeric(df[col_name], errors='coerce')
                range_violation_mask = ~numeric_col.between(min_val, max_val, inclusive='both') & pd.notna(numeric_col)
                if range_violation_mask.any():
                    invalid_rows_indices.update(df[range_violation_mask].index.tolist())
                    validation_results.append(f"Out of range values in '{col_name}': {df[range_violation_mask][col_name].tolist()}")
            elif "min_value" in col_meta: # Only min_value check
                min_val = col_meta["min_value"]
                numeric_col = pd.to_numeric(df[col_name], errors='coerce')
                min_violation_mask = (numeric_col < min_val) & pd.notna(numeric_col)
                if min_violation_mask.any():
                    invalid_rows_indices.update(df[min_violation_mask].index.tolist())
                    validation_results.append(f"Below min_value in '{col_name}': {df[min_violation_mask][col_name].tolist()}")

        # Validate allowed values for categorical types
        if expected_type_str == "str" and "allowed_values" in col_meta:
            allowed_values = col_meta["allowed_values"]
            # Check if category is not in allowed values and is not null (if nullable=True)
            if col_meta["nullable"]:
                invalid_category_mask = ~df[col_name].isin(allowed_values) & pd.notna(df[col_name])
            else:
                invalid_category_mask = ~df[col_name].isin(allowed_values)

            if invalid_category_mask.any():
                invalid_rows_indices.update(df[invalid_category_mask].index.tolist())
                validation_results.append(f"Invalid category in '{col_name}': {df[invalid_category_mask][col_name].tolist()}")


    # Identify invalid rows based on collected indices
    invalid_data_df = df.loc[list(invalid_rows_indices)].copy()
    valid_data_df = df.drop(index=list(invalid_rows_indices)).copy()

    print("\nData Quality Validation Summary:")
    if not validation_results:
        print("All data passed quality checks based on metadata.")
    else:
        print("The following data quality issues were found:")
        for res in validation_results:
            print(f"- {res}")

    # Step 4: Show valid data
    print("\nStep 4: Valid Data (rows that passed all quality checks):")
    if not valid_data_df.empty:
        print(valid_data_df)
        print(f"\nValid DataFrame shape: {valid_data_df.shape}")
    else:
        print("No valid data found after applying quality checks.")

    print("\nInvalid Data (rows that failed at least one quality check):")
    if not invalid_data_df.empty:
        print(invalid_data_df)
        print(f"\nInvalid DataFrame shape: {invalid_data_df.shape}")
    else:
        print("No invalid data found.")

except Exception as e:
    print(f"An error occurred during metadata-driven data quality validation: {e}")



Step 1: Metadata Loaded.
{
  "schema": {
    "product_id": {
      "type": "int",
      "nullable": false
    },
    "product_name": {
      "type": "str",
      "nullable": false
    },
    "price": {
      "type": "float",
      "nullable": false,
      "min_value": 0.01,
      "max_value": 1000.0
    },
    "stock_quantity": {
      "type": "int",
      "nullable": false,
      "min_value": 0
    },
    "category": {
      "type": "str",
      "nullable": true,
      "allowed_values": [
        "Electronics",
        "Books",
        "Clothing",
        "Home Goods"
      ]
    }
  },
  "quality_checks": {
    "price_range": "price >= min_value and price <= max_value",
    "stock_positive": "stock_quantity >= min_value",
    "category_valid": "category in allowed_values or category is None"
  }
}

Step 2: Data Loaded.

Original DataFrame:
   product_id      product_name    price  stock_quantity     category
0         101            Laptop  1200.50            50.0  Electronics
1     