In [None]:
import os
import re

import pandas as pd

from adc_toolkit.data.catalog import ValidatedDataCatalog

In [None]:
from adc_toolkit.data.validators.gx import (
    BatchManager,
    ConfigurationBasedExpectationAddition,
)
from adc_toolkit.data.validators.gx.data_context.repo import RepoDataContext

In [None]:
# from adc_toolkit.data.validators.pandera import PanderaValidator

In [None]:
path = os.path.abspath("./examples/configs/catalog")

In [None]:
catalog = ValidatedDataCatalog.in_directory(
    path=path,
    # validator_class=PanderaValidator,
)

In [None]:
df_extract = catalog.load("house_prices.extract_dynamic", min_sales_price=100000, min_lot_area=10000)

In [None]:
data_context = RepoDataContext(path).create()
house_prices_extract_batch_manager = BatchManager("house_prices.extract_dynamic", df_extract, data_context)
expectation_adder = ConfigurationBasedExpectationAddition()

In [None]:
expectations = [
    {
        "expect_column_values_to_not_be_null": {"column": "SalePrice"},
    },
    {
        "expect_column_values_to_be_between": {
            "column": "SalePrice",
            "min_value": 0,
            "max_value": 1,
        }
    },
]
expectation_adder.add_expectations(
    batch_manager=house_prices_extract_batch_manager,
    expectations=expectations,
)

In [None]:
df_extract = catalog.load("house_prices.extract_dynamic", min_sales_price=100000, min_lot_area=10000)  # should fail

In [None]:
new_expectations = [
    {
        "expect_column_values_to_be_between": {
            "column": "SalePrice",
            "min_value": 0,
            "max_value": None,
        }
    },
]
expectation_adder.add_expectations(
    batch_manager=house_prices_extract_batch_manager,
    expectations=new_expectations,
)

In [None]:
# Here we extract data and immediately save it again as raw data. We could skip the raw step for
# database data and continue with the intermediate step directly.

# this line automatically creates the schema expectation suit if it does not exist yet
df_extract = catalog.load("house_prices.extract_dynamic", min_sales_price=100000, min_lot_area=10000)
# this line automatically creates the schema expectation suit if it does not exist yet
catalog.save("house_prices.raw", df_extract)

In [None]:
# To go from raw to intermediate, we need to ensure intuitive and conventially named columns as
# well as the correct data types or schema.

df_raw = catalog.load("house_prices.raw")


def add_space_before_capital_letters(text: str) -> str:
    # Use a regular expression to find sequences of uppercase letters
    pattern = r"([a-z])([A-Z])"
    result = re.sub(pattern, r"\1 \2", text)
    return result


def string_to_snake_case(text: str) -> str:
    return text.lower().replace(" ", "_")


def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [add_space_before_capital_letters(col) for col in df.columns]
    df.columns = [string_to_snake_case(col) for col in df.columns]
    return df


def change_schema(df: pd.DataFrame) -> pd.DataFrame:
    new_schema = {
        "id": "Int64",
        "mszoning": "category",
        "neighborhood": "string",
        "central_air": "boolean",
        "added_at": "datetime64[ns, UTC]",
    }
    return df.astype(new_schema)


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    return change_schema(rename_columns(df))


df_intermediate = preprocess(df_raw)
catalog.save("house_prices.intermediate", df_intermediate)

In [None]:
# To go from intermediate to processed we can make any type of complex calculation we need. Make
# sure to properly define the dtypes of any new columns in the processed dataframe

df_intermediate = catalog.load("house_prices.intermediate")


def process(df: pd.DataFrame) -> pd.DataFrame:
    df["price_per_lot_square_meter"] = (df["sale_price"] / df["lot_area"]).astype("Float64")
    return df


df_processed = process(df_intermediate)
catalog.save("house_prices.processed", df_processed)