In [1]:
import pandas as pd
import pandera.pandas as pa
import json

**Documentation**: https://pandera.readthedocs.io/en/stable/

In [3]:
def safe_json(obj):
    """Convert non-serializable objects into JSON-safe types."""
    if isinstance(obj, (bytes, bytearray)):
        return obj.decode("utf-8", errors="replace")
    elif isinstance(obj, Exception):
        return str(obj)  # ✅ handle ValueError, TypeError, etc.
    elif hasattr(obj, "__dict__"):
        return obj.__dict__  # fallback for objects like Pydantic internal structures
    return str(obj)  # final fallback for any unknown types

**Method 1: DataFrameModel**

In [12]:
class Schema(pa.DataFrameModel):
    user_name: str = pa.Field()
    book_name: str = pa.Field()
    rating: float = pa.Field(ge = 0.0, lt = 5.0)

    @pa.check("user_name")
    def check_username(cls, 
                series: pd.Series) -> pd.Series:
        return series.str.len() >= 3


    class Config:
        strict = True  #disallow extra columns

In [14]:
df = pd.DataFrame({
    "user_name": ["Alice", "Bob", "Cd"],
    "book_name": ["Alice in Wonderland", "It", "Harry Potter"],
    "rating": [3.5, 4.0, 10.0],
    "author_name": ["Lewis Carroll", "Stephen King", "J.K. Rowling"]
})

try:
    Schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as exc:
    print(exc)
    exc_json = json.loads(str(exc))
    with open("panderas_errors_1.json", "w") as f:
        json.dump(exc_json, f, indent=4, default = safe_json)

{
    "SCHEMA": {
        "COLUMN_NOT_IN_SCHEMA": [
            {
                "schema": "Schema",
                "column": "Schema",
                "check": "column_in_schema",
                "error": "column 'author_name' not in DataFrameSchema {'user_name': <Schema Column(name=user_name, type=DataType(str))>, 'book_name': <Schema Column(name=book_name, type=DataType(str))>, 'rating': <Schema Column(name=rating, type=DataType(float64))>}"
            }
        ]
    },
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "Schema",
                "column": "user_name",
                "check": "check_username",
                "error": "Column 'user_name' failed element-wise validator number 0: <Check check_username> failure cases: Cd"
            },
            {
                "schema": "Schema",
                "column": "rating",
                "check": "less_than(5.0)",
                "error": "Column 'rating' failed element-wise validator 

In [20]:
class Schema2(pa.DataFrameModel):
    user_name: str = pa.Field(unique = True)
    book_name: str = pa.Field()
    rating: float = pa.Field(ge = 0.0, lt = 5.0)
    author_name: str = pa.Field()

    @pa.check("user_name")
    def check_username(cls, 
                series: pd.Series) -> pd.Series:
        return series.str.len() >= 3


    class Config:
        strict = True  #disallow extra columns

In [24]:
df2 = pd.DataFrame({
    "user_name": ["Alice", "Bob", "Bob"],
    "book_name": ["Alice in Wonderland", None, "Harry Potter"],
    "rating": [3.5, 4.0, 4.5],
    "author_name": ["Lewis Carroll", "Stephen King", "J.K. Rowling"]
})

try:
    Schema2.validate(df2, lazy=True)
except pa.errors.SchemaErrors as exc:
    print(exc)
    exc_json = json.loads(str(exc))
    with open("panderas_errors_2.json", "w") as f:
        json.dump(exc_json, f, indent=4, default = safe_json)

{
    "DATA": {
        "SERIES_CONTAINS_DUPLICATES": [
            {
                "schema": "Schema2",
                "column": "user_name",
                "check": "field_uniqueness",
                "error": "series 'user_name' contains duplicate values:1    Bob2    BobName: user_name, dtype: object"
            }
        ]
    },
    "SCHEMA": {
        "SERIES_CONTAINS_NULLS": [
            {
                "schema": "Schema2",
                "column": "book_name",
                "check": "not_nullable",
                "error": "non-nullable series 'book_name' contains null values:1    NoneName: book_name, dtype: object"
            }
        ]
    }
}


**Method 2: DataframeSchema**

In [37]:
schema = pa.DataFrameSchema(
    {"id": pa.Column(int, pa.Check.lt(10))},
    name="MySchema",
    strict=True,
)

df = pd.DataFrame({"id": [1, 2.5, 30], 
                   "extra_column": [1, 2, 3]})

try:
    schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as exc:
    print(exc)
    exc_json = json.loads(str(exc))
    with open("panderas_errors_5.json", "w") as f:
        json.dump(exc_json, f, indent=4, default = safe_json)

{
    "SCHEMA": {
        "COLUMN_NOT_IN_SCHEMA": [
            {
                "schema": "MySchema",
                "column": "MySchema",
                "check": "column_in_schema",
                "error": "column 'extra_column' not in DataFrameSchema {'id': <Schema Column(name=id, type=DataType(int64))>}"
            }
        ],
        "WRONG_DATATYPE": [
            {
                "schema": "MySchema",
                "column": "id",
                "check": "dtype('int64')",
                "error": "expected series 'id' to have type int64, got float64"
            }
        ]
    },
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "MySchema",
                "column": "id",
                "check": "less_than(10)",
                "error": "Column 'id' failed element-wise validator number 0: less_than(10) failure cases: 30.0"
            }
        ]
    }
}


In [None]:
https://docs.greatexpectations.io/docs/core/connect_to_data/dataframes/