In [22]:
import pandas as pd
import pandera as pa
import numpy as np
import json

Data

In [10]:
data = {
    "id": [123, 2435, 234],
    "name": ["john", "doe", None]
}
pandas_df = pd.DataFrame(data)

Data Validation with Pandera

In [11]:
# Validate data against a schema
pandera_schema = pa.DataFrameSchema({
    "id": pa.Column(pa.Int),
    "name": pa.Column(pa.String)
})

try:
    data_val = pandera_schema.validate(pandas_df)
    print(data_val)
except pa.errors.SchemaError as e:
    print(e.failure_cases)
    print("\n\ninvalid dataframe")
    print(e.data)
    
# Dataframe schema in Pandera
data = pd.DataFrame({
    "series": [1, 2, 4, 7, None]})

not_null_schema = pa.DataFrameSchema({
    "series": pa.Column(pa.Float,
                        pa.Check(lambda x: x > 0),
                        nullable=False)})
try:
    not_null = not_null_schema.validate(data)
    print(not_null)
except pa.errors.SchemaError as e:
    print(e.failure_cases)
    print("\n\ninvalid dataframe")
    print(e.data)
    
null_allowed_schema = pa.DataFrameSchema({
    "series": pa.Column(pa.Float,
                        pa.Check(lambda x: x > 0),
                        nullable=True)})
try:
    null_allowed = null_allowed_schema.validate(data)
    print(null_allowed)
except pa.errors.SchemaError as e:
    print(e.failure_cases)
    print("\n\ninvalid dataframe")
    print(e.data)


   index failure_case
0      2         None


invalid dataframe
     id  name
0   123  john
1  2435   doe
2   234  None
   index  failure_case
0      4           NaN


invalid dataframe
   series
0     1.0
1     2.0
2     4.0
3     7.0
4     NaN
   series
0     1.0
1     2.0
2     4.0
3     7.0
4     NaN


Data Coercion (type transformation)

In [12]:
#  convert int to string
df = pd.DataFrame({
    "values": [1, 4, 5, 7, 9]})
schema = pa.DataFrameSchema({
    "values": pa.Column(str,
                        coerce=True)})

data_validated = schema.validate(df)

print(data_validated)
print(data_validated.iloc[0].map(type))



  values
0      1
1      4
2      5
3      7
4      9
values    <class 'str'>
Name: 0, dtype: object


Checks (specify properties about objects)

In [17]:
# validate a df based on a Check
odd = pa.Check.isin([1, 3, 5, 7, 9])
schema = pa.DataFrameSchema({
    "series": pa.Column(int, odd)})
data = pd.DataFrame({"series": range(10)})

try:
    schema.validate(data)
except pa.errors.SchemaError as e:
    print("\n\ndataframe of schema errors")
    print(e.failure_cases) # dataframe of schema errors
    print("\n\ninvalid dataframe")
    print(e.data) # invalid dataframe
    
# element-wise check of a series
odd = pa.Check(
    lambda x: x % 2 != 0,
    element_wise=True)
data = pd.DataFrame({"series": [1, 3, 5, 7, 9]})
schema = pa.DataFrameSchema({"series": pa.Column(int, odd)})

try:
    schema.validate(data)
    print(data)
except pa.errors.SchemaError as e:
    print("\n\ndataframe of schema errors")
    print(e.failure_cases) # dataframe of schema errors
    print("\n\ninvalid dataframe")
    print(e.data) # invalid dataframe



dataframe of schema errors
   index  failure_case
0      0             0
1      2             2
2      4             4
3      6             6
4      8             8


invalid dataframe
   series
0       0
1       1
2       2
3       3
4       4
5       5
6       6
7       7
8       8
9       9
   series
0       1
1       3
2       5
3       7
4       9


Statistical Validation

In [21]:
# data
os_list = ['iOS', 'Android']
df = pd.DataFrame({
    'OS': np.random.choice(os_list, size=40),
    'Rating': np.random.uniform(1.0, 10.0, size=40).round(2)})
print(df)

# define the hypothese testing schema
schema = pa.DataFrameSchema({
    "Rating": pa.Column(
        pa.Float,
        [
            pa.Hypothesis.two_sample_ttest(
                sample1="iOS",
                sample2="Android",
                groupby="OS",
                relationship="greater_than",
                alpha=0.05,
                equal_var=True
            )
        ], coerce=True
    ),
    "OS": pa.Column(str)
})

# run the validation
try:
    schema.validate(df)
    print(df)
except pa.errors.SchemaError as e:
    print(e)

         OS  Rating
0   Android    6.14
1       iOS    3.65
2       iOS    6.68
3       iOS    9.23
4   Android    2.70
5       iOS    8.65
6   Android    1.06
7   Android    2.98
8   Android    9.65
9       iOS    6.82
10      iOS    2.63
11  Android    5.49
12  Android    7.64
13  Android    3.86
14  Android    1.45
15      iOS    2.74
16      iOS    3.88
17      iOS    9.22
18  Android    4.34
19      iOS    6.66
20  Android    5.80
21  Android    3.28
22      iOS    4.24
23  Android    3.41
24      iOS    5.47
25      iOS    1.95
26      iOS    5.60
27  Android    1.87
28      iOS    7.90
29  Android    9.52
30  Android    3.39
31      iOS    2.15
32  Android    7.38
33  Android    2.85
34      iOS    3.96
35      iOS    7.69
36  Android    9.01
37      iOS    1.83
38      iOS    6.87
39      iOS    4.40
Column 'Rating' failed series or dataframe validator 0: <Check two_sample_ttest: failed two sample ttest between 'iOS' and 'Android'>


Lazy Validation (See all errors raised)

In [24]:
# define constrains
check_ge_3 = pa.Check(lambda x: x > 3)
check_le_15 = pa.Check(lambda x: x <= 15)
check_gpa_0 = pa.Check(lambda x: x > 0.0)
check_gpa_4 = pa.Check(lambda x: x <= 4.0)

# define schema
schema = pa.DataFrameSchema(
    columns={
        "StudentName": pa.Column(str, pa.Check.equal_to("John")),
        "CreditsTaken": pa.Column(int, [check_ge_3, check_le_15]),
        "GPA": pa.Column(float, [check_gpa_0, check_gpa_4]),
        "Date": pa.Column(pa.DateTime),
            },
    strict=True
)

# define data
df = pd.DataFrame(
    {
        "StudentName": ["JohnDoe", "JaneDoe", "DoeJohn"],
        "CreditsTaken": [9, 12, 18],
        "GPA": [3.7, 4.01, 3.5],
        "Date": None,
    }
)

# perform lazy evaluation
try:
    schema.validate(df, lazy=True)
    print(df)
except pa.errors.SchemaError as e:
    print(json.dumps(e.message, indent=2))
    print("dataframe of schema errors")
    print(e.failure_cases)
    print("invalid dataframe")
    print(e.data)


SchemaErrors: {
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": "StudentName",
                "check": "equal_to(John)",
                "error": "Column 'StudentName' failed element-wise validator number 0: equal_to(John) failure cases: JohnDoe, JaneDoe, DoeJohn"
            },
            {
                "schema": null,
                "column": "CreditsTaken",
                "check": "<lambda>",
                "error": "Column 'CreditsTaken' failed element-wise validator number 1: <Check <lambda>> failure cases: 18"
            },
            {
                "schema": null,
                "column": "GPA",
                "check": "<lambda>",
                "error": "Column 'GPA' failed element-wise validator number 1: <Check <lambda>> failure cases: 4.01"
            }
        ]
    },
    "SCHEMA": {
        "SERIES_CONTAINS_NULLS": [
            {
                "schema": null,
                "column": "Date",
                "check": "not_nullable",
                "error": "non-nullable series 'Date' contains null values:0    None1    None2    NoneName: Date, dtype: object"
            }
        ],
        "WRONG_DATATYPE": [
            {
                "schema": null,
                "column": "Date",
                "check": "dtype('datetime64[ns]')",
                "error": "expected series 'Date' to have type datetime64[ns], got object"
            }
        ]
    }
}

Pandera Decorators

In [26]:
# simple decorator example
def decoratortest(func):
    def wrapper():
        func()
        print("decorator executed")
    return wrapper

@decoratortest
def hello_world():
    print("hello world")

hello_world()

hello world
decorator executed


In [29]:
# simple example withoud pandera decorators
data = pd.DataFrame({
    "a": [1, 4, 7, 9, 5],
    "b": [12, 13, 15, 16, 19]})

def addition(dataframe):
    dataframe["c"] = dataframe["a"] + dataframe["b"]
    return dataframe

final_df = addition(data)
print(final_df)

# example with pandera decorators validation (succes case)
check_a = pa.Check(lambda x: x <= 10)
check_b = pa.Check(lambda x: x <= 20)

validate_schema = pa.DataFrameSchema({
    "a": pa.Column(int, check_a),
    "b": pa.Column(int, check_b),})

# use the decorator
@pa.check_input(validate_schema)
def addition(dataframe):
    dataframe["c"] = dataframe["a"] + dataframe["b"]
    return dataframe

final_df = addition(data)
print(final_df)


# example with pandera decorators validation (fail case)
check_a = pa.Check(lambda x: x <= 10)
check_b = pa.Check(lambda x: x <= 18)

validate_schema = pa.DataFrameSchema({
    "a": pa.Column(int, check_a),
    "b": pa.Column(int, check_b),})

# use the decorator
@pa.check_input(validate_schema)
def addition(dataframe):
    dataframe["c"] = dataframe["a"] + dataframe["b"]
    return dataframe

final_df = addition(data)
print(final_df)

   a   b   c
0  1  12  13
1  4  13  17
2  7  15  22
3  9  16  25
4  5  19  24
   a   b   c
0  1  12  13
1  4  13  17
2  7  15  22
3  9  16  25
4  5  19  24


SchemaError: error in check_input decorator of function 'addition': Column 'b' failed element-wise validator number 0: <Check <lambda>> failure cases: 19