In [1]:
import pandas as pd
import pandera as pa

In [None]:
# validate data
schema = pa.DataFrameSchema(
    {
        'patient_id': pa.Column(int, pa.Check.greater_than(0)),
        'age': pa.Column(int, pa.Check.between(0, 90), nullable=True),
        'gender': pa.Column(int, pa.Check.between(0, 1), nullable=True),
        'chest_pain': pa.Column(int, pa.Check.between(0, 3), nullable=True),
        'resting_bp': pa.Column(int, pa.Check.between(94, 200), nullable=True),
        'serum_cholesterol': pa.Column(
            int, 
            checks=[
                pa.Check(lambda s: s >= 126 and s <= 564,
                        element_wise=True,
                        # Attributed to pandera documentation:
                        # https://pandera.readthedocs.io/en/stable/checks.html#raise-warning-instead-of-error-on-check-failure
                        raise_warning=True,
                        error="There are outliers in the data values"),
            ], 
            nullable=True),
        'fasting_blood_sugar': pa.Column(int, pa.Check.between(0, 1), nullable=True),
        'resting_electro': pa.Column(int, pa.Check.between(0, 2), nullable=True),
        'max_heart_rate': pa.Column(int, pa.Check.between(71, 202), nullable=True),
        'exercise_angia': pa.Column(int, pa.Check.between(0, 1), nullable=True),
        'old_peak': pa.Column(float, pa.Check.between(0.0, 6.2), nullable=True),
        'slope': pa.Column(
            int, 
            checks=[
                pa.Check(lambda s: s >= 1 and s <= 3,
                        element_wise=True,
                        raise_warning=True,
                        error="Certain slope values are out of range"),
            ], 
            nullable=True),
        'num_major_vessels': pa.Column(int, pa.Check.between(0, 3), nullable=True),
        'target': pa.Column(str, pa.Check.isin(['Heart Disease', 'No Heart Disease'])),
    },
    checks=[
        pa.Check(lambda df: ~df.duplicated().any(), error="Duplicate rows found."),
        pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error="Empty rows found.")
    ]
)

# schema.validate(heart, lazy=True) # Will be added into the final code ipynb to validate the dataframe 'heart'