<a href="https://colab.research.google.com/github/slogen/snippets/blob/main/SimpleTypedData_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
pip install pandera # https://github.com/unionai-oss/pandera


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [68]:

import pandas as pd
import pandera as pa


In [74]:
#Let's pretend there is a CSV file with this content

import io
CSV=io.StringIO("""
Power,Scaled,State
20,0.5,"Ready"
100,0.0,"Stopped"
""")

from pandera.typing import Series
class MyD(pa.SchemaModel):
    Power: Series[int] = pa.Field(le=100)
    Scaled: Series[float] = pa.Field(lt=1,ge=0)
    State: Series[str] = pa.Field(notin=["Unknown"])


df = pd.read_csv(CSV)
tdf = MyD(df)
tdf.Scaled*tdf.Power

0    10.0
1     0.0
dtype: float64

In [75]:
# Usage from Pre/Post (and possibly MainProcessor)

def read_untyped(source):
  # looks line "inline" CSV?
  if isinstance(source, str) and "\n" in source[:1000]: 
    import io
    return pd.read_csv(io.StringIO(source))
  # more options here
  raise Exception(f'Unsupported source: "{source}"')

# Documented interface to read data
def read_typed(schema, source):
  return schema(read_untyped(source))




In [76]:
import unittest

class ReadingValidation(unittest.TestCase):
  def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self._valid_in = """
Power,Scaled,State
20,0.5,"Ready"
100,0.0,"Stopped"
    """
  def test_valid_inputs(self):
    read_typed(MyD, self._valid_in)
  def test_detect_type_error(self):
    with (self.assertRaises(pa.errors.SchemaError)):
      read_typed(MyD, self._valid_in.replace("20,", "20.0,"))
  def test_detect_missing_column(self):
    with (self.assertRaises(pa.errors.SchemaError)):
      read_typed(MyD, self._valid_in.replace("State", "xState"))
  def test_value_range_error(self):
    with (self.assertRaises(pa.errors.SchemaError)):
      read_typed(MyD, self._valid_in.replace("0.5", "1.0"))


if __name__ == "__main__":
  unittest.main(argv=[''], verbosity=2, exit=False)


test_detect_missing_column (__main__.ReadingValidation) ... ok
test_detect_type_error (__main__.ReadingValidation) ... ok
test_valid_inputs (__main__.ReadingValidation) ... ok
test_value_range_error (__main__.ReadingValidation) ... ok

----------------------------------------------------------------------
Ran 4 tests in 0.031s

OK
