<a href="https://colab.research.google.com/github/slogen/snippets/blob/main/Pydantic_StaticTypedPandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade typeguard strictly-typed-pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [210]:
import io

from IPython.utils.openpy import TextIOWrapper
import pandas as pd

from strictly_typed_pandas import DataSet

from typing import Any, List, Optional, Type, TypeVar, Generic, Union, IO, BinaryIO

from pydantic import BaseModel, Field, BaseModel, ValidationError, validator, create_model
from pydantic.fields import ModelField

class Schema(BaseModel):
    id: int
    name: str

T = TypeVar('T')

class DirectoryAccess():
  def read(self, path: str, mode) -> IO:
    return open(path, mode = mode)

class FakeAccess():
  def __init__(self, content = {}):
    self.__content__ = content
  def read(self, path: str, mode) -> IO:
    pths = path.split("/")
    here = self.__content__
    for p in pths:
      here = here[p]
    print(f'read({path})={here}')
    return io.StringIO(here)


class Tbl(DataSet[T]):
  @classmethod
  def __get_validators__(cls):
    yield cls.validate

  @classmethod
  def __modify_schema__(cls, field_schema):
    raise Exception("Schema not implemented yest")

  @classmethod
  def validate(cls, v: Any, field: ModelField, config, **kwargs):
    #print(f"-- {__class__}.[{T}].validate({cls}, field={field}, {kwargs}")
    if isinstance(v, DataSet):
      # TODO: Check correct type
      return v
    if isinstance(v, str):
      return cls.validate_string(v, field = field, config = config, **kwargs)
    if isinstance(v, pd.DataFrame):
      return cls.validate_dataframe(v, field = field, config = config, **kwargs)
    raise TypeError(f"Unsupported value: {v}")

  @classmethod
  def validate_dataframe(cls, v: pd.DataFrame, field: ModelField, config, **kwargs):
    return field.outer_type_(v)

  @classmethod
  def read_dataframe(cls, v: str, field: ModelField, config, **kwargs):
    def default_read(path: str, mode):
      return open(path, mode)
    def read(path: str, mode: str = "rb"):
      r = getattr(config, 'reader', default_read)
      s = r(path, mode)
      print(f'read({path})={s}')
      return s
    if v.endswith(".csv"):
      return pd.read_csv(read(v))
    if v.endswith(".json"):
      return pd.read_json(read(v))
    if v.endswith(".xlsx"):
      return pd.read_excel(read(v))
    # Try other fun stuff :)
    import io
    return pd.read_json(io.StringIO(v))

  @classmethod
  def validate_string(cls, v: str, field: ModelField, config, **kwargs):
    # TODO: Use args from configuration
    return cls.validate_dataframe(cls.read_dataframe(v, field, config, **kwargs), field = field, config=config, **kwargs)

data = DataSet[Schema]({"id": [1, 2, 3], "name": ["John", "Jane", "Jack"]})
json = data.to_json(orient='records')
csv = data.to_csv(index = False)
df = data.to_dataframe()

class Main(BaseModel):
  stuff: str
  data: Tbl[Schema]
  class Config:
    reader = FakeAccess({
        'x.csv': csv,
        'x.json': json
    }).read


m = Main(stuff='X', data = "x.csv")
d = m.data
d

read(x.csv)=id,name
1,John
2,Jane
3,Jack

read(x.csv)=<_io.StringIO object at 0x7fe7762f7f70>


Unnamed: 0,id,name
0,1,John
1,2,Jane
2,3,Jack
