In [None]:
#| default_exp record_validation

# Record Validation
> Using Pydantic Models to validate the rows (or 'records') of a DataFrame.

In [None]:
#|exporti 

import pandas as pd
from pandas import DataFrame as PandasDataFrame
from typing import *
from archetypon.base_model import BaseModel,GenericModel,DataFrame
from pydantic import parse_obj_as,ValidationError,validator
from pydantic.utils import update_not_none

In [None]:
#|exporti 

def parse_dataframe_rows_as(
    model:Type[BaseModel],
    df:PandasDataFrame
)->PandasDataFrame:
    """Uses .parse_obj() method of Pydantic's `BaseModel` to validate rows of a dataframe."""
    
    # convert dataframe to a series of dictionaries
    # drop NaN values because pydantic doesn't consider them to be 'None', and that interferes with validation
    series_of_dicts = df.apply(
        lambda row: row.dropna().to_dict(),
        axis=1
    )
    
    #convert the series of dicts to a series of parsed models
    errors = []
    def parse_row(row):
        try:
            validated = parse_obj_as(model,row)
            return validated
        except ValidationError as e:
            raw_errors = e.raw_errors
            
            errors.extend(raw_errors)
            
            return BaseModel()
        return validated
    
    series_of_models = series_of_dicts.apply(
        lambda x: parse_row(x),
    )
    if len(errors)>0:
        raise ValidationError(errors=errors,model=model)
    # from model back to dictionaries and then back to Series
    validated = series_of_models.apply(
        lambda x: x.dict(),
    ).apply(pd.Series)
    
    return validated

In [None]:
import datetime as dt

In [None]:
class Model(BaseModel):
    number: int
    string: str 
    date: dt.date
    
dataframe = pd.DataFrame({
    'number':['1',2,3e0],
    'string':['a','b',3],
    'date':['1994-06-11',dt.date.today(),dt.datetime.today()],
},dtype='object')
display(dataframe)
dataframe.dtypes

Unnamed: 0,number,string,date
0,1.0,a,1994-06-11
1,2.0,b,2023-01-23
2,3.0,3,2023-01-23 15:54:23.155977


number    object
string    object
date      object
dtype: object

In [None]:
validated = parse_dataframe_rows_as(Model,dataframe)
display(validated)
validated.dtypes

Unnamed: 0,number,string,date
0,1,a,1994-06-11
1,2,b,2023-01-23
2,3,3,2023-01-23


number     int64
string    object
date      object
dtype: object

> To Do: Find a way to return the index of the erros in the ValidationError

In [None]:
bad_dataframe = pd.DataFrame({
    'number':['a',1,'b'],
    'string':dataframe['string'],
    'date':dataframe['date'],
},dtype='object')
try:
    df = parse_dataframe_rows_as(Model,bad_dataframe)
    display(df)
except ValidationError as e:
    print(e)

2 validation errors for Model
__root__ -> number
  value is not a valid integer (type=type_error.integer)
__root__ -> number
  value is not a valid integer (type=type_error.integer)


In [None]:
#|exporti 

class TypedRecordFrame(DataFrame):
    row_model: Optional[Type[BaseModel]] = None
    alias_as_column_names: bool = False
        
    @classmethod
    def __get_validators__(cls):
        for v in super().__get_validators__():
            yield v
        yield cls.validate_rows
        yield cls.validate_column_names
    
    @classmethod
    def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
        row_model=None
        if cls.row_model:
            row_model = cls.row_model.schema()
        update_not_none(
            field_schema,
            row_model=row_model
        )

    @classmethod
    def validate_rows(cls, df):
        
        if cls.row_model:
            validated = parse_dataframe_rows_as(cls.row_model,df)
                            
            return validated
        return df
    
    @classmethod
    def validate_column_names(cls,df):
        if cls.alias_as_column_names==True:
            # create a dictionary mapping field names to aliases
            field_name_to_alias = {
                field_name: cls.row_model.__fields__[field_name].alias
                for field_name in cls.row_model.__fields__
            }

            df.rename(columns=field_name_to_alias,inplace=True)
        return df
            

In [None]:
#|exporti 

class RecordModelFrameMeta(type):
    def __getitem__(self, constraint):
        return type('RecordFrame', (TypedRecordFrame,), {'row_model': constraint})


In [None]:
#|exporti
class RecordFrame(DataFrame, metaclass=RecordModelFrameMeta):
    pass

In [None]:
class ModelWithConstrainedFrame(BaseModel):
    df: RecordFrame[Model]

In [None]:
ModelWithConstrainedFrame(df=dataframe)

0,1
df,number011223string0a1b23date07712928000001167443200000021674432000000

0,1
number,011223
string,0a1b23
date,07712928000001167443200000021674432000000

0,1
0,1
1,2
2,3

0,1
0,a
1,b
2,3

0,1
0,771292800000
1,1674432000000
2,1674432000000


In [None]:
ModelWithConstrainedFrame.schema()

{'title': 'ModelWithConstrainedFrame',
 'description': "Custom implementation of Pydantic's Base Model.\n\nIncludes `_repr_json_` and `_repr_html_` methods for nice displays in Jupyter Lab and Jupyter Notebook, respectively.",
 'type': 'object',
 'properties': {'df': {'title': 'Df',
   'row_model': {'title': 'Model',
    'description': "Custom implementation of Pydantic's Base Model.\n\nIncludes `_repr_json_` and `_repr_html_` methods for nice displays in Jupyter Lab and Jupyter Notebook, respectively.",
    'type': 'object',
    'properties': {'number': {'title': 'Number', 'type': 'integer'},
     'string': {'title': 'String', 'type': 'string'},
     'date': {'title': 'Date', 'type': 'string', 'format': 'date'}},
    'required': ['number', 'string', 'date']}}},
 'required': ['df']}