In [None]:
# default_exp core

# pydantic-pandas

> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#exporti

from pandas.core.frame import DataFrame as PandasDataFrame
from pydantic import (
    validator,
    root_validator
)
from pydantic import BaseModel as PydanticBaseModel
from pydantic.main import ModelMetaclass
from pydantic_pandas.default_standard_lib import *
from pydantic_pandas.utils import delegates
from IPython.display import JSON

## DataFrame

> subclass Pandas DataFrame to make it play nicely with pydantic

In [None]:
class DataFrame(PandasDataFrame):
    
    @classmethod
    def __get_validators__(cls):
        yield cls.validate
    
    @classmethod
    def __modify_schema__(cls,field_schema):
        field_schema.update({'type':'DataFrame'})
    
    @classmethod
    def validate(cls,v):
        return cls(v)

## Base Model

> -  `_repr_json_` for Jupyter display
> - json encoders for DataFrame, Array

In [None]:
class BaseModel(PydanticBaseModel):

    def _repr_json_(self):
        try:
            return json.loads(self.json())
        except:
            pass
            
    class Config:
        json_encoders = {
            DataFrame: lambda df: json.loads(df.to_json()),
            np.ndarray: lambda arr: arr.tolist(),
            pd.Series: lambda ser: json.loads(ser.to_json(date_format='iso'))
        }
        

#### Regular Model

In [None]:
class Model(BaseModel):
    integer: int
    string: str

In [None]:
Model(integer=1,string='a')

Model(integer=1, string='a')

#### Model with a DataFrame

In [None]:
class Model(BaseModel):
    df: DataFrame
Model(df = pd.DataFrame({'a':[1,2,3]}))

Model(df=   a
0  1
1  2
2  3)

In [None]:
Model.schema()

{'title': 'Model',
 'type': 'object',
 'properties': {'df': {'title': 'Df', 'type': 'DataFrame'}},
 'required': ['df']}

## TypedArray

In [None]:
_key_completions_ = list({np.dtype(k).name for k in np.typeDict.keys() if type(k)==str})
_key_completions_

  _key_completions_ = list({np.dtype(k).name for k in np.typeDict.keys() if type(k)==str})


['float16',
 'uint16',
 'bytes',
 'complex128',
 'uint32',
 'object',
 'str',
 'bool',
 'datetime64',
 'complex64',
 'uint8',
 'float64',
 'int64',
 'int32',
 'timedelta64',
 'float32',
 'void',
 'uint64',
 'float128',
 'int16',
 'int8',
 'complex256']

In [None]:
#export 
from typing import Any
from pandas.api.types import pandas_dtype
from pydantic.utils import update_not_none

class TypedArray(pd.Series):
    dtype: Any = np.object_
        
    @classmethod
    def __get_validators__(cls):

        yield cls.validate_array
    
    @classmethod
    def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:

        update_not_none(
            field_schema,
            type='Numpy Array',
            inner_type = cls.dtype
        )
    
    @classmethod
    def validate_array(cls, array):
        dtype = cls.dtype
        
        if dtype == dt.date or dtype==dt.datetime or dtype=='datetime64':
            dtype = pandas_dtype('datetime64[ns]')
            
        return pd.Series(array,dtype=pandas_dtype(dtype))


#exporti

class ArrayMeta(type):
    def _ipython_key_completions_(self):
        return _key_completions_
    
    def __getitem__(self, dtype):

        return type('ConstrainedArray', (TypedArray,), {'dtype': dtype})
    

#exporti 

class ConstrainedArray(DataFrame, metaclass=ArrayMeta):
    pass

In [None]:
class Model(BaseModel):
    string: ConstrainedArray[str]
    date: ConstrainedArray[dt.date]
    number: ConstrainedArray[int]

In [None]:
model = Model(
    string=['a','b','c'],
    number=[1,2,3],
    date=['1994-06-01']
)
model

Model(string=0    a
1    b
2    c
dtype: object, date=0   1994-06-01
dtype: datetime64[ns], number=0    1
1    2
2    3
dtype: int64)

In [None]:
pd.DataFrame(model.dict())

Unnamed: 0,string,date,number
0,a,1994-06-01,1
1,b,NaT,2
2,c,NaT,3


## Base Frame to add validation to the whole model

In [None]:
from pydantic import root_validator

class BaseFrame(BaseModel):
    index: Any = None
    
#     class Config:
#         allow_population_by_field_name = True
    @root_validator
    def validate_nan(cls,values):
        df = pd.DataFrame(
            index= values.pop('index'),
            data=values,
            
        )

        for column,field in cls.__fields__.items():
            if field.required:
                assert df[column].isna().sum() == 0, f"required column {column} has nan values"
            elif field.default:
                df[column] = df[column].fillna(value = field.default)
        return df.reset_index().to_dict()
    @property
    def df(self):
        return pd.DataFrame(
            index=self.index,
            data = self.dict(exclude={'index'})
        )

In [None]:
from pydantic import Field

class Model(BaseFrame):
    string: ConstrainedArray[str] = Field('default')
    date: ConstrainedArray[dt.date] = Field(None,description='a required date field')
    number: ConstrainedArray[int] = None

In [None]:
m = Model(
    index = [0,1],
    string = ['a','b'],
    date = ['1994-06-01',dt.date.today()],
    number = [1,2]
)

#### What about Aliases?

In [None]:
class Model(BaseFrame):
    string:ConstrainedArray[str] = Field(alias='String Column')
    numbers:ConstrainedArray[int] = Field(alias='Number Column')

In [None]:
model = Model.parse_obj({
    "String Column":['a','b','c'],
    "Number Column":[1,2,3]
}
)
model.df

Unnamed: 0,string,numbers
0,a,1
1,b,2
2,c,3


In [None]:
df = pd.DataFrame(model.dict(by_alias=True))
df

Unnamed: 0,index,String Column,Number Column
0,0,a,1
1,1,b,2
2,2,c,3


In [None]:
Model.parse_obj(df)

Model(index={0: 0, 1: 1, 2: 2}, string={0: 'a', 1: 'b', 2: 'c'}, numbers={0: 1, 1: 2, 2: 3})

### Validators

In [None]:
3%3

0

In [None]:
from pydantic import validator

class Model(BaseFrame):
    numbers: ConstrainedArray[int]
    fizz: bool = None
    buzz: bool = None
    
    @validator('fizz',always=True)
    def _is_fizz(cls,v,values):
        return values['numbers'].apply(lambda x: x%3==0)
    def

In [None]:
Model(numbers=range(1,100)).df

Unnamed: 0,numbers,fizz,buzz
0,1,False,
1,2,False,
2,3,True,
3,4,False,
4,5,False,
...,...,...,...
94,95,False,
95,96,True,
96,97,False,
97,98,False,


### Delegate pandas functions to create custom fields

In [None]:
from pydantic_pandas.utils import delegates

In [None]:
class PandasDateTime():
    kwargs:dict = {}
    
    @classmethod
    def __get_validators__(cls):
        yield cls._to_datetime

    @classmethod
    def _to_datetime(cls,array):
        return pd.to_datetime(array,**cls.kwargs)

class PandasNumeric():
    kwargs:dict = {}
    
    @classmethod
    def __get_validators__(cls):
        yield cls._to_numeric

    @classmethod
    def _to_numeric(cls,array):
        return pd.to_numeric(array,**cls.kwargs)


In [None]:

@delegates(pd.to_datetime)
def pandas_datetime(default_value=...,**kwargs):
    return type('PandasDateTimeType', (PandasDateTime,), {'kwargs':kwargs})

@delegates(pd.to_numeric)
def pandas_numeric(default_value=...,**kwargs):
    return type('PandasNumericType', (PandasNumeric,), {'kwargs':kwargs})

In [None]:
pd.Categorical()

In [None]:
pd.Period(dt.date.today(),freq='m')

Period('2021-09', 'M')

In [None]:
class Model(BaseFrame):
    date: pandas_datetime(errors='coerce') = None
    numbers: pandas_numeric()

In [None]:
Model(
    date=['1994-06-01','error'],
    numbers = [1/9,np.e],
    other_numbers = [1/9,np.e],
)

Model(index={0: 0, 1: 1}, date={0: Timestamp('1994-06-01 00:00:00'), 1: NaT}, numbers={0: 0.1111111111111111, 1: 2.718281828459045})

## Constrained Frame

In [None]:
#exporti 

from typing import Dict, Any, Type
from pydantic import ValidationError
def update_not_none(mapping: Dict[Any, Any], **update: Any) -> None:
    mapping.update({k: v for k, v in update.items() if v is not None})

In [None]:
#export 

class TypedFrame(DataFrame):
    columns: Optional[list] = None
    row_model: Optional[Type[BaseModel]] = None
        
    @classmethod
    def __get_validators__(cls):
        yield cls.validate_columns
        yield cls.validate_rows
    
    @classmethod
    def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
        row_model=None
        if cls.row_model:
            row_model = cls.row_model.schema()
        update_not_none(
            field_schema,
            columns=cls.columns,
            row_model=row_model
        )
    @classmethod
    def validate_columns(cls,df):
        if cls.columns:
            for c in cls.columns:
                if c not in df.columns:
                    raise ValueError(f"{c} not found in columns index: {df.columns}")
        return df
    
    @classmethod
    def validate_rows(cls, df):
        
        if cls.row_model:
            parsed = []
            
            for record in df.to_dict('records'):
                try:
                    model = cls.row_model.parse_obj(record)
                    parsed.append(model.dict())
                except ValidationError as e:
                    
                    if hasattr(cls.row_model.Config,'on_error'):
                        if cls.row_model.Config.on_error=='skip':
                            continue
                    raise e
                            
            return pd.DataFrame.from_records(parsed)
        return df

In [None]:
#exporti

class FrameMeta(type):
    def __getitem__(self, constraint):
        if type(constraint)==tuple:
            return type('ConstrainedFrame', (TypedFrame,), {'columns': constraint})
        elif hasattr(constraint,'__get_validators__'):
            return type('ConstrainedFrame', (TypedFrame,), {'row_model': constraint})
        else:
            raise NotImplementedError(f"The constraint you provided is to compatible with the 'TypedFrame' Object. {constraint} ")
    

#exporti 

class ConstrainedFrame(DataFrame, metaclass=FrameMeta):
    pass

The `conframe` function will allow us to create a constrained frame type with multiple parameters at the expense of some operational overhead and some extra typing. This pattern is lifted from Pydantic functions like "constr" and "confloat".

In [None]:
#exporti

def conframe(
    *,
    columns: list = None,
    row_model: Type[BaseModel] = None
) -> Type[DataFrame]:
    # use kwargs then define conf in a dict to aid with IDE type hinting
    namespace = dict(columns=columns,row_model=row_model)
    return type('ConstrainedFrameValue', (ConstrainedFrame,), namespace)

### Example uses for `ConstrainedFrame`

#### DataFrames as Model Attributes

In [None]:
df = pd.DataFrame(
    {"metric":['a','b','c'],
     "value":[1,2,'3']
    }
)

class Record(BaseModel):
    metric: str
    value: float
    not_there: Optional[str]

class Model(BaseModel):
    # a dataframe that is validated using a row model
    validate_with_model: ConstrainedFrame[Record]
    validate_with_columns: ConstrainedFrame['metric','value']
    validate_with_conframe: DataFrame = conframe(
        columns=('metric','value'),
        row_model=Record
    )

model = Model(
    validate_with_columns=df,
    validate_with_model=df,
    validate_with_conframe=df
)
print(model.validate_with_model.dtypes['value']) # the `Record` Model parse input values into floats
print(model.validate_with_columns.dtypes['value']) # validating just the column names took the raw values.

float64
object


In [None]:
model

Model(validate_with_model=  metric  value not_there
0      a    1.0      None
1      b    2.0      None
2      c    3.0      None, validate_with_columns=  metric value
0      a     1
1      b     2
2      c     3)

#### Usage with Validate Agruments Decorator

In [None]:
from pydantic import validate_arguments,Field

class PlayerBattingStats(BaseModel):
    player: str = Field(alias='PLAYER')
    at_bats: int = Field(alias='AB')
    games: int = Field(alias='G')
    home_runs: int = Field(alias='HR')
    
    class Config:
        on_error = 'skip'

In [None]:
@validate_arguments
def compute_hr_per_game(df:ConstrainedFrame[PlayerBattingStats],precision=3):
    df['batting_average'] = (df['home_runs']/df['games']).round(precision)
    return df

This is our raw DataFrame.

In [None]:
df = pd.read_html("https://www.espn.com/mlb/history/leaders",header=1,index_col=0)[0]
df.iloc[9:15]

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
,Babe Ruth,22,2503,8399,2174,2873,506,136,714,1983,2056,1330,123,117,.342
,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
11.0,Bill Terry,14,1721,6428,1120,2193,373,112,154,1078,537,449,56,6,.341
,Pete Browning,13,1183,4820,954,1646,295,85,46,0,466,167,258,0,.341
,Willie Keeler,19,2123,8591,1719,2932,241,145,33,0,524,36,495,0,.341
14.0,Lou Gehrig,17,2164,8001,1888,2721,534,163,493,1995,1508,790,102,101,.340


Some headers are mixed into the table and yield bad data. Those rows are removed during validation because they don't conform to the row model.

In [None]:
compute_hr_per_game(df).iloc[9:15]

Unnamed: 0,player,at_bats,games,home_runs,batting_average
9,Babe Ruth,8399,2503,714,0.285
10,Bill Terry,6428,1721,154,0.089
11,Pete Browning,4820,1183,46,0.039
12,Willie Keeler,8591,2123,33,0.016
13,Lou Gehrig,8001,2164,493,0.228
14,George Sisler,8267,2055,102,0.05


In [None]:
from pydantic import HttpUrl,validator

class BattingLeaderRecord(BaseModel):
    rank: int = Field(alias='RK')
    Name: str
    AVG: float
        
    @validator('Name')
    def _remove_team_from_name(cls,v):
        """remove the last 3 letters from the name column"""
        return v[0:-3]

In [None]:
class BattingAverageLeaders(BaseModel):
    url: HttpUrl = "https://www.espn.com/mlb/stats/player/_/view/batting/table/batting/sort/avg/dir/desc"
    raw_data: DataFrame
    normalized_data: ConstrainedFrame[BattingLeaderRecord]
    timestamp:dt.date=Field(
        default_factory=lambda: dt.date.today()
    )
    
    def __init__(
        self,
        url = "https://www.espn.com/mlb/stats/player/_/view/batting/table/batting/sort/avg/dir/desc",
    ):
        raw_data = pd.concat(pd.read_html(url)[0:2],axis=1)
        super().__init__(
            raw_data=raw_data,
            normalized_data=raw_data
        )
        
        

In [None]:
BattingAverageLeaders

__main__.BattingAverageLeaders

In [None]:
leaders = BattingAverageLeaders()
leaders

BattingAverageLeaders(url='https://www.espn.com/mlb/stats/player/_/view/batting/table/batting/sort/avg/dir/desc', raw_data=    RK                      Name POS   GP   AB    R    H    AVG  2B  3B  HR  \
0    1    Frank SchwindelOAK/CHC  1B   60  224   40   76  0.339  18   1  14   
1    2            Trea TurnerWSH  SS  143  576  100  187  0.325  33   3  25   
2    3              Juan SotoWSH  LF  149  494  110  157  0.318  20   2  29   
3    4           Yuli GurrielHOU  1B  142  521   80  165  0.317  31   0  14   
4    5  Vladimir Guerrero Jr.TOR  DH  157  588  120  184  0.313  27   1  46   
5    6       Michael BrantleyHOU  LF  118  458   68  143  0.312  29   2   8   
6    7         Starling MarteMIA  CF  116  454   86  141  0.311  27   2  12   
7    8       Nick CastellanosCIN  RF  135  523   94  162  0.310  37   1  33   
8    9           Bryce HarperPHI  RF  137  474   99  146  0.308  40   1  34   
9   10        Adam FrazierPIT/SD  2B  150  561   82  172  0.307  34   5   5   
10  11  

In [None]:
assert type(leaders.timestamp)==str
display(leaders.raw_data.head())
display(leaders.normalized_data.head())

In [None]:
#export 

class TypedArray(pd.Series):
    
    @classmethod
    def __get_validators__(cls):
        yield cls.validate_type
        
    @classmethod
    def __modify_schema__(cls,field_schema:Dict)->Dict:
        field_schema.update(
            type="<Pandas Series>"
        )
        
    @classmethod
    def validate_type(cls,val):
        print('validating array')
        print(val)
        return cls(val)
    
class ArrayMeta(type):
    def __getitem__(self,t):
        return type('Array', (TypedArray,), {'inner_type':t})
    
class Array(pd.Series,metaclass=ArrayMeta):
    pass

def conarray(*args,**kwargs) -> Type[TypedArray]:
    
    return type('ConstrainedArray',(TypedArray,), kwargs)

In [None]:
#export 

class PandasDataFrame(DataFrame):
    """
    Pandas DataFrame Validation
    """

    @classmethod
    def __get_validators__(cls):
        # one or more validators may be yielded which will be called in the
        # order to validate the input, each validator will receive as an input
        # the value returned from the previous validator
        yield cls.validate

    @classmethod
    def __modify_schema__(cls, field_schema):
        # __modify_schema__ should mutate the dict it receives in place,
        # the returned value will be ignored
        field_schema.update(
            type='Pandas DataFrame'
        )

    @classmethod
    def validate(cls, v):
        if not isinstance(v, pd.DataFrame):
            raise TypeError(f'Dataframe required. Got {type(v)} instead')
        if v.empty:
            raise ValueError("Dataframe can't be empty")
        return v
    def __init__(self,*args,**kwargs):
        print('making the pandas data frame')
        super(PandasDataFrame,self).__init__(*args,**kwargs)

In [None]:
#exporti 

DataFrameModelMetaclass = ForwardRef('DataFrameModelMetaclass')

def extract_ddf_from_model_fields(model:ModelMetaclass) -> 'PandasDataFrame':
    """Returns default df for DataFrameModelMetaclass ._repr_html_() method"""
    d={}
    for k,v in model.__fields__.items():
        d[k]=[v.required,v.type_]
    return pd.DataFrame(d,index=['required','type'])

In [None]:
#export 

class DataFrameModelMetaclass(ModelMetaclass):
    def __new__(cls,name,bases,dct):
        print("in DF Meta __new__")
        print(name)
        print(bases)

        d = dct.get('__annotations__')
        if d:
            d = {k:conarray(v) for k,v in d.items()}
            dct['__annotations__']=d
        print(dct)
        model = ModelMetaclass.__new__(cls,name,bases,dct)
        model._default_df_ = extract_ddf_from_model_fields(model)
        
        return model
#     def __getitem__(self,t):
#         return type('Array', (TypedArray,), {'inner_type':t})
    
#     def _repr_json_(cls):
        
#         both_schemas = dict(
#             pandas_schema = json.loads(cls._default_df_.to_json()),
#             pydantic_schema = cls.schema()
#         )
#         return both_schemas
    
    def _repr_html_(cls):
        return cls._default_df_.to_html()

class BaseFrame(PydanticBaseModel,PandasDataFrame,metaclass=DataFrameModelMetaclass):
    """Doc from BaseFame"""
    def __init__(self,*args,**kwargs):
        print("base frame init")
        super(BaseFrame,self).__init__(**kwargs)
        super(PandasDataFrame,self).__init__(self.dict())
    
    @root_validator()
    def _base_frame_root_validator(cls,values):
        print('In Base Frame Root validator')
        return values
    

In [None]:
class MovieModel(BaseFrame):
    title: str
    year: int

In [None]:
MovieModel

In [None]:
#MovieModel(title='cocktail',year=1988)

In [None]:
movies = MovieModel(
        title=['Cocktail','Cheers','That Thing You Do!'],
        year=[1988,1982,1996]
    )


In [None]:
movies

In [None]:
try:
    MovieModel.from_records(movies.to_dict('records'))
except Exception as e:
    print("I wish this worked")
    print(e)

## How about Sub-classing ? 

In [None]:
class DetailedMovieModel(MovieModel):
    review: str
    stars: int

In [None]:
DetailedMovieModel(
    review=['Perfect','Amazing','The Best'],
    stars=[10,10,10],
    **movies.dict(
        include=set(movies.__fields__.keys()),
    )
)

In [None]:
!nbdev_build_lib