In [None]:
# default_exp core

# pydantic-pandas

> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#exporti

from pandas.core.frame import DataFrame as PandasDataFrame
from pydantic import (
    validator,
    root_validator
)
from pydantic import BaseModel as PydanticBaseModel
from pydantic.main import ModelMetaclass
from pydantic_pandas.default_standard_lib import *
from pydantic_pandas.utils import delegates
from IPython.display import JSON
from typing import Any
from pandas.api.types import pandas_dtype
from pydantic.utils import update_not_none
from pydantic import (
    root_validator,
    ValidationError
)

## DataFrame

> subclass Pandas DataFrame to make it play nicely with pydantic

In [None]:
#export 

class DataFrame(PandasDataFrame):
    
    @classmethod
    def __get_validators__(cls):
        yield cls.validate
    
    @classmethod
    def __modify_schema__(cls,field_schema):
        field_schema.update({'type':'DataFrame'})
    
    @classmethod
    def validate(cls,v):
        return cls(v)

## Base Model

> -  `_repr_json_` for Jupyter display
> - json encoders for DataFrame, Array

In [None]:
#export 

class BaseModel(PydanticBaseModel):

    def _repr_json_(self):
        try:
            return json.loads(self.json())
        except:
            pass
            
    class Config:
        json_encoders = {
            DataFrame: lambda df: json.loads(df.to_json()),
            np.ndarray: lambda arr: arr.tolist(),
            pd.Series: lambda ser: json.loads(ser.to_json(date_format='iso'))
        }
        

#### Regular Model

In [None]:
class Model(BaseModel):
    integer: int
    string: str

In [None]:
Model(integer=1,string='a')

Model(integer=1, string='a')

#### Model with a DataFrame

In [None]:
class Model(BaseModel):
    df: DataFrame
Model(df = pd.DataFrame({'a':[1,2,3]}))

Model(df=   a
0  1
1  2
2  3)

In [None]:
Model.schema()

{'title': 'Model',
 'type': 'object',
 'properties': {'df': {'title': 'Df', 'type': 'DataFrame'}},
 'required': ['df']}

## TypedArray

In [None]:
#exporti
_key_completions_ = list({np.dtype(k).name for k in np.typeDict.keys() if type(k)==str})

  _key_completions_ = list({np.dtype(k).name for k in np.typeDict.keys() if type(k)==str})


In [None]:
#export 

class TypedArray(pd.Series):
    dtype: Any = np.object_
        
    @classmethod
    def __get_validators__(cls):

        yield cls.validate_array
    
    @classmethod
    def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:

        update_not_none(
            field_schema,
            type='Numpy Array',
            inner_type = cls.dtype
        )
    
    @classmethod
    def validate_array(cls, array):
        dtype = cls.dtype
        
        if dtype == dt.date or dtype==dt.datetime or dtype=='datetime64':
            dtype = pandas_dtype('datetime64[ns]')
            
        return pd.Series(array,dtype=pandas_dtype(dtype))


#exporti

class ArrayMeta(type):
    def _ipython_key_completions_(self):
        return _key_completions_
    
    def __getitem__(self, dtype):

        return type('ConstrainedArray', (TypedArray,), {'dtype': dtype})
    

#exporti 

class ConstrainedArray(DataFrame, metaclass=ArrayMeta):
    pass

In [None]:
class Model(BaseModel):
    string: ConstrainedArray[str]
    date: ConstrainedArray[dt.date]
    number: ConstrainedArray[int]

In [None]:
model = Model(
    string=['a','b','c'],
    number=[1,2,3],
    date=['1994-06-01']
)
model

Model(string=0    a
1    b
2    c
dtype: object, date=0   1994-06-01
dtype: datetime64[ns], number=0    1
1    2
2    3
dtype: int64)

In [None]:
pd.DataFrame(model.dict())

Unnamed: 0,string,date,number
0,a,1994-06-01,1
1,b,NaT,2
2,c,NaT,3


## Base Frame to add validation to the whole model

In [None]:
#export

class BaseFrame(BaseModel):
    index: Any = None
    
#     class Config:
#         allow_population_by_field_name = True
    @root_validator
    def validate_nan(cls,values):
        df = pd.DataFrame(
            index= values.pop('index'),
            data=values,
            
        )

        for column,field in cls.__fields__.items():
            if field.required:
                assert df[column].isna().sum() == 0, f"required column {column} has nan values"
            elif field.default:
                df[column] = df[column].fillna(value = field.default)
        return df.reset_index().to_dict()
    @property
    def df(self):
        return pd.DataFrame(
            index=self.index,
            data = self.dict(exclude={'index'})
        )

In [None]:
from pydantic import Field

class Model(BaseFrame):
    string: ConstrainedArray[str] = Field('default')
    date: ConstrainedArray[dt.date] = Field(None,description='a required date field')
    number: ConstrainedArray[int] = None

In [None]:
m = Model(
    index = [0,1],
    string = ['a','b'],
    date = ['1994-06-01',dt.date.today()],
    number = [1,2]
)

#### What about Aliases?

In [None]:
class Model(BaseFrame):
    string:ConstrainedArray[str] = Field(alias='String Column')
    numbers:ConstrainedArray[int] = Field(alias='Number Column')

In [None]:
model = Model.parse_obj({
    "String Column":['a','b','c'],
    "Number Column":[1,2,3]
}
)
model.df

Unnamed: 0,string,numbers
0,a,1
1,b,2
2,c,3


In [None]:
df = pd.DataFrame(model.dict(by_alias=True))
df

Unnamed: 0,index,String Column,Number Column
0,0,a,1
1,1,b,2
2,2,c,3


In [None]:
Model.parse_obj(df)

Model(index={0: 0, 1: 1, 2: 2}, string={0: 'a', 1: 'b', 2: 'c'}, numbers={0: 1, 1: 2, 2: 3})

### Validators

In [None]:
from pydantic import validator

class Model(BaseFrame):
    numbers: ConstrainedArray[int]
    fizz: bool = None
    buzz: bool = None
    fizz_buzz: bool = None
    
    @validator('fizz',always=True)
    def _is_fizz(cls,v,values):
        return values['numbers'].apply(lambda x: x%3==0)
    
    @validator('buzz',always=True)
    def _is_buzz(cls,v,values):
        return values['numbers'].apply(lambda x: x%5==0)
    
    @validator('fizz_buzz',always=True)
    def _is_fizz_buzz(cls,v,values):
        return pd.DataFrame([values['fizz'],values['buzz']]).sum(axis=0)==2
    

In [None]:
df = Model(numbers=range(1,100)).df
df.head()

Unnamed: 0,numbers,fizz,buzz,fizz_buzz
0,1,False,False,False
1,2,False,False,False
2,3,True,False,False
3,4,False,False,False
4,5,False,True,False


In [None]:
df.iloc[14]

numbers        15
fizz         True
buzz         True
fizz_buzz    True
Name: 14, dtype: object

In [None]:
class Model(BaseFrame):
    numbers: ConstrainedArray[int]
    sum: ConstrainedArray[int] = None
    
    @validator('sum',always=True)
    def sum_numbers(cls,v,values):
        return values['numbers'].sum()

In [None]:
Model(numbers=[1,2,3]).df.sum()

numbers     6
sum        18
dtype: int64

### Delegate pandas functions to create custom fields

In [None]:
from pydantic_pandas.utils import delegates

In [None]:
class PandasDateTime():
    kwargs:dict = {}
    
    @classmethod
    def __get_validators__(cls):
        yield cls._to_datetime

    @classmethod
    def _to_datetime(cls,array):
        return pd.to_datetime(array,**cls.kwargs)

class PandasNumeric():
    kwargs:dict = {}
    
    @classmethod
    def __get_validators__(cls):
        yield cls._to_numeric

    @classmethod
    def _to_numeric(cls,array):
        return pd.to_numeric(array,**cls.kwargs)


In [None]:

@delegates(pd.to_datetime)
def pandas_datetime(default_value=...,**kwargs):
    return type('PandasDateTimeType', (PandasDateTime,), {'kwargs':kwargs})

@delegates(pd.to_numeric)
def pandas_numeric(default_value=...,**kwargs):
    return type('PandasNumericType', (PandasNumeric,), {'kwargs':kwargs})

In [None]:
class Model(BaseFrame):
    date: pandas_datetime(errors='coerce') = None
    numbers: pandas_numeric()

In [None]:
Model(
    date=['1994-06-01','error'],
    numbers = [1/9,np.e],
    other_numbers = [1/9,np.e],
)

Model(index={0: 0, 1: 1}, date={0: Timestamp('1994-06-01 00:00:00'), 1: NaT}, numbers={0: 0.1111111111111111, 1: 2.718281828459045})

### Example uses for `RecordFrame`

#### DataFrames as Model Attributes

In [None]:
from pydantic_pandas.row_model import *

df = pd.DataFrame(
    {"metric":['a','b','c'],
     "value":[1,2,'3']
    }
)

class Record(BaseModel):
    metric: str
    value: float
    not_there: Optional[str]

class Model(BaseModel):
    # a dataframe that is validated using a row model
    validate_with_model: RecordFrame[Record]
    #validate_with_columns: RecordFrame['metric','value']
    validate_with_conframe: DataFrame = recordframe(
        row_model=Record
    )

model = Model(
    validate_with_columns=df,
    validate_with_model=df,
    validate_with_conframe=df
)
print(model.validate_with_model.dtypes['value']) # the `Record` Model parse input values into floats

float64


In [None]:
model

Model(validate_with_model=  metric  value not_there
0      a    1.0      None
1      b    2.0      None
2      c    3.0      None)

#### Usage with Validate Agruments Decorator

In [None]:
from pydantic import validate_arguments,Field

@record_model
class PlayerBattingStats(BaseModel):
    player: str = Field(alias='PLAYER')
    at_bats: int = Field(alias='AB')
    games: int = Field(alias='G')
    home_runs: int = Field(alias='HR')
    
    class Config:
        on_errors = 'skip'

In [None]:
@validate_arguments
def compute_hr_per_game(df:PlayerBattingStats,precision=3):
    df['batting_average'] = (df['home_runs']/df['games']).round(precision)
    return df

This is our raw DataFrame.

In [None]:
df = pd.read_html("https://www.espn.com/mlb/history/leaders",header=1,index_col=0)[0]
df.iloc[9:15]

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
,Babe Ruth,22,2503,8399,2174,2873,506,136,714,1983,2056,1330,123,117,.342
,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
11.0,Bill Terry,14,1721,6428,1120,2193,373,112,154,1078,537,449,56,6,.341
,Pete Browning,13,1183,4820,954,1646,295,85,46,0,466,167,258,0,.341
,Willie Keeler,19,2123,8591,1719,2932,241,145,33,0,524,36,495,0,.341
14.0,Lou Gehrig,17,2164,8001,1888,2721,534,163,493,1995,1508,790,102,101,.340


Some headers are mixed into the table and yield bad data. Those rows are removed during validation because they don't conform to the row model.

In [None]:
compute_hr_per_game(df).iloc[9:15]

Unnamed: 0,player,at_bats,games,home_runs,batting_average
9,Babe Ruth,8399,2503,714,0.285
10,Bill Terry,6428,1721,154,0.089
11,Pete Browning,4820,1183,46,0.039
12,Willie Keeler,8591,2123,33,0.016
13,Lou Gehrig,8001,2164,493,0.228
14,George Sisler,8267,2055,102,0.05


In [None]:
from pydantic import HttpUrl,validator

@record_model
class BattingLeaderRecord(BaseModel):
    rank: int = Field(alias='RK')
    Name: str
    AVG: float
        
    @validator('Name')
    def _remove_team_from_name(cls,v):
        """remove the last 3 letters from the name column"""
        return v[0:-3]

In [None]:
class BattingAverageLeaders(BaseModel):
    url: HttpUrl = "https://www.espn.com/mlb/stats/player/_/view/batting/table/batting/sort/avg/dir/desc"
    raw_data: DataFrame
    normalized_data: BattingLeaderRecord
    timestamp:dt.date=Field(
        default_factory=lambda: dt.date.today()
    )
    
    def __init__(
        self,
        url = "https://www.espn.com/mlb/stats/player/_/view/batting/table/batting/sort/avg/dir/desc",
    ):
        raw_data = pd.concat(pd.read_html(url)[0:2],axis=1)
        super().__init__(
            raw_data=raw_data,
            normalized_data=raw_data
        )
        
        

In [None]:
leaders = BattingAverageLeaders()
display(leaders.raw_data.head())
display(leaders.normalized_data.head())

Unnamed: 0,RK,Name,POS,GP,AB,R,H,AVG,2B,3B,HR,RBI,TB,BB,K,SB,OBP,SLG,OPS,WAR
0,1,Giancarlo StantonNYY,DH,1,4,1,3,0.75,0,0,1,1,6,0,1,0,0.75,1.5,2.25,0.0
1,2,Tommy EdmanSTL,2B,1,5,1,3,0.6,0,0,0,0,3,0,1,2,0.6,0.6,1.2,0.0
2,3,Kris BryantSF,RF,5,17,1,8,0.471,0,0,1,2,11,1,3,0,0.5,0.647,1.147,0.0
3,4,Luis RobertCHW,CF,4,15,4,7,0.467,0,0,0,1,7,2,2,0,0.556,0.467,1.022,0.0
4,5,Enrique HernandezBOS,CF,11,49,9,20,0.408,4,1,5,9,41,1,6,0,0.423,0.837,1.26,0.0


Unnamed: 0,rank,Name,AVG
0,1,Giancarlo Stanton,0.75
1,2,Tommy Edman,0.6
2,3,Kris Bryan,0.471
3,4,Luis Robert,0.467
4,5,Enrique Hernandez,0.408


In [None]:
#hide
!nbdev_build_lib

Converted 00_core.ipynb.
Converted 01_row_model.ipynb.
Converted 98_utils.ipynb.
Converted 99_default_standard_lib.ipynb.
Converted index.ipynb.
