In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from typing import Annotated, Iterable, TypedDict
from polaroids import DataFrame, Field
from polaroids.dataframe import _Metadata
import polars as pl
from polaroids import _utils

from typing import Annotated, TypedDict
from polaroids import DataFrame, Field
import polars as pl

class Schema(TypedDict):
    a: Annotated[int, Field(
        sorted="ascending",
        coerce=True,
        unique=True,
        checks=[lambda d: d.ge(0)],
    )]
    b: int | None

df = (
    pl.DataFrame({"a": [0.0, 1.0], "b": [None, 0]})   
    .pipe(DataFrame[Schema]) # <- Add a Schema to your dataframe
    .validate() # Validate it from the Schema annotations!
)
df

a,b
i64,i64
0,
1,0.0


In [None]:
row = df.row(0, named=True)
row["a"]
row["not_exists"]




KeyError: 'not_exist'

In [None]:


from typing import Annotated, Self, TypedDict
from polaroids import DataFrame, Field
import polars as pl

class BasicSchema(TypedDict):
    a: Annotated[pl.Int64, Field(
        sorted="ascending",
        coerce=True,
        unique=True,
        checks=[lambda d: d.ge(0)],
    )]
    b: int | None

df = pl.DataFrame({"a": [0.0, 1.0], "b": [None, 0]})

df_validated = df.pipe(DataFrame[BasicSchema]).validate()
df_validated
# shape: (2, 2)
# ┌─────┬──────┐
# │ a   ┆ b    │
# │ --- ┆ ---  │
# │ i64 ┆ i64  │
# ╞═════╪══════╡
# │ 0   ┆ null │
# │ 1   ┆ 10   │
# └─────┴──────┘


class BasicSchemaDataFrame(DataFrame[BasicSchema]):
    def check_a_greater_then_b(self) -> None:
        assert self.select((pl.col("a") >= pl.col("b")).all()).item(), "a should be greater the b"

# Example usage
BasicSchemaDataFrame(df).validate() # Passes validation

# This will raise an AssertionError
(
    pl.DataFrame({"a": [5, 6], "b": [None, 10]})
    .pipe(BasicSchemaDataFrame)
    .validate() # This will raise 💣 !
)


AssertionError: a should be greater the b

In [31]:
print(df_validated)

shape: (2, 2)
┌─────┬──────┐
│ a   ┆ b    │
│ --- ┆ ---  │
│ i64 ┆ str  │
╞═════╪══════╡
│ 0   ┆ null │
│ 1   ┆ cat  │
└─────┴──────┘


In [20]:
from typing import TypedDict
from polaroids import DataFrame, Field
import polars as pl

class BasicSchema(TypedDict):
    a: Annotated[pl.Int64, Field(
        sorted="ascending",
        coerce=True,
        unique=True,
        checks=[lambda d: d.ge(0)],
    )]
    b: str | None

print(
    pl.DataFrame({"a": [2.0, 5.0], "b": ["a", None]})
    .pipe(DataFrame[BasicSchema])
    .validate()
)

shape: (2, 2)
┌─────┬──────┐
│ a   ┆ b    │
│ --- ┆ ---  │
│ i64 ┆ str  │
╞═════╪══════╡
│ 2   ┆ a    │
│ 5   ┆ null │
└─────┴──────┘


In [15]:


from collections.abc import Callable

from polars import String



df = (
    pl.DataFrame({"a": [0,None,2,3], "b": [None,0,0,0], "c": [-1.0, 2, 4,10]})
    .pipe(DataFrame[BasicSchema])
    # ._typeddict
    # .__annotations__
    # .set_sorted("a")
    # .__annotations__
    # ._metadata
    .validate()
    # .schema
    # ._metadata
    # .validate()
    # .pipe(pl.DataFrame).select("sorted", "column")
    # .schema["sorted"]
    # .pipe(lambda d: d.to_dict())
    # .group_by("sorted").agg(pl.col("column"))
    # .group_by("sorted").agg(pl.col("column"))
    # .filter(pl.col("unique"))["column"].to_list()
    
    # ["checks"] 
    # .__args__[0].__origin__ is Callable
)
df

ValidationError: The following columns contains nulls: ['a', 'b'].

In [5]:
from ctypes import util


_utils.typeddict_to_polats_schema(_Metadata)

Schema([('primary_key', Boolean),
        ('unique', Boolean),
        ('sorted', Enum(categories=['descending', 'ascending'])),
        ('coerce', Boolean),
        ('default', Object),
        ('checks', Object),
        ('column', String),
        ('nullable', Boolean)])

In [46]:
from typing import get_type_hints

class BasicSchema(TypedDict):
    a: Annotated[pl.Int64, Field(
        sorted="ascending",
        coerce=True,
        unique=True,
        checks=[lambda d: d.ge(0)],
    )]
    b: Annotated[pl.Int64, Field(primary_key=False)]
    c: Annotated[int | None, Field(coerce=True)]

get_type_hints(BasicSchema)
# 'a': Int64, 'b': Int64, 'c': int | None}

# I want to retrieve the nullable columns 
# here : nullable_cols == ["c"] 
nullable_cols = [name for name, hint in get_type_hints(BasicSchema).items() if type(None) in getattr(hint, "__args__", [])]
nullable_cols

['c']

In [45]:
type(None)

NoneType

In [44]:
None.__class__

NoneType

In [74]:
pl.DataFrame({"sorted": ["ascending"], "column": ["a"]}, schema_overrides={"sorted": e}).group_by("sorted").agg(pl.col("column"))

sorted,column
enum,list[str]
"""ascending""","[""a""]"


In [36]:
df = pl.DataFrame({'A' : [1,2,3]})
df = df.set_sorted("A")
df["A"].flags
# (
#     pl.DataFrame({'A' : [1,2,3]})
#     .with_columns(pl.col('A').set_sorted())
#     # .write_parquet('tmp.parquet')
# )

# pl.read_parquet('tmp.parquet')['A'].flags 
# Today: {'SORTED_ASC': False, 'SORTED_DESC': False}
# Wish: {'SORTED_ASC': True, 'SORTED_DESC': False}

{'SORTED_ASC': True, 'SORTED_DESC': False}

In [39]:
(
    pl.DataFrame({"a": [0,1,2,3], "b": [0,0,0,0]})
    .select(pl.all().is_duplicated().any())
    .transpose(include_header = True, column_names = ["is_duplicated"])
    .filter(pl.col("is_duplicated")).get_column("column").to_list()
)

['b']

In [None]:
(
    pl.DataFrame({"a": [0,1,2,3], "b": [0,0,0,0]})
    
    # .select(pl.all().filter(pl.element().is_duplicated()).implode())
    # .select(pl.all().is_unique().all())
    # .transpose(include_header = True, column_names = ["is_unique"])
    # .filter(~pl.col("is_unique")).get_column("column").to_list()
)

a,b
list[i64],list[i64]
"[0, 1, … 3]","[0, 0, … 0]"
