Skip to content

Commit

Permalink
Many Python type hint fixes (#966)
Browse files Browse the repository at this point in the history
Changes:

    Fixed many type hints that were previously ignored with # type: ignore
    Some of these ignores were masking bugs/unwanted behaviour. Fixes included in this PR:
        DataFrame/LazyFrame.join now properly handles Expr arguments for left_on/right_on.
        Series.__setitem__ now uses .cast(UInt64) instead of the non-existant .cast_u64()
        GBSelection.apply now catches an Error when selection=None (result of call from select_all)
    Some lines require # type: ignore due to limitations in mypy. For these lines, the error code has been included, e.g. # type: ignore[override]. This means that unintended type errors are still caught.
    Removed return type hint from __init__ functions, as this is not required in the latest mypy versions (since it is always None).
    Unused private function _as_float_ndarray deleted from utils.py

To do in a future PR:

    Fix the last three nasty '# type: ignore` statements
    Properly type hint all the file IO functions; these appear to be incomplete at times
  • Loading branch information
stinodego authored and ritchie46 committed Jul 14, 2021
1 parent c0e5617 commit 17f1ce0
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 120 deletions.
7 changes: 4 additions & 3 deletions py-polars/polars/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(
elements: tp.List[str],
tag: str,
attributes: Optional[Dict[str, str]] = None,
) -> None:
):
self.tag = tag
self.elements = elements
self.attributes = attributes
Expand Down Expand Up @@ -130,10 +130,11 @@ def write_body(self) -> None:
def write(self, inner: str) -> None:
self.elements.append(inner)

def render(self) -> None:
def render(self) -> tp.List[str]:
with Tag(self.elements, "table", {"border": "1", "class": "dataframe"}):
self.write_header()
self.write_body()
return self.elements


class NotebookFormatter(HTMLFormatter):
Expand Down Expand Up @@ -166,7 +167,7 @@ def write_style(self) -> None:
template = dedent("\n".join((template_first, template_mid, template_last)))
self.write(template)

def render(self) -> tp.List[str]: # type: ignore
def render(self) -> tp.List[str]:
"""
Return the lines needed to render a HTML table.
"""
Expand Down
9 changes: 0 additions & 9 deletions py-polars/polars/ffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,3 @@ def _ptr_to_numpy(ptr: int, len: int, ptr_type: Any) -> np.ndarray:
"""
ptr_ctype = ctypes.cast(ptr, ctypes.POINTER(ptr_type))
return np.ctypeslib.as_array(ptr_ctype, (len,))


def _as_float_ndarray(ptr: int, size: int) -> np.ndarray:
"""
https://github.com/maciejkula/python-rustlearn
Turn a float* to a numpy array.
"""
return np.core.multiarray.int_asbuffer(ptr, size * np.float32.itemsize) # type: ignore
73 changes: 41 additions & 32 deletions py-polars/polars/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(
self,
data: Union[Dict[str, Sequence], tp.List[Series], np.ndarray],
nullable: bool = True,
) -> None:
):
"""
A DataFrame is a two dimensional data structure that represents data as a table with rows and columns.
"""
Expand Down Expand Up @@ -248,7 +248,7 @@ def read_csv(
for k, v in dtype.items():
dtype_list.append((k, pytype_to_polars_type(v)))

null_values = _process_null_values(null_values) # type: ignore
processed_null_values = _process_null_values(null_values)

self._df = PyDataFrame.read_csv(
file,
Expand All @@ -268,7 +268,7 @@ def read_csv(
dtype_list,
low_memory,
comment_char,
null_values,
processed_null_values,
)
return self

Expand Down Expand Up @@ -723,8 +723,8 @@ def __getitem__(self, item: Any) -> Any:
if isinstance(item, Sequence):
if isinstance(item[0], str):
return wrap_df(self._df.select(item))
if hasattr(item[0], "_pyexpr"):
return self.select(item) # type: ignore
elif isinstance(item[0], pl.Expr):
return self.select(item)

# select rows by mask or index
# df[[1, 2, 3]]
Expand Down Expand Up @@ -1366,8 +1366,8 @@ def downsample(self, by: Union[str, tp.List[str]], rule: str, n: int) -> "GroupB
def join(
self,
df: "DataFrame",
left_on: Optional[Union[str, tp.List[str], "Expr", tp.List["Expr"]]] = None,
right_on: Optional[Union[str, tp.List[str], "Expr", tp.List["Expr"]]] = None,
left_on: Optional[Union[str, "Expr", tp.List[str], tp.List["Expr"]]] = None,
right_on: Optional[Union[str, "Expr", tp.List[str], tp.List["Expr"]]] = None,
on: Optional[Union[str, tp.List[str]]] = None,
how: str = "inner",
) -> Union["DataFrame", "LazyFrame"]:
Expand Down Expand Up @@ -1446,23 +1446,32 @@ def join(
if how == "cross":
return wrap_df(self._df.join(df._df, [], [], how))

if isinstance(left_on, str):
left_on = [left_on]
if isinstance(right_on, str):
right_on = [right_on]
left_on_: Union[tp.List[str], tp.List[pl.Expr], None]
if isinstance(left_on, (str, pl.Expr)):
left_on_ = [left_on] # type: ignore[assignment]
else:
left_on_ = left_on

right_on_: Union[tp.List[str], tp.List[pl.Expr], None]
if isinstance(right_on, (str, pl.Expr)):
right_on_ = [right_on] # type: ignore[assignment]
else:
right_on_ = right_on

if isinstance(on, str):
left_on = [on]
right_on = [on]
left_on_ = [on]
right_on_ = [on]
elif isinstance(on, list):
left_on = on
right_on = on
if left_on is None or right_on is None:
left_on_ = on
right_on_ = on

if left_on_ is None or right_on_ is None:
raise ValueError("You should pass the column to join on as an argument.")
if isinstance(left_on[0], pl.Expr) or isinstance(right_on[0], pl.Expr): # type: ignore
return self.lazy().join(df.lazy(), left_on, right_on, how=how)

return wrap_df(self._df.join(df._df, left_on, right_on, how))
if isinstance(left_on_[0], pl.Expr) or isinstance(right_on_[0], pl.Expr):
return self.lazy().join(df.lazy(), left_on, right_on, how=how)
else:
return wrap_df(self._df.join(df._df, left_on_, right_on_, how))

def apply(
self,
Expand Down Expand Up @@ -1742,7 +1751,7 @@ def lazy(self) -> "LazyFrame":
return wrap_ldf(self._df.lazy())

def select(
self, exprs: Union[str, "Expr", tp.List[str], tp.List["Expr"]]
self, exprs: Union[str, "Expr", Sequence[str], Sequence["Expr"]]
) -> "DataFrame":
"""
Select columns from this DataFrame.
Expand Down Expand Up @@ -2033,7 +2042,7 @@ def __init__(
downsample: bool = False,
rule: Optional[str] = None,
downsample_n: int = 0,
) -> None:
):
self._df = df
self.by = by
self.downsample = downsample
Expand Down Expand Up @@ -2167,8 +2176,8 @@ def agg(
elif isinstance(column_to_agg, list):

if isinstance(column_to_agg[0], tuple):
column_to_agg = [ # type: ignore
(column, [agg] if isinstance(agg, str) else agg) # type: ignore
column_to_agg = [ # type: ignore[misc]
(column, [agg] if isinstance(agg, str) else agg) # type: ignore[misc]
for (column, agg) in column_to_agg
]

Expand All @@ -2177,7 +2186,7 @@ def agg(
wrap_df(self._df)
.lazy()
.groupby(self.by)
.agg(column_to_agg) # type: ignore
.agg(column_to_agg) # type: ignore[arg-type]
.collect(no_optimization=True, string_cache=False)
)

Expand Down Expand Up @@ -2316,7 +2325,7 @@ def __init__(
by: Union[str, tp.List[str]],
pivot_column: str,
values_column: str,
) -> None:
):
self._df = df
self.by = by
self.pivot_column = pivot_column
Expand Down Expand Up @@ -2392,7 +2401,7 @@ def __init__(
downsample: bool = False,
rule: Optional[str] = None,
downsample_n: int = 0,
) -> None:
):
self._df = df
self.by = by
self.selection = selection
Expand Down Expand Up @@ -2491,18 +2500,18 @@ def agg_list(self) -> DataFrame:

def apply(
self,
func: Union[Callable[[Any], Any], Callable[[Any], Any]],
func: Callable[[Any], Any],
return_dtype: Optional[Type[DataType]] = None,
) -> DataFrame:
"""
Apply a function over the groups.
"""
df = self.agg_list()
if isinstance(self.selection, str):
selection = [self.selection]
else:
selection = self.selection
for name in selection: # type: ignore
if self.selection is None:
raise TypeError(
"apply not available for Groupby.select_all(). Use select() instead."
)
for name in self.selection:
s = df.drop_in_place(name + "_agg_list").apply(func, return_dtype)
s.rename(name, in_place=True)
df[name] = s
Expand Down
30 changes: 15 additions & 15 deletions py-polars/polars/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def read_csv(
[f"column_{int(column[1:]) + 1}" for column in tbl.column_names]
)

return from_arrow(tbl, rechunk) # type: ignore
return from_arrow(tbl, rechunk) # type: ignore[return-value]

with _prepare_file_arg(file, **storage_options) as data:
df = DataFrame.read_csv(
Expand Down Expand Up @@ -414,11 +414,11 @@ def read_ipc(
"""
storage_options = storage_options or {}
with _prepare_file_arg(file, **storage_options) as data:
return DataFrame.read_ipc(data, use_pyarrow) # type: ignore
return DataFrame.read_ipc(data, use_pyarrow)


def read_parquet(
source: Union[str, BinaryIO, Path, tp.List[str]],
source: Union[str, BinaryIO],
stop_after_n_rows: Optional[int] = None,
memory_map: bool = True,
columns: Optional[tp.List[str]] = None,
Expand Down Expand Up @@ -453,8 +453,10 @@ def read_parquet(
storage_options = storage_options or {}
with _prepare_file_arg(source, **storage_options) as source_prep:
if stop_after_n_rows is not None:
return DataFrame.read_parquet(source_prep, stop_after_n_rows=stop_after_n_rows) # type: ignore
return from_arrow( # type: ignore
return DataFrame.read_parquet(
source_prep, stop_after_n_rows=stop_after_n_rows
)
return from_arrow( # type: ignore[return-value]
pa.parquet.read_table(
source_prep, memory_map=memory_map, columns=columns, **kwargs
)
Expand Down Expand Up @@ -533,7 +535,7 @@ def _from_pandas_helper(a: pd.Series) -> pa.Array: # noqa: F821
def from_pandas(
df: Union[pd.DataFrame, pd.Series, pd.DatetimeIndex],
rechunk: bool = True, # noqa: F821
) -> DataFrame:
) -> Union[Series, DataFrame]:
"""
Convert from a pandas DataFrame to a polars DataFrame.
Expand All @@ -549,7 +551,7 @@ def from_pandas(
A Polars DataFrame
"""
if isinstance(df, pd.Series) or isinstance(df, pd.DatetimeIndex):
return from_arrow(_from_pandas_helper(df)) # type: ignore
return from_arrow(_from_pandas_helper(df))

# Note: we first tried to infer the schema via pyarrow and then modify the schema if needed.
# However arrow 3.0 determines the type of a string like this:
Expand All @@ -562,10 +564,10 @@ def from_pandas(
data[name] = _from_pandas_helper(s)

table = pa.table(data)
return from_arrow(table, rechunk) # type: ignore
return from_arrow(table, rechunk)


def concat(dfs: tp.List[DataFrame], rechunk: bool = True) -> DataFrame:
def concat(dfs: Sequence[DataFrame], rechunk: bool = True) -> DataFrame:
"""
Aggregate all the Dataframes in a List of DataFrames to a single DataFrame.
Expand All @@ -580,7 +582,7 @@ def concat(dfs: tp.List[DataFrame], rechunk: bool = True) -> DataFrame:
df = dfs[0].clone()
for i in range(1, len(dfs)):
try:
df = df.vstack(dfs[i], in_place=False) # type: ignore
df = df.vstack(dfs[i], in_place=False) # type: ignore[assignment]
# could have a double borrow (one mutable one ref)
except RuntimeError:
df.vstack(dfs[i].clone(), in_place=True)
Expand Down Expand Up @@ -613,9 +615,7 @@ def repeat(val: Union[int, float, str], n: int, name: Optional[str] = None) -> S
return Series.from_arrow(name, pa.repeat(val, n))


def read_json(
source: Union[str, StringIO, Path],
) -> DataFrame:
def read_json(source: Union[str, BytesIO]) -> DataFrame:
"""
Read into a DataFrame from JSON format.
Expand All @@ -624,7 +624,7 @@ def read_json(
source
Path to a file or a file like object.
"""
return DataFrame.read_json(source) # type: ignore
return DataFrame.read_json(source)


def from_rows(
Expand Down Expand Up @@ -682,7 +682,7 @@ def read_sql(sql: str, engine: Any) -> DataFrame:
# conversion from pandas to arrow is very cheap compared to db driver
import pandas as pd

return from_pandas(pd.read_sql(sql, engine))
return from_pandas(pd.read_sql(sql, engine)) # type: ignore[return-value]
except ImportError:
from sqlalchemy import text

Expand Down

0 comments on commit 17f1ce0

Please sign in to comment.