Skip to content

Commit

Permalink
feat[python]: Support reading all sheets xlsx file (#4634)
Browse files Browse the repository at this point in the history
  • Loading branch information
zundertj committed Sep 9, 2022
1 parent 4cb683e commit bc24180
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 6 deletions.
1 change: 1 addition & 0 deletions py-polars/build.requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pyarrow
pandas
pytz
types-pytz
xlsx2csv

# Tooling
maturin==0.13.2
Expand Down
69 changes: 63 additions & 6 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
"""Functions for reading and writing data."""
from __future__ import annotations

import sys
from io import BytesIO, IOBase, StringIO
from pathlib import Path
from typing import TYPE_CHECKING, BinaryIO, Callable, Mapping, TextIO
from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Mapping, TextIO, overload
from warnings import warn

if sys.version_info >= (3, 8):
from typing import Literal
else:
from typing_extensions import Literal

from polars.utils import deprecated_alias, format_path, handle_projection_columns

try:
Expand Down Expand Up @@ -1102,13 +1108,46 @@ def read_sql(
)


@overload
def read_excel(
file: str | BytesIO | Path | BinaryIO | bytes,
sheet_id: Literal[None],
sheet_name: Literal[None],
xlsx2csv_options: dict[str, object] | None,
read_csv_options: dict[str, object] | None,
) -> dict[str, DataFrame]:
...


@overload
def read_excel(
file: str | BytesIO | Path | BinaryIO | bytes,
sheet_id: Literal[None],
sheet_name: str,
xlsx2csv_options: dict[str, object] | None = None,
read_csv_options: dict[str, object] | None = None,
) -> DataFrame:
...


@overload
def read_excel(
file: str | BytesIO | Path | BinaryIO | bytes,
sheet_id: int,
sheet_name: Literal[None],
xlsx2csv_options: dict[str, object] | None = None,
read_csv_options: dict[str, object] | None = None,
) -> DataFrame:
...


def read_excel(
file: str | BytesIO | Path | BinaryIO | bytes,
sheet_id: int | None = 1,
sheet_name: str | None = None,
xlsx2csv_options: dict[str, object] | None = None,
read_csv_options: dict[str, object] | None = None,
) -> DataFrame:
) -> DataFrame | dict[str, DataFrame]:
"""
Read Excel (XLSX) sheet into a DataFrame.
Expand Down Expand Up @@ -1198,12 +1237,30 @@ def read_excel(
if not read_csv_options:
read_csv_options = {}

# Convert sheets from XSLX document to CSV.
parser = xlsx2csv.Xlsx2csv(file, **xlsx2csv_options)

if (sheet_name is not None) or ((sheet_id is not None) and (sheet_id > 0)):
return _read_excel_sheet(parser, sheet_id, sheet_name, read_csv_options)
else:
return {
sheet["name"]: _read_excel_sheet(
parser, sheet["index"], None, read_csv_options
)
for sheet in parser.workbook.sheets
}


def _read_excel_sheet(
parser: Any,
sheet_id: int | None,
sheet_name: str | None,
read_csv_options: dict[str, object] | None,
) -> DataFrame:
csv_buffer = StringIO()

# Convert sheet from XSLX document to CSV.
xlsx2csv.Xlsx2csv(file, **xlsx2csv_options).convert(
outfile=csv_buffer, sheetid=sheet_id, sheetname=sheet_name
)
# Parse XLSX sheet to CSV.
parser.convert(outfile=csv_buffer, sheetid=sheet_id, sheetname=sheet_name)

# Rewind buffer to start.
csv_buffer.seek(0)
Expand Down
Binary file added py-polars/tests/files/example.xlsx
Binary file not shown.
23 changes: 23 additions & 0 deletions py-polars/tests/io/test_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pathlib import Path

import polars as pl


def test_read_excel() -> None:
example_file = Path(__file__).parent.parent / "files" / "example.xlsx"
df = pl.read_excel(example_file, sheet_name="Sheet1", sheet_id=None)

expected = pl.DataFrame({"hello": ["Row 1", "Row 2"]})

pl.testing.assert_frame_equal(df, expected)


def test_read_excel_all_sheets() -> None:
example_file = Path(__file__).parent.parent / "files" / "example.xlsx"
df = pl.read_excel(example_file, sheet_id=None) # type: ignore[call-overload]

expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})

pl.testing.assert_frame_equal(df["Sheet1"], expected1)
pl.testing.assert_frame_equal(df["Sheet2"], expected2)

0 comments on commit bc24180

Please sign in to comment.