Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions splitgraph/ingestion/csv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ class CSVDataSource(ForeignDataWrapperDataSource):
"description": "Sample size, in bytes, for encoding/dialect/header detection",
},
"encoding": {"type": "string", "description": "Encoding of the CSV file"},
"ignore_decode_errors": {
"type": "boolean",
"description": "Ignore errors when decoding the file",
},
"header": {
"type": "boolean",
"description": "First line of the CSV file is its header",
Expand Down Expand Up @@ -191,6 +195,7 @@ def get_server_options(self):
"header",
"separator",
"quotechar",
"ignore_decode_errors",
"dialect",
]:
if k in self.params:
Expand Down
15 changes: 11 additions & 4 deletions splitgraph/ingestion/csv/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ class CSVOptions(NamedTuple):
autodetect_sample_size: int = 65536
delimiter: str = ","
quotechar: str = '"'
dialect: Optional[Union[str, Type[csv.Dialect]]] = "excel"
dialect: Optional[Union[str, Type[csv.Dialect]]] = None
header: bool = True
encoding: str = "utf-8"
ignore_decode_errors: bool = False

@classmethod
def from_fdw_options(cls, fdw_options):
Expand All @@ -33,6 +34,7 @@ def from_fdw_options(cls, fdw_options):
quotechar=fdw_options.get("quotechar", '"'),
dialect=fdw_options.get("dialect"),
encoding=fdw_options.get("encoding", "utf-8"),
ignore_decode_errors=get_bool(fdw_options, "ignore_decode_errors", default=False),
)

def to_csv_kwargs(self):
Expand All @@ -55,14 +57,19 @@ def autodetect_csv(stream: io.RawIOBase, csv_options: CSVOptions) -> CSVOptions:

if csv_options.autodetect_encoding:
encoding = chardet.detect(data)["encoding"]
if encoding == "ascii":
if encoding == "ascii" or encoding is None:
# ASCII is a subset of UTF-8. For safety, if chardet detected
# the encoding as ASCII, use UTF-8 (a valid ASCII file is a valid UTF-8 file,
# but not vice versa)

# If we can't detect the encoding, fall back to utf-8 too (hopefully the user
# passed ignore_decode_errors=True
encoding = "utf-8"
csv_options = csv_options._replace(encoding=encoding)

sample = data.decode(csv_options.encoding)
sample = data.decode(
csv_options.encoding, errors="ignore" if csv_options.ignore_decode_errors else "strict"
)
# Emulate universal newlines mode (convert \r, \r\n, \n into \n)
sample = "\n".join(sample.splitlines())

Expand Down Expand Up @@ -92,7 +99,7 @@ def make_csv_reader(
stream.reset()
# https://docs.python.org/3/library/csv.html#id3
# Open with newline="" for universal newlines
io_stream = io.TextIOWrapper(io.BufferedReader(stream), encoding=csv_options.encoding, newline="") # type: ignore
io_stream = io.TextIOWrapper(io.BufferedReader(stream), encoding=csv_options.encoding, newline="", errors="ignore" if csv_options.ignore_decode_errors else "strict") # type: ignore

reader = csv.reader(io_stream, **csv_options.to_csv_kwargs())
return csv_options, reader
26 changes: 26 additions & 0 deletions test/splitgraph/ingestion/test_csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os
from io import BytesIO

import pytest

from splitgraph.core.types import TableColumn
from splitgraph.engine import ResultShape
Expand Down Expand Up @@ -241,3 +244,26 @@ def test_csv_mac_newlines():
ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None
),
]


def test_csv_ignore_decoding_errors():
# Test doomed CSVs with malformed Unicode characters. Can't repro this with a small example,
# but in some situations chardet can return None, so we fall back to UTF-8. For the purposes
# of this test, we force UTF-8 instead.

malformed = b"name;number\nTA\xef\xbf\xbd\xef\xbf\xbd\xef\xc3\x87\xc3\x83O\xc2\xba;17"

options = CSVOptions(ignore_decode_errors=False, encoding="utf-8", autodetect_encoding=False)

with pytest.raises(UnicodeDecodeError):
make_csv_reader(BytesIO(malformed), options)

options = CSVOptions(ignore_decode_errors=True, encoding="utf-8", autodetect_encoding=False)
options, reader = make_csv_reader(BytesIO(malformed), options)
assert options.encoding == "utf-8"
assert options.header is True

data = list(reader)
assert len(data) == 2
assert data[0] == ["name", "number"]
assert data[1] == ["TA��ÇÃOº", "17"]