splitgraph · mildbyte · Apr 7, 2021 · Apr 7, 2021 · Apr 7, 2021
diff --git a/splitgraph/ingestion/csv/__init__.py b/splitgraph/ingestion/csv/__init__.py
@@ -99,6 +99,10 @@ class CSVDataSource(ForeignDataWrapperDataSource):
                 "description": "Sample size, in bytes, for encoding/dialect/header detection",
             },
             "encoding": {"type": "string", "description": "Encoding of the CSV file"},
+            "ignore_decode_errors": {
+                "type": "boolean",
+                "description": "Ignore errors when decoding the file",
+            },
             "header": {
                 "type": "boolean",
                 "description": "First line of the CSV file is its header",
@@ -191,6 +195,7 @@ def get_server_options(self):
             "header",
             "separator",
             "quotechar",
+            "ignore_decode_errors",
             "dialect",
         ]:
             if k in self.params:

diff --git a/splitgraph/ingestion/csv/common.py b/splitgraph/ingestion/csv/common.py
@@ -17,9 +17,10 @@ class CSVOptions(NamedTuple):
     autodetect_sample_size: int = 65536
     delimiter: str = ","
     quotechar: str = '"'
-    dialect: Optional[Union[str, Type[csv.Dialect]]] = "excel"
+    dialect: Optional[Union[str, Type[csv.Dialect]]] = None
     header: bool = True
     encoding: str = "utf-8"
+    ignore_decode_errors: bool = False
 
     @classmethod
     def from_fdw_options(cls, fdw_options):
@@ -33,6 +34,7 @@ def from_fdw_options(cls, fdw_options):
             quotechar=fdw_options.get("quotechar", '"'),
             dialect=fdw_options.get("dialect"),
             encoding=fdw_options.get("encoding", "utf-8"),
+            ignore_decode_errors=get_bool(fdw_options, "ignore_decode_errors", default=False),
         )
 
     def to_csv_kwargs(self):
@@ -55,14 +57,19 @@ def autodetect_csv(stream: io.RawIOBase, csv_options: CSVOptions) -> CSVOptions:
 
     if csv_options.autodetect_encoding:
         encoding = chardet.detect(data)["encoding"]
-        if encoding == "ascii":
+        if encoding == "ascii" or encoding is None:
             # ASCII is a subset of UTF-8. For safety, if chardet detected
             # the encoding as ASCII, use UTF-8 (a valid ASCII file is a valid UTF-8 file,
             # but not vice versa)
+
+            # If we can't detect the encoding, fall back to utf-8 too (hopefully the user
+            # passed ignore_decode_errors=True
             encoding = "utf-8"
         csv_options = csv_options._replace(encoding=encoding)
 
-    sample = data.decode(csv_options.encoding)
+    sample = data.decode(
+        csv_options.encoding, errors="ignore" if csv_options.ignore_decode_errors else "strict"
+    )
     # Emulate universal newlines mode (convert \r, \r\n, \n into \n)
     sample = "\n".join(sample.splitlines())
 
@@ -92,7 +99,7 @@ def make_csv_reader(
     stream.reset()
     # https://docs.python.org/3/library/csv.html#id3
     # Open with newline="" for universal newlines
-    io_stream = io.TextIOWrapper(io.BufferedReader(stream), encoding=csv_options.encoding, newline="")  # type: ignore
+    io_stream = io.TextIOWrapper(io.BufferedReader(stream), encoding=csv_options.encoding, newline="", errors="ignore" if csv_options.ignore_decode_errors else "strict")  # type: ignore
 
     reader = csv.reader(io_stream, **csv_options.to_csv_kwargs())
     return csv_options, reader
diff --git a/test/splitgraph/ingestion/test_csv.py b/test/splitgraph/ingestion/test_csv.py
@@ -1,4 +1,7 @@
 import os
+from io import BytesIO
+
+import pytest
 
 from splitgraph.core.types import TableColumn
 from splitgraph.engine import ResultShape
@@ -241,3 +244,26 @@ def test_csv_mac_newlines():
                 ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None
             ),
         ]
+
+
+def test_csv_ignore_decoding_errors():
+    # Test doomed CSVs with malformed Unicode characters. Can't repro this with a small example,
+    # but in some situations chardet can return None, so we fall back to UTF-8. For the purposes
+    # of this test, we force UTF-8 instead.
+
+    malformed = b"name;number\nTA\xef\xbf\xbd\xef\xbf\xbd\xef\xc3\x87\xc3\x83O\xc2\xba;17"
+
+    options = CSVOptions(ignore_decode_errors=False, encoding="utf-8", autodetect_encoding=False)
+
+    with pytest.raises(UnicodeDecodeError):
+        make_csv_reader(BytesIO(malformed), options)
+
+    options = CSVOptions(ignore_decode_errors=True, encoding="utf-8", autodetect_encoding=False)
+    options, reader = make_csv_reader(BytesIO(malformed), options)
+    assert options.encoding == "utf-8"
+    assert options.header is True
+
+    data = list(reader)
+    assert len(data) == 2
+    assert data[0] == ["name", "number"]
+    assert data[1] == ["TA��ÇÃOº", "17"]