simonw · mikecoop83 · Apr 7, 2021
diff --git a/csv_diff/cli.py b/csv_diff/cli.py
@@ -42,7 +42,12 @@
     is_flag=True,
     help="Show unchanged fields for rows with at least one change",
 )
-def cli(previous, current, key, format, json, singular, plural, show_unchanged):
+@click.option(
+    "--encoding",
+    default=None,
+    help="Specify text encoding of the csv files",
+)
+def cli(previous, current, key, format, json, singular, plural, show_unchanged, encoding):
     "Diff two CSV or JSON files"
     dialect = {
         "csv": "excel",
@@ -51,10 +56,10 @@ def cli(previous, current, key, format, json, singular, plural, show_unchanged):
 
     def load(filename):
         if format == "json":
-            return load_json(open(filename), key=key)
+            return load_json(open(filename, encoding=encoding), key=key)
         else:
             return load_csv(
-                open(filename, newline=""), key=key, dialect=dialect.get(format)
+                open(filename, newline="", encoding=encoding), key=key, dialect=dialect.get(format)
             )
 
     diff = compare(load(previous), load(current), show_unchanged)

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -234,3 +234,43 @@ def test_semicolon_delimited(tmpdir):
         "columns_added": [],
         "columns_removed": [],
     } == json.loads(result.output.strip())
+
+
+def test_human_cli_non_utf8_encoding(tmpdir):
+    # This test confirms the ability to parse csv files that are not encoded using utf-8.
+    # The names in the files contain characters that would cause UnicodeDecodeErrors if they
+    # are encoeded using cp1252 and then parsed using utf-8.
+    encoding = "cp1252"
+    one = tmpdir / "one.csv"
+    two = tmpdir / "two.csv"
+    one.write_binary(
+        dedent(
+            """
+    id;name
+    1;José
+    """
+        ).strip().encode(encoding)
+    )
+    two.write_binary(
+        dedent(
+            """
+    id;name
+    1;Ángela
+    """
+        ).strip().encode(encoding)
+    )
+    result = CliRunner().invoke(
+        cli.cli, [str(one), str(two), "--key", "id", "--encoding", encoding], catch_exceptions=False
+    )
+    assert 0 == result.exit_code
+    assert (
+        dedent(
+            """
+    1 row changed
+
+      id: 1
+        name: "José" => "Ángela"
+    """
+        ).strip()
+        == result.output.strip()
+    )