reading and writing in utf-8 in notebook tasks

ploomber · Mar 11, 2022 · 8224624 · 8224624
1 parent 55015c5
commit 8224624
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 7 deletions.
diff --git a/src/ploomber/sources/notebooksource.py b/src/ploomber/sources/notebooksource.py
@@ -55,6 +55,27 @@
 from ploomber.io import pretty_print
 
 
+# TODO: we should unit test that this function is called, as opposed to vanilla
+# .read_text
+def _read_primitive(path):
+    """
+    We read using the UTF-8 instead of the default encoding since notebooks are
+    always stored in UTF-8.
+
+    We can see this in nbformat, which always reads as UTF-8:
+    https://github.com/jupyter/nbformat/blob/df63593b64a15ee1c37b522973c39e8674f93c5b/nbformat/__init__.py#L125
+
+    Scripts are a different story since they may have other encodings, however,
+    modern editors have UTF-8 as default (example: VSCode
+    https://docs.microsoft.com/en-us/powershell/scripting/dev-cross-plat/vscode/understanding-file-encoding?view=powershell-7.2#configuring-vs-code)
+    so it's safer to use UTF-8 than the default encoding.
+
+    jupytext already does this:
+    https://github.com/mwouts/jupytext/issues/896
+    """
+    return Path(path).read_text(encoding='utf-8')
+
+
 def _get_last_cell(nb):
     """
     Get last cell, ignores cells with empty source (unless the notebook only
@@ -185,7 +206,7 @@ def __init__(self,
                     'File does not exist.' +
                     _suggest_ploomber_scaffold_missing_file())
 
-            self._primitive = primitive.read_text(encoding='utf-8')
+            self._primitive = _read_primitive(primitive)
         else:
             raise TypeError('Notebooks must be initialized from strings, '
                             'Placeholder or pathlib.Path, got {}'.format(
@@ -265,7 +286,7 @@ def __init__(self,
     @property
     def primitive(self):
         if self._hot_reload:
-            self._primitive = self._path.read_text()
+            self._primitive = _read_primitive(self._path)
 
         return self._primitive
 

diff --git a/src/ploomber/tasks/notebook.py b/src/ploomber/tasks/notebook.py
@@ -39,6 +39,31 @@
 from ploomber.io import FileLoaderMixin, pretty_print
 
 
+# TODO: ensure that all places where we call this function are unit tested
+def _write_text_utf_8(path, text):
+    """Write text using UTF-8 encoding
+
+    Notes
+    -----
+    In some systems, UTF-8 is not the default encoding (this happens on some
+    Windows configurations, which have CP-1252 as default). This can break
+    storing some files if the str to write contains characters that do not
+    exist in the default encoding
+    (https://github.com/ploomber/ploomber/issues/334)
+
+    Under normal circumstances, it's better to use the default encoding since
+    it provides better system compatibility, however, notebooks are always
+    stored as UTF-8 so it's ok to store them as such.
+
+    nbconvert always stores as UTF-8:
+    https://github.com/jupyter/nbconvert/blob/5d57c16b655602f4705f9147f671f2cbaadaebca/nbconvert/writers/files.py#L126
+
+    nbformat always reads as UTF-8:
+    https://github.com/jupyter/nbformat/blob/df63593b64a15ee1c37b522973c39e8674f93c5b/nbformat/__init__.py#L125
+    """
+    Path(path).write_text(text, encoding='utf-8')
+
+
 def _suggest_passing_product_dictionary():
     if Path('pipeline.yaml').is_file():
         return """
@@ -192,7 +217,7 @@ def _from_ipynb(path_to_nb, exporter, nbconvert_export_kwargs):
         content, _ = nbconvert.export(exporter, nb, **nbconvert_export_kwargs)
 
         if isinstance(content, str):
-            path.write_text(content, encoding='utf-8')
+            _write_text_utf_8(path, content)
         elif isinstance(content, bytes):
             path.write_bytes(content)
         else:
@@ -430,7 +455,7 @@ def develop(self, app='notebook', args=None):
         name_new = name.replace(suffix, '-tmp.ipynb')
         tmp = self.source.loc.with_name(name_new)
         content = nbformat.writes(nb, version=nbformat.NO_CONVERT)
-        tmp.write_text(content, encoding='utf-8')
+        _write_text_utf_8(tmp, content)
 
         # open notebook with injected debugging cell
         try:
@@ -500,7 +525,7 @@ def debug(self, kind='ipdb'):
         fd, tmp_path = tempfile.mkstemp(suffix='.py')
         os.close(fd)
         code = jupytext.writes(nb, version=nbformat.NO_CONVERT, fmt='py')
-        Path(tmp_path).write_text(code, encoding='utf-8')
+        _write_text_utf_8(tmp_path, code)
 
         if kind == 'pm':
             # post-mortem debugging
@@ -548,7 +573,7 @@ def run(self):
         os.close(fd)
 
         tmp = Path(tmp)
-        tmp.write_text(self.source.nb_str_rendered, encoding='utf-8')
+        _write_text_utf_8(tmp, self.source.nb_str_rendered)
 
         if self.local_execution:
             self.papermill_params['cwd'] = str(self.source.loc.parent)

diff --git a/tests/tasks/test_notebook.py b/tests/tasks/test_notebook.py
@@ -1,5 +1,5 @@
 import sys
-from unittest.mock import Mock
+from unittest.mock import Mock, ANY
 from pathlib import Path
 
 import pytest
@@ -78,6 +78,23 @@ def test_notebook_conversion(monkeypatch, output, tmp_directory):
     conv.convert()
 
 
+def test_notebook_conversion_stores_as_unicode(tmp_directory, monkeypatch):
+    nb = nbformat.v4.new_notebook()
+    cell = nbformat.v4.new_code_cell('1 + 1')
+    nb.cells.append(cell)
+
+    with open('nb.ipynb', 'w', encoding='utf-8') as f:
+        nbformat.write(nb, f)
+
+    conv = notebook.NotebookConverter('nb.ipynb', exporter_name='html')
+
+    mock = Mock()
+    monkeypatch.setattr(notebook.Path, 'write_text', mock)
+    conv.convert()
+
+    mock.assert_called_once_with(ANY, encoding='utf-8')
+
+
 @pytest.mark.parametrize(
     'name, out_dir',
     [