From 8224624626c16069819e071f756a626ca52cc64f Mon Sep 17 00:00:00 2001 From: Eduardo Blancas Reyes Date: Fri, 11 Mar 2022 08:10:05 -0600 Subject: [PATCH] reading and writing in utf-8 in notebook tasks --- src/ploomber/sources/notebooksource.py | 25 +++++++++++++++++-- src/ploomber/tasks/notebook.py | 33 ++++++++++++++++++++++---- tests/tasks/test_notebook.py | 19 ++++++++++++++- 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/src/ploomber/sources/notebooksource.py b/src/ploomber/sources/notebooksource.py index 7ad1fb3e5..1439fbb7d 100644 --- a/src/ploomber/sources/notebooksource.py +++ b/src/ploomber/sources/notebooksource.py @@ -55,6 +55,27 @@ from ploomber.io import pretty_print +# TODO: we should unit test that this function is called, as opposed to vanilla +# .read_text +def _read_primitive(path): + """ + We read using the UTF-8 instead of the default encoding since notebooks are + always stored in UTF-8. + + We can see this in nbformat, which always reads as UTF-8: + https://github.com/jupyter/nbformat/blob/df63593b64a15ee1c37b522973c39e8674f93c5b/nbformat/__init__.py#L125 + + Scripts are a different story since they may have other encodings, however, + modern editors have UTF-8 as default (example: VSCode + https://docs.microsoft.com/en-us/powershell/scripting/dev-cross-plat/vscode/understanding-file-encoding?view=powershell-7.2#configuring-vs-code) + so it's safer to use UTF-8 than the default encoding. + + jupytext already does this: + https://github.com/mwouts/jupytext/issues/896 + """ + return Path(path).read_text(encoding='utf-8') + + def _get_last_cell(nb): """ Get last cell, ignores cells with empty source (unless the notebook only @@ -185,7 +206,7 @@ def __init__(self, 'File does not exist.' + _suggest_ploomber_scaffold_missing_file()) - self._primitive = primitive.read_text(encoding='utf-8') + self._primitive = _read_primitive(primitive) else: raise TypeError('Notebooks must be initialized from strings, ' 'Placeholder or pathlib.Path, got {}'.format( @@ -265,7 +286,7 @@ def __init__(self, @property def primitive(self): if self._hot_reload: - self._primitive = self._path.read_text() + self._primitive = _read_primitive(self._path) return self._primitive diff --git a/src/ploomber/tasks/notebook.py b/src/ploomber/tasks/notebook.py index 7435d49f8..cee85c4a9 100644 --- a/src/ploomber/tasks/notebook.py +++ b/src/ploomber/tasks/notebook.py @@ -39,6 +39,31 @@ from ploomber.io import FileLoaderMixin, pretty_print +# TODO: ensure that all places where we call this function are unit tested +def _write_text_utf_8(path, text): + """Write text using UTF-8 encoding + + Notes + ----- + In some systems, UTF-8 is not the default encoding (this happens on some + Windows configurations, which have CP-1252 as default). This can break + storing some files if the str to write contains characters that do not + exist in the default encoding + (https://github.com/ploomber/ploomber/issues/334) + + Under normal circumstances, it's better to use the default encoding since + it provides better system compatibility, however, notebooks are always + stored as UTF-8 so it's ok to store them as such. + + nbconvert always stores as UTF-8: + https://github.com/jupyter/nbconvert/blob/5d57c16b655602f4705f9147f671f2cbaadaebca/nbconvert/writers/files.py#L126 + + nbformat always reads as UTF-8: + https://github.com/jupyter/nbformat/blob/df63593b64a15ee1c37b522973c39e8674f93c5b/nbformat/__init__.py#L125 + """ + Path(path).write_text(text, encoding='utf-8') + + def _suggest_passing_product_dictionary(): if Path('pipeline.yaml').is_file(): return """ @@ -192,7 +217,7 @@ def _from_ipynb(path_to_nb, exporter, nbconvert_export_kwargs): content, _ = nbconvert.export(exporter, nb, **nbconvert_export_kwargs) if isinstance(content, str): - path.write_text(content, encoding='utf-8') + _write_text_utf_8(path, content) elif isinstance(content, bytes): path.write_bytes(content) else: @@ -430,7 +455,7 @@ def develop(self, app='notebook', args=None): name_new = name.replace(suffix, '-tmp.ipynb') tmp = self.source.loc.with_name(name_new) content = nbformat.writes(nb, version=nbformat.NO_CONVERT) - tmp.write_text(content, encoding='utf-8') + _write_text_utf_8(tmp, content) # open notebook with injected debugging cell try: @@ -500,7 +525,7 @@ def debug(self, kind='ipdb'): fd, tmp_path = tempfile.mkstemp(suffix='.py') os.close(fd) code = jupytext.writes(nb, version=nbformat.NO_CONVERT, fmt='py') - Path(tmp_path).write_text(code, encoding='utf-8') + _write_text_utf_8(tmp_path, code) if kind == 'pm': # post-mortem debugging @@ -548,7 +573,7 @@ def run(self): os.close(fd) tmp = Path(tmp) - tmp.write_text(self.source.nb_str_rendered, encoding='utf-8') + _write_text_utf_8(tmp, self.source.nb_str_rendered) if self.local_execution: self.papermill_params['cwd'] = str(self.source.loc.parent) diff --git a/tests/tasks/test_notebook.py b/tests/tasks/test_notebook.py index 44d525404..a20a6f84d 100644 --- a/tests/tasks/test_notebook.py +++ b/tests/tasks/test_notebook.py @@ -1,5 +1,5 @@ import sys -from unittest.mock import Mock +from unittest.mock import Mock, ANY from pathlib import Path import pytest @@ -78,6 +78,23 @@ def test_notebook_conversion(monkeypatch, output, tmp_directory): conv.convert() +def test_notebook_conversion_stores_as_unicode(tmp_directory, monkeypatch): + nb = nbformat.v4.new_notebook() + cell = nbformat.v4.new_code_cell('1 + 1') + nb.cells.append(cell) + + with open('nb.ipynb', 'w', encoding='utf-8') as f: + nbformat.write(nb, f) + + conv = notebook.NotebookConverter('nb.ipynb', exporter_name='html') + + mock = Mock() + monkeypatch.setattr(notebook.Path, 'write_text', mock) + conv.convert() + + mock.assert_called_once_with(ANY, encoding='utf-8') + + @pytest.mark.parametrize( 'name, out_dir', [