Skip to content

Commit

Permalink
reading and writing in utf-8 in notebook tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
edublancas committed Mar 11, 2022
1 parent 55015c5 commit 8224624
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 7 deletions.
25 changes: 23 additions & 2 deletions src/ploomber/sources/notebooksource.py
Expand Up @@ -55,6 +55,27 @@
from ploomber.io import pretty_print


# TODO: we should unit test that this function is called, as opposed to vanilla
# .read_text
def _read_primitive(path):
"""
We read using the UTF-8 instead of the default encoding since notebooks are
always stored in UTF-8.
We can see this in nbformat, which always reads as UTF-8:
https://github.com/jupyter/nbformat/blob/df63593b64a15ee1c37b522973c39e8674f93c5b/nbformat/__init__.py#L125
Scripts are a different story since they may have other encodings, however,
modern editors have UTF-8 as default (example: VSCode
https://docs.microsoft.com/en-us/powershell/scripting/dev-cross-plat/vscode/understanding-file-encoding?view=powershell-7.2#configuring-vs-code)
so it's safer to use UTF-8 than the default encoding.
jupytext already does this:
https://github.com/mwouts/jupytext/issues/896
"""
return Path(path).read_text(encoding='utf-8')


def _get_last_cell(nb):
"""
Get last cell, ignores cells with empty source (unless the notebook only
Expand Down Expand Up @@ -185,7 +206,7 @@ def __init__(self,
'File does not exist.' +
_suggest_ploomber_scaffold_missing_file())

self._primitive = primitive.read_text(encoding='utf-8')
self._primitive = _read_primitive(primitive)
else:
raise TypeError('Notebooks must be initialized from strings, '
'Placeholder or pathlib.Path, got {}'.format(
Expand Down Expand Up @@ -265,7 +286,7 @@ def __init__(self,
@property
def primitive(self):
if self._hot_reload:
self._primitive = self._path.read_text()
self._primitive = _read_primitive(self._path)

return self._primitive

Expand Down
33 changes: 29 additions & 4 deletions src/ploomber/tasks/notebook.py
Expand Up @@ -39,6 +39,31 @@
from ploomber.io import FileLoaderMixin, pretty_print


# TODO: ensure that all places where we call this function are unit tested
def _write_text_utf_8(path, text):
"""Write text using UTF-8 encoding
Notes
-----
In some systems, UTF-8 is not the default encoding (this happens on some
Windows configurations, which have CP-1252 as default). This can break
storing some files if the str to write contains characters that do not
exist in the default encoding
(https://github.com/ploomber/ploomber/issues/334)
Under normal circumstances, it's better to use the default encoding since
it provides better system compatibility, however, notebooks are always
stored as UTF-8 so it's ok to store them as such.
nbconvert always stores as UTF-8:
https://github.com/jupyter/nbconvert/blob/5d57c16b655602f4705f9147f671f2cbaadaebca/nbconvert/writers/files.py#L126
nbformat always reads as UTF-8:
https://github.com/jupyter/nbformat/blob/df63593b64a15ee1c37b522973c39e8674f93c5b/nbformat/__init__.py#L125
"""
Path(path).write_text(text, encoding='utf-8')


def _suggest_passing_product_dictionary():
if Path('pipeline.yaml').is_file():
return """
Expand Down Expand Up @@ -192,7 +217,7 @@ def _from_ipynb(path_to_nb, exporter, nbconvert_export_kwargs):
content, _ = nbconvert.export(exporter, nb, **nbconvert_export_kwargs)

if isinstance(content, str):
path.write_text(content, encoding='utf-8')
_write_text_utf_8(path, content)
elif isinstance(content, bytes):
path.write_bytes(content)
else:
Expand Down Expand Up @@ -430,7 +455,7 @@ def develop(self, app='notebook', args=None):
name_new = name.replace(suffix, '-tmp.ipynb')
tmp = self.source.loc.with_name(name_new)
content = nbformat.writes(nb, version=nbformat.NO_CONVERT)
tmp.write_text(content, encoding='utf-8')
_write_text_utf_8(tmp, content)

# open notebook with injected debugging cell
try:
Expand Down Expand Up @@ -500,7 +525,7 @@ def debug(self, kind='ipdb'):
fd, tmp_path = tempfile.mkstemp(suffix='.py')
os.close(fd)
code = jupytext.writes(nb, version=nbformat.NO_CONVERT, fmt='py')
Path(tmp_path).write_text(code, encoding='utf-8')
_write_text_utf_8(tmp_path, code)

if kind == 'pm':
# post-mortem debugging
Expand Down Expand Up @@ -548,7 +573,7 @@ def run(self):
os.close(fd)

tmp = Path(tmp)
tmp.write_text(self.source.nb_str_rendered, encoding='utf-8')
_write_text_utf_8(tmp, self.source.nb_str_rendered)

if self.local_execution:
self.papermill_params['cwd'] = str(self.source.loc.parent)
Expand Down
19 changes: 18 additions & 1 deletion tests/tasks/test_notebook.py
@@ -1,5 +1,5 @@
import sys
from unittest.mock import Mock
from unittest.mock import Mock, ANY
from pathlib import Path

import pytest
Expand Down Expand Up @@ -78,6 +78,23 @@ def test_notebook_conversion(monkeypatch, output, tmp_directory):
conv.convert()


def test_notebook_conversion_stores_as_unicode(tmp_directory, monkeypatch):
nb = nbformat.v4.new_notebook()
cell = nbformat.v4.new_code_cell('1 + 1')
nb.cells.append(cell)

with open('nb.ipynb', 'w', encoding='utf-8') as f:
nbformat.write(nb, f)

conv = notebook.NotebookConverter('nb.ipynb', exporter_name='html')

mock = Mock()
monkeypatch.setattr(notebook.Path, 'write_text', mock)
conv.convert()

mock.assert_called_once_with(ANY, encoding='utf-8')


@pytest.mark.parametrize(
'name, out_dir',
[
Expand Down

0 comments on commit 8224624

Please sign in to comment.