From 4ee5129dffb41f8630a0a6b5507c467833ca942c Mon Sep 17 00:00:00 2001 From: Nokome Date: Thu, 9 Sep 2021 17:56:41 +1200 Subject: [PATCH] feat(IPYNB): Add decoding and encoding of Jupyter Notebooks --- .vscode/settings.json | 17 ++--- fixtures/articles/simple.ipynb | 32 +++++++++ rust/Cargo.toml | 4 ++ rust/src/methods/decode/ipynb.rs | 118 +++++++++++++++++++++++++++++++ rust/src/methods/decode/mod.rs | 9 +++ rust/src/methods/encode/ipynb.rs | 7 ++ rust/src/methods/encode/mod.rs | 6 ++ rust/tests/ende.rs | 14 ++-- 8 files changed, 195 insertions(+), 12 deletions(-) create mode 100644 fixtures/articles/simple.ipynb create mode 100644 rust/src/methods/decode/ipynb.rs create mode 100644 rust/src/methods/encode/ipynb.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index 88e987fc1d..27e2ba6d65 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -36,27 +36,28 @@ "workbench.editor.highlightModifiedTabs": true, "typescript.tsdk": "node_modules/typescript/lib", "cSpell.words": [ + "canonicalize", + "concat", "CRAN", "Dockta", "Emph", "Encoda", + "inlines", + "ipynb", + "itemprop", + "itemscope", + "itemtype", "Jesta", "Jupita", "Logga", "Nixta", "Nontextual", "Pandoc", + "patchable", "Pyla", "Rscript", - "Stencila", - "canonicalize", - "concat", - "patchable", - "inlines", - "itemprop", - "itemscope", - "itemtype", "schemars", + "Stencila", "structopt", "undelegatable" ] diff --git a/fixtures/articles/simple.ipynb b/fixtures/articles/simple.ipynb new file mode 100644 index 0000000000..5dfd407808 --- /dev/null +++ b/fixtures/articles/simple.ipynb @@ -0,0 +1,32 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "This is a relatively simple test article written as a Jupyter Notebook." + ], + "metadata": {} + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 5c847d2605..ac896fb41c 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -46,6 +46,7 @@ import-github = [] decode-date = ["dtparse"] decode-docx = ["decode-pandoc"] decode-html = ["decode-md", "kuchiki", "markup5ever"] +decode-ipynb = ["decode-md", "decode-html"] decode-json = [] decode-latex = ["decode-pandoc"] decode-md = ["pulldown-cmark", "nom", "coerce", "decode-html", "encode-txt"] @@ -58,6 +59,7 @@ decode-yaml = [] encode-docx = ["encode-pandoc"] encode-html = ["encode-txt", "html-escape"] +encode-ipynb = ["encode-md"] encode-json = [] encode-latex = ["encode-pandoc"] encode-md = [] @@ -109,6 +111,7 @@ default = [ "decode-date", "decode-docx", "decode-html", + "decode-ipynb", "decode-json", "decode-latex", "decode-md", @@ -120,6 +123,7 @@ default = [ "decode-yaml", "encode-docx", "encode-html", + "encode-ipynb", "encode-json", "encode-latex", "encode-md", diff --git a/rust/src/methods/decode/ipynb.rs b/rust/src/methods/decode/ipynb.rs new file mode 100644 index 0000000000..7c8b2de04f --- /dev/null +++ b/rust/src/methods/decode/ipynb.rs @@ -0,0 +1,118 @@ +use eyre::Result; +use stencila_schema::{Article, BlockContent, Node}; + +use super::md; + +/// Decode a Jupyter Notebook to a `Node`. +/// +/// Aims to support the [Jupyter Notebook v4.5 schema](https://github.com/jupyter/nbformat/blob/master/nbformat/v4/nbformat.v4.5.schema.json) +/// but should work for any v4 notebook. +/// +/// Aims to be permissive by ignoring but warning about data that does not meet v4 schema, rather than +/// erroring on it. +pub fn decode(ipynb: &str) -> Result { + let notebook = serde_json::from_str::(ipynb)?; + + if let Some(version) = notebook.get("nbformat").and_then(|value| value.as_u64()) { + if version != 4 { + tracing::warn!( + "Jupyter Notebook has unsupported format version: {}", + version + ); + } + } else { + tracing::warn!( + "Jupyter Notebook does not have a valid `nbformat` property; assuming version 4" + ); + } + + let content = if let Some(cells) = notebook.get("cells").and_then(|value| value.as_array()) { + let mut content = Vec::with_capacity(cells.len()); + for cell in cells { + let cell_type = cell + .get("cell_type") + .and_then(|value| value.as_str()) + .unwrap_or_default(); + let mut blocks = match cell_type { + "code" => translate_code_cell(cell), + "markdown" => translate_markdown_cell(cell), + "raw" => translate_raw_cell(cell), + _ => { + tracing::warn!("Jupyter Notebook cell has unknown type: {}", cell_type); + Vec::new() + } + }; + content.append(&mut blocks); + } + content + } else { + tracing::warn!("Jupyter Notebook does not have a valid `cells` property"); + Vec::new() + }; + + let article = Article { + content: if content.is_empty() { + None + } else { + Some(content) + }, + ..Default::default() + }; + Ok(Node::Article(article)) +} + +/// Translate a Jupyter "code" cell +fn translate_code_cell(_cell: &serde_json::Value) -> Vec { + todo!() +} + +/// Translate a Jupyter "markdown" cell +fn translate_markdown_cell(cell: &serde_json::Value) -> Vec { + let markdown = if let Some(source) = cell.get("source") { + translate_multiline_string(source) + } else { + tracing::warn!("Markdown cell does not have a `source` property"); + return Vec::new(); + }; + md::decode_fragment(&markdown) +} + +/// Translate a Jupyter "raw" cell +fn translate_raw_cell(_cell: &serde_json::Value) -> Vec { + tracing::warn!("Decoding of raw cells is not yet supported"); + Vec::new() +} + +/// Translates a Jupyter `multiline_string` (either a plain string, or an array of strings) +/// to a Rust `String`. +fn translate_multiline_string(multiline_string: &serde_json::Value) -> String { + if let Some(str) = multiline_string.as_str() { + str.to_string() + } else if let Some(array) = multiline_string.as_array() { + array + .iter() + .filter_map(|value| value.as_str().map(String::from)) + .collect::>() + .concat() + } else { + tracing::warn!("Unexpected value type for multiline string"); + "".to_string() + } +} + +#[cfg(test)] +mod test { + use serde_json::json; + + use super::*; + + #[test] + fn multiline_string() { + let mls1 = json!(["Line1\n", "Line2"]); + let mls2 = json!("Line1\nLine2"); + let str1 = "Line1\nLine2"; + + assert_eq!(translate_multiline_string(&mls1), str1); + assert_eq!(translate_multiline_string(&mls2), str1); + } +} diff --git a/rust/src/methods/decode/mod.rs b/rust/src/methods/decode/mod.rs index 2a53d9c1a3..76761cc662 100644 --- a/rust/src/methods/decode/mod.rs +++ b/rust/src/methods/decode/mod.rs @@ -5,6 +5,7 @@ use stencila_schema::Node; #[cfg(feature = "decode-date")] pub mod date; + #[cfg(feature = "decode-docx")] pub mod docx; @@ -14,6 +15,9 @@ pub mod json; #[cfg(feature = "decode-html")] pub mod html; +#[cfg(feature = "decode-ipynb")] +pub mod ipynb; + #[cfg(feature = "decode-md")] pub mod md; @@ -38,6 +42,8 @@ pub mod toml; #[cfg(feature = "decode-yaml")] pub mod yaml; +// Modules for types of content, rather than specific formats + pub mod code; pub mod media; @@ -67,6 +73,9 @@ pub async fn decode(input: &str, format: &str) -> Result { #[cfg(feature = "decode-html")] "html" => html::decode(input, false)?, + #[cfg(feature = "decode-ipynb")] + "ipynb" => ipynb::decode(input)?, + #[cfg(feature = "decode-json")] "json" => json::decode(input)?, diff --git a/rust/src/methods/encode/ipynb.rs b/rust/src/methods/encode/ipynb.rs new file mode 100644 index 0000000000..89bebed2fa --- /dev/null +++ b/rust/src/methods/encode/ipynb.rs @@ -0,0 +1,7 @@ +use eyre::Result; +use stencila_schema::Node; + +/// Encode a `Node` to a Jupyter Notebook. +pub fn encode(_node: &Node) -> Result { + todo!() +} diff --git a/rust/src/methods/encode/mod.rs b/rust/src/methods/encode/mod.rs index 82995d056d..99c0d36979 100644 --- a/rust/src/methods/encode/mod.rs +++ b/rust/src/methods/encode/mod.rs @@ -10,6 +10,9 @@ pub mod docx; #[allow(clippy::deprecated_cfg_attr)] pub mod html; +#[cfg(feature = "encode-ipynb")] +pub mod ipynb; + #[cfg(feature = "encode-json")] pub mod json; @@ -102,6 +105,9 @@ pub async fn encode( #[cfg(feature = "encode-html")] "html" => html::encode(node, options)?, + #[cfg(feature = "encode-ipynb")] + "ipynb" => ipynb::encode(node)?, + #[cfg(feature = "encode-json")] "json" => json::encode(node, options)?, diff --git a/rust/tests/ende.rs b/rust/tests/ende.rs index d8058ceaf1..755f0866c3 100644 --- a/rust/tests/ende.rs +++ b/rust/tests/ende.rs @@ -89,6 +89,15 @@ proptest! { assert_json_eq!(input, output); } + #[cfg(all(feature="encode-ipynb", feature="decode-ipynb"))] + #[test] + #[ignore="Because encoding not yet implemented"] + fn ipynb(input in article(Freedom::Min)) { + let content = encode::ipynb::encode(&input).unwrap(); + let output = decode::ipynb::decode(&content).unwrap(); + assert_json_eq!(input, output); + } + #[cfg(all(feature="encode-md", feature="decode-md"))] #[test] fn md(input in article(Freedom::Min)) { @@ -102,10 +111,7 @@ proptest! { fn rmd(input in article(Freedom::Min)) { let content = encode::rmd::encode(&input).unwrap(); let output = decode::rmd::decode(&content).unwrap(); - assert_eq!( - serde_json::to_value(&input).unwrap(), - serde_json::to_value(&output).unwrap() - ) + assert_json_eq!(input, output); } #[cfg(all(feature="encode-pandoc", feature="decode-pandoc"))]