feat(IPYNB): Add decoding and encoding of Jupyter Notebooks

stencila · Sep 9, 2021 · 4ee5129 · 4ee5129
1 parent fd3f829
commit 4ee5129
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 12 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -36,27 +36,28 @@
   "workbench.editor.highlightModifiedTabs": true,
   "typescript.tsdk": "node_modules/typescript/lib",
   "cSpell.words": [
+    "canonicalize",
+    "concat",
     "CRAN",
     "Dockta",
     "Emph",
     "Encoda",
+    "inlines",
+    "ipynb",
+    "itemprop",
+    "itemscope",
+    "itemtype",
     "Jesta",
     "Jupita",
     "Logga",
     "Nixta",
     "Nontextual",
     "Pandoc",
+    "patchable",
     "Pyla",
     "Rscript",
-    "Stencila",
-    "canonicalize",
-    "concat",
-    "patchable",
-    "inlines",
-    "itemprop",
-    "itemscope",
-    "itemtype",
     "schemars",
+    "Stencila",
     "structopt",
     "undelegatable"
   ]

diff --git a/fixtures/articles/simple.ipynb b/fixtures/articles/simple.ipynb
@@ -0,0 +1,32 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This is a relatively simple test article written as a Jupyter Notebook."
+   ],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -46,6 +46,7 @@ import-github = []
 decode-date = ["dtparse"]
 decode-docx = ["decode-pandoc"]
 decode-html = ["decode-md", "kuchiki", "markup5ever"]
+decode-ipynb = ["decode-md", "decode-html"]
 decode-json = []
 decode-latex = ["decode-pandoc"]
 decode-md = ["pulldown-cmark", "nom", "coerce", "decode-html", "encode-txt"]
@@ -58,6 +59,7 @@ decode-yaml = []
 
 encode-docx = ["encode-pandoc"]
 encode-html = ["encode-txt", "html-escape"]
+encode-ipynb = ["encode-md"]
 encode-json = []
 encode-latex = ["encode-pandoc"]
 encode-md = []
@@ -109,6 +111,7 @@ default = [
   "decode-date",
   "decode-docx",
   "decode-html",
+  "decode-ipynb",
   "decode-json",
   "decode-latex",
   "decode-md",
@@ -120,6 +123,7 @@ default = [
   "decode-yaml",
   "encode-docx",
   "encode-html",
+  "encode-ipynb",
   "encode-json",
   "encode-latex",
   "encode-md",

diff --git a/rust/src/methods/decode/ipynb.rs b/rust/src/methods/decode/ipynb.rs
@@ -0,0 +1,118 @@
+use eyre::Result;
+use stencila_schema::{Article, BlockContent, Node};
+
+use super::md;
+
+/// Decode a Jupyter Notebook to a `Node`.
+///
+/// Aims to support the [Jupyter Notebook v4.5 schema](https://github.com/jupyter/nbformat/blob/master/nbformat/v4/nbformat.v4.5.schema.json)
+/// but should work for any v4 notebook.
+///
+/// Aims to be permissive by ignoring but warning about data that does not meet v4 schema, rather than
+/// erroring on it.
+pub fn decode(ipynb: &str) -> Result<Node> {
+    let notebook = serde_json::from_str::<serde_json::Value>(ipynb)?;
+
+    if let Some(version) = notebook.get("nbformat").and_then(|value| value.as_u64()) {
+        if version != 4 {
+            tracing::warn!(
+                "Jupyter Notebook has unsupported format version: {}",
+                version
+            );
+        }
+    } else {
+        tracing::warn!(
+            "Jupyter Notebook does not have a valid `nbformat` property; assuming version 4"
+        );
+    }
+
+    let content = if let Some(cells) = notebook.get("cells").and_then(|value| value.as_array()) {
+        let mut content = Vec::with_capacity(cells.len());
+        for cell in cells {
+            let cell_type = cell
+                .get("cell_type")
+                .and_then(|value| value.as_str())
+                .unwrap_or_default();
+            let mut blocks = match cell_type {
+                "code" => translate_code_cell(cell),
+                "markdown" => translate_markdown_cell(cell),
+                "raw" => translate_raw_cell(cell),
+                _ => {
+                    tracing::warn!("Jupyter Notebook cell has unknown type: {}", cell_type);
+                    Vec::new()
+                }
+            };
+            content.append(&mut blocks);
+        }
+        content
+    } else {
+        tracing::warn!("Jupyter Notebook does not have a valid `cells` property");
+        Vec::new()
+    };
+
+    let article = Article {
+        content: if content.is_empty() {
+            None
+        } else {
+            Some(content)
+        },
+        ..Default::default()
+    };
+    Ok(Node::Article(article))
+}
+
+/// Translate a Jupyter "code" cell
+fn translate_code_cell(_cell: &serde_json::Value) -> Vec<BlockContent> {
+    todo!()
+}
+
+/// Translate a Jupyter "markdown" cell
+fn translate_markdown_cell(cell: &serde_json::Value) -> Vec<BlockContent> {
+    let markdown = if let Some(source) = cell.get("source") {
+        translate_multiline_string(source)
+    } else {
+        tracing::warn!("Markdown cell does not have a `source` property");
+        return Vec::new();
+    };
+    md::decode_fragment(&markdown)
+}
+
+/// Translate a Jupyter "raw" cell
+fn translate_raw_cell(_cell: &serde_json::Value) -> Vec<BlockContent> {
+    tracing::warn!("Decoding of raw cells is not yet supported");
+    Vec::new()
+}
+
+/// Translates a Jupyter `multiline_string` (either a plain string, or an array of strings)
+/// to a Rust `String`.
+fn translate_multiline_string(multiline_string: &serde_json::Value) -> String {
+    if let Some(str) = multiline_string.as_str() {
+        str.to_string()
+    } else if let Some(array) = multiline_string.as_array() {
+        array
+            .iter()
+            .filter_map(|value| value.as_str().map(String::from))
+            .collect::<Vec<String>>()
+            .concat()
+    } else {
+        tracing::warn!("Unexpected value type for multiline string");
+        "".to_string()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use serde_json::json;
+
+    use super::*;
+
+    #[test]
+    fn multiline_string() {
+        let mls1 = json!(["Line1\n", "Line2"]);
+        let mls2 = json!("Line1\nLine2");
+        let str1 = "Line1\nLine2";
+
+        assert_eq!(translate_multiline_string(&mls1), str1);
+        assert_eq!(translate_multiline_string(&mls2), str1);
+    }
+}
diff --git a/rust/src/methods/decode/mod.rs b/rust/src/methods/decode/mod.rs
@@ -5,6 +5,7 @@ use stencila_schema::Node;
 
 #[cfg(feature = "decode-date")]
 pub mod date;
+
 #[cfg(feature = "decode-docx")]
 pub mod docx;
 
@@ -14,6 +15,9 @@ pub mod json;
 #[cfg(feature = "decode-html")]
 pub mod html;
 
+#[cfg(feature = "decode-ipynb")]
+pub mod ipynb;
+
 #[cfg(feature = "decode-md")]
 pub mod md;
 
@@ -38,6 +42,8 @@ pub mod toml;
 #[cfg(feature = "decode-yaml")]
 pub mod yaml;
 
+// Modules for types of content, rather than specific formats
+
 pub mod code;
 pub mod media;
 
@@ -67,6 +73,9 @@ pub async fn decode(input: &str, format: &str) -> Result<Node> {
         #[cfg(feature = "decode-html")]
         "html" => html::decode(input, false)?,
 
+        #[cfg(feature = "decode-ipynb")]
+        "ipynb" => ipynb::decode(input)?,
+
         #[cfg(feature = "decode-json")]
         "json" => json::decode(input)?,
 

diff --git a/rust/src/methods/encode/ipynb.rs b/rust/src/methods/encode/ipynb.rs
@@ -0,0 +1,7 @@
+use eyre::Result;
+use stencila_schema::Node;
+
+/// Encode a `Node` to a Jupyter Notebook.
+pub fn encode(_node: &Node) -> Result<String> {
+    todo!()
+}
diff --git a/rust/src/methods/encode/mod.rs b/rust/src/methods/encode/mod.rs
@@ -10,6 +10,9 @@ pub mod docx;
 #[allow(clippy::deprecated_cfg_attr)]
 pub mod html;
 
+#[cfg(feature = "encode-ipynb")]
+pub mod ipynb;
+
 #[cfg(feature = "encode-json")]
 pub mod json;
 
@@ -102,6 +105,9 @@ pub async fn encode(
         #[cfg(feature = "encode-html")]
         "html" => html::encode(node, options)?,
 
+        #[cfg(feature = "encode-ipynb")]
+        "ipynb" => ipynb::encode(node)?,
+
         #[cfg(feature = "encode-json")]
         "json" => json::encode(node, options)?,
 

diff --git a/rust/tests/ende.rs b/rust/tests/ende.rs
@@ -89,6 +89,15 @@ proptest! {
         assert_json_eq!(input, output);
     }
 
+    #[cfg(all(feature="encode-ipynb", feature="decode-ipynb"))]
+    #[test]
+    #[ignore="Because encoding not yet implemented"]
+    fn ipynb(input in article(Freedom::Min)) {
+        let content = encode::ipynb::encode(&input).unwrap();
+        let output = decode::ipynb::decode(&content).unwrap();
+        assert_json_eq!(input, output);
+    }
+
     #[cfg(all(feature="encode-md", feature="decode-md"))]
     #[test]
     fn md(input in article(Freedom::Min)) {
@@ -102,10 +111,7 @@ proptest! {
     fn rmd(input in article(Freedom::Min)) {
         let content = encode::rmd::encode(&input).unwrap();
         let output = decode::rmd::decode(&content).unwrap();
-        assert_eq!(
-            serde_json::to_value(&input).unwrap(),
-            serde_json::to_value(&output).unwrap()
-        )
+        assert_json_eq!(input, output);
     }
 
     #[cfg(all(feature="encode-pandoc", feature="decode-pandoc"))]