Skip to content

Commit

Permalink
feat(IPYNB): Add decoding and encoding of Jupyter Notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
nokome committed Sep 9, 2021
1 parent fd3f829 commit 4ee5129
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 12 deletions.
17 changes: 9 additions & 8 deletions .vscode/settings.json
Expand Up @@ -36,27 +36,28 @@
"workbench.editor.highlightModifiedTabs": true,
"typescript.tsdk": "node_modules/typescript/lib",
"cSpell.words": [
"canonicalize",
"concat",
"CRAN",
"Dockta",
"Emph",
"Encoda",
"inlines",
"ipynb",
"itemprop",
"itemscope",
"itemtype",
"Jesta",
"Jupita",
"Logga",
"Nixta",
"Nontextual",
"Pandoc",
"patchable",
"Pyla",
"Rscript",
"Stencila",
"canonicalize",
"concat",
"patchable",
"inlines",
"itemprop",
"itemscope",
"itemtype",
"schemars",
"Stencila",
"structopt",
"undelegatable"
]
Expand Down
32 changes: 32 additions & 0 deletions fixtures/articles/simple.ipynb
@@ -0,0 +1,32 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"This is a relatively simple test article written as a Jupyter Notebook."
],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 4 additions & 0 deletions rust/Cargo.toml
Expand Up @@ -46,6 +46,7 @@ import-github = []
decode-date = ["dtparse"]
decode-docx = ["decode-pandoc"]
decode-html = ["decode-md", "kuchiki", "markup5ever"]
decode-ipynb = ["decode-md", "decode-html"]
decode-json = []
decode-latex = ["decode-pandoc"]
decode-md = ["pulldown-cmark", "nom", "coerce", "decode-html", "encode-txt"]
Expand All @@ -58,6 +59,7 @@ decode-yaml = []

encode-docx = ["encode-pandoc"]
encode-html = ["encode-txt", "html-escape"]
encode-ipynb = ["encode-md"]
encode-json = []
encode-latex = ["encode-pandoc"]
encode-md = []
Expand Down Expand Up @@ -109,6 +111,7 @@ default = [
"decode-date",
"decode-docx",
"decode-html",
"decode-ipynb",
"decode-json",
"decode-latex",
"decode-md",
Expand All @@ -120,6 +123,7 @@ default = [
"decode-yaml",
"encode-docx",
"encode-html",
"encode-ipynb",
"encode-json",
"encode-latex",
"encode-md",
Expand Down
118 changes: 118 additions & 0 deletions rust/src/methods/decode/ipynb.rs
@@ -0,0 +1,118 @@
use eyre::Result;
use stencila_schema::{Article, BlockContent, Node};

use super::md;

/// Decode a Jupyter Notebook to a `Node`.
///
/// Aims to support the [Jupyter Notebook v4.5 schema](https://github.com/jupyter/nbformat/blob/master/nbformat/v4/nbformat.v4.5.schema.json)
/// but should work for any v4 notebook.
///
/// Aims to be permissive by ignoring but warning about data that does not meet v4 schema, rather than
/// erroring on it.
pub fn decode(ipynb: &str) -> Result<Node> {
let notebook = serde_json::from_str::<serde_json::Value>(ipynb)?;

if let Some(version) = notebook.get("nbformat").and_then(|value| value.as_u64()) {
if version != 4 {
tracing::warn!(
"Jupyter Notebook has unsupported format version: {}",
version
);
}
} else {
tracing::warn!(
"Jupyter Notebook does not have a valid `nbformat` property; assuming version 4"
);
}

let content = if let Some(cells) = notebook.get("cells").and_then(|value| value.as_array()) {
let mut content = Vec::with_capacity(cells.len());
for cell in cells {
let cell_type = cell
.get("cell_type")
.and_then(|value| value.as_str())
.unwrap_or_default();
let mut blocks = match cell_type {
"code" => translate_code_cell(cell),
"markdown" => translate_markdown_cell(cell),
"raw" => translate_raw_cell(cell),
_ => {
tracing::warn!("Jupyter Notebook cell has unknown type: {}", cell_type);
Vec::new()
}
};
content.append(&mut blocks);
}
content
} else {
tracing::warn!("Jupyter Notebook does not have a valid `cells` property");
Vec::new()
};

let article = Article {
content: if content.is_empty() {
None
} else {
Some(content)
},
..Default::default()
};
Ok(Node::Article(article))
}

/// Translate a Jupyter "code" cell
fn translate_code_cell(_cell: &serde_json::Value) -> Vec<BlockContent> {
todo!()
}

/// Translate a Jupyter "markdown" cell
fn translate_markdown_cell(cell: &serde_json::Value) -> Vec<BlockContent> {
let markdown = if let Some(source) = cell.get("source") {
translate_multiline_string(source)
} else {
tracing::warn!("Markdown cell does not have a `source` property");
return Vec::new();
};
md::decode_fragment(&markdown)
}

/// Translate a Jupyter "raw" cell
fn translate_raw_cell(_cell: &serde_json::Value) -> Vec<BlockContent> {
tracing::warn!("Decoding of raw cells is not yet supported");
Vec::new()
}

/// Translates a Jupyter `multiline_string` (either a plain string, or an array of strings)
/// to a Rust `String`.
fn translate_multiline_string(multiline_string: &serde_json::Value) -> String {
if let Some(str) = multiline_string.as_str() {
str.to_string()
} else if let Some(array) = multiline_string.as_array() {
array
.iter()
.filter_map(|value| value.as_str().map(String::from))
.collect::<Vec<String>>()
.concat()
} else {
tracing::warn!("Unexpected value type for multiline string");
"".to_string()
}
}

#[cfg(test)]
mod test {
use serde_json::json;

use super::*;

#[test]
fn multiline_string() {
let mls1 = json!(["Line1\n", "Line2"]);
let mls2 = json!("Line1\nLine2");
let str1 = "Line1\nLine2";

assert_eq!(translate_multiline_string(&mls1), str1);
assert_eq!(translate_multiline_string(&mls2), str1);
}
}
9 changes: 9 additions & 0 deletions rust/src/methods/decode/mod.rs
Expand Up @@ -5,6 +5,7 @@ use stencila_schema::Node;

#[cfg(feature = "decode-date")]
pub mod date;

#[cfg(feature = "decode-docx")]
pub mod docx;

Expand All @@ -14,6 +15,9 @@ pub mod json;
#[cfg(feature = "decode-html")]
pub mod html;

#[cfg(feature = "decode-ipynb")]
pub mod ipynb;

#[cfg(feature = "decode-md")]
pub mod md;

Expand All @@ -38,6 +42,8 @@ pub mod toml;
#[cfg(feature = "decode-yaml")]
pub mod yaml;

// Modules for types of content, rather than specific formats

pub mod code;
pub mod media;

Expand Down Expand Up @@ -67,6 +73,9 @@ pub async fn decode(input: &str, format: &str) -> Result<Node> {
#[cfg(feature = "decode-html")]
"html" => html::decode(input, false)?,

#[cfg(feature = "decode-ipynb")]
"ipynb" => ipynb::decode(input)?,

#[cfg(feature = "decode-json")]
"json" => json::decode(input)?,

Expand Down
7 changes: 7 additions & 0 deletions rust/src/methods/encode/ipynb.rs
@@ -0,0 +1,7 @@
use eyre::Result;
use stencila_schema::Node;

/// Encode a `Node` to a Jupyter Notebook.
pub fn encode(_node: &Node) -> Result<String> {
todo!()
}
6 changes: 6 additions & 0 deletions rust/src/methods/encode/mod.rs
Expand Up @@ -10,6 +10,9 @@ pub mod docx;
#[allow(clippy::deprecated_cfg_attr)]
pub mod html;

#[cfg(feature = "encode-ipynb")]
pub mod ipynb;

#[cfg(feature = "encode-json")]
pub mod json;

Expand Down Expand Up @@ -102,6 +105,9 @@ pub async fn encode(
#[cfg(feature = "encode-html")]
"html" => html::encode(node, options)?,

#[cfg(feature = "encode-ipynb")]
"ipynb" => ipynb::encode(node)?,

#[cfg(feature = "encode-json")]
"json" => json::encode(node, options)?,

Expand Down
14 changes: 10 additions & 4 deletions rust/tests/ende.rs
Expand Up @@ -89,6 +89,15 @@ proptest! {
assert_json_eq!(input, output);
}

#[cfg(all(feature="encode-ipynb", feature="decode-ipynb"))]
#[test]
#[ignore="Because encoding not yet implemented"]
fn ipynb(input in article(Freedom::Min)) {
let content = encode::ipynb::encode(&input).unwrap();
let output = decode::ipynb::decode(&content).unwrap();
assert_json_eq!(input, output);
}

#[cfg(all(feature="encode-md", feature="decode-md"))]
#[test]
fn md(input in article(Freedom::Min)) {
Expand All @@ -102,10 +111,7 @@ proptest! {
fn rmd(input in article(Freedom::Min)) {
let content = encode::rmd::encode(&input).unwrap();
let output = decode::rmd::decode(&content).unwrap();
assert_eq!(
serde_json::to_value(&input).unwrap(),
serde_json::to_value(&output).unwrap()
)
assert_json_eq!(input, output);
}

#[cfg(all(feature="encode-pandoc", feature="decode-pandoc"))]
Expand Down

0 comments on commit 4ee5129

Please sign in to comment.