fix(R markdown): Handling of specific format for code chunks and expr…

…essions
stencila · Aug 31, 2021 · 9975b42 · 9975b42
1 parent eae08fc
commit 9975b42
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 4 deletions.
diff --git a/fixtures/fragments/rmd/code-chunk.rmd b/fixtures/fragments/rmd/code-chunk.rmd
@@ -0,0 +1,3 @@
+```{r}
+# No label, no options
+```
diff --git a/fixtures/fragments/rmd/code-expression.rmd b/fixtures/fragments/rmd/code-expression.rmd
@@ -0,0 +1,3 @@
+A R Markdown code expression (inline code chunk): `r 1+1`.
+
+Plain old code fragments: `1+1`, `r2d2`, `r`.
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -50,7 +50,7 @@ decode-docx = ["decode-pandoc"]
 decode-html = ["kuchiki", "markup5ever"]
 decode-json = []
 decode-latex = ["decode-pandoc"]
-decode-md = ["pulldown-cmark", "nom", "decode-html"]
+decode-md = ["pulldown-cmark", "nom", "coerce", "decode-html", "encode-txt"]
 decode-pandoc = ["binaries", "pandoc_types"]
 decode-person = ["human_name"]
 decode-rmd = ["decode-md"]

diff --git a/rust/src/methods/decode/rmd.rs b/rust/src/methods/decode/rmd.rs
@@ -1,9 +1,92 @@
 use super::md;
 use eyre::Result;
-use stencila_schema::Node;
+use stencila_schema::{
+    BlockContent, CodeBlock, CodeChunk, CodeExpression, CodeFragment, Delete, Emphasis,
+    InlineContent, Node, NontextualAnnotation, Paragraph, Strong, Subscript, Superscript,
+};
 
 /// Decode a R Markdown document to a `Node`
 pub fn decode(input: &str) -> Result<Node> {
-    // TODO: Any necessary translations before parsing as Markdown
-    md::decode(input)
+    let mut node = md::decode(input)?;
+    if let Node::Article(article) = &mut node {
+        if let Some(content) = &mut article.content {
+            transform_blocks(content)
+        }
+    }
+    Ok(node)
+}
+
+fn transform_blocks(blocks: &mut Vec<BlockContent>) {
+    for block in blocks {
+        match block {
+            BlockContent::CodeBlock(CodeBlock {
+                programming_language,
+                text,
+                ..
+            }) => {
+                let programming_language = programming_language
+                    .clone()
+                    .map(|boxed| *boxed)
+                    .unwrap_or("".to_string());
+                if programming_language.starts_with("{r") && programming_language.ends_with("}") {
+                    *block = BlockContent::CodeChunk(CodeChunk {
+                        programming_language: "r".to_string(),
+                        text: text.to_string(),
+                        ..Default::default()
+                    })
+                }
+            }
+            BlockContent::Paragraph(Paragraph { content, .. }) => transform_inlines(content),
+            _ => (),
+        }
+    }
+}
+
+fn transform_inlines(inlines: &mut Vec<InlineContent>) {
+    for inline in inlines {
+        match inline {
+            // Code fragments prefixed with `r` get transformed to a CodeExpression
+            InlineContent::CodeFragment(CodeFragment { text, .. }) => {
+                if let Some(text) = text.strip_prefix("r ") {
+                    *inline = InlineContent::CodeExpression(CodeExpression {
+                        programming_language: "r".to_string(),
+                        text: text.to_string(),
+                        ..Default::default()
+                    })
+                }
+            }
+            // Recursively transform other inlines
+            InlineContent::Delete(Delete { content, .. })
+            | InlineContent::Emphasis(Emphasis { content, .. })
+            | InlineContent::Subscript(Subscript { content, .. })
+            | InlineContent::Superscript(Superscript { content, .. })
+            | InlineContent::Strong(Strong { content, .. })
+            | InlineContent::NontextualAnnotation(NontextualAnnotation { content, .. }) => {
+                transform_inlines(content)
+            }
+            _ => (),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::utils::tests::snapshot_content;
+    use insta::assert_json_snapshot;
+
+    #[ignore]
+    #[test]
+    fn rmd_articles() {
+        snapshot_content("articles/*.Rmd", |_path, content| {
+            assert_json_snapshot!(decode(&content).unwrap());
+        });
+    }
+
+    #[test]
+    fn rmd_fragments() {
+        snapshot_content("fragments/rmd/*.Rmd", |_path, content| {
+            assert_json_snapshot!(decode(&content).unwrap());
+        });
+    }
 }
diff --git a/rust/src/methods/decode/snapshots/rmd_fragments@code-chunk.rmd.snap b/rust/src/methods/decode/snapshots/rmd_fragments@code-chunk.rmd.snap
@@ -0,0 +1,16 @@
+---
+source: rust/src/methods/decode/rmd.rs
+expression: decode(&content).unwrap()
+input_file: fixtures/fragments/rmd/code-chunk.rmd
+
+---
+{
+  "type": "Article",
+  "content": [
+    {
+      "type": "CodeChunk",
+      "programmingLanguage": "r",
+      "text": "# No label, no options"
+    }
+  ]
+}
diff --git a/rust/src/methods/decode/snapshots/rmd_fragments@code-expression.rmd.snap b/rust/src/methods/decode/snapshots/rmd_fragments@code-expression.rmd.snap
@@ -0,0 +1,44 @@
+---
+source: rust/src/methods/decode/rmd.rs
+expression: decode(&content).unwrap()
+input_file: fixtures/fragments/rmd/code-expression.rmd
+
+---
+{
+  "type": "Article",
+  "content": [
+    {
+      "type": "Paragraph",
+      "content": [
+        "A R Markdown code expression (inline code chunk): ",
+        {
+          "type": "CodeExpression",
+          "programmingLanguage": "r",
+          "text": "1+1"
+        },
+        "."
+      ]
+    },
+    {
+      "type": "Paragraph",
+      "content": [
+        "Plain old code fragments: ",
+        {
+          "type": "CodeFragment",
+          "text": "1+1"
+        },
+        ", ",
+        {
+          "type": "CodeFragment",
+          "text": "r2d2"
+        },
+        ", ",
+        {
+          "type": "CodeFragment",
+          "text": "r"
+        },
+        "."
+      ]
+    }
+  ]
+}