From c15843687a9ccb943ca67e10368f4a235af937f9 Mon Sep 17 00:00:00 2001 From: Nokome Date: Sun, 29 Aug 2021 13:41:03 +1200 Subject: [PATCH] fix(Markdown decoding): Handle code chunks and expressions --- fixtures/articles/code.md | 21 ++++ rust/src/methods/decode/md.rs | 104 +++++++++++---- .../snapshots/md_articles@code.md.snap.new | 118 ++++++++++++++++++ 3 files changed, 218 insertions(+), 25 deletions(-) create mode 100644 fixtures/articles/code.md create mode 100644 rust/src/methods/decode/snapshots/md_articles@code.md.snap.new diff --git a/fixtures/articles/code.md b/fixtures/articles/code.md new file mode 100644 index 0000000000..75140822c0 --- /dev/null +++ b/fixtures/articles/code.md @@ -0,0 +1,21 @@ +This article fixture is focussed on the Markdown representation of executable code nodes such as `CodeChunk`, `CodeExpression`, and `Parameter` nodes. + +## Inline code + +Code expressions have a language and the `exec` keyword in curly braces, like this `1+1`{r exec} and this `2+2`{python exec}. + +Non-executable code fragments, lack the `exec` keyword but can have a language e.g. `3+3`{r}. + +## Block code + +Code chunk use the `exec` keywork to differentiate them from code blocks, + +```r exec +"Hello from R" +``` + +Non executable code blocks do not have the `exec` keyword, + +```python +# Not executed +``` diff --git a/rust/src/methods/decode/md.rs b/rust/src/methods/decode/md.rs index 477e9e8f3d..1e3481483a 100644 --- a/rust/src/methods/decode/md.rs +++ b/rust/src/methods/decode/md.rs @@ -11,20 +11,20 @@ use nom::{ branch::alt, bytes::complete::{tag, take, take_until, take_while1}, character::complete::{char, digit1, multispace0, multispace1}, - combinator::{map_res, not, peek}, + combinator::{map_res, not, opt, peek}, multi::{fold_many0, separated_list1}, - sequence::{delimited, preceded, tuple}, + sequence::{delimited, pair, preceded, tuple}, IResult, }; use once_cell::sync::Lazy; use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag}; use regex::Regex; use stencila_schema::{ - Article, AudioObjectSimple, BlockContent, Cite, CiteGroup, CodeBlock, CodeFragment, - CreativeWorkContent, Delete, Emphasis, Heading, ImageObjectSimple, InlineContent, Link, List, - ListItem, ListItemContent, MathFragment, Node, Paragraph, QuoteBlock, Strong, Subscript, - Superscript, TableCell, TableCellContent, TableRow, TableRowRowType, TableSimple, - ThematicBreak, VideoObjectSimple, + Article, AudioObjectSimple, BlockContent, Cite, CiteGroup, CodeBlock, CodeChunk, + CodeExpression, CodeFragment, CreativeWorkContent, Delete, Emphasis, Heading, + ImageObjectSimple, InlineContent, Link, List, ListItem, ListItemContent, MathFragment, Node, + Paragraph, QuoteBlock, Strong, Subscript, Superscript, TableCell, TableCellContent, TableRow, + TableRowRowType, TableSimple, ThematicBreak, VideoObjectSimple, }; /// Decode a Markdown document to a `Node` @@ -253,22 +253,39 @@ pub fn decode_fragment(md: &str) -> Vec { ..Default::default() })), Tag::CodeBlock(kind) => { - let text = inlines.pop_text().trim_end_matches('\n').to_string(); - blocks.push_node(BlockContent::CodeBlock(CodeBlock { - text, - programming_language: match kind { - CodeBlockKind::Fenced(lang) => { - let lang = lang.to_string(); - if !lang.is_empty() { - Some(Box::new(lang)) + let (lang, exec) = match kind { + CodeBlockKind::Fenced(lang) => { + let lang = lang.to_string(); + if !lang.is_empty() { + let (lang, exec) = if let Some(lang) = lang.strip_suffix("exec") { + (lang.to_string(), true) } else { - None - } + (lang.to_string(), false) + }; + (Some(lang), exec) + } else { + (None, false) } - _ => None, - }, - ..Default::default() - })) + } + _ => (None, false), + }; + + let text = inlines.pop_text().trim_end_matches('\n').to_string(); + + let node = match (lang.as_ref(), exec) { + (Some(lang), true) => BlockContent::CodeChunk(CodeChunk { + text, + programming_language: lang.to_string(), + ..Default::default() + }), + _ => BlockContent::CodeBlock(CodeBlock { + text, + programming_language: lang.map(Box::new), + ..Default::default() + }), + }; + + blocks.push_node(node) } // Inline nodes with inline content @@ -362,10 +379,9 @@ pub fn decode_fragment(md: &str) -> Vec { } }, Event::Code(value) => { - inlines.push_node(InlineContent::CodeFragment(CodeFragment { - text: value.to_string(), - ..Default::default() - })); + // Because we allow for attributes on code, we push back the + // code in back ticks for it to be parsed again later. + inlines.push_text(&["`", &value.to_string(), "`"].concat()) } Event::Rule => blocks.push_node(BlockContent::ThematicBreak(ThematicBreak { ..Default::default() @@ -623,6 +639,7 @@ impl Inlines { fn inline_content(input: &str) -> IResult<&str, Vec> { fold_many0( alt(( + code_attrs, cite_group, cite, math, @@ -646,6 +663,43 @@ fn inline_content(input: &str) -> IResult<&str, Vec> { )(input) } +/// Parse inline code with attributes in curly braces +/// e.g. `code`{attr1 attr2} into a `CodeFragment` or `CodeExpression` node +pub fn code_attrs(input: &str) -> IResult<&str, InlineContent> { + map_res( + pair( + delimited(char('`'), take_until("`"), char('`')), + opt(delimited(char('{'), take_until("}"), char('}'))), + ), + |res: (&str, Option<&str>)| -> Result { + let text = res.0.to_string(); + let (lang, exec) = match res.1 { + Some(attrs) => { + let attrs = attrs.split_whitespace().collect::>(); + ( + attrs.get(0).map(|item| item.to_string()), + attrs.contains(&"exec"), + ) + } + None => (None, false), + }; + let node = match (lang.as_ref(), exec) { + (Some(lang), true) => InlineContent::CodeExpression(CodeExpression { + text, + programming_language: lang.to_string(), + ..Default::default() + }), + _ => InlineContent::CodeFragment(CodeFragment { + text, + programming_language: lang.map(Box::new), + ..Default::default() + }), + }; + Ok(node) + }, + )(input) +} + /// Parse a string into a narrative `Cite` node /// /// This attempts to follow Pandoc's citation handling as closely as possible diff --git a/rust/src/methods/decode/snapshots/md_articles@code.md.snap.new b/rust/src/methods/decode/snapshots/md_articles@code.md.snap.new new file mode 100644 index 0000000000..03d3b222fb --- /dev/null +++ b/rust/src/methods/decode/snapshots/md_articles@code.md.snap.new @@ -0,0 +1,118 @@ +--- +source: rust/src/methods/decode/md.rs +expression: "decode(&content).expect(\"Unable to decode Markdown\")" +input_file: fixtures/articles/code.md + +--- +{ + "type": "Article", + "content": [ + { + "type": "Paragraph", + "content": [ + "This article fixture is focussed on the Markdown representation of executable code nodes such as ", + { + "type": "CodeFragment", + "text": "CodeChunk" + }, + ", ", + { + "type": "CodeFragment", + "text": "CodeExpression" + }, + ", and ", + { + "type": "CodeFragment", + "text": "Parameter" + }, + " nodes." + ] + }, + { + "type": "Heading", + "content": [ + "Inline code" + ], + "depth": 2 + }, + { + "type": "Paragraph", + "content": [ + "Code expressions have a language and the ", + { + "type": "CodeFragment", + "text": "exec" + }, + " keyword in curly braces, like this ", + { + "type": "CodeExpression", + "programmingLanguage": "r", + "text": "1+1" + }, + " and this ", + { + "type": "CodeExpression", + "programmingLanguage": "python", + "text": "2+2" + }, + "." + ] + }, + { + "type": "Paragraph", + "content": [ + "Non-executable code fragments, lack the ", + { + "type": "CodeFragment", + "text": "exec" + }, + " keyword but can have a language e.g. ", + { + "type": "CodeFragment", + "text": "3+3", + "programmingLanguage": "r" + }, + "." + ] + }, + { + "type": "Heading", + "content": [ + "Block code" + ], + "depth": 2 + }, + { + "type": "Paragraph", + "content": [ + "Code chunk use the ", + { + "type": "CodeFragment", + "text": "exec" + }, + " keywork to differentiate them from code blocks," + ] + }, + { + "type": "CodeChunk", + "programmingLanguage": "r ", + "text": "\"Hello from R\"" + }, + { + "type": "Paragraph", + "content": [ + "Non executable code blocks do not have the ", + { + "type": "CodeFragment", + "text": "exec" + }, + " keyword," + ] + }, + { + "type": "CodeBlock", + "text": "# Not executed", + "programmingLanguage": "python" + } + ] +}