Skip to content

Commit

Permalink
fix(Markdown decoding): Handle code chunks and expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
nokome committed Aug 31, 2021
1 parent 6e65620 commit c158436
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 25 deletions.
21 changes: 21 additions & 0 deletions fixtures/articles/code.md
@@ -0,0 +1,21 @@
This article fixture is focussed on the Markdown representation of executable code nodes such as `CodeChunk`, `CodeExpression`, and `Parameter` nodes.

## Inline code

Code expressions have a language and the `exec` keyword in curly braces, like this `1+1`{r exec} and this `2+2`{python exec}.

Non-executable code fragments, lack the `exec` keyword but can have a language e.g. `3+3`{r}.

## Block code

Code chunk use the `exec` keywork to differentiate them from code blocks,

```r exec
"Hello from R"
```

Non executable code blocks do not have the `exec` keyword,

```python
# Not executed
```
104 changes: 79 additions & 25 deletions rust/src/methods/decode/md.rs
Expand Up @@ -11,20 +11,20 @@ use nom::{
branch::alt,
bytes::complete::{tag, take, take_until, take_while1},
character::complete::{char, digit1, multispace0, multispace1},
combinator::{map_res, not, peek},
combinator::{map_res, not, opt, peek},
multi::{fold_many0, separated_list1},
sequence::{delimited, preceded, tuple},
sequence::{delimited, pair, preceded, tuple},
IResult,
};
use once_cell::sync::Lazy;
use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
use regex::Regex;
use stencila_schema::{
Article, AudioObjectSimple, BlockContent, Cite, CiteGroup, CodeBlock, CodeFragment,
CreativeWorkContent, Delete, Emphasis, Heading, ImageObjectSimple, InlineContent, Link, List,
ListItem, ListItemContent, MathFragment, Node, Paragraph, QuoteBlock, Strong, Subscript,
Superscript, TableCell, TableCellContent, TableRow, TableRowRowType, TableSimple,
ThematicBreak, VideoObjectSimple,
Article, AudioObjectSimple, BlockContent, Cite, CiteGroup, CodeBlock, CodeChunk,
CodeExpression, CodeFragment, CreativeWorkContent, Delete, Emphasis, Heading,
ImageObjectSimple, InlineContent, Link, List, ListItem, ListItemContent, MathFragment, Node,
Paragraph, QuoteBlock, Strong, Subscript, Superscript, TableCell, TableCellContent, TableRow,
TableRowRowType, TableSimple, ThematicBreak, VideoObjectSimple,
};

/// Decode a Markdown document to a `Node`
Expand Down Expand Up @@ -253,22 +253,39 @@ pub fn decode_fragment(md: &str) -> Vec<BlockContent> {
..Default::default()
})),
Tag::CodeBlock(kind) => {
let text = inlines.pop_text().trim_end_matches('\n').to_string();
blocks.push_node(BlockContent::CodeBlock(CodeBlock {
text,
programming_language: match kind {
CodeBlockKind::Fenced(lang) => {
let lang = lang.to_string();
if !lang.is_empty() {
Some(Box::new(lang))
let (lang, exec) = match kind {
CodeBlockKind::Fenced(lang) => {
let lang = lang.to_string();
if !lang.is_empty() {
let (lang, exec) = if let Some(lang) = lang.strip_suffix("exec") {
(lang.to_string(), true)
} else {
None
}
(lang.to_string(), false)
};
(Some(lang), exec)
} else {
(None, false)
}
_ => None,
},
..Default::default()
}))
}
_ => (None, false),
};

let text = inlines.pop_text().trim_end_matches('\n').to_string();

let node = match (lang.as_ref(), exec) {
(Some(lang), true) => BlockContent::CodeChunk(CodeChunk {
text,
programming_language: lang.to_string(),
..Default::default()
}),
_ => BlockContent::CodeBlock(CodeBlock {
text,
programming_language: lang.map(Box::new),
..Default::default()
}),
};

blocks.push_node(node)
}

// Inline nodes with inline content
Expand Down Expand Up @@ -362,10 +379,9 @@ pub fn decode_fragment(md: &str) -> Vec<BlockContent> {
}
},
Event::Code(value) => {
inlines.push_node(InlineContent::CodeFragment(CodeFragment {
text: value.to_string(),
..Default::default()
}));
// Because we allow for attributes on code, we push back the
// code in back ticks for it to be parsed again later.
inlines.push_text(&["`", &value.to_string(), "`"].concat())
}
Event::Rule => blocks.push_node(BlockContent::ThematicBreak(ThematicBreak {
..Default::default()
Expand Down Expand Up @@ -623,6 +639,7 @@ impl Inlines {
fn inline_content(input: &str) -> IResult<&str, Vec<InlineContent>> {
fold_many0(
alt((
code_attrs,
cite_group,
cite,
math,
Expand All @@ -646,6 +663,43 @@ fn inline_content(input: &str) -> IResult<&str, Vec<InlineContent>> {
)(input)
}

/// Parse inline code with attributes in curly braces
/// e.g. `code`{attr1 attr2} into a `CodeFragment` or `CodeExpression` node
pub fn code_attrs(input: &str) -> IResult<&str, InlineContent> {
map_res(
pair(
delimited(char('`'), take_until("`"), char('`')),
opt(delimited(char('{'), take_until("}"), char('}'))),
),
|res: (&str, Option<&str>)| -> Result<InlineContent> {
let text = res.0.to_string();
let (lang, exec) = match res.1 {
Some(attrs) => {
let attrs = attrs.split_whitespace().collect::<Vec<&str>>();
(
attrs.get(0).map(|item| item.to_string()),
attrs.contains(&"exec"),
)
}
None => (None, false),
};
let node = match (lang.as_ref(), exec) {
(Some(lang), true) => InlineContent::CodeExpression(CodeExpression {
text,
programming_language: lang.to_string(),
..Default::default()
}),
_ => InlineContent::CodeFragment(CodeFragment {
text,
programming_language: lang.map(Box::new),
..Default::default()
}),
};
Ok(node)
},
)(input)
}

/// Parse a string into a narrative `Cite` node
///
/// This attempts to follow Pandoc's citation handling as closely as possible
Expand Down
118 changes: 118 additions & 0 deletions rust/src/methods/decode/snapshots/md_articles@code.md.snap.new
@@ -0,0 +1,118 @@
---
source: rust/src/methods/decode/md.rs
expression: "decode(&content).expect(\"Unable to decode Markdown\")"
input_file: fixtures/articles/code.md

---
{
"type": "Article",
"content": [
{
"type": "Paragraph",
"content": [
"This article fixture is focussed on the Markdown representation of executable code nodes such as ",
{
"type": "CodeFragment",
"text": "CodeChunk"
},
", ",
{
"type": "CodeFragment",
"text": "CodeExpression"
},
", and ",
{
"type": "CodeFragment",
"text": "Parameter"
},
" nodes."
]
},
{
"type": "Heading",
"content": [
"Inline code"
],
"depth": 2
},
{
"type": "Paragraph",
"content": [
"Code expressions have a language and the ",
{
"type": "CodeFragment",
"text": "exec"
},
" keyword in curly braces, like this ",
{
"type": "CodeExpression",
"programmingLanguage": "r",
"text": "1+1"
},
" and this ",
{
"type": "CodeExpression",
"programmingLanguage": "python",
"text": "2+2"
},
"."
]
},
{
"type": "Paragraph",
"content": [
"Non-executable code fragments, lack the ",
{
"type": "CodeFragment",
"text": "exec"
},
" keyword but can have a language e.g. ",
{
"type": "CodeFragment",
"text": "3+3",
"programmingLanguage": "r"
},
"."
]
},
{
"type": "Heading",
"content": [
"Block code"
],
"depth": 2
},
{
"type": "Paragraph",
"content": [
"Code chunk use the ",
{
"type": "CodeFragment",
"text": "exec"
},
" keywork to differentiate them from code blocks,"
]
},
{
"type": "CodeChunk",
"programmingLanguage": "r ",
"text": "\"Hello from R\""
},
{
"type": "Paragraph",
"content": [
"Non executable code blocks do not have the ",
{
"type": "CodeFragment",
"text": "exec"
},
" keyword,"
]
},
{
"type": "CodeBlock",
"text": "# Not executed",
"programmingLanguage": "python"
}
]
}

0 comments on commit c158436

Please sign in to comment.