/
utils.py
173 lines (150 loc) · 7.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import re
CALLOUT_BLOCK_REGEX = re.compile(r'^(\s*)((?:> ?)+) *\[!([^\]]*)\]([\-\+]?)(.*)?')
# (1): leading whitespace (all tabs and 4x spaces get reused)
# (2): indents (all leading '>' symbols)
# (3): callout type ([!'capture'] or [!'capture | attribute'] excl. brackets and leading !)
# (4): foldable token (+ or - or <blank>)
# (5): title
CALLOUT_CONTENT_SYNTAX_REGEX = re.compile(r'^(\s*)((?:> ?)+)')
# (1): leading whitespace (all tabs and 4x spaces get reused)
# (2): indents (all leading '>' symbols)
class CalloutParser:
"""Class to parse callout blocks from markdown and convert them to mkdocs supported admonitions."""
# From https://help.obsidian.md/How+to/Use+callouts#Types
aliases = {
'abstract': ['summary', 'tldr'],
'tip': ['hint', 'important'],
'success': ['check', 'done'],
'question': ['help', 'faq'],
'warning': ['caution', 'attention'],
'failure': ['fail', 'missing'],
'danger': ['error'],
'quote': ['cite']
}
alias_tuples = [(alias, c_type) for c_type, aliases in aliases.items() for alias in aliases]
def __init__(self, convert_aliases: bool = True, breakless_lists: bool = True):
# Stack to keep track of the current indentation level
self.indent_levels: list[int] = list()
# Whether to convert aliases or not
self.convert_aliases: bool = convert_aliases
# Breakless list allow for lists to be created without a blank line between them
# (Obsidian's default behavior, but not within the scope of the CommonMark spec)
self.breakless_lists: bool = breakless_lists
# Flags to keep track of the previous line's content for breakless list handling
self.text_in_prev_line: bool = False
self.list_in_prev_line: bool = False
# Check that the callout isn't inside a codefence
self.in_codefence: bool = False
def _parse_block_syntax(self, block) -> str:
"""
Converts the callout syntax from obsidian into the mkdocs syntax
Takes an argument block, which is a regex match.
"""
# Group 1: Leading whitespace (we need to reuse tabs and 4x spaces)
whitespace = block.group(1).replace(' ', '\t')
whitespace = re.sub(r'[^\t]', '', whitespace)
# Group 2: Leading > symbols (indentation, for nested callouts)
indent = block.group(2).count('>')
indent = '\t' * (indent - 1)
# Group 3: Callout block type (note, warning, info, etc.) + inline block syntax
c_type = block.group(3).lower()
c_type = re.sub(r' *\| *(inline|left) *$', ' inline', c_type)
c_type = re.sub(r' *\| *(inline end|right) *$', ' inline end', c_type)
c_type = re.sub(r' *\|.*', '', c_type)
# Convert aliases, if enabled
if self.convert_aliases:
c_type = self._convert_aliases(c_type)
# Group 4: Foldable callouts
syntax = {'-': '???', '+': '???+'}
syntax = syntax.get(block.group(4), '!!!')
# Group 5: Title, add leading whitespace and quotation marks, if it exists
title = block.group(5).strip()
title = f' "{title}"' if title else ''
# Construct the new callout syntax ({indent}!!! note "Title")
return f'{whitespace}{indent}{syntax} {c_type}{title}'
@staticmethod
def _convert_aliases(c_type: str) -> str:
"""Converts aliases to their respective callout type, if its enabled"""
for alias, identifier in CalloutParser.alias_tuples:
c_type = re.sub(rf'^{alias}\b', identifier, c_type)
return c_type
def _breakless_list_handler(self, line: str) -> str:
"""
Handles a breakless list by adding a newline if the previous line was text
This is a workaround for Obsidian's default behavior, which allows for lists to be created
without a blank line between them.
"""
is_list = re.search(r'^\s*(?:[-+*]|\d+\.)\s', line)
if is_list and self.text_in_prev_line:
# If the previous line was a list, keep the line as is
if self.list_in_prev_line:
return line
# If the previous line was text, add a newline before the list
indent = re.search(r'^\t*', line).group()
line = f'{indent}\n{line}'
else:
# Set text_in_prev_line according to the current line
self.text_in_prev_line = line.strip() != ''
self.list_in_prev_line = is_list
return line
def _convert_block(self, line: str) -> str:
"""Calls parse_block_syntax if regex matches, which returns a converted callout block"""
match = re.search(CALLOUT_BLOCK_REGEX, line)
if match:
# Store the current indent level and add it to the list if it doesn't exist
indent_level = match.group(2).count('>')
# Guard clause to prevent non-callout blocks from being converted (e.g. code blocks)
if 1 not in self.indent_levels and indent_level != 1:
return line
if indent_level not in self.indent_levels:
self.indent_levels.append(indent_level)
return self._parse_block_syntax(match)
def _convert_content(self, line: str) -> str:
"""
Converts the callout content by replacing leading '>' symbols with '\t'.
Will return the original line if active_callout is false or if line is missing leading '>' symbols.
"""
match = re.search(CALLOUT_CONTENT_SYNTAX_REGEX, line)
if match and self.indent_levels:
# Group 1: Leading whitespace (we need to reuse tabs and 4x spaces)
whitespace = match.group(1).replace(' ', '\t')
whitespace = re.sub(r'[^\t]', '', whitespace)
# Remove any higher levels whilst the current line
# has a lower indent level than the last line.
while match.group(2).count('>') < self.indent_levels[-1]:
self.indent_levels = self.indent_levels[:-1]
indent = '\t' * (self.indent_levels[-1] + whitespace.count('\t'))
line = re.sub(rf'^\s*(?:> ?){{{self.indent_levels[-1]}}} ?', indent, line)
# Handle breakless lists before returning the line, if enabled
if self.breakless_lists:
line = self._breakless_list_handler(line)
else:
# Reset the relevant variables
self.indent_levels = list()
# These are unused if breakless_lists is disabled
self.list_in_prev_line = False
self.text_in_prev_line = False
return line
def convert_line(self, line: str) -> str:
"""
Converts the syntax for callouts to admonitions for a single line of markdown
returns _convert_block if line matches that of a callout block syntax,
if line is not a block syntax, it will return _convert_content.
"""
# Toggle in_codefence if line contains a codefence
# (If a line contains '```' before any meaningful content, it's a codefence)
if re.match(r'^\s*(?:>\s*)*```', line):
self.in_codefence = not self.in_codefence
if self.in_codefence and self.indent_levels:
return self._convert_content(line)
elif self.in_codefence:
return line
return self._convert_block(line) or self._convert_content(line)
def parse(self, markdown: str) -> str:
"""Takes a markdown file input returns a version with converted callout syntax"""
self.indent_levels = list() # Reset (redundant in conjunction with mkdocs)
# If markdown file does not contain a callout, skip it
if not re.search(r'> *\[!', markdown):
return markdown
# Convert markdown line by line, then return it
return '\n'.join(self.convert_line(line) for line in markdown.split('\n'))