Skip to content

Commit aaa613f

Browse files
Omikhleiaalerque
authored andcommitted
fix(packages): Correct handling of ampersands and tildes in bibtex
Accept `\&` for compatibility with legacy BibTeX, but do not mandate it to be escaped for compatibility with other engines. Support unescaped `~` as a non-breaking space for compability with TeX, this is often found in existing bibliography files. Support `\~` to render a tilde. XML-escape the input so it can safely be wrapped in a `<sile>` construct. Closes #2050 Closes #1860 (replaced by this implementation)
1 parent b775147 commit aaa613f

File tree

1 file changed

+40
-5
lines changed

1 file changed

+40
-5
lines changed

packages/bibtex/init.lua

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,37 @@ local epnf = require("epnf")
77

88
local Bibliography
99

10+
local nbsp = luautf8.char(0x00A0)
11+
local function sanitize (str)
12+
local s = str
13+
-- TeX special characters:
14+
-- Backslash-escaped tilde is a tilde,
15+
-- but standalone tilde is a non-breaking space
16+
:gsub(
17+
"(.?)~",
18+
function (prev)
19+
if prev == "\\" then
20+
return "~"
21+
end
22+
return prev .. nbsp
23+
end
24+
)
25+
-- Other backslash-escaped characters are skipped
26+
-- TODO FIXME:
27+
-- This ok for \", \& etc. which we want to unescape,
28+
-- BUT what should we do with other TeX-like commands?
29+
:gsub(
30+
"\\",
31+
""
32+
)
33+
-- We will wrap the content in <sile> tags so we need to XML-escape
34+
-- the input.
35+
:gsub("&", "&amp;")
36+
:gsub("<", "&lt;")
37+
:gsub(">", "&gt;")
38+
return s
39+
end
40+
1041
-- luacheck: push ignore
1142
-- stylua: ignore start
1243
---@diagnostic disable: undefined-global, unused-local, lowercase-global
@@ -18,11 +49,14 @@ local bibtexparser = epnf.define(function (_ENV)
1849
local quoted = C( P'"' * C(((1 - S'"\r\n\f\\') + (P'\\' * 1)) ^ 0) * '"' ) / function (...) local t={...}; return t[2] end
1950
local _ = WS^0
2051
local sep = S",;" * _
21-
local myID = C(identifier + P(1)) / function (t) return strings[t] or t end
22-
local myTag = C(identifier + P(1)) / function (t) return t:lower() end
23-
local pieces = balanced + quoted + myID
24-
local value = Ct(pieces * (WS * P("#") * WS * pieces)^0) / function (t) return table.concat(t) end
25-
local pair = Cg(myTag * _ * "=" * _ * C(value)) * _ * sep^-1 / function (...) local t= {...}; return t[1], t[#t] end
52+
local myID = C(identifier)
53+
local myStrID = myID / function (t) return strings[t] or t end
54+
local myTag = C(identifier) / function (t) return t:lower() end
55+
local pieces = balanced + quoted + myStrID
56+
local value = Ct(pieces * (WS * P("#") * WS * pieces)^0)
57+
/ function (t) return table.concat(t) end / sanitize
58+
local pair = myTag * _ * "=" * _ * value * _ * sep^-1
59+
/ function (...) local t= {...}; return t[1], t[#t] end
2660
local list = Cf(Ct("") * pair^0, rawset)
2761
local skippedType = Cmt(R("az", "AZ")^1, function(_, _, tag)
2862
-- ignore both @comment and @preamble
@@ -300,6 +334,7 @@ If no such abbreviation is found, the value is considered to be a string literal
300334
301335
String values are assumed to be in the UTF-8 encoding, and shall not contain (La)TeX commands.
302336
Special character sequences from TeX (such as \code{`} assumed to be an opening quote) are not supported.
337+
There are exceptions to this rule. Notably, the \code{~} character can be used to represent a non-breaking space (when not backslash-escaped), and the \code{\\&} sequence is accepted (though this implementation does not mandate escaping ampersands).
303338
304339
Values can also be composed by concatenating strings, using the \code{#} character.
305340

0 commit comments

Comments
 (0)