Skip to content

Commit

Permalink
Allow MathML Core tags in post content
Browse files Browse the repository at this point in the history
See mastodon#19806 for more info.

Test Plan:
----------
```
$ RAILS_ENV=test bundle exec rspec spec/lib/sanitize_config_spec.rb -f d
Randomized with seed 26282

Sanitize::Config
  ::MASTODON_OUTGOING
    keeps a with href and rel tag, not adding to rel or target if url is local
    behaves like common HTML sanitization
      removes a with unsupported scheme in href
      removes a with unparsable href
      keeps math
      keeps ul
      removes a without href and only keeps text content
      removes a without href
      keeps a with href
      keeps a with translate="no"
      removes "translate" attribute with invalid value
      keeps h1
      does not re-interpret HTML when removing unsupported links
      keeps title in abbr
      keeps start and reversed attributes of ol
      keeps a with supported scheme and no host
      correctly sanitizes linethickness

Finished in 0.61166 seconds (files took 4.76 seconds to load)
16 examples, 0 failures

Randomized with seed 26282
```
observed 100% code coverage of lib/sanitize_ext/sanitize_config.rb.

See mastodon#19806, glitch-soc#1432
  • Loading branch information
4e554c4c authored and ionathanch committed Sep 17, 2023
1 parent 0bbf7cf commit 7623acf
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 2 deletions.
85 changes: 83 additions & 2 deletions lib/sanitize_ext/sanitize_config.rb
Expand Up @@ -21,6 +21,85 @@ module Config
gemini
).freeze

# We remove all "style" attributes. In particular we remove all color
# attributes and length percentages.
COMMON_MATH_ATTRS = %w(
dir
displaystyle
mathvariant
scriptlevel
)
MATH_TAG_ATTRS = {
'annotation' => %w(encoding),
'annotation-xml' => %w(encoding),
# we remove all attributes from maction
'maction' => %w(),
'math' => %w(display alttext),
'merror' => %w(),
# see below
'mfrac' => %w(linethickness),
'mi' => %w(),
'mmultiscripts' => %w(),
'mn' => %w(),
'mo' => %w(
form
fence
separator
stretchy
symmetric
largeop
movablelimits
),
'mover' => %w(accent),
'moverunder' => %w(accent accentunder),
# see <mspace>
'mpadded' => %w(),
'mphantom' => %w(),
'mprescripts' => %w(),
'mroot' => %w(),
'mrow' => %w(),
'ms' => %w(),
# mspace is only described by its `width`, `depth` and `height` attributes.
# If these are removed, perhaps we should remove the element in general?
'mspace' => %w(),
'msqrt' => %w(),
'mstyle' => %w(),
'msub' => %w(),
'msubsup' => %w(),
'msup' => %w(),
'mtable' => %w(),
'mtd' => %w(colspan rowspan),
'mtext' => %w(),
'mtr' => %w(),
'munder' => %w(accentunder),
'semantics' => %w(),
}.transform_values { |attr_list| attr_list + COMMON_MATH_ATTRS }.freeze

# We need some special logic for some math tags.
#
# In particular, <mathfrac> contains a (usually stylistic) attribute
# `linethickness`, which denotes the thickness of the horizontal bar.
# However, `linethickness="0"`, erases the horizontal bar completely. This
# looks more like a two-element table, and could denote a two-element
# vector, or (in the MathML Core spec) the binomial coefficient!
# For example:
# <mo>(</mo><mfrac linethickness="0"><mi>x</mi><mi>y</mi></mfrac><mo>)</mo>
# denotes xCy, while
# <mo>(</mo><mfrac><mi>x</mi><mi>y</mi></mfrac><mo>)</mo>
# denotes (x/y). These two constructions are very different and the
# distinction needs to be mantained.
MATH_TRANSFORMER = lambda do |env|
node = env[:node]
return if env[:is_allowlisted] || !node.element?
return unless env[:node_name] == 'mfrac'

node.attribute_nodes.each do |attr|
attr.unlink if attr.name == 'linethickness' && attr.value != '0'
end
# we don't allowlist the node. instead we let the CleanElement transformer
# take care of the rest of the attributes.
end

CLASS_WHITELIST_TRANSFORMER = lambda do |env|
node = env[:node]
class_list = node['class']&.split(/[\t\n\f\r ]/)
Expand Down Expand Up @@ -75,7 +154,7 @@ module Config
end

MASTODON_STRICT ||= freeze_config(
elements: %w(p br span a abbr del pre blockquote code b strong u sub sup i em h1 h2 h3 h4 h5 ul ol li),
elements: %w(p br span a abbr del pre blockquote code b strong u sub sup i em h1 h2 h3 h4 h5 ul ol li) + MATH_TAG_ATTRS.keys,

attributes: {
'a' => %w(href rel class title translate),
Expand All @@ -84,7 +163,7 @@ module Config
'blockquote' => %w(cite),
'ol' => %w(start reversed),
'li' => %w(value),
},
}.merge(MATH_TAG_ATTRS),

add_attributes: {
'a' => {
Expand All @@ -103,6 +182,7 @@ module Config
IMG_TAG_TRANSFORMER,
TRANSLATE_TRANSFORMER,
UNSUPPORTED_HREF_TRANSFORMER,
MATH_TRANSFORMER,
]
)

Expand Down Expand Up @@ -169,6 +249,7 @@ module Config
UNSUPPORTED_HREF_TRANSFORMER,
LINK_REL_TRANSFORMER,
LINK_TARGET_TRANSFORMER,
MATH_TRANSFORMER,
]
)
end
Expand Down
10 changes: 10 additions & 0 deletions spec/lib/sanitize_config_spec.rb
Expand Up @@ -55,6 +55,16 @@
it 'keeps title in abbr' do
expect(Sanitize.fragment('<abbr title="HyperText Markup Language">HTML</abbr>', subject)).to eq '<abbr title="HyperText Markup Language">HTML</abbr>'
end

it 'keeps math' do
MATHML = '<math display="block"><mrow><mrow><munder><mo movablelimits="false">∑</mo><mrow><mi>a</mi><mo>∈</mo><mi>𝔄</mi></mrow></munder></mrow><mn>2</mn><mo>⁢</mo><mi>a</mi><mo>+</mo><mn>1</mn></mrow></math>'
expect(Sanitize.fragment(MATHML, subject)).to eq MATHML
end

it 'correctly sanitizes linethickness' do
expect(Sanitize.fragment('<math><mfrac linethickness="0"><mn>1</mn><mn>2</mn></mfrac></math>', subject)).to eq '<math><mfrac linethickness="0"><mn>1</mn><mn>2</mn></mfrac></math>'
expect(Sanitize.fragment('<math><mfrac linethickness="1"><mn>1</mn><mn>2</mn></mfrac></math>', subject)).to eq '<math><mfrac><mn>1</mn><mn>2</mn></mfrac></math>'
end
end

describe '::MASTODON_OUTGOING' do
Expand Down

0 comments on commit 7623acf

Please sign in to comment.