Skip to content

Commit

Permalink
Use original raw text and mimetype when indexing rich text.
Browse files Browse the repository at this point in the history
This avoids a double transform (raw source to output mimetype to plain text).
Includes a reindex of the SearchableText index for Collections, Documents and News Items.

Backported from master
Issue plone/Products.CMFPlone#2066
  • Loading branch information
mauritsvanrees committed Aug 9, 2017
1 parent 312af20 commit daaf6b0
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 4 deletions.
6 changes: 5 additions & 1 deletion CHANGES.rst
Expand Up @@ -14,7 +14,11 @@ New features:

Bug fixes:

- *add item here*
- Use original raw text and mimetype when indexing rich text.
This avoids a double transform (raw source to output mimetype to plain text).
Includes a reindex of the SearchableText index for Collections, Documents and News Items.
`Issue 2066 <https://github.com/plone/Products.CMFPlone/issues/2066>`_.
[maurits]


1.1.3 (2017-07-20)
Expand Down
7 changes: 5 additions & 2 deletions plone/app/contenttypes/indexers.py
Expand Up @@ -42,10 +42,13 @@ def SearchableText(obj):
textvalue = richtext.text
if IRichTextValue.providedBy(textvalue):
transforms = getToolByName(obj, 'portal_transforms')
# Before you think about switching raw/output
# or mimeType/outputMimeType, first read
# https://github.com/plone/Products.CMFPlone/issues/2066
text = transforms.convertTo(
'text/plain',
safe_unicode(textvalue.output).encode('utf8'),
mimetype=textvalue.outputMimeType,
safe_unicode(textvalue.raw).encode('utf-8'),
mimetype=textvalue.mimeType,
).getData().strip()

subject = u' '.join(
Expand Down
2 changes: 1 addition & 1 deletion plone/app/contenttypes/profiles/default/metadata.xml
@@ -1,5 +1,5 @@
<metadata>
<version>1105</version>
<version>1106</version>
<dependencies>
<dependency>profile-plone.app.dexterity:default</dependency>
<dependency>profile-plone.app.event:default</dependency>
Expand Down
24 changes: 24 additions & 0 deletions plone/app/contenttypes/tests/test_indexes.py
Expand Up @@ -224,6 +224,30 @@ def test_html_stripped_searchable_text_index(self):
self.assertEqual(index_data['SearchableText'].count('p'), 0)
self.assertEqual(index_data['SearchableText'].count('b'), 0)

def test_raw_text_searchable_text_index(self):
"""Ensure that raw text is used, instead of output.
It makes no sense to transform raw text to the output mimetype,
and then transform it again to plain text.
Note that this does mean that javascript may get in the
searchable text, but you will usually have a hard time setting it.
"""
self.document.text = RichTextValue(
u"""<script type="text/javascript">alert('Lorem ipsum')
</script>""",
mimeType='text/html',
outputMimeType='text/x-html-safe'
)
self.document.reindexObject()
brains = self.catalog.searchResults(dict(
SearchableText=u'Lorem ipsum',
))
self.assertEqual(len(brains), 1)
rid = brains[0].getRID()
index_data = self.catalog.getIndexDataForRID(rid)
self.assertEqual(index_data['SearchableText'].count('script'), 0)
self.assertEqual(index_data['SearchableText'].count('text'), 0)

def test_file_fulltext_in_searchable_text_index_string(self):
from plone.namedfile.file import NamedBlobFile
data = ("Lorem ipsum. Köln <!-- ...oder München, das ist hier die "
Expand Down
17 changes: 17 additions & 0 deletions plone/app/contenttypes/upgrades.py
Expand Up @@ -215,3 +215,20 @@ def searchabletext_collections(context):
for brain in search(portal_type='Collection'):
obj = brain.getObject()
obj.reindexObject(idxs=['SearchableText'])


def searchabletext_richtext(context):
"""Reindex rich text types for SearchableText.
Our SearchableText indexer has been going back and forth between
taking the raw text or the output, and using the original mimetype
or the output mimetype. We are on the third combination now
(original raw source with original mimetype) so it is time to reindex.
See https://github.com/plone/Products.CMFPlone/issues/2066
"""
catalog = getToolByName(context, 'portal_catalog')
search = catalog.unrestrictedSearchResults
for brain in search(portal_type=['Collection', 'Document', 'News Item']):
obj = brain.getObject()
obj.reindexObject(idxs=['SearchableText'])
8 changes: 8 additions & 0 deletions plone/app/contenttypes/upgrades.zcml
Expand Up @@ -80,4 +80,12 @@
handler=".upgrades.searchabletext_collections"
/>

<genericsetup:upgradeStep
source="1105"
destination="1106"
title="Reindex SearchableText for all rich text types"
profile="plone.app.contenttypes:default"
handler=".upgrades.searchabletext_richtext"
/>

</configure>

0 comments on commit daaf6b0

Please sign in to comment.