Skip to content

Commit

Permalink
Merge pull request #85 from kylewm/brevity2
Browse files Browse the repository at this point in the history
Delegate tweet truncation to brevity
  • Loading branch information
Kyle Mahan committed Apr 23, 2016
2 parents 2fd8ef0 + 32bf4eb commit 2f00e42
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 91 deletions.
52 changes: 33 additions & 19 deletions granary/test/test_twitter.py
Expand Up @@ -1283,31 +1283,31 @@ def test_tweet_truncate(self):
u'manually may already be getting them')
expected = (
u'Hey #indieweb, the coming storm of webmention Spam may not '
u'be far away. Those of us that have input fields to… '
u'(https://ben.thatmustbe.me/note/2015/1/31/1/)')
result = self.twitter._truncate(orig, 'https://ben.thatmustbe.me/note/2015/1/31/1/', False)
u'be far away. Those of us that have input fields to send… '
u'https://ben.thatmustbe.me/note/2015/1/31/1/')
result = self.twitter._truncate(orig, 'https://ben.thatmustbe.me/note/2015/1/31/1/', 'note', False)
self.assertEquals(expected, result)

orig = expected = (
u'Despite names,\n'
u'ind.ie&indie.vc are NOT #indieweb @indiewebcamp\n'
u'indiewebcamp.com/2014-review#Indie_Term_Re-use\n'
u'@iainspad @sashtown @thomatronic (ttk.me t4_81)')
result = self.twitter._truncate(orig, None, False)
result = self.twitter._truncate(orig, None, 'note', False)
self.assertEquals(expected, result)

orig = expected = (
u'@davewiner I stubbed a page on the wiki for '
u'https://indiewebcamp.com/River4. Edits/improvmnts from users are '
u'welcome! @kevinmarks @julien51 @aaronpk')
result = self.twitter._truncate(orig, None, False)
result = self.twitter._truncate(orig, None, 'note', False)
self.assertEquals(expected, result)

orig = expected = (
u'This is a long tweet with (foo.com/parenthesized-urls) and urls '
u'that wikipedia.org/Contain_(Parentheses), a url with a query '
u'string;foo.withknown.com/example?query=parameters')
result = self.twitter._truncate(orig, None, False)
result = self.twitter._truncate(orig, None, 'note', False)
self.assertEquals(expected, result)

orig = (
Expand All @@ -1318,7 +1318,7 @@ def test_tweet_truncate(self):
u'This is a long tweet with (foo.com/parenthesized-urls) and urls '
u'that wikipedia.org/Contain_(Parentheses), that is one charc too '
u'long:…')
result = self.twitter._truncate(orig, None, False)
result = self.twitter._truncate(orig, None, 'note', False)
self.assertEquals(expected, result)

# test case-insensitive link matching
Expand All @@ -1329,26 +1329,26 @@ def test_tweet_truncate(self):
expected = (
u'The Telegram Bot API is the best bot API ever. Everyone should learn '
u'from it, especially Matrix.org… '
u'(https://unrelenting.technology/notes/2015-09-05-00-35-13)')
u'https://unrelenting.technology/notes/2015-09-05-00-35-13')
result = self.twitter._truncate(
orig, 'https://unrelenting.technology/notes/2015-09-05-00-35-13', False)
orig, 'https://unrelenting.technology/notes/2015-09-05-00-35-13', 'note', False)
self.assertEquals(expected, result)

twitter.MAX_TWEET_LENGTH = 20
twitter.TCO_LENGTH = 5

orig = u'url http://foo.co/bar ellipsize http://foo.co/baz'
expected = u'url http://foo.co/bar ellipsize…'
result = self.twitter._truncate(orig, None, False)
result = self.twitter._truncate(orig, None, 'note', False)
self.assertEquals(expected, result)

orig = u'too long\nextra whitespace\tbut should include url'
expected = u'too long… (http://obj.ca)'
result = self.twitter._truncate(orig, 'http://obj.ca', False)
expected = u'too long… http://obj.ca'
result = self.twitter._truncate(orig, 'http://obj.ca', 'note', False)
self.assertEquals(expected, result)

orig = expected = u'trailing slash http://www.foo.co/'
result = self.twitter._truncate(orig, None, False)
result = self.twitter._truncate(orig, None, 'note', False)
self.assertEquals(expected, result)

def test_no_ellipsize_real_tweet(self):
Expand Down Expand Up @@ -1388,12 +1388,12 @@ def test_ellipsize_real_tweet(self):
'manually may already be getting them')

content = (u'Hey #indieweb, the coming storm of webmention Spam may not '
u'be far away. Those of us that have input fields to… '
u'(https://ben.thatmustbe.me/note/2015/1/31/1/)')
u'be far away. Those of us that have input fields to send… '
u'https://ben.thatmustbe.me/note/2015/1/31/1/')

preview = (u'Hey #indieweb, the coming storm of webmention Spam may not '
u'be far away. Those of us that have input fields to… '
u'(<a href="https://ben.thatmustbe.me/note/2015/1/31/1/">ben.thatmustbe.me/note/2015/1/31...</a>)')
u'be far away. Those of us that have input fields to send… '
u'<a href="https://ben.thatmustbe.me/note/2015/1/31/1/">ben.thatmustbe.me/note/2015/1/31...</a>')

self.expect_urlopen(twitter.API_POST_TWEET, TWEET,
params={'status': content.encode('utf-8')})
Expand All @@ -1408,6 +1408,19 @@ def test_ellipsize_real_tweet(self):
actual_preview = self.twitter.preview_create(obj, include_link=True).content
self.assertEquals(preview, actual_preview)

def test_tweet_article_has_different_format(self):
"""Articles are published with a slightly different format:
"The Title: url", instead of "The Title (url)"
"""
preview = self.twitter.preview_create({
'objectType': 'article',
'displayName': 'The Article Title',
'url': 'http://example.com/article',
}, include_link=True).content
self.assertEquals(
'The Article Title: <a href="http://example.com/article">example.com/'
'article</a>', preview)

def test_create_tweet_note_prefers_summary_then_content_then_name(self):
obj = copy.deepcopy(OBJECT)

Expand Down Expand Up @@ -1455,7 +1468,7 @@ def test_create_tweet_include_link(self):
twitter.TCO_LENGTH = 5

self.expect_urlopen(twitter.API_POST_TWEET, TWEET,
params={'status': 'too long… (http://obj.ca)'})
params={'status': 'too long… http://obj.ca'})
self.mox.ReplayAll()

obj = copy.deepcopy(OBJECT)
Expand All @@ -1466,7 +1479,8 @@ def test_create_tweet_include_link(self):
})
self.twitter.create(obj, include_link=True)
result = self.twitter.preview_create(obj, include_link=True)
self.assertIn(u'too long… (<a href="http://obj.ca">obj.ca</a>)',result.content)
self.assertIn(u'too long… <a href="http://obj.ca">obj.ca</a>',
result.content)

def test_create_recognize_note(self):
"""Use post-type-discovery to recognize a note with non-trivial html content.
Expand Down
85 changes: 13 additions & 72 deletions granary/twitter.py
Expand Up @@ -29,6 +29,7 @@

from bs4 import BeautifulSoup
import requests
import brevity

import source
from oauth_dropins import twitter_auth
Expand Down Expand Up @@ -621,7 +622,7 @@ def _create(self, obj, preview=None, include_link=False, ignore_formatting=False
# truncate and ellipsize content if it's over the character
# count. URLs will be t.co-wrapped, so include that when counting.
include_url = obj.get('url') if include_link else None
content = self._truncate(content, include_url, has_media)
content = self._truncate(content, include_url, type, has_media)

# linkify defaults to Twitter's link shortening behavior
preview_content = util.linkify(content, pretty=True, skip_bare_cc_tlds=True)
Expand Down Expand Up @@ -728,87 +729,27 @@ def _create(self, obj, preview=None, include_link=False, ignore_formatting=False

return source.creation_result(resp)

def _truncate(self, content, include_url, has_media):
def _truncate(self, content, include_url, type, has_media):
"""Shorten tweet content to fit within the 140 character limit.
Args:
content: string
include_url: string
type: string
has_media: boolean
Return: string, the possibly shortened and ellipsized tweet text
"""
def rstrip_cruft(text):
return text.rstrip().rstrip(',;.')

def trunc_to_nearest_word(text, length):
# try stripping trailing whitespace first
text = rstrip_cruft(text)
if len(text) <= length:
return text
# walk backwards until we find a delimiter
for j in xrange(length, -1, -1):
if text[j] in ',.;: \t\r\n':
return rstrip_cruft(text[:j])

links, splits = util.tokenize_links(content, skip_bare_cc_tlds=True)
max = MAX_TWEET_LENGTH
if include_url:
max -= TCO_LENGTH + 3
if type == 'article':
format = brevity.FORMAT_ARTICLE
else:
format = brevity.FORMAT_NOTE
if has_media:
# twitter includes a pic.twitter.com link (and space) for pictures or
# video - one link total, regardless of number of pictures - so account
# for that.
max -= TCO_LENGTH + 1

tokens = []
for i in xrange(len(links)):
if splits[i]:
tokens.append(('text', splits[i]))
tokens.append(('link', links[i]))
if splits[-1]:
tokens.append(('text', splits[-1]))

length = 0
shortened = []
truncated = False

for i, (toktype, token) in enumerate(tokens):
tokmax = max - length

# links are all or nothing, either add it or don't
if toktype == 'link':
# account for ellipsis if this is not the last token
if i < len(tokens) - 1:
tokmax -= 1
if TCO_LENGTH > tokmax:
truncated = True
break
length += TCO_LENGTH
shortened.append(token)
# truncate text to the nearest word
else:
# account for ellipsis if this is not the last token, or it
# will be truncated
if i < len(tokens) - 1 or len(token) > tokmax:
tokmax -= 1
if len(token) > tokmax:
token = trunc_to_nearest_word(token, tokmax)
if token:
length += len(token)
shortened.append(token)
truncated = True
break
else:
length += len(token)
shortened.append(token)

content = ''.join(shortened)
if truncated:
content = content.rstrip() + u'…'
if include_url:
content += ' (%s)' % include_url
return content
format += '+' + brevity.FORMAT_MEDIA

return brevity.shorten(
content, permalink=include_url, permashortlink=include_url,
target_length=MAX_TWEET_LENGTH, link_length=TCO_LENGTH, format=format)

def upload_images(self, urls):
"""Uploads one or more images from web URLs.
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -6,3 +6,4 @@ mf2py>=0.2.7
mf2util>=0.3.3
oauth-dropins>=1.3
requests<2.6.0
brevity>=0.2.8
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -48,6 +48,7 @@ def __init__(self, *args, **kwargs):
'mf2util>=0.3.3',
'oauth-dropins>=1.3',
'requests<2.6.0',
'brevity>=0.2.8',
],
test_loader='setup:TestLoader',
test_suite='granary.test',
Expand Down

0 comments on commit 2f00e42

Please sign in to comment.