Skip to content

Commit

Permalink
Use ast instead of eval for string extraction
Browse files Browse the repository at this point in the history
This is safer (as we don't actually execute anything),
and allows us to parse f-strings too.

Closes #769 (supersedes it)
Refs #715 (doesn't add an error yet, but doesn't crash on f-strings)
  • Loading branch information
akx committed Nov 1, 2022
1 parent a946ae6 commit 2e5708f
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 9 deletions.
35 changes: 26 additions & 9 deletions babel/messages/extract.py
Expand Up @@ -15,7 +15,7 @@
:copyright: (c) 2013-2022 by the Babel Team.
:license: BSD, see LICENSE for more details.
"""

import ast
import os
from os.path import relpath
import sys
Expand Down Expand Up @@ -487,14 +487,9 @@ def extract_python(fileobj, keywords, comment_tags, options):
if nested:
funcname = value
elif tok == STRING:
# Unwrap quotes in a safe manner, maintaining the string's
# encoding
# https://sourceforge.net/tracker/?func=detail&atid=355470&
# aid=617979&group_id=5470
code = compile('# coding=%s\n%s' % (str(encoding), value),
'<string>', 'eval', future_flags)
value = eval(code, {'__builtins__': {}}, {})
buf.append(value)
val = _parse_python_string(value, encoding, future_flags)
if val is not None:
buf.append(val)
elif tok == OP and value == ',':
if buf:
messages.append(''.join(buf))
Expand All @@ -516,6 +511,28 @@ def extract_python(fileobj, keywords, comment_tags, options):
funcname = value


def _parse_python_string(value, encoding, future_flags):
# Unwrap quotes in a safe manner, maintaining the string's encoding
# https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
code = compile(
f'# coding={str(encoding)}\n{value}',
'<string>',
'eval',
ast.PyCF_ONLY_AST | future_flags,
)
if isinstance(code, ast.Expression):
body = code.body
if isinstance(body, ast.Str):
return body.s
if isinstance(body, ast.JoinedStr): # f-string
if all(isinstance(node, ast.Str) for node in body.values):
return ''.join(node.s for node in body.values)
if all(isinstance(node, ast.Constant) for node in body.values):
return ''.join(str(node.value) for node in body.values)
# TODO: we could raise an error or warning when not all nodes are constants
return None


def extract_javascript(fileobj, keywords, comment_tags, options):
"""Extract messages from JavaScript source code.
Expand Down
27 changes: 27 additions & 0 deletions tests/messages/test_extract.py
Expand Up @@ -528,3 +528,30 @@ def test_future(self):
messages = list(extract.extract('python', buf,
extract.DEFAULT_KEYWORDS, [], {}))
assert messages[0][1] == u'\xa0'

def test_f_strings(self):
buf = BytesIO(br"""
t1 = _('foobar')
t2 = _(f'spameggs' f'feast') # should be extracted; constant parts only
t2 = _(f'spameggs' 'kerroshampurilainen') # should be extracted (mixing f with no f)
t3 = _(f'''whoa! a ''' # should be extracted (continues on following lines)
f'flying shark'
'... hello'
)
t4 = _(f'spameggs {t1}') # should not be extracted
""")
messages = list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [], {}))
assert len(messages) == 4
assert messages[0][1] == u'foobar'
assert messages[1][1] == u'spameggsfeast'
assert messages[2][1] == u'spameggskerroshampurilainen'
assert messages[3][1] == u'whoa! a flying shark... hello'

def test_f_strings_non_utf8(self):
buf = BytesIO(b"""
# -- coding: latin-1 --
t2 = _(f'\xe5\xe4\xf6' f'\xc5\xc4\xd6')
""")
messages = list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [], {}))
assert len(messages) == 1
assert messages[0][1] == u'åäöÅÄÖ'

0 comments on commit 2e5708f

Please sign in to comment.