Skip to content

Commit

Permalink
Trac #30106: sage.libs.ecl: Fix unicode handling
Browse files Browse the repository at this point in the history
As a follow-up to #29278, #29280: If we use Unicode variable names in
`SR`, declaring a domain gives an error:
{{{
sage: SR.var('π', domain='real')
RuntimeError: ECL says: THROW: The catch MACSYMA-QUIT is undefined.
SystemError: <built-in method var of sage.symbolic.ring.SymbolicRing
object at 0x334506908> returned a result with an error set
}}}

This comes from our ECL interface:
{{{
sage: from sage.libs.ecl import *
sage: u_symbol = EclObject('🔥')
sage: u_symbol
<repr(<sage.libs.ecl.EclObject at 0x337e7b3c8>) failed:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x94 in position 2:
invalid start byte>
sage: u_symbol.python()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x94 in position 2:
invalid start byte
}}}

Also note:
{{{
sage: b_symbol = EclObject(bytes([166]))
sage: b_symbol
<repr(<sage.libs.ecl.EclObject at 0x337e7b058>) failed:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa6 in position 0:
invalid start byte>
}}}

URL: https://trac.sagemath.org/30106
Reported by: mkoeppe
Ticket author(s): Matthias Koeppe
Reviewer(s): Markus Wageringel
  • Loading branch information
Release Manager committed Jul 23, 2020
2 parents 147309d + 59dd62b commit 8bac45e
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 64 deletions.
20 changes: 5 additions & 15 deletions src/sage/cpython/string.pxd
Expand Up @@ -10,8 +10,6 @@

from __future__ import absolute_import

from cpython.version cimport PY_MAJOR_VERSION


cdef extern from "string_impl.h":
str _cstr_to_str(const char* c, encoding, errors)
Expand All @@ -32,9 +30,8 @@ cpdef inline str bytes_to_str(b, encoding=None, errors=None):
r"""
Convert ``bytes`` to ``str``.
On Python 2 this is a no-op since ``bytes is str``. On Python 3
this decodes the given ``bytes`` to a Python 3 unicode ``str`` using
the specified encoding.
This decodes the given ``bytes`` to a Python 3 unicode ``str`` using
the specified encoding. It is a no-op on ``str`` input.
EXAMPLES::
Expand All @@ -52,22 +49,15 @@ cpdef inline str bytes_to_str(b, encoding=None, errors=None):
if type(b) is not bytes:
raise TypeError(f"expected bytes, {type(b).__name__} found")

if PY_MAJOR_VERSION <= 2:
return <str>b
else:
return _cstr_to_str(<bytes>b, encoding, errors)
return _cstr_to_str(<bytes>b, encoding, errors)


cpdef inline bytes str_to_bytes(s, encoding=None, errors=None):
r"""
Convert ``str`` or ``unicode`` to ``bytes``.
On Python 3 this encodes the given ``str`` to a Python 3 ``bytes``
using the specified encoding.
On Python 2 this is a no-op on ``str`` input since ``str is bytes``.
However, this function also accepts Python 2 ``unicode`` objects and
treats them the same as Python 3 unicode ``str`` objects.
It encodes the given ``str`` to a Python 3 ``bytes``
using the specified encoding. It is a no-op on ``bytes`` input.
EXAMPLES::
Expand Down
31 changes: 0 additions & 31 deletions src/sage/cpython/string_impl.h
Expand Up @@ -15,9 +15,6 @@

static inline PyObject* _cstr_to_str(const char* c, PyObject* encoding, PyObject* errors)
{
#if PY_MAJOR_VERSION <= 2
return PyBytes_FromString(c);
#else
const char* err = NULL; // Default: strict
const char* enc = NULL; // Default: utf-8

Expand All @@ -32,48 +29,21 @@ static inline PyObject* _cstr_to_str(const char* c, PyObject* encoding, PyObject
}

return PyUnicode_Decode(c, strlen(c), enc, err);
#endif
}


static inline PyObject* _str_to_bytes(PyObject* s, PyObject* encoding, PyObject* errors)
{
#if PY_MAJOR_VERSION <= 2
/* On Python 2, we accept bytes == str as input */
if (PyBytes_CheckExact(s)) {
Py_INCREF(s);
return s;
}
#endif

if (!PyUnicode_Check(s)) {
PyErr_Format(PyExc_TypeError,
#if PY_MAJOR_VERSION >= 3
"expected str, %s found",
#else
"expected str or unicode, %s found",
#endif
Py_TYPE(s)->tp_name);
return NULL;
}

const char* err = NULL; // Default: strict
const char* enc = NULL; // Default: utf-8

#if PY_MAJOR_VERSION <= 2
if (errors != Py_None) {
err = PyString_AsString(errors);
if (!err) return NULL;
}

if (encoding != Py_None) {
enc = PyString_AsString(encoding);
if (!enc) return NULL;
}
else {
enc = "utf-8";
}
#else
if (errors != Py_None) {
err = PyUnicode_AsUTF8(errors);
if (!err) return NULL;
Expand All @@ -83,7 +53,6 @@ static inline PyObject* _str_to_bytes(PyObject* s, PyObject* encoding, PyObject*
enc = PyUnicode_AsUTF8(encoding);
if (!enc) return NULL;
}
#endif

return PyUnicode_AsEncodedString(s, enc, err);
}
1 change: 1 addition & 0 deletions src/sage/libs/ecl.pxd
Expand Up @@ -135,6 +135,7 @@ cdef extern from "ecl/ecl.h":
cl_object cl_write_to_string(cl_narg narg, cl_object o)
cl_object ecl_cstring_to_base_string_or_nil(char *s)
cl_object si_coerce_to_base_string(cl_object x)
cl_object si_base_string_p(cl_object x)

# S-expr evaluation and function calls

Expand Down
88 changes: 70 additions & 18 deletions src/sage/libs/ecl.pyx
Expand Up @@ -42,6 +42,9 @@ cdef bint bint_integerp(cl_object obj):
cdef bint bint_rationalp(cl_object obj):
return not(cl_rationalp(obj) == Cnil)

cdef bint bint_base_string_p(cl_object obj):
return not(si_base_string_p(obj) == Cnil)

cdef extern from "eclsig.h":
int ecl_sig_on() except 0
void ecl_sig_off()
Expand Down Expand Up @@ -101,6 +104,8 @@ cdef void remove_node(cl_object node):
cdef cl_object list_of_objects

cdef cl_object read_from_string_clobj #our own error catching reader
cdef cl_object make_unicode_string_clobj
cdef cl_object unicode_string_codepoints_clobj

cdef bint ecl_has_booted = 0

Expand Down Expand Up @@ -230,6 +235,8 @@ def init_ecl():
"""
global list_of_objects
global read_from_string_clobj
global make_unicode_string_clobj
global unicode_string_codepoints_clobj
global conditions_to_handle_clobj
global ecl_has_booted
cdef char *argv[1]
Expand Down Expand Up @@ -284,8 +291,27 @@ def init_ecl():
conditions_to_handle_clobj=ecl_list1(ecl_make_symbol(b"SERIOUS-CONDITION", b"COMMON-LISP"))
insert_node_after(list_of_objects,conditions_to_handle_clobj)

cl_eval(string_to_object(b"""
(defun sage-make-unicode-string (codepoints)
(map 'string #'code-char codepoints))
"""))
make_unicode_string_clobj = cl_eval(string_to_object(b"#'sage-make-unicode-string"))

cl_eval(string_to_object(b"""
(defun sage-unicode-string-codepoints (s)
(map 'list #'char-code s))
"""))
unicode_string_codepoints_clobj = cl_eval(string_to_object(b"#'sage-unicode-string-codepoints"))

ecl_has_booted = 1

cdef ecl_string_to_python(cl_object s):
if bint_base_string_p(s):
return char_to_str(ecl_base_string_pointer_safe(s))
else:
s = cl_funcall(2, unicode_string_codepoints_clobj, s)
return ''.join(chr(code) for code in ecl_to_python(s))

cdef cl_object ecl_safe_eval(cl_object form) except NULL:
"""
TESTS:
Expand All @@ -310,9 +336,8 @@ cdef cl_object ecl_safe_eval(cl_object form) except NULL:
ecl_sig_off()

if error != NULL:
error = si_coerce_to_base_string(error)
raise RuntimeError("ECL says: {}".format(
char_to_str(ecl_base_string_pointer_safe(error))))
ecl_string_to_python(error)))
else:
return ret

Expand All @@ -324,9 +349,8 @@ cdef cl_object ecl_safe_funcall(cl_object func, cl_object arg) except NULL:
ecl_sig_off()

if error != NULL:
error = si_coerce_to_base_string(error)
raise RuntimeError("ECL says: {}".format(
char_to_str(ecl_base_string_pointer_safe(error))))
ecl_string_to_python(error)))
else:
return ret

Expand All @@ -338,9 +362,8 @@ cdef cl_object ecl_safe_apply(cl_object func, cl_object args) except NULL:
ecl_sig_off()

if error != NULL:
error = si_coerce_to_base_string(error)
raise RuntimeError("ECL says: {}".format(
char_to_str(ecl_base_string_pointer_safe(error))))
ecl_string_to_python(error)))
else:
return ret

Expand Down Expand Up @@ -393,8 +416,10 @@ def print_objects():
cdef cl_object c, s
c = list_of_objects
while True:
s = si_coerce_to_base_string(cl_write_to_string(1,cl_car(c)))
print(char_to_str(ecl_base_string_pointer_safe(s)))

s = cl_write_to_string(1, cl_car(c))
print(ecl_string_to_python(s))

c = cl_cadr(c)
if c == Cnil:
break
Expand All @@ -407,7 +432,7 @@ cdef cl_object python_to_ecl(pyobj) except NULL:
# strings ->parsed by lisp reader

cdef bytes s
cdef cl_object L, ptr
cdef cl_object L, ptr, o

if isinstance(pyobj,bool):
if pyobj:
Expand All @@ -426,8 +451,14 @@ cdef cl_object python_to_ecl(pyobj) except NULL:
elif isinstance(pyobj,float):
return ecl_make_doublefloat(pyobj)
elif isinstance(pyobj,unicode):
s=str_to_bytes(pyobj)
return ecl_safe_read_string(s)
try:
s = str_to_bytes(pyobj, 'ascii')
except UnicodeEncodeError:
o = cl_funcall(2, make_unicode_string_clobj,
python_to_ecl([ord(c) for c in pyobj]))
else:
o = ecl_cstring_to_base_string_or_nil(s)
return ecl_safe_funcall(read_from_string_clobj, o)
elif isinstance(pyobj,bytes):
s=<bytes>pyobj
return ecl_safe_read_string(s)
Expand Down Expand Up @@ -504,8 +535,8 @@ cdef ecl_to_python(cl_object o):
return tuple(L)
return L
else:
s = si_coerce_to_base_string(cl_write_to_string(1,o))
return char_to_str(ecl_base_string_pointer_safe(s))
s = cl_write_to_string(1, o)
return ecl_string_to_python(s)

#Maxima's BFLOAT multiprecision float type can be read with:
#def bfloat_to_python(e):
Expand Down Expand Up @@ -605,6 +636,19 @@ cdef class EclObject:
True
sage: EclObject(-i).python() == -i
True
We check that symbols with Unicode names are converted correctly::
sage: EclObject('λ')
<ECL: Λ>
sage: EclObject('|λ|')
<ECL: |λ|>
We check that Unicode strings are converted correctly::
sage: EclObject('"Mαξιμα"')
<ECL: "Mαξιμα">
"""
cdef cl_object obj #the wrapped object
cdef cl_object node #linked list pointer: car(node) == obj
Expand Down Expand Up @@ -721,8 +765,8 @@ cdef class EclObject:
"""
cdef cl_object s
s = si_coerce_to_base_string(cl_write_to_string(1,self.obj))
return char_to_str(ecl_base_string_pointer_safe(s))
s = cl_write_to_string(1, self.obj)
return ecl_string_to_python(s)

def __hash__(self):
r"""
Expand Down Expand Up @@ -1288,7 +1332,7 @@ cdef EclObject ecl_wrap(cl_object o):

#convenience routine to more easily evaluate strings
cpdef EclObject ecl_eval(str s):
"""
r"""
Read and evaluate string in Lisp and return the result
EXAMPLES::
Expand All @@ -1299,10 +1343,18 @@ cpdef EclObject ecl_eval(str s):
sage: ecl_eval("(mapcar 'fibo '(1 2 3 4 5 6 7))")
<ECL: (1 1 2 3 5 8 13)>
TESTS:
We check that Unicode is handled correctly::
sage: ecl_eval('''(defun double-struck-number (n) (map 'string #'(lambda (c) (code-char (+ (char-code #\𝟘) (- (char-code c) (char-code #\\0))))) (format nil "~A" n)))''')
<ECL: DOUBLE-STRUCK-NUMBER>
sage: _(4711)
<ECL: "𝟜𝟟𝟙𝟙">
"""
cdef cl_object o
o=ecl_safe_read_string(str_to_bytes(s))
o=ecl_safe_eval(o)
o=ecl_safe_eval(python_to_ecl(s))
return ecl_wrap(o)

init_ecl()

0 comments on commit 8bac45e

Please sign in to comment.