Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 98 additions & 20 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,10 @@ static _Py_Identifier *static_strings = NULL;
shared as well. */
static PyObject *unicode_latin1[256] = {NULL};

/* Single character Unicode strings in BMP are cached. */
#define BMP_CACHE_SIZE 256
static PyObject *unicode_bmp_cache[BMP_CACHE_SIZE*2] = {NULL};

/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0,
Expand Down Expand Up @@ -262,7 +266,9 @@ const unsigned char _Py_ascii_whitespace[] = {

/* forward */
static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
static PyObject* unicode_char(Py_UCS4 ch);
static PyObject* get_latin1_char(unsigned char ch);
static PyObject* get_bmp_char(Py_UCS2 ch);
static int unicode_modifiable(PyObject *unicode);


Expand Down Expand Up @@ -501,11 +507,15 @@ unicode_result_wchar(PyObject *unicode)

if (len == 1) {
wchar_t ch = _PyUnicode_WSTR(unicode)[0];
if ((Py_UCS4)ch < 256) {
PyObject *latin1_char = get_latin1_char((unsigned char)ch);
#if SIZEOF_WCHAR_T == 2
Py_DECREF(unicode);
return unicode_char((Py_UCS2)ch);
#else
if ((Py_UCS4)ch < 0x10000) {
Py_DECREF(unicode);
return latin1_char;
return unicode_char((Py_UCS4)ch);
}
#endif
}

if (_PyUnicode_Ready(unicode) < 0) {
Expand Down Expand Up @@ -537,10 +547,9 @@ unicode_result_ready(PyObject *unicode)
}

if (length == 1) {
void *data = PyUnicode_DATA(unicode);
int kind = PyUnicode_KIND(unicode);
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
if (ch < 256) {
if (kind == PyUnicode_1BYTE_KIND) {
Py_UCS1 ch = PyUnicode_1BYTE_DATA(unicode)[0];
PyObject *latin1_char = unicode_latin1[ch];
if (latin1_char != NULL) {
if (unicode != latin1_char) {
Expand All @@ -556,6 +565,35 @@ unicode_result_ready(PyObject *unicode)
return unicode;
}
}
if (kind == PyUnicode_2BYTE_KIND) {
Py_UCS2 ch = PyUnicode_2BYTE_DATA(unicode)[0];
PyObject **cell = &unicode_bmp_cache[(ch % BMP_CACHE_SIZE) * 2];
PyObject *bmp_char = cell[0];
if (bmp_char) {
if (PyUnicode_2BYTE_DATA(bmp_char)[0] == ch) {
found:
if (unicode != bmp_char) {
Py_INCREF(bmp_char);
Py_DECREF(unicode);
}
return bmp_char;
}
bmp_char = cell[1];
if (bmp_char && PyUnicode_2BYTE_DATA(bmp_char)[0] == ch) {
/* move found character to front */
cell[1] = cell[0];
cell[0] = bmp_char;
goto found;
}
}
assert(_PyUnicode_CheckConsistency(unicode, 1));
/* insert new character at front */
Py_XDECREF(cell[1]);
cell[1] = cell[0];
cell[0] = unicode;
Py_INCREF(unicode);
return unicode;
}
}

assert(_PyUnicode_CheckConsistency(unicode, 1));
Expand Down Expand Up @@ -1976,27 +2014,58 @@ get_latin1_char(unsigned char ch)
return unicode;
}

static PyObject *
get_bmp_char(Py_UCS2 ch)
{
PyObject **cell = &unicode_bmp_cache[(ch % BMP_CACHE_SIZE) * 2];
PyObject *unicode = cell[0];
if (unicode) {
if (PyUnicode_2BYTE_DATA(unicode)[0] == ch) {
goto done;
}
unicode = cell[1];
if (unicode && PyUnicode_2BYTE_DATA(unicode)[0] == ch) {
/* move found character at front */
cell[1] = cell[0];
cell[0] = unicode;
goto done;
}
}
unicode = PyUnicode_New(1, ch);
if (!unicode) {
return NULL;
}
PyUnicode_2BYTE_DATA(unicode)[0] = ch;
assert(_PyUnicode_CheckConsistency(unicode, 1));
/* insert new character at front */
Py_XDECREF(cell[1]);
cell[1] = cell[0];
cell[0] = unicode;

done:
Py_INCREF(unicode);
return unicode;
}

static PyObject*
unicode_char(Py_UCS4 ch)
{
PyObject *unicode;

assert(ch <= MAX_UNICODE);

if (ch < 256)
if (ch < 256) {
return get_latin1_char(ch);

}
if (ch < 0x10000) {
return get_bmp_char(ch);
}
unicode = PyUnicode_New(1, ch);
if (unicode == NULL)
if (unicode == NULL) {
return NULL;

assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
} else {
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
PyUnicode_4BYTE_DATA(unicode)[0] = ch;
}
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
PyUnicode_4BYTE_DATA(unicode)[0] = ch;
assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode;
}
Expand Down Expand Up @@ -2038,10 +2107,17 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();

/* Single character Unicode objects in the Latin-1 range are
shared when using this constructor */
if (size == 1 && (Py_UCS4)*u < 256)
return get_latin1_char((unsigned char)*u);
/* Single character Unicode objects can be shared when using this
constructor. */
if (size == 1) {
#if SIZEOF_WCHAR_T == 2
return unicode_char((Py_UCS2)*u);
#else
if ((Py_UCS4)*u <= MAX_UNICODE) {
return unicode_char((Py_UCS4)*u);
}
#endif
}

/* If not empty and not single character, copy the Unicode data
into the new object */
Expand Down Expand Up @@ -15264,6 +15340,8 @@ _PyUnicode_Fini(void)

for (i = 0; i < 256; i++)
Py_CLEAR(unicode_latin1[i]);
for (i = 0; i < BMP_CACHE_SIZE*2; i++)
Py_CLEAR(unicode_bmp_cache[i]);
_PyUnicode_ClearStaticStrings();
(void)PyUnicode_ClearFreeList();
}
Expand Down