Skip to content

Commit

Permalink
Additional unicode primitive functions.
Browse files Browse the repository at this point in the history
Introduce unicode_version(), icu_unicode_version(), and
unicode_assigned().

The latter requires introducing a new lookup table for the Unicode
General Category, which is generated along with the other Unicode
lookup tables.

Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com
Reviewed-by: Peter Eisentraut
  • Loading branch information
jeff-davis committed Nov 2, 2023
1 parent 7021d3b commit a02b37f
Show file tree
Hide file tree
Showing 18 changed files with 4,924 additions and 22 deletions.
109 changes: 90 additions & 19 deletions doc/src/sgml/func.sgml
Expand Up @@ -2859,6 +2859,22 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>

<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>unicode_assigned</primary>
</indexterm>
<function>unicode_assigned</function> ( <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Returns <literal>true</literal> if all characters in the string are
assigned Unicode codepoints; <literal>false</literal> otherwise. This
function can only be used when the server encoding is
<literal>UTF8</literal>.
</para></entry>
</row>

<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
Expand Down Expand Up @@ -23427,25 +23443,6 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
This is equivalent to <function>current_user</function>.
</para></entry>
</row>

<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>version</primary>
</indexterm>
<function>version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string describing the <productname>PostgreSQL</productname>
server's version. You can also get this information from
<xref linkend="guc-server-version"/>, or for a machine-readable
version use <xref linkend="guc-server-version-num"/>. Software
developers should use <varname>server_version_num</varname> (available
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
parsing the text version.
</para></entry>
</row>
</tbody>
</tgroup>
</table>
Expand Down Expand Up @@ -26332,6 +26329,80 @@ SELECT collation for ('foo' COLLATE "de_DE");

</sect2>

<sect2 id="functions-info-version">
<title>Version Information Functions</title>

<para>
The functions shown in <xref linkend="functions-version"/>
print version information.
</para>

<table id="functions-version">
<title>Version Information Functions</title>
<tgroup cols="1">
<thead>
<row>
<entry role="func_table_entry"><para role="func_signature">
Function
</para>
<para>
Description
</para></entry>
</row>
</thead>

<tbody>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>version</primary>
</indexterm>
<function>version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string describing the <productname>PostgreSQL</productname>
server's version. You can also get this information from
<xref linkend="guc-server-version"/>, or for a machine-readable
version use <xref linkend="guc-server-version-num"/>. Software
developers should use <varname>server_version_num</varname> (available
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
parsing the text version.
</para></entry>
</row>

<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>unicode_version</primary>
</indexterm>
<function>unicode_version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string representing the version of Unicode used by
<productname>PostgreSQL</productname>.
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>icu_unicode_version</primary>
</indexterm>
<function>icu_unicode_version</function> ()
<returnvalue>text</returnvalue>
</para>
<para>
Returns a string representing the version of Unicode used by ICU, if
the server was built with ICU support; otherwise returns
<literal>NULL</literal> </para></entry>
</row>
</tbody>
</tgroup>
</table>

</sect2>

</sect1>

<sect1 id="functions-admin">
Expand Down
61 changes: 61 additions & 0 deletions src/backend/utils/adt/varlena.c
Expand Up @@ -23,7 +23,9 @@
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
#include "common/unicode_category.h"
#include "common/unicode_norm.h"
#include "common/unicode_version.h"
#include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
Expand Down Expand Up @@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
return form;
}

/*
* Returns version of Unicode used by Postgres in "major.minor" format (the
* same format as the Unicode version reported by ICU). The third component
* ("update version") never involves additions to the character repertiore and
* is unimportant for most purposes.
*
* See: https://unicode.org/versions/
*/
Datum
unicode_version(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
}

/*
* Returns version of Unicode used by ICU, if enabled; otherwise NULL.
*/
Datum
icu_unicode_version(PG_FUNCTION_ARGS)
{
#ifdef USE_ICU
PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
#else
PG_RETURN_NULL();
#endif
}

/*
* Check whether the string contains only assigned Unicode code
* points. Requires that the database encoding is UTF-8.
*/
Datum
unicode_assigned(PG_FUNCTION_ARGS)
{
text *input = PG_GETARG_TEXT_PP(0);
unsigned char *p;
int size;

if (GetDatabaseEncoding() != PG_UTF8)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));

/* convert to pg_wchar */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
pg_wchar uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);

if (category == PG_U_UNASSIGNED)
PG_RETURN_BOOL(false);

p += pg_utf_mblen(p);
}

PG_RETURN_BOOL(true);
}

Datum
unicode_normalize_func(PG_FUNCTION_ARGS)
{
Expand Down
1 change: 1 addition & 0 deletions src/common/Makefile
Expand Up @@ -78,6 +78,7 @@ OBJS_COMMON = \
scram-common.o \
string.o \
stringinfo.o \
unicode_category.o \
unicode_norm.o \
username.o \
wait_error.o \
Expand Down
1 change: 1 addition & 0 deletions src/common/meson.build
Expand Up @@ -30,6 +30,7 @@ common_sources = files(
'scram-common.c',
'string.c',
'stringinfo.c',
'unicode_category.c',
'unicode_norm.c',
'username.c',
'wait_error.c',
Expand Down
19 changes: 17 additions & 2 deletions src/common/unicode/Makefile
Expand Up @@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
LIBS += $(PTHREAD_LIBS)

LDFLAGS_INTERNAL += $(ICU_LIBS)
CPPFLAGS += $(ICU_CFLAGS)

# By default, do nothing.
all:

update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
mv $^ $(top_srcdir)/src/include/common/
$(MAKE) category-check
$(MAKE) normalization-check

# These files are part of the Unicode Character Database. Download
Expand All @@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)

unicode_version.h: generate-unicode_version.pl
$(PERL) $< --version $(UNICODE_VERSION)

unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
$(PERL) $<

# Generation of conversion tables used for string normalization with
# UTF-8 strings.
unicode_norm_hashfunc.h: unicode_norm_table.h
Expand All @@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
$(PERL) $^ >$@

# Test suite
category-check: category_test
./category_test

normalization-check: norm_test
./norm_test

category_test: category_test.o ../unicode_category.o | submake-common

norm_test: norm_test.o ../unicode_norm.o | submake-common

norm_test.o: norm_test_table.h
Expand All @@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt


clean:
rm -f $(OBJS) norm_test norm_test.o
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o

distclean: clean
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
Expand Down
108 changes: 108 additions & 0 deletions src/common/unicode/category_test.c
@@ -0,0 +1,108 @@
/*-------------------------------------------------------------------------
* category_test.c
* Program to test Unicode general category functions.
*
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/common/unicode/category_test.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef USE_ICU
#include <unicode/uchar.h>
#endif
#include "common/unicode_category.h"
#include "common/unicode_version.h"

/*
* Parse version into integer for easy comparison.
*/
#ifdef USE_ICU
static int
parse_unicode_version(const char *version)
{
int n,
major,
minor;

n = sscanf(version, "%d.%d", &major, &minor);

Assert(n == 2);
Assert(minor < 100);

return major * 100 + minor;
}
#endif

/*
* Exhaustively test that the Unicode category for each codepoint matches that
* returned by ICU.
*/
int
main(int argc, char **argv)
{
#ifdef USE_ICU
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;

printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);

for (UChar32 code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);

if (pg_category != icu_category)
{
/*
* A version mismatch means that some assigned codepoints in the
* newer version may be unassigned in the older version. That's
* OK, though the test will not cover those codepoints marked
* unassigned in the older version (that is, it will no longer be
* an exhaustive test).
*/
if (pg_category == PG_U_UNASSIGNED &&
pg_unicode_version < icu_unicode_version)
pg_skipped_codepoints++;
else if (icu_category == PG_U_UNASSIGNED &&
icu_unicode_version < pg_unicode_version)
icu_skipped_codepoints++;
else
{
printf("FAILURE for codepoint %06x\n", code);
printf("Postgres category: %02d %s %s\n", pg_category,
unicode_category_abbrev(pg_category),
unicode_category_string(pg_category));
printf("ICU category: %02d %s %s\n", icu_category,
unicode_category_abbrev(icu_category),
unicode_category_string(icu_category));
printf("\n");
exit(1);
}
}
}

if (pg_skipped_codepoints > 0)
printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
pg_skipped_codepoints);
if (icu_skipped_codepoints > 0)
printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
icu_skipped_codepoints);

printf("category_test: All tests successful!\n");
exit(0);
#else
printf("ICU support required for test; skipping.\n");
exit(0);
#endif
}

0 comments on commit a02b37f

Please sign in to comment.