Additional unicode primitive functions.

Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
postgres · Nov 2, 2023 · a02b37f · a02b37f
1 parent 7021d3b
commit a02b37f
Show file tree

Hide file tree

Showing 18 changed files with 4,924 additions and 22 deletions.
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
@@ -2859,6 +2859,22 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_assigned</primary>
+        </indexterm>
+        <function>unicode_assigned</function> ( <type>text</type> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Returns <literal>true</literal> if all characters in the string are
+        assigned Unicode codepoints; <literal>false</literal> otherwise. This
+        function can only be used when the server encoding is
+        <literal>UTF8</literal>.
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -23427,25 +23443,6 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
         This is equivalent to <function>current_user</function>.
        </para></entry>
       </row>
-
-      <row>
-       <entry role="func_table_entry"><para role="func_signature">
-        <indexterm>
-         <primary>version</primary>
-        </indexterm>
-        <function>version</function> ()
-        <returnvalue>text</returnvalue>
-       </para>
-       <para>
-        Returns a string describing the <productname>PostgreSQL</productname>
-        server's version.  You can also get this information from
-        <xref linkend="guc-server-version"/>, or for a machine-readable
-        version use <xref linkend="guc-server-version-num"/>.  Software
-        developers should use <varname>server_version_num</varname> (available
-        since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
-        parsing the text version.
-       </para></entry>
-      </row>
      </tbody>
     </tgroup>
    </table>
@@ -26332,6 +26329,80 @@ SELECT collation for ('foo' COLLATE "de_DE");
 
   </sect2>
 
+  <sect2 id="functions-info-version">
+   <title>Version Information Functions</title>
+
+   <para>
+    The functions shown in <xref linkend="functions-version"/>
+    print version information.
+   </para>
+
+   <table id="functions-version">
+    <title>Version Information Functions</title>
+    <tgroup cols="1">
+     <thead>
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        Function
+       </para>
+       <para>
+        Description
+       </para></entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>version</primary>
+        </indexterm>
+        <function>version</function> ()
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Returns a string describing the <productname>PostgreSQL</productname>
+        server's version.  You can also get this information from
+        <xref linkend="guc-server-version"/>, or for a machine-readable
+        version use <xref linkend="guc-server-version-num"/>.  Software
+        developers should use <varname>server_version_num</varname> (available
+        since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
+        parsing the text version.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_version</primary>
+        </indexterm>
+        <function>unicode_version</function> ()
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Returns a string representing the version of Unicode used by
+        <productname>PostgreSQL</productname>.
+       </para></entry>
+      </row>
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>icu_unicode_version</primary>
+        </indexterm>
+        <function>icu_unicode_version</function> ()
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Returns a string representing the version of Unicode used by ICU, if
+        the server was built with ICU support; otherwise returns
+        <literal>NULL</literal> </para></entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+  </sect2>
+
   </sect1>
 
   <sect1 id="functions-admin">

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
@@ -23,7 +23,9 @@
 #include "catalog/pg_type.h"
 #include "common/hashfn.h"
 #include "common/int.h"
+#include "common/unicode_category.h"
 #include "common/unicode_norm.h"
+#include "common/unicode_version.h"
 #include "funcapi.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
 	return form;
 }
 
+/*
+ * Returns version of Unicode used by Postgres in "major.minor" format (the
+ * same format as the Unicode version reported by ICU). The third component
+ * ("update version") never involves additions to the character repertiore and
+ * is unimportant for most purposes.
+ *
+ * See: https://unicode.org/versions/
+ */
+Datum
+unicode_version(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
+}
+
+/*
+ * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
+ */
+Datum
+icu_unicode_version(PG_FUNCTION_ARGS)
+{
+#ifdef USE_ICU
+	PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
+#else
+	PG_RETURN_NULL();
+#endif
+}
+
+/*
+ * Check whether the string contains only assigned Unicode code
+ * points. Requires that the database encoding is UTF-8.
+ */
+Datum
+unicode_assigned(PG_FUNCTION_ARGS)
+{
+	text	   *input = PG_GETARG_TEXT_PP(0);
+	unsigned char *p;
+	int			size;
+
+	if (GetDatabaseEncoding() != PG_UTF8)
+		ereport(ERROR,
+				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
+
+	/* convert to pg_wchar */
+	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+	p = (unsigned char *) VARDATA_ANY(input);
+	for (int i = 0; i < size; i++)
+	{
+		pg_wchar	uchar = utf8_to_unicode(p);
+		int			category = unicode_category(uchar);
+
+		if (category == PG_U_UNASSIGNED)
+			PG_RETURN_BOOL(false);
+
+		p += pg_utf_mblen(p);
+	}
+
+	PG_RETURN_BOOL(true);
+}
+
 Datum
 unicode_normalize_func(PG_FUNCTION_ARGS)
 {

diff --git a/src/common/Makefile b/src/common/Makefile
@@ -78,6 +78,7 @@ OBJS_COMMON = \
 	scram-common.o \
 	string.o \
 	stringinfo.o \
+	unicode_category.o \
 	unicode_norm.o \
 	username.o \
 	wait_error.o \

diff --git a/src/common/meson.build b/src/common/meson.build
@@ -30,6 +30,7 @@ common_sources = files(
   'scram-common.c',
   'string.c',
   'stringinfo.c',
+  'unicode_category.c',
   'unicode_norm.c',
   'username.c',
   'wait_error.c',

diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
@@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
 override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
+LDFLAGS_INTERNAL += $(ICU_LIBS)
+CPPFLAGS += $(ICU_CFLAGS)
+
 # By default, do nothing.
 all:
 
-update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
 	mv $^ $(top_srcdir)/src/include/common/
+	$(MAKE) category-check
 	$(MAKE) normalization-check
 
 # These files are part of the Unicode Character Database. Download
@@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
 UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
+unicode_version.h: generate-unicode_version.pl
+	$(PERL) $< --version $(UNICODE_VERSION)
+
+unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
+	$(PERL) $<
+
 # Generation of conversion tables used for string normalization with
 # UTF-8 strings.
 unicode_norm_hashfunc.h: unicode_norm_table.h
@@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
 	$(PERL) $^ >$@
 
 # Test suite
+category-check: category_test
+	./category_test
+
 normalization-check: norm_test
 	./norm_test
 
+category_test: category_test.o ../unicode_category.o | submake-common
+
 norm_test: norm_test.o ../unicode_norm.o | submake-common
 
 norm_test.o: norm_test_table.h
@@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
 
 
 clean:
-	rm -f $(OBJS) norm_test norm_test.o
+	rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
 
 distclean: clean
 	rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h

diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
@@ -0,0 +1,108 @@
+/*-------------------------------------------------------------------------
+ * category_test.c
+ *		Program to test Unicode general category functions.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/common/unicode/category_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef USE_ICU
+#include <unicode/uchar.h>
+#endif
+#include "common/unicode_category.h"
+#include "common/unicode_version.h"
+
+/*
+ * Parse version into integer for easy comparison.
+ */
+#ifdef USE_ICU
+static int
+parse_unicode_version(const char *version)
+{
+	int			n,
+				major,
+				minor;
+
+	n = sscanf(version, "%d.%d", &major, &minor);
+
+	Assert(n == 2);
+	Assert(minor < 100);
+
+	return major * 100 + minor;
+}
+#endif
+
+/*
+ * Exhaustively test that the Unicode category for each codepoint matches that
+ * returned by ICU.
+ */
+int
+main(int argc, char **argv)
+{
+#ifdef USE_ICU
+	int			pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
+	int			icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
+	int			pg_skipped_codepoints = 0;
+	int			icu_skipped_codepoints = 0;
+
+	printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
+	printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
+
+	for (UChar32 code = 0; code <= 0x10ffff; code++)
+	{
+		uint8_t		pg_category = unicode_category(code);
+		uint8_t		icu_category = u_charType(code);
+
+		if (pg_category != icu_category)
+		{
+			/*
+			 * A version mismatch means that some assigned codepoints in the
+			 * newer version may be unassigned in the older version. That's
+			 * OK, though the test will not cover those codepoints marked
+			 * unassigned in the older version (that is, it will no longer be
+			 * an exhaustive test).
+			 */
+			if (pg_category == PG_U_UNASSIGNED &&
+				pg_unicode_version < icu_unicode_version)
+				pg_skipped_codepoints++;
+			else if (icu_category == PG_U_UNASSIGNED &&
+					 icu_unicode_version < pg_unicode_version)
+				icu_skipped_codepoints++;
+			else
+			{
+				printf("FAILURE for codepoint %06x\n", code);
+				printf("Postgres category:	%02d %s %s\n", pg_category,
+					   unicode_category_abbrev(pg_category),
+					   unicode_category_string(pg_category));
+				printf("ICU category:		%02d %s %s\n", icu_category,
+					   unicode_category_abbrev(icu_category),
+					   unicode_category_string(icu_category));
+				printf("\n");
+				exit(1);
+			}
+		}
+	}
+
+	if (pg_skipped_codepoints > 0)
+		printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
+			   pg_skipped_codepoints);
+	if (icu_skipped_codepoints > 0)
+		printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
+			   icu_skipped_codepoints);
+
+	printf("category_test: All tests successful!\n");
+	exit(0);
+#else
+	printf("ICU support required for test; skipping.\n");
+	exit(0);
+#endif
+}