Skip to content

Commit

Permalink
Add CJK word boundary iterator to query parser and term generator
Browse files Browse the repository at this point in the history
The previous CJK token iterator has been refactored into CJK text
iterators to either break CJK text into ngrams or use word boundary
analysis. To enable the latter, the new flag FLAG_CJK_WORDS has been
added to both QueryParser and TermGenerator.

CJK ngrams remain default and are backwards compatible.

The word boundary analysis uses the ICU - International Components for
Unicode library (http://site.icu-project.org/).
  • Loading branch information
rsto committed Jun 27, 2016
1 parent af2585e commit 16dd9b2
Show file tree
Hide file tree
Showing 12 changed files with 437 additions and 52 deletions.
4 changes: 2 additions & 2 deletions xapian-core/Makefile.am
Expand Up @@ -9,7 +9,7 @@ if MAINTAINER_MODE
export ACLOCAL AUTOCONF AUTOHEADER AUTOM4TE AUTOMAKE
endif

AM_CPPFLAGS = -I$(top_srcdir)/common -I$(top_srcdir)/include
AM_CPPFLAGS = -I$(top_srcdir)/common -I$(top_srcdir)/include $(ICU_CFLAGS)
if VPATH_BUILD
# Needed for the generated files include/xapian/version.h
# include/xapian/error.h and include/xapian/errordispatch.h
Expand Down Expand Up @@ -94,7 +94,7 @@ lib_LTLIBRARIES = libxapian.la
libxapian_la_SOURCES = $(lib_src)
libxapian_la_LIBADD = $(XAPIAN_LIBS)
libxapian_la_LDFLAGS = \
$(XAPIAN_LDFLAGS) $(NO_UNDEFINED) -version-info $(LIBRARY_VERSION_INFO)
$(XAPIAN_LDFLAGS) $(NO_UNDEFINED) -version-info $(LIBRARY_VERSION_INFO) $(ICU_LIBS)

lib_src =

Expand Down
6 changes: 6 additions & 0 deletions xapian-core/configure.ac
Expand Up @@ -254,6 +254,12 @@ AC_SUBST([abi_affecting_cxxflags])

XAPIAN_LDFLAGS=
XAPIAN_LIBS=

dnl Check for ICU. We require this for the CJKWordIterator
PKG_CHECK_MODULES([ICU], [icu-uc])
AC_SUBST([ICU_CFLAGS])
AC_SUBST([ICU_LIBS])

AC_SUBST([XAPIAN_LDFLAGS])
AC_SUBST([XAPIAN_LIBS])

Expand Down
10 changes: 10 additions & 0 deletions xapian-core/include/xapian/queryparser.h
Expand Up @@ -790,6 +790,16 @@ class XAPIAN_VISIBILITY_DEFAULT QueryParser {
*/
FLAG_CJK_NGRAM = 2048,

/** Enable generation of words from CJK text.
*
* With this enabled, spans of CJK characters are split into CJK
* words using text boundary heuristics. Non-CJK characters are
* split into words as normal.
*
* The corresponding option needs to have been used at index time.
*/
FLAG_CJK_WORDS = 4096,

/** The default flags.
*
* Used if you don't explicitly pass any to @a parse_query().
Expand Down
12 changes: 11 additions & 1 deletion xapian-core/include/xapian/termgenerator.h
Expand Up @@ -106,7 +106,17 @@ class XAPIAN_VISIBILITY_DEFAULT TermGenerator {
* enabled in 1.2.8 and later by setting environment variable
* XAPIAN_CJK_NGRAM.
*/
FLAG_CJK_NGRAM = 2048 // Value matches QueryParser flag.
FLAG_CJK_NGRAM = 2048, // Value matches QueryParser flag.

/** Enable generation of words from CJK text.
*
* With this enabled, spans of CJK characters are split into CJK
* words using text boundary heuristics. Non-CJK characters are
* split into words as normal.
*
* The corresponding option needs to be passed to QueryParser.
*/
FLAG_CJK_WORDS = 4096 // Value matches QueryParser flag
};

/// Stemming strategies, for use with set_stemming_strategy().
Expand Down
155 changes: 155 additions & 0 deletions xapian-core/m4/pkg.m4
@@ -0,0 +1,155 @@
# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
#
# Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that program.

# PKG_PROG_PKG_CONFIG([MIN-VERSION])
# ----------------------------------
AC_DEFUN([PKG_PROG_PKG_CONFIG],
[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
fi
if test -n "$PKG_CONFIG"; then
_pkg_min_version=m4_default([$1], [0.9.0])
AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
PKG_CONFIG=""
fi
fi[]dnl
])# PKG_PROG_PKG_CONFIG

# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
#
# Check to see whether a particular set of modules exists. Similar
# to PKG_CHECK_MODULES(), but does not set variables or print errors.
#
#
# Similar to PKG_CHECK_MODULES, make sure that the first instance of
# this or PKG_CHECK_MODULES is called, or make sure to call
# PKG_CHECK_EXISTS manually
# --------------------------------------------------------------
AC_DEFUN([PKG_CHECK_EXISTS],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
if test -n "$PKG_CONFIG" && \
AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
m4_ifval([$2], [$2], [:])
m4_ifvaln([$3], [else
$3])dnl
fi])


# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
# ---------------------------------------------
m4_define([_PKG_CONFIG],
[if test -n "$$1"; then
pkg_cv_[]$1="$$1"
elif test -n "$PKG_CONFIG"; then
PKG_CHECK_EXISTS([$3],
[pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
[pkg_failed=yes])
else
pkg_failed=untried
fi[]dnl
])# _PKG_CONFIG

# _PKG_SHORT_ERRORS_SUPPORTED
# -----------------------------
AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
_pkg_short_errors_supported=yes
else
_pkg_short_errors_supported=no
fi[]dnl
])# _PKG_SHORT_ERRORS_SUPPORTED


# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
# [ACTION-IF-NOT-FOUND])
#
#
# Note that if there is a possibility the first call to
# PKG_CHECK_MODULES might not happen, you should be sure to include an
# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
#
#
# --------------------------------------------------------------
AC_DEFUN([PKG_CHECK_MODULES],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
pkg_failed=no
AC_MSG_CHECKING([for $1])
_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
_PKG_CONFIG([$1][_LIBS], [libs], [$2])
m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
and $1[]_LIBS to avoid the need to call pkg-config.
See the pkg-config man page for more details.])
if test $pkg_failed = yes; then
_PKG_SHORT_ERRORS_SUPPORTED
if test $_pkg_short_errors_supported = yes; then
$1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors "$2" 2>&1`
else
$1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors "$2" 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
ifelse([$4], , [AC_MSG_ERROR(dnl
[Package requirements ($2) were not met:
$$1_PKG_ERRORS
Consider adjusting the PKG_CONFIG_PATH environment variable if you
installed software in a non-standard prefix.
_PKG_TEXT
])],
[AC_MSG_RESULT([no])
$4])
elif test $pkg_failed = untried; then
ifelse([$4], , [AC_MSG_FAILURE(dnl
[The pkg-config script could not be found or is too old. Make sure it
is in your PATH or set the PKG_CONFIG environment variable to the full
path to pkg-config.
_PKG_TEXT
To get pkg-config, see <http://pkg-config.freedesktop.org/>.])],
[$4])
else
$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
AC_MSG_RESULT([yes])
ifelse([$3], , :, [$3])
fi[]dnl
])# PKG_CHECK_MODULES
74 changes: 72 additions & 2 deletions xapian-core/queryparser/cjk-tokenizer.cc
Expand Up @@ -31,6 +31,7 @@

#include "omassert.h"
#include "xapian/unicode.h"
#include "xapian/error.h"

#include <cstdlib>
#include <string>
Expand Down Expand Up @@ -97,8 +98,16 @@ CJK::get_cjk(Xapian::Utf8Iterator &it)
return str;
}

bool
CJKTokenIterator::equal_to(const CJKTokenIterator & other) const
{
// We only really care about comparisons where one or other is an end
// iterator.
return it == other.it;
}

const string &
CJKTokenIterator::operator*() const
CJKNgramIterator::operator*() const
{
if (current_token.empty()) {
Assert(it != Xapian::Utf8Iterator());
Expand All @@ -111,7 +120,7 @@ CJKTokenIterator::operator*() const
}

CJKTokenIterator &
CJKTokenIterator::operator++()
CJKNgramIterator::operator++()
{
if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
Xapian::Unicode::append_utf8(current_token, *p);
Expand All @@ -124,3 +133,64 @@ CJKTokenIterator::operator++()
}
return *this;
}

bool
CJKWordIterator::equal_to(const CJKTokenIterator & other) const
{
if (CJKWordIterator const* o = dynamic_cast<CJKWordIterator const*>(&other)) {
return p == o->p && q == o->q;
} else {
return false;
}
}

CJKWordIterator::CJKWordIterator(const std::string & s) : CJKTokenIterator(s)
{
unsigned c;
while (it != Xapian::Utf8Iterator()) {
c = *it;
++it;
ustr.append((UChar32) c);
}

UErrorCode err = U_ZERO_ERROR;
brk = icu::BreakIterator::createWordInstance(0 /*unknown locale*/, err);
if (U_FAILURE(err))
throw Xapian::InternalError(string("ICU error: ") + string(u_errorName(err)));
brk->setText(ustr);
q = brk->first();
p = brk->next();
}

const string &
CJKWordIterator::operator*() const
{
if (current_token.empty()) {
Assert(p != q);
len = 0;
icu::UnicodeString uword = ustr.tempSubString(q, p-q);
for (int32_t i = 0; i < uword.length(); i = uword.getChar32Limit(++i)) {
Xapian::Unicode::append_utf8(current_token, uword.char32At(i));
len++;
}
}
return current_token;
}


CJKTokenIterator &
CJKWordIterator::operator++()
{
q = p;
p = brk->next();
if (p != UBRK_DONE) {
current_token.resize(0);
if (p != q) {
// refresh current_token and len
current_token = (*(*this));
}
} else {
q = UBRK_DONE;
}
return *this;
}

0 comments on commit 16dd9b2

Please sign in to comment.