Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| /*------------------------------------------------------------------------- | |
| * | |
| * spell.c | |
| * Normalizing word with ISpell | |
| * | |
| * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group | |
| * | |
| * | |
| * IDENTIFICATION | |
| * src/backend/tsearch/spell.c | |
| * | |
| *------------------------------------------------------------------------- | |
| */ | |
| #include "postgres.h" | |
| #include "catalog/pg_collation.h" | |
| #include "tsearch/dicts/spell.h" | |
| #include "tsearch/ts_locale.h" | |
| #include "utils/memutils.h" | |
| /* | |
| * Initialization requires a lot of memory that's not needed | |
| * after the initialization is done. During initialization, | |
| * CurrentMemoryContext is the long-lived memory context associated | |
| * with the dictionary cache entry. We keep the short-lived stuff | |
| * in the Conf->buildCxt context. | |
| */ | |
| #define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz)) | |
| #define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz)) | |
| /* | |
| * Prepare for constructing an ISpell dictionary. | |
| * | |
| * The IspellDict struct is assumed to be zeroed when allocated. | |
| */ | |
| void | |
| NIStartBuild(IspellDict *Conf) | |
| { | |
| /* | |
| * The temp context is a child of CurTransactionContext, so that it will | |
| * go away automatically on error. | |
| */ | |
| Conf->buildCxt = AllocSetContextCreate(CurTransactionContext, | |
| "Ispell dictionary init context", | |
| ALLOCSET_DEFAULT_MINSIZE, | |
| ALLOCSET_DEFAULT_INITSIZE, | |
| ALLOCSET_DEFAULT_MAXSIZE); | |
| } | |
| /* | |
| * Clean up when dictionary construction is complete. | |
| */ | |
| void | |
| NIFinishBuild(IspellDict *Conf) | |
| { | |
| /* Release no-longer-needed temp memory */ | |
| MemoryContextDelete(Conf->buildCxt); | |
| /* Just for cleanliness, zero the now-dangling pointers */ | |
| Conf->buildCxt = NULL; | |
| Conf->Spell = NULL; | |
| Conf->firstfree = NULL; | |
| } | |
| /* | |
| * "Compact" palloc: allocate without extra palloc overhead. | |
| * | |
| * Since we have no need to free the ispell data items individually, there's | |
| * not much value in the per-chunk overhead normally consumed by palloc. | |
| * Getting rid of it is helpful since ispell can allocate a lot of small nodes. | |
| * | |
| * We currently pre-zero all data allocated this way, even though some of it | |
| * doesn't need that. The cpalloc and cpalloc0 macros are just documentation | |
| * to indicate which allocations actually require zeroing. | |
| */ | |
| #define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */ | |
| #define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */ | |
| static void * | |
| compact_palloc0(IspellDict *Conf, size_t size) | |
| { | |
| void *result; | |
| /* Should only be called during init */ | |
| Assert(Conf->buildCxt != NULL); | |
| /* No point in this for large chunks */ | |
| if (size > COMPACT_MAX_REQ) | |
| return palloc0(size); | |
| /* Keep everything maxaligned */ | |
| size = MAXALIGN(size); | |
| /* Need more space? */ | |
| if (size > Conf->avail) | |
| { | |
| Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK); | |
| Conf->avail = COMPACT_ALLOC_CHUNK; | |
| } | |
| result = (void *) Conf->firstfree; | |
| Conf->firstfree += size; | |
| Conf->avail -= size; | |
| return result; | |
| } | |
| #define cpalloc(size) compact_palloc0(Conf, size) | |
| #define cpalloc0(size) compact_palloc0(Conf, size) | |
| static char * | |
| cpstrdup(IspellDict *Conf, const char *str) | |
| { | |
| char *res = cpalloc(strlen(str) + 1); | |
| strcpy(res, str); | |
| return res; | |
| } | |
| /* | |
| * Apply lowerstr(), producing a temporary result (in the buildCxt). | |
| */ | |
| static char * | |
| lowerstr_ctx(IspellDict *Conf, const char *src) | |
| { | |
| MemoryContext saveCtx; | |
| char *dst; | |
| saveCtx = MemoryContextSwitchTo(Conf->buildCxt); | |
| dst = lowerstr(src); | |
| MemoryContextSwitchTo(saveCtx); | |
| return dst; | |
| } | |
| #define MAX_NORM 1024 | |
| #define MAXNORMLEN 256 | |
| #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) ) | |
| #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] ) | |
| #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T ) | |
| static char *VoidString = ""; | |
| static int | |
| cmpspell(const void *s1, const void *s2) | |
| { | |
| return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word)); | |
| } | |
| static int | |
| cmpspellaffix(const void *s1, const void *s2) | |
| { | |
| return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN)); | |
| } | |
| static char * | |
| findchar(char *str, int c) | |
| { | |
| while (*str) | |
| { | |
| if (t_iseq(str, c)) | |
| return str; | |
| str += pg_mblen(str); | |
| } | |
| return NULL; | |
| } | |
| /* backward string compare for suffix tree operations */ | |
| static int | |
| strbcmp(const unsigned char *s1, const unsigned char *s2) | |
| { | |
| int l1 = strlen((const char *) s1) - 1, | |
| l2 = strlen((const char *) s2) - 1; | |
| while (l1 >= 0 && l2 >= 0) | |
| { | |
| if (s1[l1] < s2[l2]) | |
| return -1; | |
| if (s1[l1] > s2[l2]) | |
| return 1; | |
| l1--; | |
| l2--; | |
| } | |
| if (l1 < l2) | |
| return -1; | |
| if (l1 > l2) | |
| return 1; | |
| return 0; | |
| } | |
| static int | |
| strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count) | |
| { | |
| int l1 = strlen((const char *) s1) - 1, | |
| l2 = strlen((const char *) s2) - 1, | |
| l = count; | |
| while (l1 >= 0 && l2 >= 0 && l > 0) | |
| { | |
| if (s1[l1] < s2[l2]) | |
| return -1; | |
| if (s1[l1] > s2[l2]) | |
| return 1; | |
| l1--; | |
| l2--; | |
| l--; | |
| } | |
| if (l == 0) | |
| return 0; | |
| if (l1 < l2) | |
| return -1; | |
| if (l1 > l2) | |
| return 1; | |
| return 0; | |
| } | |
| static int | |
| cmpaffix(const void *s1, const void *s2) | |
| { | |
| const AFFIX *a1 = (const AFFIX *) s1; | |
| const AFFIX *a2 = (const AFFIX *) s2; | |
| if (a1->type < a2->type) | |
| return -1; | |
| if (a1->type > a2->type) | |
| return 1; | |
| if (a1->type == FF_PREFIX) | |
| return strcmp(a1->repl, a2->repl); | |
| else | |
| return strbcmp((const unsigned char *) a1->repl, | |
| (const unsigned char *) a2->repl); | |
| } | |
| static void | |
| NIAddSpell(IspellDict *Conf, const char *word, const char *flag) | |
| { | |
| if (Conf->nspell >= Conf->mspell) | |
| { | |
| if (Conf->mspell) | |
| { | |
| Conf->mspell *= 2; | |
| Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *)); | |
| } | |
| else | |
| { | |
| Conf->mspell = 1024 * 20; | |
| Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *)); | |
| } | |
| } | |
| Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); | |
| strcpy(Conf->Spell[Conf->nspell]->word, word); | |
| strlcpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN); | |
| Conf->nspell++; | |
| } | |
| /* | |
| * import dictionary | |
| * | |
| * Note caller must already have applied get_tsearch_config_filename | |
| */ | |
| void | |
| NIImportDictionary(IspellDict *Conf, const char *filename) | |
| { | |
| tsearch_readline_state trst; | |
| char *line; | |
| if (!tsearch_readline_begin(&trst, filename)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("could not open dictionary file \"%s\": %m", | |
| filename))); | |
| while ((line = tsearch_readline(&trst)) != NULL) | |
| { | |
| char *s, | |
| *pstr; | |
| const char *flag; | |
| /* Extract flag from the line */ | |
| flag = NULL; | |
| if ((s = findchar(line, '/'))) | |
| { | |
| *s++ = '\0'; | |
| flag = s; | |
| while (*s) | |
| { | |
| /* we allow only single encoded flags for faster works */ | |
| if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s)) | |
| s++; | |
| else | |
| { | |
| *s = '\0'; | |
| break; | |
| } | |
| } | |
| } | |
| else | |
| flag = ""; | |
| /* Remove trailing spaces */ | |
| s = line; | |
| while (*s) | |
| { | |
| if (t_isspace(s)) | |
| { | |
| *s = '\0'; | |
| break; | |
| } | |
| s += pg_mblen(s); | |
| } | |
| pstr = lowerstr_ctx(Conf, line); | |
| NIAddSpell(Conf, pstr, flag); | |
| pfree(pstr); | |
| pfree(line); | |
| } | |
| tsearch_readline_end(&trst); | |
| } | |
| static int | |
| FindWord(IspellDict *Conf, const char *word, int affixflag, int flag) | |
| { | |
| SPNode *node = Conf->Dictionary; | |
| SPNodeData *StopLow, | |
| *StopHigh, | |
| *StopMiddle; | |
| const uint8 *ptr = (const uint8 *) word; | |
| flag &= FF_DICTFLAGMASK; | |
| while (node && *ptr) | |
| { | |
| StopLow = node->data; | |
| StopHigh = node->data + node->length; | |
| while (StopLow < StopHigh) | |
| { | |
| StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); | |
| if (StopMiddle->val == *ptr) | |
| { | |
| if (*(ptr + 1) == '\0' && StopMiddle->isword) | |
| { | |
| if (flag == 0) | |
| { | |
| if (StopMiddle->compoundflag & FF_COMPOUNDONLY) | |
| return 0; | |
| } | |
| else if ((flag & StopMiddle->compoundflag) == 0) | |
| return 0; | |
| if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL)) | |
| return 1; | |
| } | |
| node = StopMiddle->node; | |
| ptr++; | |
| break; | |
| } | |
| else if (StopMiddle->val < *ptr) | |
| StopLow = StopMiddle + 1; | |
| else | |
| StopHigh = StopMiddle; | |
| } | |
| if (StopLow >= StopHigh) | |
| break; | |
| } | |
| return 0; | |
| } | |
| static void | |
| NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type) | |
| { | |
| AFFIX *Affix; | |
| if (Conf->naffixes >= Conf->maffixes) | |
| { | |
| if (Conf->maffixes) | |
| { | |
| Conf->maffixes *= 2; | |
| Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX)); | |
| } | |
| else | |
| { | |
| Conf->maffixes = 16; | |
| Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX)); | |
| } | |
| } | |
| Affix = Conf->Affix + Conf->naffixes; | |
| if (strcmp(mask, ".") == 0) | |
| { | |
| Affix->issimple = 1; | |
| Affix->isregis = 0; | |
| } | |
| else if (RS_isRegis(mask)) | |
| { | |
| Affix->issimple = 0; | |
| Affix->isregis = 1; | |
| RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false, | |
| *mask ? mask : VoidString); | |
| } | |
| else | |
| { | |
| int masklen; | |
| int wmasklen; | |
| int err; | |
| pg_wchar *wmask; | |
| char *tmask; | |
| Affix->issimple = 0; | |
| Affix->isregis = 0; | |
| tmask = (char *) tmpalloc(strlen(mask) + 3); | |
| if (type == FF_SUFFIX) | |
| sprintf(tmask, "%s$", mask); | |
| else | |
| sprintf(tmask, "^%s", mask); | |
| masklen = strlen(tmask); | |
| wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar)); | |
| wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen); | |
| err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen, | |
| REG_ADVANCED | REG_NOSUB, | |
| DEFAULT_COLLATION_OID); | |
| if (err) | |
| { | |
| char errstr[100]; | |
| pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr)); | |
| ereport(ERROR, | |
| (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), | |
| errmsg("invalid regular expression: %s", errstr))); | |
| } | |
| } | |
| Affix->flagflags = flagflags; | |
| if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG)) | |
| { | |
| if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0) | |
| Affix->flagflags |= FF_COMPOUNDFLAG; | |
| } | |
| Affix->flag = flag; | |
| Affix->type = type; | |
| Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString; | |
| if ((Affix->replen = strlen(repl)) > 0) | |
| Affix->repl = cpstrdup(Conf, repl); | |
| else | |
| Affix->repl = VoidString; | |
| Conf->naffixes++; | |
| } | |
| #define PAE_WAIT_MASK 0 | |
| #define PAE_INMASK 1 | |
| #define PAE_WAIT_FIND 2 | |
| #define PAE_INFIND 3 | |
| #define PAE_WAIT_REPL 4 | |
| #define PAE_INREPL 5 | |
| static bool | |
| parse_affentry(char *str, char *mask, char *find, char *repl) | |
| { | |
| int state = PAE_WAIT_MASK; | |
| char *pmask = mask, | |
| *pfind = find, | |
| *prepl = repl; | |
| *mask = *find = *repl = '\0'; | |
| while (*str) | |
| { | |
| if (state == PAE_WAIT_MASK) | |
| { | |
| if (t_iseq(str, '#')) | |
| return false; | |
| else if (!t_isspace(str)) | |
| { | |
| COPYCHAR(pmask, str); | |
| pmask += pg_mblen(str); | |
| state = PAE_INMASK; | |
| } | |
| } | |
| else if (state == PAE_INMASK) | |
| { | |
| if (t_iseq(str, '>')) | |
| { | |
| *pmask = '\0'; | |
| state = PAE_WAIT_FIND; | |
| } | |
| else if (!t_isspace(str)) | |
| { | |
| COPYCHAR(pmask, str); | |
| pmask += pg_mblen(str); | |
| } | |
| } | |
| else if (state == PAE_WAIT_FIND) | |
| { | |
| if (t_iseq(str, '-')) | |
| { | |
| state = PAE_INFIND; | |
| } | |
| else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) | |
| { | |
| COPYCHAR(prepl, str); | |
| prepl += pg_mblen(str); | |
| state = PAE_INREPL; | |
| } | |
| else if (!t_isspace(str)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("syntax error"))); | |
| } | |
| else if (state == PAE_INFIND) | |
| { | |
| if (t_iseq(str, ',')) | |
| { | |
| *pfind = '\0'; | |
| state = PAE_WAIT_REPL; | |
| } | |
| else if (t_isalpha(str)) | |
| { | |
| COPYCHAR(pfind, str); | |
| pfind += pg_mblen(str); | |
| } | |
| else if (!t_isspace(str)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("syntax error"))); | |
| } | |
| else if (state == PAE_WAIT_REPL) | |
| { | |
| if (t_iseq(str, '-')) | |
| { | |
| break; /* void repl */ | |
| } | |
| else if (t_isalpha(str)) | |
| { | |
| COPYCHAR(prepl, str); | |
| prepl += pg_mblen(str); | |
| state = PAE_INREPL; | |
| } | |
| else if (!t_isspace(str)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("syntax error"))); | |
| } | |
| else if (state == PAE_INREPL) | |
| { | |
| if (t_iseq(str, '#')) | |
| { | |
| *prepl = '\0'; | |
| break; | |
| } | |
| else if (t_isalpha(str)) | |
| { | |
| COPYCHAR(prepl, str); | |
| prepl += pg_mblen(str); | |
| } | |
| else if (!t_isspace(str)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("syntax error"))); | |
| } | |
| else | |
| elog(ERROR, "unrecognized state in parse_affentry: %d", state); | |
| str += pg_mblen(str); | |
| } | |
| *pmask = *pfind = *prepl = '\0'; | |
| return (*mask && (*find || *repl)) ? true : false; | |
| } | |
| static void | |
| addFlagValue(IspellDict *Conf, char *s, uint32 val) | |
| { | |
| while (*s && t_isspace(s)) | |
| s += pg_mblen(s); | |
| if (!*s) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("syntax error"))); | |
| if (pg_mblen(s) != 1) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("multibyte flag character is not allowed"))); | |
| Conf->flagval[*(unsigned char *) s] = (unsigned char) val; | |
| Conf->usecompound = true; | |
| } | |
| /* | |
| * Import an affix file that follows MySpell or Hunspell format | |
| */ | |
| static void | |
| NIImportOOAffixes(IspellDict *Conf, const char *filename) | |
| { | |
| char type[BUFSIZ], | |
| *ptype = NULL; | |
| char sflag[BUFSIZ]; | |
| char mask[BUFSIZ], | |
| *pmask; | |
| char find[BUFSIZ], | |
| *pfind; | |
| char repl[BUFSIZ], | |
| *prepl; | |
| bool isSuffix = false; | |
| int flag = 0; | |
| char flagflags = 0; | |
| tsearch_readline_state trst; | |
| int scanread = 0; | |
| char scanbuf[BUFSIZ]; | |
| char *recoded; | |
| /* read file to find any flag */ | |
| memset(Conf->flagval, 0, sizeof(Conf->flagval)); | |
| Conf->usecompound = false; | |
| if (!tsearch_readline_begin(&trst, filename)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("could not open affix file \"%s\": %m", | |
| filename))); | |
| while ((recoded = tsearch_readline(&trst)) != NULL) | |
| { | |
| if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) | |
| { | |
| pfree(recoded); | |
| continue; | |
| } | |
| if (STRNCMP(recoded, "COMPOUNDFLAG") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"), | |
| FF_COMPOUNDFLAG); | |
| else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"), | |
| FF_COMPOUNDBEGIN); | |
| else if (STRNCMP(recoded, "COMPOUNDLAST") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"), | |
| FF_COMPOUNDLAST); | |
| /* COMPOUNDLAST and COMPOUNDEND are synonyms */ | |
| else if (STRNCMP(recoded, "COMPOUNDEND") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDEND"), | |
| FF_COMPOUNDLAST); | |
| else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"), | |
| FF_COMPOUNDMIDDLE); | |
| else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0) | |
| addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"), | |
| FF_COMPOUNDONLY); | |
| else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"), | |
| FF_COMPOUNDPERMITFLAG); | |
| else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0) | |
| addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"), | |
| FF_COMPOUNDFORBIDFLAG); | |
| else if (STRNCMP(recoded, "FLAG") == 0) | |
| { | |
| char *s = recoded + strlen("FLAG"); | |
| while (*s && t_isspace(s)) | |
| s += pg_mblen(s); | |
| if (*s && STRNCMP(s, "default") != 0) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("Ispell dictionary supports only default flag value"))); | |
| } | |
| pfree(recoded); | |
| } | |
| tsearch_readline_end(&trst); | |
| sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5); | |
| if (!tsearch_readline_begin(&trst, filename)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("could not open affix file \"%s\": %m", | |
| filename))); | |
| while ((recoded = tsearch_readline(&trst)) != NULL) | |
| { | |
| if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) | |
| goto nextline; | |
| scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask); | |
| if (ptype) | |
| pfree(ptype); | |
| ptype = lowerstr_ctx(Conf, type); | |
| if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) | |
| goto nextline; | |
| if (scanread == 4) | |
| { | |
| if (strlen(sflag) != 1) | |
| goto nextline; | |
| flag = *sflag; | |
| isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; | |
| if (t_iseq(find, 'y') || t_iseq(find, 'Y')) | |
| flagflags = FF_CROSSPRODUCT; | |
| else | |
| flagflags = 0; | |
| } | |
| else | |
| { | |
| char *ptr; | |
| int aflg = 0; | |
| if (strlen(sflag) != 1 || flag != *sflag || flag == 0) | |
| goto nextline; | |
| prepl = lowerstr_ctx(Conf, repl); | |
| /* affix flag */ | |
| if ((ptr = strchr(prepl, '/')) != NULL) | |
| { | |
| *ptr = '\0'; | |
| ptr = repl + (ptr - prepl) + 1; | |
| while (*ptr) | |
| { | |
| aflg |= Conf->flagval[*(unsigned char *) ptr]; | |
| ptr++; | |
| } | |
| } | |
| pfind = lowerstr_ctx(Conf, find); | |
| pmask = lowerstr_ctx(Conf, mask); | |
| if (t_iseq(find, '0')) | |
| *pfind = '\0'; | |
| if (t_iseq(repl, '0')) | |
| *prepl = '\0'; | |
| NIAddAffix(Conf, flag, flagflags | aflg, pmask, pfind, prepl, | |
| isSuffix ? FF_SUFFIX : FF_PREFIX); | |
| pfree(prepl); | |
| pfree(pfind); | |
| pfree(pmask); | |
| } | |
| nextline: | |
| pfree(recoded); | |
| } | |
| tsearch_readline_end(&trst); | |
| if (ptype) | |
| pfree(ptype); | |
| } | |
| /* | |
| * import affixes | |
| * | |
| * Note caller must already have applied get_tsearch_config_filename | |
| * | |
| * This function is responsible for parsing ispell ("old format") affix files. | |
| * If we realize that the file contains new-format commands, we pass off the | |
| * work to NIImportOOAffixes(), which will re-read the whole file. | |
| */ | |
| void | |
| NIImportAffixes(IspellDict *Conf, const char *filename) | |
| { | |
| char *pstr = NULL; | |
| char mask[BUFSIZ]; | |
| char find[BUFSIZ]; | |
| char repl[BUFSIZ]; | |
| char *s; | |
| bool suffixes = false; | |
| bool prefixes = false; | |
| int flag = 0; | |
| char flagflags = 0; | |
| tsearch_readline_state trst; | |
| bool oldformat = false; | |
| char *recoded = NULL; | |
| if (!tsearch_readline_begin(&trst, filename)) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("could not open affix file \"%s\": %m", | |
| filename))); | |
| memset(Conf->flagval, 0, sizeof(Conf->flagval)); | |
| Conf->usecompound = false; | |
| while ((recoded = tsearch_readline(&trst)) != NULL) | |
| { | |
| pstr = lowerstr(recoded); | |
| /* Skip comments and empty lines */ | |
| if (*pstr == '#' || *pstr == '\n') | |
| goto nextline; | |
| if (STRNCMP(pstr, "compoundwords") == 0) | |
| { | |
| s = findchar(pstr, 'l'); | |
| if (s) | |
| { | |
| s = recoded + (s - pstr); /* we need non-lowercased | |
| * string */ | |
| while (*s && !t_isspace(s)) | |
| s += pg_mblen(s); | |
| while (*s && t_isspace(s)) | |
| s += pg_mblen(s); | |
| if (*s && pg_mblen(s) == 1) | |
| { | |
| Conf->flagval[*(unsigned char *) s] = FF_COMPOUNDFLAG; | |
| Conf->usecompound = true; | |
| } | |
| oldformat = true; | |
| goto nextline; | |
| } | |
| } | |
| if (STRNCMP(pstr, "suffixes") == 0) | |
| { | |
| suffixes = true; | |
| prefixes = false; | |
| oldformat = true; | |
| goto nextline; | |
| } | |
| if (STRNCMP(pstr, "prefixes") == 0) | |
| { | |
| suffixes = false; | |
| prefixes = true; | |
| oldformat = true; | |
| goto nextline; | |
| } | |
| if (STRNCMP(pstr, "flag") == 0) | |
| { | |
| s = recoded + 4; /* we need non-lowercased string */ | |
| flagflags = 0; | |
| while (*s && t_isspace(s)) | |
| s += pg_mblen(s); | |
| if (*s == '*') | |
| { | |
| flagflags |= FF_CROSSPRODUCT; | |
| s++; | |
| } | |
| else if (*s == '~') | |
| { | |
| flagflags |= FF_COMPOUNDONLY; | |
| s++; | |
| } | |
| if (*s == '\\') | |
| s++; | |
| /* | |
| * An old-format flag is a single ASCII character; we expect it to | |
| * be followed by EOL, whitespace, or ':'. Otherwise this is a | |
| * new-format flag command. | |
| */ | |
| if (*s && pg_mblen(s) == 1) | |
| { | |
| flag = *(unsigned char *) s; | |
| s++; | |
| if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || | |
| t_isspace(s)) | |
| { | |
| oldformat = true; | |
| goto nextline; | |
| } | |
| } | |
| goto isnewformat; | |
| } | |
| if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || | |
| STRNCMP(recoded, "COMPOUNDMIN") == 0 || | |
| STRNCMP(recoded, "PFX") == 0 || | |
| STRNCMP(recoded, "SFX") == 0) | |
| goto isnewformat; | |
| if ((!suffixes) && (!prefixes)) | |
| goto nextline; | |
| if (!parse_affentry(pstr, mask, find, repl)) | |
| goto nextline; | |
| NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); | |
| nextline: | |
| pfree(recoded); | |
| pfree(pstr); | |
| } | |
| tsearch_readline_end(&trst); | |
| return; | |
| isnewformat: | |
| if (oldformat) | |
| ereport(ERROR, | |
| (errcode(ERRCODE_CONFIG_FILE_ERROR), | |
| errmsg("affix file contains both old-style and new-style commands"))); | |
| tsearch_readline_end(&trst); | |
| NIImportOOAffixes(Conf, filename); | |
| } | |
| static int | |
| MergeAffix(IspellDict *Conf, int a1, int a2) | |
| { | |
| char **ptr; | |
| while (Conf->nAffixData + 1 >= Conf->lenAffixData) | |
| { | |
| Conf->lenAffixData *= 2; | |
| Conf->AffixData = (char **) repalloc(Conf->AffixData, | |
| sizeof(char *) * Conf->lenAffixData); | |
| } | |
| ptr = Conf->AffixData + Conf->nAffixData; | |
| *ptr = cpalloc(strlen(Conf->AffixData[a1]) + | |
| strlen(Conf->AffixData[a2]) + | |
| 1 /* space */ + 1 /* \0 */ ); | |
| sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]); | |
| ptr++; | |
| *ptr = NULL; | |
| Conf->nAffixData++; | |
| return Conf->nAffixData - 1; | |
| } | |
| static uint32 | |
| makeCompoundFlags(IspellDict *Conf, int affix) | |
| { | |
| uint32 flag = 0; | |
| char *str = Conf->AffixData[affix]; | |
| while (str && *str) | |
| { | |
| flag |= Conf->flagval[*(unsigned char *) str]; | |
| str++; | |
| } | |
| return (flag & FF_DICTFLAGMASK); | |
| } | |
| static SPNode * | |
| mkSPNode(IspellDict *Conf, int low, int high, int level) | |
| { | |
| int i; | |
| int nchar = 0; | |
| char lastchar = '\0'; | |
| SPNode *rs; | |
| SPNodeData *data; | |
| int lownew = low; | |
| for (i = low; i < high; i++) | |
| if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level]) | |
| { | |
| nchar++; | |
| lastchar = Conf->Spell[i]->word[level]; | |
| } | |
| if (!nchar) | |
| return NULL; | |
| rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData)); | |
| rs->length = nchar; | |
| data = rs->data; | |
| lastchar = '\0'; | |
| for (i = low; i < high; i++) | |
| if (Conf->Spell[i]->p.d.len > level) | |
| { | |
| if (lastchar != Conf->Spell[i]->word[level]) | |
| { | |
| if (lastchar) | |
| { | |
| data->node = mkSPNode(Conf, lownew, i, level + 1); | |
| lownew = i; | |
| data++; | |
| } | |
| lastchar = Conf->Spell[i]->word[level]; | |
| } | |
| data->val = ((uint8 *) (Conf->Spell[i]->word))[level]; | |
| if (Conf->Spell[i]->p.d.len == level + 1) | |
| { | |
| bool clearCompoundOnly = false; | |
| if (data->isword && data->affix != Conf->Spell[i]->p.d.affix) | |
| { | |
| /* | |
| * MergeAffix called a few times. If one of word is | |
| * allowed to be in compound word and another isn't, then | |
| * clear FF_COMPOUNDONLY flag. | |
| */ | |
| clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag | |
| & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix)) | |
| ? false : true; | |
| data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix); | |
| } | |
| else | |
| data->affix = Conf->Spell[i]->p.d.affix; | |
| data->isword = 1; | |
| data->compoundflag = makeCompoundFlags(Conf, data->affix); | |
| if ((data->compoundflag & FF_COMPOUNDONLY) && | |
| (data->compoundflag & FF_COMPOUNDFLAG) == 0) | |
| data->compoundflag |= FF_COMPOUNDFLAG; | |
| if (clearCompoundOnly) | |
| data->compoundflag &= ~FF_COMPOUNDONLY; | |
| } | |
| } | |
| data->node = mkSPNode(Conf, lownew, high, level + 1); | |
| return rs; | |
| } | |
| /* | |
| * Builds the Conf->Dictionary tree and AffixData from the imported dictionary | |
| * and affixes. | |
| */ | |
| void | |
| NISortDictionary(IspellDict *Conf) | |
| { | |
| int i; | |
| int naffix = 0; | |
| int curaffix; | |
| /* compress affixes */ | |
| /* Count the number of different flags used in the dictionary */ | |
| qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix); | |
| naffix = 0; | |
| for (i = 0; i < Conf->nspell; i++) | |
| { | |
| if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN)) | |
| naffix++; | |
| } | |
| /* | |
| * Fill in Conf->AffixData with the affixes that were used in the | |
| * dictionary. Replace textual flag-field of Conf->Spell entries with | |
| * indexes into Conf->AffixData array. | |
| */ | |
| Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); | |
| curaffix = -1; | |
| for (i = 0; i < Conf->nspell; i++) | |
| { | |
| if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN)) | |
| { | |
| curaffix++; | |
| Assert(curaffix < naffix); | |
| Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag); | |
| } | |
| Conf->Spell[i]->p.d.affix = curaffix; | |
| Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); | |
| } | |
| Conf->lenAffixData = Conf->nAffixData = naffix; | |
| qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); | |
| Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); | |
| } | |
| static AffixNode * | |
| mkANode(IspellDict *Conf, int low, int high, int level, int type) | |
| { | |
| int i; | |
| int nchar = 0; | |
| uint8 lastchar = '\0'; | |
| AffixNode *rs; | |
| AffixNodeData *data; | |
| int lownew = low; | |
| int naff; | |
| AFFIX **aff; | |
| for (i = low; i < high; i++) | |
| if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type)) | |
| { | |
| nchar++; | |
| lastchar = GETCHAR(Conf->Affix + i, level, type); | |
| } | |
| if (!nchar) | |
| return NULL; | |
| aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1)); | |
| naff = 0; | |
| rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData)); | |
| rs->length = nchar; | |
| data = rs->data; | |
| lastchar = '\0'; | |
| for (i = low; i < high; i++) | |
| if (Conf->Affix[i].replen > level) | |
| { | |
| if (lastchar != GETCHAR(Conf->Affix + i, level, type)) | |
| { | |
| if (lastchar) | |
| { | |
| data->node = mkANode(Conf, lownew, i, level + 1, type); | |
| if (naff) | |
| { | |
| data->naff = naff; | |
| data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff); | |
| memcpy(data->aff, aff, sizeof(AFFIX *) * naff); | |
| naff = 0; | |
| } | |
| data++; | |
| lownew = i; | |
| } | |
| lastchar = GETCHAR(Conf->Affix + i, level, type); | |
| } | |
| data->val = GETCHAR(Conf->Affix + i, level, type); | |
| if (Conf->Affix[i].replen == level + 1) | |
| { /* affix stopped */ | |
| aff[naff++] = Conf->Affix + i; | |
| } | |
| } | |
| data->node = mkANode(Conf, lownew, high, level + 1, type); | |
| if (naff) | |
| { | |
| data->naff = naff; | |
| data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff); | |
| memcpy(data->aff, aff, sizeof(AFFIX *) * naff); | |
| naff = 0; | |
| } | |
| pfree(aff); | |
| return rs; | |
| } | |
| static void | |
| mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix) | |
| { | |
| int i, | |
| cnt = 0; | |
| int start = (issuffix) ? startsuffix : 0; | |
| int end = (issuffix) ? Conf->naffixes : startsuffix; | |
| AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData)); | |
| Affix->length = 1; | |
| Affix->isvoid = 1; | |
| if (issuffix) | |
| { | |
| Affix->data->node = Conf->Suffix; | |
| Conf->Suffix = Affix; | |
| } | |
| else | |
| { | |
| Affix->data->node = Conf->Prefix; | |
| Conf->Prefix = Affix; | |
| } | |
| for (i = start; i < end; i++) | |
| if (Conf->Affix[i].replen == 0) | |
| cnt++; | |
| if (cnt == 0) | |
| return; | |
| Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt); | |
| Affix->data->naff = (uint32) cnt; | |
| cnt = 0; | |
| for (i = start; i < end; i++) | |
| if (Conf->Affix[i].replen == 0) | |
| { | |
| Affix->data->aff[cnt] = Conf->Affix + i; | |
| cnt++; | |
| } | |
| } | |
| static bool | |
| isAffixInUse(IspellDict *Conf, char flag) | |
| { | |
| int i; | |
| for (i = 0; i < Conf->nAffixData; i++) | |
| if (strchr(Conf->AffixData[i], flag) != NULL) | |
| return true; | |
| return false; | |
| } | |
| void | |
| NISortAffixes(IspellDict *Conf) | |
| { | |
| AFFIX *Affix; | |
| size_t i; | |
| CMPDAffix *ptr; | |
| int firstsuffix = Conf->naffixes; | |
| if (Conf->naffixes == 0) | |
| return; | |
| if (Conf->naffixes > 1) | |
| qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); | |
| Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes); | |
| ptr->affix = NULL; | |
| for (i = 0; i < Conf->naffixes; i++) | |
| { | |
| Affix = &(((AFFIX *) Conf->Affix)[i]); | |
| if (Affix->type == FF_SUFFIX && i < firstsuffix) | |
| firstsuffix = i; | |
| if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && | |
| isAffixInUse(Conf, (char) Affix->flag)) | |
| { | |
| if (ptr == Conf->CompoundAffix || | |
| ptr->issuffix != (ptr - 1)->issuffix || | |
| strbncmp((const unsigned char *) (ptr - 1)->affix, | |
| (const unsigned char *) Affix->repl, | |
| (ptr - 1)->len)) | |
| { | |
| /* leave only unique and minimals suffixes */ | |
| ptr->affix = Affix->repl; | |
| ptr->len = Affix->replen; | |
| ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false; | |
| ptr++; | |
| } | |
| } | |
| } | |
| ptr->affix = NULL; | |
| Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1)); | |
| Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); | |
| Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); | |
| mkVoidAffix(Conf, true, firstsuffix); | |
| mkVoidAffix(Conf, false, firstsuffix); | |
| } | |
| static AffixNodeData * | |
| FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) | |
| { | |
| AffixNodeData *StopLow, | |
| *StopHigh, | |
| *StopMiddle; | |
| uint8 symbol; | |
| if (node->isvoid) | |
| { /* search void affixes */ | |
| if (node->data->naff) | |
| return node->data; | |
| node = node->data->node; | |
| } | |
| while (node && *level < wrdlen) | |
| { | |
| StopLow = node->data; | |
| StopHigh = node->data + node->length; | |
| while (StopLow < StopHigh) | |
| { | |
| StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); | |
| symbol = GETWCHAR(word, wrdlen, *level, type); | |
| if (StopMiddle->val == symbol) | |
| { | |
| (*level)++; | |
| if (StopMiddle->naff) | |
| return StopMiddle; | |
| node = StopMiddle->node; | |
| break; | |
| } | |
| else if (StopMiddle->val < symbol) | |
| StopLow = StopMiddle + 1; | |
| else | |
| StopHigh = StopMiddle; | |
| } | |
| if (StopLow >= StopHigh) | |
| break; | |
| } | |
| return NULL; | |
| } | |
| static char * | |
| CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen) | |
| { | |
| /* | |
| * Check compound allow flags | |
| */ | |
| if (flagflags == 0) | |
| { | |
| if (Affix->flagflags & FF_COMPOUNDONLY) | |
| return NULL; | |
| } | |
| else if (flagflags & FF_COMPOUNDBEGIN) | |
| { | |
| if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG) | |
| return NULL; | |
| if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0) | |
| if (Affix->type == FF_SUFFIX) | |
| return NULL; | |
| } | |
| else if (flagflags & FF_COMPOUNDMIDDLE) | |
| { | |
| if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 || | |
| (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)) | |
| return NULL; | |
| } | |
| else if (flagflags & FF_COMPOUNDLAST) | |
| { | |
| if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG) | |
| return NULL; | |
| if ((Affix->flagflags & FF_COMPOUNDLAST) == 0) | |
| if (Affix->type == FF_PREFIX) | |
| return NULL; | |
| } | |
| /* | |
| * make replace pattern of affix | |
| */ | |
| if (Affix->type == FF_SUFFIX) | |
| { | |
| strcpy(newword, word); | |
| strcpy(newword + len - Affix->replen, Affix->find); | |
| if (baselen) /* store length of non-changed part of word */ | |
| *baselen = len - Affix->replen; | |
| } | |
| else | |
| { | |
| /* | |
| * if prefix is an all non-changed part's length then all word | |
| * contains only prefix and suffix, so out | |
| */ | |
| if (baselen && *baselen + strlen(Affix->find) <= Affix->replen) | |
| return NULL; | |
| strcpy(newword, Affix->find); | |
| strcat(newword, word + Affix->replen); | |
| } | |
| /* | |
| * check resulting word | |
| */ | |
| if (Affix->issimple) | |
| return newword; | |
| else if (Affix->isregis) | |
| { | |
| if (RS_execute(&(Affix->reg.regis), newword)) | |
| return newword; | |
| } | |
| else | |
| { | |
| int err; | |
| pg_wchar *data; | |
| size_t data_len; | |
| int newword_len; | |
| /* Convert data string to wide characters */ | |
| newword_len = strlen(newword); | |
| data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar)); | |
| data_len = pg_mb2wchar_with_len(newword, data, newword_len); | |
| if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0))) | |
| { | |
| pfree(data); | |
| return newword; | |
| } | |
| pfree(data); | |
| } | |
| return NULL; | |
| } | |
| static int | |
| addToResult(char **forms, char **cur, char *word) | |
| { | |
| if (cur - forms >= MAX_NORM - 1) | |
| return 0; | |
| if (forms == cur || strcmp(word, *(cur - 1)) != 0) | |
| { | |
| *cur = pstrdup(word); | |
| *(cur + 1) = NULL; | |
| return 1; | |
| } | |
| return 0; | |
| } | |
| static char ** | |
| NormalizeSubWord(IspellDict *Conf, char *word, int flag) | |
| { | |
| AffixNodeData *suffix = NULL, | |
| *prefix = NULL; | |
| int slevel = 0, | |
| plevel = 0; | |
| int wrdlen = strlen(word), | |
| swrdlen; | |
| char **forms; | |
| char **cur; | |
| char newword[2 * MAXNORMLEN] = ""; | |
| char pnewword[2 * MAXNORMLEN] = ""; | |
| AffixNode *snode = Conf->Suffix, | |
| *pnode; | |
| int i, | |
| j; | |
| if (wrdlen > MAXNORMLEN) | |
| return NULL; | |
| cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); | |
| *cur = NULL; | |
| /* Check that the word itself is normal form */ | |
| if (FindWord(Conf, word, 0, flag)) | |
| { | |
| *cur = pstrdup(word); | |
| cur++; | |
| *cur = NULL; | |
| } | |
| /* Find all other NORMAL forms of the 'word' (check only prefix) */ | |
| pnode = Conf->Prefix; | |
| plevel = 0; | |
| while (pnode) | |
| { | |
| prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); | |
| if (!prefix) | |
| break; | |
| for (j = 0; j < prefix->naff; j++) | |
| { | |
| if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL)) | |
| { | |
| /* prefix success */ | |
| if (FindWord(Conf, newword, prefix->aff[j]->flag, flag)) | |
| cur += addToResult(forms, cur, newword); | |
| } | |
| } | |
| pnode = prefix->node; | |
| } | |
| /* | |
| * Find all other NORMAL forms of the 'word' (check suffix and then | |
| * prefix) | |
| */ | |
| while (snode) | |
| { | |
| int baselen = 0; | |
| /* find possible suffix */ | |
| suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); | |
| if (!suffix) | |
| break; | |
| /* foreach suffix check affix */ | |
| for (i = 0; i < suffix->naff; i++) | |
| { | |
| if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen)) | |
| { | |
| /* suffix success */ | |
| if (FindWord(Conf, newword, suffix->aff[i]->flag, flag)) | |
| cur += addToResult(forms, cur, newword); | |
| /* now we will look changed word with prefixes */ | |
| pnode = Conf->Prefix; | |
| plevel = 0; | |
| swrdlen = strlen(newword); | |
| while (pnode) | |
| { | |
| prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); | |
| if (!prefix) | |
| break; | |
| for (j = 0; j < prefix->naff; j++) | |
| { | |
| if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen)) | |
| { | |
| /* prefix success */ | |
| int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ? | |
| 0 : prefix->aff[j]->flag; | |
| if (FindWord(Conf, pnewword, ff, flag)) | |
| cur += addToResult(forms, cur, pnewword); | |
| } | |
| } | |
| pnode = prefix->node; | |
| } | |
| } | |
| } | |
| snode = suffix->node; | |
| } | |
| if (cur == forms) | |
| { | |
| pfree(forms); | |
| return (NULL); | |
| } | |
| return (forms); | |
| } | |
| typedef struct SplitVar | |
| { | |
| int nstem; | |
| int lenstem; | |
| char **stem; | |
| struct SplitVar *next; | |
| } SplitVar; | |
| static int | |
| CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace) | |
| { | |
| bool issuffix; | |
| /* in case CompoundAffix is null: */ | |
| if (*ptr == NULL) | |
| return -1; | |
| if (CheckInPlace) | |
| { | |
| while ((*ptr)->affix) | |
| { | |
| if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0) | |
| { | |
| len = (*ptr)->len; | |
| issuffix = (*ptr)->issuffix; | |
| (*ptr)++; | |
| return (issuffix) ? len : 0; | |
| } | |
| (*ptr)++; | |
| } | |
| } | |
| else | |
| { | |
| char *affbegin; | |
| while ((*ptr)->affix) | |
| { | |
| if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL) | |
| { | |
| len = (*ptr)->len + (affbegin - word); | |
| issuffix = (*ptr)->issuffix; | |
| (*ptr)++; | |
| return (issuffix) ? len : 0; | |
| } | |
| (*ptr)++; | |
| } | |
| } | |
| return -1; | |
| } | |
| static SplitVar * | |
| CopyVar(SplitVar *s, int makedup) | |
| { | |
| SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar)); | |
| v->next = NULL; | |
| if (s) | |
| { | |
| int i; | |
| v->lenstem = s->lenstem; | |
| v->stem = (char **) palloc(sizeof(char *) * v->lenstem); | |
| v->nstem = s->nstem; | |
| for (i = 0; i < s->nstem; i++) | |
| v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i]; | |
| } | |
| else | |
| { | |
| v->lenstem = 16; | |
| v->stem = (char **) palloc(sizeof(char *) * v->lenstem); | |
| v->nstem = 0; | |
| } | |
| return v; | |
| } | |
| static void | |
| AddStem(SplitVar *v, char *word) | |
| { | |
| if (v->nstem >= v->lenstem) | |
| { | |
| v->lenstem *= 2; | |
| v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem); | |
| } | |
| v->stem[v->nstem] = word; | |
| v->nstem++; | |
| } | |
| static SplitVar * | |
| SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos) | |
| { | |
| SplitVar *var = NULL; | |
| SPNodeData *StopLow, | |
| *StopHigh, | |
| *StopMiddle = NULL; | |
| SPNode *node = (snode) ? snode : Conf->Dictionary; | |
| int level = (snode) ? minpos : startpos; /* recursive | |
| * minpos==level */ | |
| int lenaff; | |
| CMPDAffix *caff; | |
| char *notprobed; | |
| int compoundflag = 0; | |
| notprobed = (char *) palloc(wordlen); | |
| memset(notprobed, 1, wordlen); | |
| var = CopyVar(orig, 1); | |
| while (level < wordlen) | |
| { | |
| /* find word with epenthetic or/and compound affix */ | |
| caff = Conf->CompoundAffix; | |
| while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0) | |
| { | |
| /* | |
| * there is one of compound affixes, so check word for existings | |
| */ | |
| char buf[MAXNORMLEN]; | |
| char **subres; | |
| lenaff = level - startpos + lenaff; | |
| if (!notprobed[startpos + lenaff - 1]) | |
| continue; | |
| if (level + lenaff - 1 <= minpos) | |
| continue; | |
| if (lenaff >= MAXNORMLEN) | |
| continue; /* skip too big value */ | |
| if (lenaff > 0) | |
| memcpy(buf, word + startpos, lenaff); | |
| buf[lenaff] = '\0'; | |
| if (level == 0) | |
| compoundflag = FF_COMPOUNDBEGIN; | |
| else if (level == wordlen - 1) | |
| compoundflag = FF_COMPOUNDLAST; | |
| else | |
| compoundflag = FF_COMPOUNDMIDDLE; | |
| subres = NormalizeSubWord(Conf, buf, compoundflag); | |
| if (subres) | |
| { | |
| /* Yes, it was a word from dictionary */ | |
| SplitVar *new = CopyVar(var, 0); | |
| SplitVar *ptr = var; | |
| char **sptr = subres; | |
| notprobed[startpos + lenaff - 1] = 0; | |
| while (*sptr) | |
| { | |
| AddStem(new, *sptr); | |
| sptr++; | |
| } | |
| pfree(subres); | |
| while (ptr->next) | |
| ptr = ptr->next; | |
| ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff); | |
| pfree(new->stem); | |
| pfree(new); | |
| } | |
| } | |
| if (!node) | |
| break; | |
| StopLow = node->data; | |
| StopHigh = node->data + node->length; | |
| while (StopLow < StopHigh) | |
| { | |
| StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); | |
| if (StopMiddle->val == ((uint8 *) (word))[level]) | |
| break; | |
| else if (StopMiddle->val < ((uint8 *) (word))[level]) | |
| StopLow = StopMiddle + 1; | |
| else | |
| StopHigh = StopMiddle; | |
| } | |
| if (StopLow < StopHigh) | |
| { | |
| if (level == FF_COMPOUNDBEGIN) | |
| compoundflag = FF_COMPOUNDBEGIN; | |
| else if (level == wordlen - 1) | |
| compoundflag = FF_COMPOUNDLAST; | |
| else | |
| compoundflag = FF_COMPOUNDMIDDLE; | |
| /* find infinitive */ | |
| if (StopMiddle->isword && | |
| (StopMiddle->compoundflag & compoundflag) && | |
| notprobed[level]) | |
| { | |
| /* ok, we found full compoundallowed word */ | |
| if (level > minpos) | |
| { | |
| /* and its length more than minimal */ | |
| if (wordlen == level + 1) | |
| { | |
| /* well, it was last word */ | |
| AddStem(var, pnstrdup(word + startpos, wordlen - startpos)); | |
| pfree(notprobed); | |
| return var; | |
| } | |
| else | |
| { | |
| /* then we will search more big word at the same point */ | |
| SplitVar *ptr = var; | |
| while (ptr->next) | |
| ptr = ptr->next; | |
| ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); | |
| /* we can find next word */ | |
| level++; | |
| AddStem(var, pnstrdup(word + startpos, level - startpos)); | |
| node = Conf->Dictionary; | |
| startpos = level; | |
| continue; | |
| } | |
| } | |
| } | |
| node = StopMiddle->node; | |
| } | |
| else | |
| node = NULL; | |
| level++; | |
| } | |
| AddStem(var, pnstrdup(word + startpos, wordlen - startpos)); | |
| pfree(notprobed); | |
| return var; | |
| } | |
| static void | |
| addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant) | |
| { | |
| if (*lres == NULL) | |
| *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme)); | |
| if (*lcur - *lres < MAX_NORM - 1) | |
| { | |
| (*lcur)->lexeme = word; | |
| (*lcur)->flags = flags; | |
| (*lcur)->nvariant = NVariant; | |
| (*lcur)++; | |
| (*lcur)->lexeme = NULL; | |
| } | |
| } | |
| TSLexeme * | |
| NINormalizeWord(IspellDict *Conf, char *word) | |
| { | |
| char **res; | |
| TSLexeme *lcur = NULL, | |
| *lres = NULL; | |
| uint16 NVariant = 1; | |
| res = NormalizeSubWord(Conf, word, 0); | |
| if (res) | |
| { | |
| char **ptr = res; | |
| while (*ptr && (lcur - lres) < MAX_NORM) | |
| { | |
| addNorm(&lres, &lcur, *ptr, 0, NVariant++); | |
| ptr++; | |
| } | |
| pfree(res); | |
| } | |
| if (Conf->usecompound) | |
| { | |
| int wordlen = strlen(word); | |
| SplitVar *ptr, | |
| *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1); | |
| int i; | |
| while (var) | |
| { | |
| if (var->nstem > 1) | |
| { | |
| char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST); | |
| if (subres) | |
| { | |
| char **subptr = subres; | |
| while (*subptr) | |
| { | |
| for (i = 0; i < var->nstem - 1; i++) | |
| { | |
| addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant); | |
| } | |
| addNorm(&lres, &lcur, *subptr, 0, NVariant); | |
| subptr++; | |
| NVariant++; | |
| } | |
| pfree(subres); | |
| var->stem[0] = NULL; | |
| pfree(var->stem[var->nstem - 1]); | |
| } | |
| } | |
| for (i = 0; i < var->nstem && var->stem[i]; i++) | |
| pfree(var->stem[i]); | |
| ptr = var->next; | |
| pfree(var->stem); | |
| pfree(var); | |
| var = ptr; | |
| } | |
| } | |
| return lres; | |
| } |