Skip to content

Commit

Permalink
[Minor] Add diacritics flag for language detector
Browse files Browse the repository at this point in the history
  • Loading branch information
vstakhov committed Feb 4, 2020
1 parent 5ebcabc commit 9cf530b
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 13 deletions.
2 changes: 1 addition & 1 deletion contrib/languages-data/cs.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion contrib/languages-data/fr.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion contrib/languages-data/pl.json

Large diffs are not rendered by default.

44 changes: 35 additions & 9 deletions src/libmime/lang_detection.c
Expand Up @@ -70,13 +70,6 @@ static const gchar *tier1_langs[] = {
"pt", "ru", "pl", "tk", "th", "ar"
};

enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
RS_LANGUAGE_LATIN = (1 << 0),
RS_LANGUAGE_TIER1 = (1 << 3),
RS_LANGUAGE_TIER0 = (1 << 4),
};

enum rspamd_language_category {
RSPAMD_LANGUAGE_LATIN = 0,
RSPAMD_LANGUAGE_CYRILLIC,
Expand All @@ -87,7 +80,7 @@ enum rspamd_language_category {

struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
enum rspamd_language_elt_flags flags;
gint flags; /* enum rspamd_language_elt_flags */
enum rspamd_language_category category;
guint trigramms_words;
guint stop_words;
Expand Down Expand Up @@ -353,7 +346,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
{
struct ucl_parser *parser;
ucl_object_t *top;
const ucl_object_t *freqs, *n_words, *cur, *type;
const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
ucl_object_iter_t it = NULL;
UErrorCode uc_err = U_ZERO_ERROR;
struct rspamd_language_elt *nelt;
Expand Down Expand Up @@ -440,6 +433,29 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
}
}

flags = ucl_object_lookup (top, "flags");

if (type != NULL && ucl_object_type (type) == UCL_ARRAY) {
ucl_object_iter_t it = NULL;
const ucl_object_t *cur;

while ((cur = ucl_object_iterate (flags, &it, true)) != NULL) {
const gchar *fl = ucl_object_tostring (cur);

if (cur) {
if (strcmp (fl, "diacritics") == 0) {
nelt->flags |= RS_LANGUAGE_DIACRITICS;
}
else {
msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
}
}
else {
msg_debug_config ("unknown flags type of language %s", nelt->name);
}
}
}

if (stop_words) {
const ucl_object_t *specific_stop_words;

Expand Down Expand Up @@ -1902,4 +1918,14 @@ rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
}

return FALSE;
}

gint
rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt)
{
if (elt) {
return elt->flags;
}

return 0;
}
15 changes: 14 additions & 1 deletion src/libmime/lang_detection.h
Expand Up @@ -50,6 +50,14 @@ enum rspamd_unicode_scripts {
RSPAMD_UNICODE_HANGUL = (1 << 16),
};

enum rspamd_language_elt_flags {
RS_LANGUAGE_DEFAULT = 0,
RS_LANGUAGE_LATIN = (1 << 0),
RS_LANGUAGE_TIER1 = (1 << 3),
RS_LANGUAGE_TIER0 = (1 << 4),
RS_LANGUAGE_DIACRITICS = (1 << 5),
};

struct rspamd_lang_detector_res {
gdouble prob;
const gchar *lang;
Expand Down Expand Up @@ -88,7 +96,12 @@ gboolean rspamd_language_detector_detect (struct rspamd_task *task,
gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
const gchar *word, gsize wlen);


/**
* Return language flags for a specific language elt
* @param elt
* @return
*/
gint rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt);
#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit 9cf530b

Please sign in to comment.