-
-
Notifications
You must be signed in to change notification settings - Fork 373
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Project] Add preliminary language detector based on fasttext library
- Loading branch information
Showing
5 changed files
with
243 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
/*- | ||
* Copyright 2023 Vsevolod Stakhov | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "lang_detection_fasttext.h" | ||
|
||
#ifdef WITH_FASTTEXT | ||
#include "fasttext/fasttext.h" | ||
#include "libserver/cfg_file.h" | ||
#include "libserver/logger.h" | ||
#include "fmt/core.h" | ||
#include <exception> | ||
#include <string> | ||
#include <vector> | ||
#include <sstream> | ||
#include <streambuf> | ||
#endif | ||
|
||
#ifdef WITH_FASTTEXT | ||
namespace rspamd::langdet { | ||
class fasttext_langdet { | ||
private: | ||
fasttext::FastText ft; | ||
bool loaded; | ||
|
||
struct one_shot_buf : public std::streambuf { | ||
explicit one_shot_buf(const char *in, std::size_t sz) { | ||
auto deconst_in = const_cast<char *>(in); | ||
setg(deconst_in, deconst_in, deconst_in + sz); | ||
} | ||
}; | ||
public: | ||
explicit fasttext_langdet(struct rspamd_config *cfg) { | ||
const auto *ucl_obj = cfg->rcl_obj; | ||
const auto *opts_section = ucl_object_find_key(ucl_obj, "options"); | ||
|
||
if (opts_section) { | ||
const auto *model = ucl_object_find_key(opts_section, "fasttext_langdet_model"); | ||
|
||
if (model) { | ||
try { | ||
ft.loadModel(ucl_object_tostring(model)); | ||
loaded = true; | ||
} | ||
catch (std::exception &e) { | ||
auto err_message = fmt::format("cannot load fasttext model: {}", e.what()); | ||
msg_err_config("%s", err_message.c_str()); | ||
loaded = false; | ||
} | ||
} | ||
} | ||
} | ||
|
||
/* Disallow multiple initialisation */ | ||
fasttext_langdet() = delete; | ||
fasttext_langdet(const fasttext_langdet &) = delete; | ||
fasttext_langdet(fasttext_langdet &&) = delete; | ||
|
||
~fasttext_langdet() = default; | ||
|
||
|
||
auto detect_language(const char *in, size_t len, int k) -> std::vector<std::pair<fasttext::real, std::string>> * | ||
{ | ||
if (!loaded) { | ||
return nullptr; | ||
} | ||
|
||
/* Hack to deal with streams without copies */ | ||
one_shot_buf buf{in, len}; | ||
auto stream = std::istream{&buf}; | ||
auto predictions = new std::vector<std::pair<fasttext::real, std::string>>; | ||
predictions->reserve(k); | ||
auto res = ft.predictLine(stream, *predictions, k, 0.0f); | ||
|
||
if (res) { | ||
return predictions; | ||
} | ||
else { | ||
delete predictions; | ||
} | ||
|
||
return nullptr; | ||
} | ||
}; | ||
} | ||
#endif | ||
|
||
/* C API part */ | ||
G_BEGIN_DECLS | ||
|
||
#define FASTTEXT_MODEL_TO_C_API(p) reinterpret_cast<rspamd::langdet::fasttext_langdet *>(p) | ||
#define FASTTEXT_RESULT_TO_C_API(res) reinterpret_cast<std::vector<std::pair<fasttext::real, std::string>> *>(res) | ||
|
||
void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg) | ||
{ | ||
#ifndef WITH_FASTTEXT | ||
return nullptr; | ||
#else | ||
return (void *)new rspamd::langdet::fasttext_langdet(cfg); | ||
#endif | ||
} | ||
|
||
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, | ||
const char *in, size_t len, int k) | ||
{ | ||
#ifndef WITH_FASTTEXT | ||
return nullptr; | ||
#else | ||
auto *real_model = FASTTEXT_MODEL_TO_C_API(ud); | ||
auto *res = real_model->detect_language(in, len, k); | ||
|
||
return (rspamd_fasttext_predict_result_t)res; | ||
#endif | ||
} | ||
|
||
void rspamd_lang_detection_fasttext_destroy(void *ud) | ||
{ | ||
#ifdef WITH_FASTTEXT | ||
delete FASTTEXT_MODEL_TO_C_API(ud); | ||
#endif | ||
} | ||
|
||
const char * | ||
rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res) | ||
{ | ||
#ifdef WITH_FASTTEXT | ||
auto *real_res = FASTTEXT_RESULT_TO_C_API(res); | ||
|
||
if (real_res && !real_res->empty()) { | ||
return real_res->front().second.c_str(); | ||
} | ||
#endif | ||
return nullptr; | ||
} | ||
|
||
float | ||
rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res) | ||
{ | ||
#ifdef WITH_FASTTEXT | ||
auto *real_res = FASTTEXT_RESULT_TO_C_API(res); | ||
|
||
if (real_res && !real_res->empty()) { | ||
return real_res->front().first; | ||
} | ||
#endif | ||
return 0.0f; | ||
} | ||
|
||
void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res) | ||
{ | ||
#ifdef WITH_FASTTEXT | ||
auto *real_res = FASTTEXT_RESULT_TO_C_API(res); | ||
|
||
delete real_res; | ||
#endif | ||
} | ||
|
||
G_END_DECLS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/*- | ||
* Copyright 2023 Vsevolod Stakhov | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#ifndef RSPAMD_LANG_DETECTION_FASTTEXT_H | ||
#define RSPAMD_LANG_DETECTION_FASTTEXT_H | ||
|
||
#include "config.h" | ||
|
||
G_BEGIN_DECLS | ||
struct rspamd_config; | ||
/** | ||
* Initialize fasttext language detector | ||
* @param cfg | ||
* @return opaque pointer | ||
*/ | ||
void* rspamd_lang_detection_fasttext_init(struct rspamd_config *cfg); | ||
|
||
|
||
typedef void * rspamd_fasttext_predict_result_t; | ||
/** | ||
* Detect language using fasttext | ||
* @param ud opaque pointer | ||
* @param in input text | ||
* @param len length of input text | ||
* @param k number of results to return | ||
* @return TRUE if language is detected | ||
*/ | ||
rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud, | ||
const char *in, size_t len, int k); | ||
|
||
/** | ||
* Get language from fasttext result | ||
* @param res | ||
* @return | ||
*/ | ||
const char *rspamd_lang_detection_fasttext_get_lang(rspamd_fasttext_predict_result_t res); | ||
|
||
/** | ||
* Get probability from fasttext result | ||
* @param res | ||
* @return | ||
*/ | ||
float rspamd_lang_detection_fasttext_get_prob(rspamd_fasttext_predict_result_t res); | ||
|
||
/** | ||
* Destroy fasttext result | ||
* @param res | ||
*/ | ||
void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res); | ||
|
||
/** | ||
* Destroy fasttext language detector | ||
*/ | ||
void rspamd_lang_detection_fasttext_destroy(void *ud); | ||
|
||
|
||
G_END_DECLS | ||
#endif /* RSPAMD_LANG_DETECTION_FASTTEXT_H */ |