Skip to content

Commit

Permalink
Added benchmark based on accuracy test.
Browse files Browse the repository at this point in the history
Removed Lazy to eager loading. With these changes the benchmark runs 25% faster compare to baseline
  • Loading branch information
serega committed Apr 17, 2022
1 parent b0e39a8 commit accae7a
Show file tree
Hide file tree
Showing 4 changed files with 325 additions and 158 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ crate-type = ["cdylib", "rlib"]
name = "accuracy_reports"
required-features = ["accuracy-reports"]

[[bin]]
name = "bench"

[dependencies]
fraction = "0.10"
include_dir = "0.7"
Expand Down
212 changes: 212 additions & 0 deletions src/bin/bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
use std::collections::HashMap;
use std::time::Instant;
use include_dir::Dir;
use lingua::{Language, LanguageDetectorBuilder};
use strum::IntoEnumIterator;
use lingua_afrikaans_language_model::AFRIKAANS_TESTDATA_DIRECTORY;
use lingua_albanian_language_model::ALBANIAN_TESTDATA_DIRECTORY;
use lingua_arabic_language_model::ARABIC_TESTDATA_DIRECTORY;
use lingua_armenian_language_model::ARMENIAN_TESTDATA_DIRECTORY;
use lingua_azerbaijani_language_model::AZERBAIJANI_TESTDATA_DIRECTORY;
use lingua_basque_language_model::BASQUE_TESTDATA_DIRECTORY;
use lingua_belarusian_language_model::BELARUSIAN_TESTDATA_DIRECTORY;
use lingua_bengali_language_model::BENGALI_TESTDATA_DIRECTORY;
use lingua_bokmal_language_model::BOKMAL_TESTDATA_DIRECTORY;
use lingua_bosnian_language_model::BOSNIAN_TESTDATA_DIRECTORY;
use lingua_bulgarian_language_model::BULGARIAN_TESTDATA_DIRECTORY;
use lingua_catalan_language_model::CATALAN_TESTDATA_DIRECTORY;
use lingua_chinese_language_model::CHINESE_TESTDATA_DIRECTORY;
use lingua_croatian_language_model::CROATIAN_TESTDATA_DIRECTORY;
use lingua_czech_language_model::CZECH_TESTDATA_DIRECTORY;
use lingua_danish_language_model::DANISH_TESTDATA_DIRECTORY;
use lingua_dutch_language_model::DUTCH_TESTDATA_DIRECTORY;
use lingua_english_language_model::ENGLISH_TESTDATA_DIRECTORY;
use lingua_esperanto_language_model::ESPERANTO_TESTDATA_DIRECTORY;
use lingua_estonian_language_model::ESTONIAN_TESTDATA_DIRECTORY;
use lingua_finnish_language_model::FINNISH_TESTDATA_DIRECTORY;
use lingua_french_language_model::FRENCH_TESTDATA_DIRECTORY;
use lingua_ganda_language_model::GANDA_TESTDATA_DIRECTORY;
use lingua_georgian_language_model::GEORGIAN_TESTDATA_DIRECTORY;
use lingua_german_language_model::GERMAN_TESTDATA_DIRECTORY;
use lingua_greek_language_model::GREEK_TESTDATA_DIRECTORY;
use lingua_gujarati_language_model::GUJARATI_TESTDATA_DIRECTORY;
use lingua_hebrew_language_model::HEBREW_TESTDATA_DIRECTORY;
use lingua_hindi_language_model::HINDI_TESTDATA_DIRECTORY;
use lingua_hungarian_language_model::HUNGARIAN_TESTDATA_DIRECTORY;
use lingua_icelandic_language_model::ICELANDIC_TESTDATA_DIRECTORY;
use lingua_indonesian_language_model::INDONESIAN_TESTDATA_DIRECTORY;
use lingua_irish_language_model::IRISH_TESTDATA_DIRECTORY;
use lingua_italian_language_model::ITALIAN_TESTDATA_DIRECTORY;
use lingua_japanese_language_model::JAPANESE_TESTDATA_DIRECTORY;
use lingua_kazakh_language_model::KAZAKH_TESTDATA_DIRECTORY;
use lingua_korean_language_model::KOREAN_TESTDATA_DIRECTORY;
use lingua_latin_language_model::LATIN_TESTDATA_DIRECTORY;
use lingua_latvian_language_model::LATVIAN_TESTDATA_DIRECTORY;
use lingua_lithuanian_language_model::LITHUANIAN_TESTDATA_DIRECTORY;
use lingua_macedonian_language_model::MACEDONIAN_TESTDATA_DIRECTORY;
use lingua_malay_language_model::MALAY_TESTDATA_DIRECTORY;
use lingua_maori_language_model::MAORI_TESTDATA_DIRECTORY;
use lingua_marathi_language_model::MARATHI_TESTDATA_DIRECTORY;
use lingua_mongolian_language_model::MONGOLIAN_TESTDATA_DIRECTORY;
use lingua_nynorsk_language_model::NYNORSK_TESTDATA_DIRECTORY;
use lingua_persian_language_model::PERSIAN_TESTDATA_DIRECTORY;
use lingua_polish_language_model::POLISH_TESTDATA_DIRECTORY;
use lingua_portuguese_language_model::PORTUGUESE_TESTDATA_DIRECTORY;
use lingua_punjabi_language_model::PUNJABI_TESTDATA_DIRECTORY;
use lingua_romanian_language_model::ROMANIAN_TESTDATA_DIRECTORY;
use lingua_russian_language_model::RUSSIAN_TESTDATA_DIRECTORY;
use lingua_serbian_language_model::SERBIAN_TESTDATA_DIRECTORY;
use lingua_shona_language_model::SHONA_TESTDATA_DIRECTORY;
use lingua_slovak_language_model::SLOVAK_TESTDATA_DIRECTORY;
use lingua_slovene_language_model::SLOVENE_TESTDATA_DIRECTORY;
use lingua_somali_language_model::SOMALI_TESTDATA_DIRECTORY;
use lingua_sotho_language_model::SOTHO_TESTDATA_DIRECTORY;
use lingua_spanish_language_model::SPANISH_TESTDATA_DIRECTORY;
use lingua_swahili_language_model::SWAHILI_TESTDATA_DIRECTORY;
use lingua_swedish_language_model::SWEDISH_TESTDATA_DIRECTORY;
use lingua_tagalog_language_model::TAGALOG_TESTDATA_DIRECTORY;
use lingua_tamil_language_model::TAMIL_TESTDATA_DIRECTORY;
use lingua_telugu_language_model::TELUGU_TESTDATA_DIRECTORY;
use lingua_thai_language_model::THAI_TESTDATA_DIRECTORY;
use lingua_tsonga_language_model::TSONGA_TESTDATA_DIRECTORY;
use lingua_tswana_language_model::TSWANA_TESTDATA_DIRECTORY;
use lingua_turkish_language_model::TURKISH_TESTDATA_DIRECTORY;
use lingua_ukrainian_language_model::UKRAINIAN_TESTDATA_DIRECTORY;
use lingua_urdu_language_model::URDU_TESTDATA_DIRECTORY;
use lingua_vietnamese_language_model::VIETNAMESE_TESTDATA_DIRECTORY;
use lingua_welsh_language_model::WELSH_TESTDATA_DIRECTORY;
use lingua_xhosa_language_model::XHOSA_TESTDATA_DIRECTORY;
use lingua_yoruba_language_model::YORUBA_TESTDATA_DIRECTORY;
use lingua_zulu_language_model::ZULU_TESTDATA_DIRECTORY;


fn get_file_content<'a>(file_name: &'a str, language: &'a Language) -> Vec<String> {
let directory = get_test_data_directory(language);
directory
.get_file(file_name)
.unwrap()
.contents_utf8()
.unwrap()
.split("\n")
.filter(|&line| !line.trim().is_empty())
.map(|line| line.to_string())
.collect()
}

fn get_test_data_directory(language: &Language) -> Dir<'static> {
match language {
&Language::Afrikaans => AFRIKAANS_TESTDATA_DIRECTORY,
&Language::Albanian => ALBANIAN_TESTDATA_DIRECTORY,
&Language::Arabic => ARABIC_TESTDATA_DIRECTORY,
&Language::Armenian => ARMENIAN_TESTDATA_DIRECTORY,
&Language::Azerbaijani => AZERBAIJANI_TESTDATA_DIRECTORY,
&Language::Basque => BASQUE_TESTDATA_DIRECTORY,
&Language::Belarusian => BELARUSIAN_TESTDATA_DIRECTORY,
&Language::Bengali => BENGALI_TESTDATA_DIRECTORY,
&Language::Bokmal => BOKMAL_TESTDATA_DIRECTORY,
&Language::Bosnian => BOSNIAN_TESTDATA_DIRECTORY,
&Language::Bulgarian => BULGARIAN_TESTDATA_DIRECTORY,
&Language::Catalan => CATALAN_TESTDATA_DIRECTORY,
&Language::Chinese => CHINESE_TESTDATA_DIRECTORY,
&Language::Croatian => CROATIAN_TESTDATA_DIRECTORY,
&Language::Czech => CZECH_TESTDATA_DIRECTORY,
&Language::Danish => DANISH_TESTDATA_DIRECTORY,
&Language::Dutch => DUTCH_TESTDATA_DIRECTORY,
&Language::English => ENGLISH_TESTDATA_DIRECTORY,
&Language::Esperanto => ESPERANTO_TESTDATA_DIRECTORY,
&Language::Estonian => ESTONIAN_TESTDATA_DIRECTORY,
&Language::Finnish => FINNISH_TESTDATA_DIRECTORY,
&Language::French => FRENCH_TESTDATA_DIRECTORY,
&Language::Ganda => GANDA_TESTDATA_DIRECTORY,
&Language::Georgian => GEORGIAN_TESTDATA_DIRECTORY,
&Language::German => GERMAN_TESTDATA_DIRECTORY,
&Language::Greek => GREEK_TESTDATA_DIRECTORY,
&Language::Gujarati => GUJARATI_TESTDATA_DIRECTORY,
&Language::Hebrew => HEBREW_TESTDATA_DIRECTORY,
&Language::Hindi => HINDI_TESTDATA_DIRECTORY,
&Language::Hungarian => HUNGARIAN_TESTDATA_DIRECTORY,
&Language::Icelandic => ICELANDIC_TESTDATA_DIRECTORY,
&Language::Indonesian => INDONESIAN_TESTDATA_DIRECTORY,
&Language::Irish => IRISH_TESTDATA_DIRECTORY,
&Language::Italian => ITALIAN_TESTDATA_DIRECTORY,
&Language::Japanese => JAPANESE_TESTDATA_DIRECTORY,
&Language::Kazakh => KAZAKH_TESTDATA_DIRECTORY,
&Language::Korean => KOREAN_TESTDATA_DIRECTORY,
&Language::Latin => LATIN_TESTDATA_DIRECTORY,
&Language::Latvian => LATVIAN_TESTDATA_DIRECTORY,
&Language::Lithuanian => LITHUANIAN_TESTDATA_DIRECTORY,
&Language::Macedonian => MACEDONIAN_TESTDATA_DIRECTORY,
&Language::Malay => MALAY_TESTDATA_DIRECTORY,
&Language::Maori => MAORI_TESTDATA_DIRECTORY,
&Language::Marathi => MARATHI_TESTDATA_DIRECTORY,
&Language::Mongolian => MONGOLIAN_TESTDATA_DIRECTORY,
&Language::Nynorsk => NYNORSK_TESTDATA_DIRECTORY,
&Language::Persian => PERSIAN_TESTDATA_DIRECTORY,
&Language::Polish => POLISH_TESTDATA_DIRECTORY,
&Language::Portuguese => PORTUGUESE_TESTDATA_DIRECTORY,
&Language::Punjabi => PUNJABI_TESTDATA_DIRECTORY,
&Language::Romanian => ROMANIAN_TESTDATA_DIRECTORY,
&Language::Russian => RUSSIAN_TESTDATA_DIRECTORY,
&Language::Serbian => SERBIAN_TESTDATA_DIRECTORY,
&Language::Shona => SHONA_TESTDATA_DIRECTORY,
&Language::Slovak => SLOVAK_TESTDATA_DIRECTORY,
&Language::Slovene => SLOVENE_TESTDATA_DIRECTORY,
&Language::Somali => SOMALI_TESTDATA_DIRECTORY,
&Language::Sotho => SOTHO_TESTDATA_DIRECTORY,
&Language::Spanish => SPANISH_TESTDATA_DIRECTORY,
&Language::Swahili => SWAHILI_TESTDATA_DIRECTORY,
&Language::Swedish => SWEDISH_TESTDATA_DIRECTORY,
&Language::Tagalog => TAGALOG_TESTDATA_DIRECTORY,
&Language::Tamil => TAMIL_TESTDATA_DIRECTORY,
&Language::Telugu => TELUGU_TESTDATA_DIRECTORY,
&Language::Thai => THAI_TESTDATA_DIRECTORY,
&Language::Tsonga => TSONGA_TESTDATA_DIRECTORY,
&Language::Tswana => TSWANA_TESTDATA_DIRECTORY,
&Language::Turkish => TURKISH_TESTDATA_DIRECTORY,
&Language::Ukrainian => UKRAINIAN_TESTDATA_DIRECTORY,
&Language::Urdu => URDU_TESTDATA_DIRECTORY,
&Language::Vietnamese => VIETNAMESE_TESTDATA_DIRECTORY,
&Language::Welsh => WELSH_TESTDATA_DIRECTORY,
&Language::Xhosa => XHOSA_TESTDATA_DIRECTORY,
&Language::Yoruba => YORUBA_TESTDATA_DIRECTORY,
&Language::Zulu => ZULU_TESTDATA_DIRECTORY,
}
}



fn main() {


let lingua_detector = LanguageDetectorBuilder::from_all_languages()
.with_preloaded_language_models()
.build();


let mut all_examples = Vec::new();
let mut counts: HashMap<Language, usize> = HashMap::new();
for (idx, language) in Language::iter().enumerate() {
let single_words = get_file_content("single-words.txt", &language);
let word_pairs = get_file_content("word-pairs.txt", &language);
let sentences = get_file_content("sentences.txt", &language);

all_examples.extend_from_slice(single_words.as_slice());
all_examples.extend_from_slice(word_pairs.as_slice());
all_examples.extend_from_slice(sentences.as_slice());

counts.insert(language.clone(), 0);
}


let now = Instant::now();
for example in all_examples {
if let Some(language) = lingua_detector.detect_language_of(example) {
*counts.get_mut(&language).unwrap() += 1;
}
}

println!("{}", now.elapsed().as_millis());
println!("{:?}", counts);


}
15 changes: 0 additions & 15 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ pub(crate) const MINIMUM_RELATIVE_DISTANCE_MESSAGE: &str =
pub struct LanguageDetectorBuilder {
languages: HashSet<Language>,
minimum_relative_distance: f64,
is_every_language_model_preloaded: bool,
}

impl LanguageDetectorBuilder {
Expand Down Expand Up @@ -151,33 +150,19 @@ impl LanguageDetectorBuilder {
self
}

/// Configures `LanguageDetectorBuilder` to preload all language models when creating
/// the instance of [LanguageDetector].
///
/// By default, *Lingua* uses lazy-loading to load only those language models
/// on demand which are considered relevant by the rule-based filter engine.
/// For web services, for instance, it is rather beneficial to preload all language
/// models into memory to avoid unexpected latency while waiting for the
/// service response. This method allows to switch between these two loading modes.
pub fn with_preloaded_language_models(&mut self) -> &mut Self {
self.is_every_language_model_preloaded = true;
self
}

/// Creates and returns the configured instance of [LanguageDetector].
pub fn build(&mut self) -> LanguageDetector {
LanguageDetector::from(
self.languages.clone(),
self.minimum_relative_distance,
self.is_every_language_model_preloaded,
)
}

fn from(languages: HashSet<Language>) -> Self {
Self {
languages,
minimum_relative_distance: 0.0,
is_every_language_model_preloaded: false,
}
}
}
Expand Down
Loading

0 comments on commit accae7a

Please sign in to comment.