From 50eb05e54673e9aac50f83147c04d98b58318835 Mon Sep 17 00:00:00 2001 From: pchampio Date: Tue, 12 Mar 2024 19:02:41 +0100 Subject: [PATCH] WIP french dataset LM done fix qa fix 2 readme --- .../ASR/stm_prepare.py | 1 + recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md | 68 +++ .../LM/hparams/train_ngram.yaml | 28 + .../ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py | 1 + .../ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py | 152 +++++ .../ESTER+EPAC+ETAPE+REPERE/stm_prepare.py | 543 ++++++++++++++++++ 6 files changed, 793 insertions(+) create mode 120000 recipes/ESTER+EPAC+ETAPE+REPERE/ASR/stm_prepare.py create mode 100644 recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md create mode 100644 recipes/ESTER+EPAC+ETAPE+REPERE/LM/hparams/train_ngram.yaml create mode 120000 recipes/ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py create mode 100644 recipes/ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py create mode 100644 recipes/ESTER+EPAC+ETAPE+REPERE/stm_prepare.py diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/ASR/stm_prepare.py b/recipes/ESTER+EPAC+ETAPE+REPERE/ASR/stm_prepare.py new file mode 120000 index 00000000000..b001e66466c --- /dev/null +++ b/recipes/ESTER+EPAC+ETAPE+REPERE/ASR/stm_prepare.py @@ -0,0 +1 @@ +../stm_prepare.py \ No newline at end of file diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md new file mode 100644 index 00000000000..6fcb328d717 --- /dev/null +++ b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md @@ -0,0 +1,68 @@ +# Language Model with ESTER+EPAC+ETAPE+REPERE +This folder contains recipes for training language models for the above datasets. +It supports n-gram LM. +Depending on the ASR token type ("phone", or "char"), an apostrophe (') should +be preceded by a space or not. +Example: +"C'est" -> modeled as "sɛ" -> mapped to "C'EST" +"C'est" -> modeled as 2 words "C'" and "EST" -> transcribed to "C' EST" + +## Installing Extra Dependencies + +If you want to train an n-gram, in this recipe we are using the popular KenLM library. Let's start by installing the Ubuntu library prerequisites. For a complete guide on how to install required dependencies, please refer to [this](https://kheafield.com/code/kenlm/dependencies/) link: + ``` + sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev + ``` + + Next, we need to start downloading and unpacking the KenLM repo. + ``` + wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz + ``` + +KenLM is written in C++, so we'll make use of cmake to build the binaries. + ``` +mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2 + ``` + +Now, make sure that the executables are added to your .bashrc file. To do it, +- Open the ~/.bashrc file in a text editor. +- Scroll to the end of the file and add the following line: ```export PATH=$PATH:/your/path/to/kenlm/build/bin ``` +- Save it and type: `source ~/.bashrc ` + +# How to run: +```shell +python train_ngram.py hparams/train_ngram.yaml --data_folder=your/data/folder +# your/data/folder should point to the directory containing one or all of the following directories: EPAC ESTER1 ESTER2 ETAPE REPERE +## ls your/data/folder -> EPAC ESTER1 ESTER2 ETAPE REPERE +## ls your/data/folder/ETAPE -> dev test tools train +``` + +| Release | hyperparams file | Test PP | Model link | GPUs | +| :--- | :---: | :---: | :---: | :---: | +| 24-03-12 | 3-for-char-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- | +| 24-03-12 | 4-for-char-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- | +| 24-03-12 | 3-for-phone-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- | +| 24-03-12 | 4-for-phone-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- | + + +# **About SpeechBrain** +- Website: https://speechbrain.github.io/ +- Code: https://github.com/speechbrain/speechbrain/ +- HuggingFace: https://huggingface.co/speechbrain/ + + +# **Citing SpeechBrain** +Please, cite SpeechBrain if you use it for your research or business. + +```bibtex +@misc{speechbrain, + title={{SpeechBrain}: A General-Purpose Speech Toolkit}, + author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio}, + year={2021}, + eprint={2106.04624}, + archivePrefix={arXiv}, + primaryClass={eess.AS}, + note={arXiv:2106.04624} +} +``` + diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/hparams/train_ngram.yaml b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/hparams/train_ngram.yaml new file mode 100644 index 00000000000..951c8567f42 --- /dev/null +++ b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/hparams/train_ngram.yaml @@ -0,0 +1,28 @@ +######### +# Recipe for Training kenLM on stm formated Data. +# +# Author: +# - Pierre Champion 2024 +################################ +# Seed needs to be set at top of yaml, before objects with parameters are made +output_folder: !ref results/n_gram_lm/ +# for_token_type: phone +for_token_type: char # new_word_on_apostrophe +# Data files +data_folder: !PLACEHOLDER # e.g, /path/to/corpus (**/*.stm) +stm_directory: !ref /**/[^\.ne_e2\.|\.ne\.|\.spk\.|part\.]*.stm +wav_directory: !ref /**/*.wav +train_splits: {"train_ESTER2":["/ESTER2/train_trans_rapide/*", "/ESTER2/train/*"], "train_ESTER1":["/ESTER1/train/*"], "train_EPAC":["/EPAC/train/*"], "train_ETAPE":["/ETAPE/train/*"], "train_REPERE":["/REPERE/train/*"]} +dev_splits: [] +test_splits: [] +merge_train_csv: "train_ESTER2, train_ESTER1, train_EPAC, train_ETAPE, train_REPERE" +train_csv: !ref /train.csv +lang_dir: !ref /lang +vocab_file: !ref /vocab.txt +add_word_boundary: True +sil_prob: 0. +caching: False +skip_prep: False +arpa_order: 4 +prune_level: [0, 1, 2] +output_arpa: !ref /-for--gram.arpa diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py new file mode 120000 index 00000000000..b001e66466c --- /dev/null +++ b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py @@ -0,0 +1 @@ +../stm_prepare.py \ No newline at end of file diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py new file mode 100644 index 00000000000..63052fa3465 --- /dev/null +++ b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py @@ -0,0 +1,152 @@ +""" +Recipe to train kenlm ngram model. + +To run this recipe, do the following: +> python train.py hparams/train.yaml --data_folder=/path/to/corpus (**/*.stm) + +Authors + * Adel Moumen 2024 + * Pierre Champion 2023 +""" + +import os +import sys +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml +import speechbrain.k2_integration as sbk2 +from speechbrain.utils.data_utils import get_list_from_csv + +logger = logging.getLogger(__name__) + + +def dataprep_lm_training( + lm_dir, + output_arpa, + csv_files, + external_lm_corpus, + vocab_file, + arpa_order=3, + prune_level=[0, 1, 2], +): + """Prepare lm txt corpus file for lm training with kenlm (https://github.com/kpu/kenlm) + Does nothing if output_arpa exists. + Else display to the user how to use kenlm in command line, then exit + (return code 1), the user has to run the command manually. + Instruction on how to compile kenlm (lmplz binary) is available in the + above link. + + Arguments + --------- + lm_dir : str + Path to where to store txt corpus + output_arpa : str + File to write arpa lm + csv_files : List[str] + CSV files to use to increase lm txt corpus + external_lm_corpus : List[str] + (Big) text dataset corpus + vocab_file : str + N-grams that contain vocabulary items not in this file be pruned. + arpa_order : int + Order of the arpa lm + prune_level : List[int] + The numbers must be non-decreasing and the last number will be extended to any higher order. + For example, --prune 0 disables pruning (the default) while --prune 0 0 1 prunes singletons for orders three and higher. + Please refer to https://kheafield.com/code/kenlm/estimation/ for more details. + """ + column_text_key = "wrd" # defined in librispeech_prepare.py + lm_corpus = os.path.join(lm_dir, "lm_corpus.txt") + line_seen = set() + with open(lm_corpus, "w") as corpus: + for file in csv_files: + for line in get_list_from_csv(file, column_text_key): + corpus.write(line + "\n") + line_seen.add(line + "\n") + for file in external_lm_corpus: + with open(file) as f: + for line in f: + if line not in line_seen: + corpus.write(line) + prune_level = " ".join(map(str, prune_level)) + cmd = f"lmplz -o {arpa_order} --prune {prune_level} --limit_vocab_file {vocab_file} < {lm_corpus} | sed '1,20s///1' > {output_arpa}" + logger.critical( + f"RUN the following kenlm command to build a {arpa_order}-gram arpa LM (https://github.com/kpu/kenlm):" + ) + logger.critical(f"$ {cmd}") + sys.exit(0) + + +if __name__ == "__main__": + # Load hyperparameters file with command-line overrides + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep + import stm_prepare + + # multi-gpu (ddp) save data preparation + run_on_main( + stm_prepare.prepare_stm, + kwargs={ + "stm_directory": hparams["stm_directory"], + "wav_directory": hparams["wav_directory"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["output_folder"], + "merge_train_csv": hparams["merge_train_csv"].split(","), + "train_csv": hparams["train_csv"], + "skip_prep": hparams["skip_prep"], + "new_word_on_apostrophe": hparams["for_token_type"] in ["char"], + }, + ) + + # Create the lexicon.txt for k2 + run_on_main( + sbk2.lexicon.prepare_char_lexicon, + kwargs={ + "lang_dir": hparams["lang_dir"], + "vocab_files": [hparams["vocab_file"]], + "extra_csv_files": [hparams["output_folder"] + "/train.csv"] + if not hparams["skip_prep"] + else [], + "add_word_boundary": hparams["add_word_boundary"], + }, + ) + + caching = ( + {"cache": False} + if "caching" in hparams and hparams["caching"] is False + else {} + ) + + # Create the lang directory for k2 + run_on_main( + sbk2.prepare_lang.prepare_lang, + kwargs={ + "lang_dir": hparams["lang_dir"], + "sil_prob": hparams["sil_prob"], + **caching, + }, + ) + + dataprep_lm_training( + lm_dir=hparams["output_folder"], + output_arpa=hparams["output_arpa"], + csv_files=[hparams["train_csv"]], + external_lm_corpus=[], + vocab_file=os.path.join(hparams["lang_dir"], "words.txt"), + arpa_order=hparams["arpa_order"], + prune_level=hparams["prune_level"], + ) diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/stm_prepare.py b/recipes/ESTER+EPAC+ETAPE+REPERE/stm_prepare.py new file mode 100644 index 00000000000..64011eadc91 --- /dev/null +++ b/recipes/ESTER+EPAC+ETAPE+REPERE/stm_prepare.py @@ -0,0 +1,543 @@ +""" +Data preparation. +Assume that transcription (.stm) is generated by Transcriber export. + +Download (paid) ESTER1: https://catalogue.elra.info/en-us/repository/browse/ELRA-S0241/ +Download (paid) ESTER2: https://catalogue.elra.info/en-us/repository/browse/ELRA-S0338/ +Download (paid) ETAPE: https://catalogue.elra.info/en-us/repository/browse/ELRA-E0046/ +Download (paid) EPAC: https://catalogue.elra.info/en-us/repository/browse/ELRA-S0305/ +Download (paid) REPERE: https://catalogue.elra.info/en-us/repository/browse/ELRA-E0044/ + +Author +------ +Pierre Champion +""" + +import logging +import re +import glob +import os +import csv +import sys + +from num2words import num2words +import ftfy +from tqdm import tqdm + +from speechbrain.dataio.dataio import ( + load_pkl, + save_pkl, +) +from speechbrain.utils.data_utils import get_list_from_csv + +import soundfile +import string + +delset = string.punctuation +delset = delset.replace("'", "") +delset = delset.replace("%", "") +delset = delset.replace(",", "") + +logger = logging.getLogger(__name__) +OPT_FILE = "opt_stm_prepare.pkl" +SAMPLERATE = 16000 + + +logging.basicConfig(level=logging.INFO) + + +def prepare_stm( # noqa + stm_directory, + wav_directory, + tr_splits, + dev_splits, + te_splits, + save_folder, + merge_train_csv, + train_csv, + skip_prep=False, + ignore_wav=False, + new_word_on_apostrophe=True, +): + """ + This class prepares the csv files for STM like dataset. + + Arguments + --------- + stm_directory : str + Path to the folder where the original .stm files are stored (glob compatible) + wav_directory : str + Path to the folder where the original .wav files are stored (glob compatible) + tr_splits : Union[list, Dict] + List of train splits (regex from path) to prepare from ["/train*", "other_data"] + Dict of List of train splits (regex from path) to prepare from {"train_ETAPE":["/ETAPE/train/*"], "train_ESTER2":["/ESTER2/train/*"], "train_ESTER1":["/ESTER1/train/*"]}, + dev_splits : Union[list, Dict] + List of dev splits (regex from path) to prepare from ["*/dev/*"]. + Dict of List of test splits (regex from path) to prepare from dev_splits: {"dev_ESTER2":["/ESTER2/dev/*"], "dev_ESTER1":["/ESTER1/dev/*"], "dev_ETAPE":["/ETAPE/dev/*"], "dev_REPERE2014":["/REPERE/dev2014/*"]} + te_splits : Union[list, Dict] + List of test splits (regex from path) to prepare from ["*/test/*"] -> create one test merged test dataset. + Dict of List of test splits (regex from path) to prepare from {"test_ETAPE":["/ETAPE/test/*"], "test_ESTER2":["/ESTER2/test/*"], "test_ESTER1":["/ESTER1/test/*"]}, + save_folder : str + The directory where to store the csv files. + merge_train_csv : List[str] + List of tr_splits to concatenate in train_csv + train_csv: str + Where to store the final train csv from the splits defined in merge_train_csv + skip_prep: bool + If True, data preparation is skipped. + + Example + ------- + >>> prepare_stm( + ... "/corpus/**/[^\.ne_e2\.|\.ne\.|\.spk\.|part\.]*.stm", # noqa + ... "/corpus/**/*.wav", + ... [r"/train/", r"/train_trans_rapide/"], + ... [r"/dev/"], + ... [r"/test/"], + ... "./data_prep_out", + ...) + + >>> prepare_stm( + ... "/corpus/ESTER[1-2]/**/[^\.ne_e2\.|\.ne\.|\.spk\.]*.stm", # noqa + ... "/corpns/ESTER[1-2]/**/*.wav", + ... [r"/train/", r"/train_trans_rapide/"], + ... [r"/dev/"], + ... [r"/test/"], + ... "./data_prep_out", + ...) + + >>> prepare_stm( + ... "/corpns/ESTER2/**/[^\.ne_e2\.|\.ne\.|\.spk\.]*.stm", # noqa + ... "/corpus/ESTER2/**/*.wav", + ... [r"/train/", r"/train_trans_rapide/"], + ... [r"/dev/"], + ... [r"/test/"], + ... "./data_prep_out", + ...) + + """ + + if skip_prep: + return + + os.makedirs(save_folder, exist_ok=True) + + conf = locals().copy() + save_opt = os.path.join(save_folder, OPT_FILE) + # Check if this phase is already done (if so, skip it) + if skip(save_folder, conf, save_opt): + logger.info("Skipping preparation, completed in previous run.") + return + else: + logger.info("Data_preparation...") + + stm_paths, stm_exclude_match = custom_glob_filter(stm_directory) + pbar = tqdm(stm_paths, bar_format="{desc} {percentage:3.0f}%") + i = 0 + + if not ignore_wav: + wav_paths, wav_exclude_match = custom_glob_filter(wav_directory) + wav_paths_map = { + normalize_wav_key(os.path.basename(wav_p)): wav_p + for wav_p in wav_paths + } + + split_info = [] + for split, default_name in zip( + [tr_splits, dev_splits, te_splits], ["train", "dev", "test"] + ): + if isinstance(split, list): + split_info.append((split, default_name, [])) + if isinstance(split, dict): + for te_splits_name, te_splits_paths in split.items(): + split_info.append((te_splits_paths, te_splits_name, [])) + + train_vocab = set() + train_transcript_words = [] + + for filename in pbar: + if ( + stm_exclude_match is not None + ): # Exclude all paths with specified string + if re.search(stm_exclude_match, filename): + logger.debug( + f"Skipping {filename}, as it is in the exclude match" + ) + continue + + split = "n" + info = None + for sp, id, _info in split_info: + for tr in sp: + if re.search(tr, filename): + split = id + info = _info + + if split == "n": + logger.debug(f"Skipping {filename}, not associated to any split") + continue + + d = (filename).ljust(80, " ") + i += 1 + pbar.set_description( + f"Len: {str(i).rjust(4)} : Processing '{split}' : {d}" + ) + with open(filename, "r") as file: + data = file.readlines() + for line in transform_lines(filterout_lines(data)): + parts = line.split() + wav_key = normalize_wav_key(parts[0]) + if not ignore_wav and wav_key not in wav_paths_map: + logger.critical( + f"Did not found wav '{wav_key}' for stm: '{filename}'" + ) + break + else: + text = text_transform( + f"{' '.join(parts[6:])}", + new_word_on_apostrophe=new_word_on_apostrophe, + ) + + # No transcription, might be only rire/jingle anotation + if text == "": + continue + + if not ignore_wav: + # wav file is not complete + audio_info = soundfile.info(wav_paths_map[wav_key]) + startTime = float(parts[3]) + endTime = float(parts[4]) + wav_path = wav_paths_map[wav_key] + if ( + startTime > audio_info.duration + or int(endTime) > audio_info.duration + ): + logger.critical( + f"Skipping, segment StartTime or endTime ({startTime},{endTime}) longer than wav file ({audio_info.duration})" + ) + continue + else: + # Text only + startTime = 0.0 + endTime = 1.0 + wav_path = "/dev/null" + + if split == "train": + train_transcript_words.append(text) + for word in set(text.split(" ")): + train_vocab.add(word) + + info.append( + { + "ID": f"{parts[0]}-{parts[3].replace('.', '')[:-1].zfill(7)}-{parts[4].replace('.', '')[:-1].zfill(7)}", + "wrd": text, + "spk": parts[2], + "gender": f"{parts[5].split(',', 3)[2].replace('>', '')}", + "startTime": startTime, + "endTime": endTime, + "duration": endTime - startTime, + "file": wav_path, + } + ) + + merge_train_csv = [m.strip() for m in merge_train_csv] + with open(train_csv, "w") as merge_csvfile: + merge_writer = None + + for path, split, info in split_info: + + # Sort the data based on column ID + sorted_formatted_data = sorted(info, key=lambda x: x["ID"]) + + if len(sorted_formatted_data) == 0: + logger.critical( + f"No file found for {split} {path} check directory paths." + ) + continue + + csv_file = os.path.join(save_folder, split + ".csv") + with open(csv_file, "w") as csvfile: + writer = csv.DictWriter( + csvfile, fieldnames=sorted_formatted_data[0].keys() + ) + writer.writeheader() + writer.writerows(sorted_formatted_data) + + if split in merge_train_csv: + if not merge_writer: + merge_writer = csv.DictWriter( + merge_csvfile, + fieldnames=sorted_formatted_data[0].keys(), + ) + merge_writer.writeheader() + merge_writer.writerows(sorted_formatted_data) + + sorted_vocabulary = sorted(train_vocab) + vocab_file = os.path.join(save_folder, "vocab.txt") + with open(vocab_file, "w") as file: + for word in sorted_vocabulary: + if word == " ": + continue + if word == "": + continue + # Write each word to the file, followed by a newline character + file.write(word + "\n") + + transcript_words = os.path.join(save_folder, "transcript_words.txt") + with open(transcript_words, "w") as file: + for line in train_transcript_words: + file.write(line + "\n") + + # saving options + save_pkl(conf, save_opt) + + +def normalize_wav_key(key): + key = key.replace("suite", "") + key = key.replace("_bis", "") + key = key.replace("automatique", "") + key = key.replace("wav", "") + key = key.replace("-", "_") + key = key.replace(".", "") + key = key.lower() + return key + + +def custom_glob_filter(directory): + # Support for exclude exact word + # https://stackoverflow.com/questions/20638040/glob-exclude-pattern + try: # Try to parse exact match direction + exclude_match = ( + re.findall(r"\[\^.*\]", directory)[0] + .replace("[^", "") + .replace("]", "") + ) + except IndexError: + exclude_match = None + else: # Remove custom directive + directory = re.sub(r"\[\^.*\]", "", directory) + paths = glob.glob(directory, recursive=True) + return paths, exclude_match + + +def transform_lines(line): + # Perform string replacements using regular expressions + line = [re.sub(r"", "", line) for line in line] + line = [re.sub(r"", "", line) for line in line] + line = [re.sub(r"\([0-9]+\)", "", line) for line in line] + line = [re.sub(r"", "", line) for line in line] + line = [re.sub(r"\([^ ]*\)$", "", line) for line in line] + return line + + +def text_transform(text, new_word_on_apostrophe=True): + + text = ftfy.fix_text(text) + + # Names + text = re.sub(r"Franç§ois", "François", text) + text = re.sub(r"Schrà ¶der", "Schràder", text) + + text = re.sub(r"«", "", text) + text = re.sub(r"»", "", text) + + text = re.sub(r"°", "degré", text) + + text = re.sub(r"²", "", text) + + # remove html tag + text = re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", "", text) + + # Replace curly braces with square brackets + text = text.replace("{", "[").replace("}", "]") + + text = re.sub(r"\.\.\.|\*|\[.*?\]", "", text.lower()) + delset_specific = delset + remove_clear = "()=-" + for char in remove_clear: + delset_specific = delset_specific.replace(char, "") + text = text.translate(str.maketrans("", "", delset_specific)) + + # Undecidable variant heared like on (n') en: + text = re.sub(r"\(.+?\)", "", text) + text = re.sub(r"\(\)", "", text) + text = re.sub(r"(O.K.)", "ok", text) + text = re.sub(r"(O.K)", "ok", text) + + text = re.sub(r"%", "pour cent", text) + + text = re.sub(r"=", "", text) + text = re.sub(r"\(", "", text) + text = re.sub(r"\)", "", text) + + # t 'avais + text = re.sub(r"[ ]\'", "'", text) + # t' avais + text = re.sub(r"\'[ ]", "'", text) + if new_word_on_apostrophe: + text = re.sub(r"\'", "' ", text) + + # ' en debut de phrase + text = re.sub(r"^'", "", text) + + # -) hesitation + text = re.sub(r"-\)", "", text) + + # convert to french time 15h -> 15 heure + match = re.match(r"(\d+)h", text) + text = f"{match.group(1)} heure" if match else text + + num_list = re.findall(" \d+,\d+ | \d+,\d+$", text) # noqa + if len(num_list) > 0: + for num in num_list: + num_in_word = num2words(float(num.replace(",", ".")), lang="fr") + text = text.replace(num, " " + str(num_in_word) + " ", 1) + + num_list = re.findall("\d+,\d+", text) # noqa + if len(num_list) > 0: + for num in num_list: + num_in_word = num2words(float(num.replace(",", ".")), lang="fr") + text = text.replace(num, " " + str(num_in_word) + " ", 1) + + num_list = re.findall(" \d+ | \d+$", text) # noqa + if len(num_list) > 0: + for num in num_list: + num_in_word = num2words(int(num), lang="fr") + text = text.replace(num, " " + str(num_in_word) + " ", 1) + + num_list = re.findall("\d+", text) # noqa + if len(num_list) > 0: + for num in num_list: + num_in_word = num2words(int(num), lang="fr") + text = text.replace(num, " " + str(num_in_word) + " ", 1) + + # arc-en-ciel + text = re.sub(r"-", " ", text) + + # virgule (after num2words!) + text = re.sub(r",", "", text) + + # euh + # text = re.sub(r"euh", "", text) + + # ã used as à in most case + text = re.sub(r"ã", "à", text) + + # replace n succesive spaces with one space. + text = re.sub(r"\s{2,}", " ", text) + text = re.sub("^ ", "", text) + text = re.sub(" $", "", text) + + # The byte 0x9c encodes a "curly quote" in the Windows-1252 character encoding. + text = re.sub(r"c½ur", "coeur", text) + text = re.sub(r"cœur", "coeur", text) + # The byte 0x92 encodes a "curly quote" in the Windows-1252 character encoding. + text = re.sub(r"’", "'", text) + text = re.sub(r"' '", "' ", text) + text = re.sub(r"'' ", "' ", text) + + return text + + +def filterout_lines(lines): + # Filter out lines containing specific patterns + return [ + line + for line in lines + if not any( + pattern in line + for pattern in [ + "ignore_time_segment_in_scoring", + ";;", + "inter_segment_gap", + "excluded_region", + ] + ) + ] + + +def dataprep_lm_training( + lm_dir, output_arpa, csv_files, external_lm_corpus, vocab_file +): + """Prepare lm txt corpus file for lm training with kenlm (https://github.com/kpu/kenlm) + Does nothing if output_arpa exists. + Else display to the user how to use kenlm in command line, then exit + (return code 1), the user has to run the command manually. + Instruction on how to compile kenlm (lmplz binary) is available in the + above link. + + Arguments + --------- + lm_dir : str + Path to where to store txt corpus + output_arpa : str + File to write arpa lm + csv_files : List[str] + CSV files to use to increase lm txt corpus + external_lm_corpus : List[str] + (Big) text dataset corpus + vocab_file : str + N-grams that contain vocabulary items not in this file be pruned. + """ + if not os.path.exists(output_arpa): + column_text_key = "wrd" + lm_corpus = os.path.join(lm_dir, "lm_corpus.txt") + line_seen = set() + with open(lm_corpus, "w") as corpus: + for file in csv_files: + for line in get_list_from_csv(file, column_text_key): + corpus.write(line + "\n") + line_seen.add(line + "\n") + for file in external_lm_corpus: + with open(file) as f: + for line in f: + if line not in line_seen: + corpus.write(line) + logger.critical( + "RUN the following kenlm command to build a 3-gram arpa LM (https://github.com/kpu/kenlm):" + ) + logger.critical( + f"$ lmplz -o 3 --prune 0 1 2 --limit_vocab_file {vocab_file} < {lm_corpus}| sed '1,20s///1' > {output_arpa}" + ) + sys.exit(1) + + +def skip(save_folder, conf, save_opt): + """ + Detect when data prep can be skipped. + + Arguments + --------- + save_folder : str + The location of the seave directory + conf : dict + The configuration options to ensure they haven't changed. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + + # Checking csv files + skip = True + + if len(glob.glob(os.path.join(save_folder, "*.csv"), recursive=False)) == 0: + logger.info(f"Did not found any csv in '{save_folder}'") + skip = False + else: + logger.info(f"Found csv in '{save_folder}'") + + # Checking saved options + if skip is True: + if os.path.isfile(save_opt): + opts_old = load_pkl(save_opt) + if opts_old == conf: + skip = True + else: + skip = False + else: + skip = False + + return skip