WIP french dataset

LM done fix qa fix 2 readme
speechbrain · Mar 12, 2024 · 50eb05e · 50eb05e
1 parent c23d1e9
commit 50eb05e
Show file tree

Hide file tree

Showing 6 changed files with 793 additions and 0 deletions.
diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/ASR/stm_prepare.py b/recipes/ESTER+EPAC+ETAPE+REPERE/ASR/stm_prepare.py
@@ -0,0 +1 @@
+../stm_prepare.py
diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/README.md
@@ -0,0 +1,68 @@
+# Language Model with ESTER+EPAC+ETAPE+REPERE
+This folder contains recipes for training language models for the above datasets.
+It supports n-gram LM.
+Depending on the ASR token type ("phone", or "char"), an apostrophe (') should
+be preceded by a space or not.  
+Example:  
+"C'est" -> modeled as "sɛ" -> mapped to "C'EST"  
+"C'est" -> modeled as 2 words "C'" and "EST" -> transcribed to "C' EST"  
+
+## Installing Extra Dependencies
+
+If you want to train an n-gram, in this recipe we are using the popular KenLM library. Let's start by installing the Ubuntu library prerequisites. For a complete guide on how to install required dependencies, please refer to [this](https://kheafield.com/code/kenlm/dependencies/) link:
+ ```
+ sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
+ ```
+
+ Next, we need to start downloading and unpacking the KenLM repo.
+ ```
+ wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
+ ```
+
+KenLM is written in C++, so we'll make use of cmake to build the binaries.
+ ```
+mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
+ ```
+
+Now, make sure that the executables are added to your .bashrc file. To do it,
+- Open the ~/.bashrc file in a text editor.
+- Scroll to the end of the file and add the following line:  ```export PATH=$PATH:/your/path/to/kenlm/build/bin ```
+- Save it and type:  `source ~/.bashrc `
+
+# How to run:
+```shell
+python train_ngram.py hparams/train_ngram.yaml  --data_folder=your/data/folder
+# your/data/folder should point to the directory containing one or all of the following directories: EPAC  ESTER1  ESTER2  ETAPE  REPERE
+## ls your/data/folder -> EPAC  ESTER1  ESTER2  ETAPE  REPERE
+## ls your/data/folder/ETAPE -> dev  test  tools  train
+```
+
+| Release | hyperparams file | Test PP | Model link | GPUs |
+| :---     | :---: | :---: | :---: | :---: |
+| 24-03-12 | 3-for-char-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- |
+| 24-03-12 | 4-for-char-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- |
+| 24-03-12 | 3-for-phone-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- |
+| 24-03-12 | 4-for-phone-gram.arpa - train_ngram.yaml | --.-- | [link](https://www.dropbox.com/TODO) | --.-- |
+
+
+# **About SpeechBrain**
+- Website: https://speechbrain.github.io/
+- Code: https://github.com/speechbrain/speechbrain/
+- HuggingFace: https://huggingface.co/speechbrain/
+
+
+# **Citing SpeechBrain**
+Please, cite SpeechBrain if you use it for your research or business.
+
+```bibtex
+@misc{speechbrain,
+  title={{SpeechBrain}: A General-Purpose Speech Toolkit},
+  author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
+  year={2021},
+  eprint={2106.04624},
+  archivePrefix={arXiv},
+  primaryClass={eess.AS},
+  note={arXiv:2106.04624}
+}
+```
+
diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/hparams/train_ngram.yaml b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/hparams/train_ngram.yaml
@@ -0,0 +1,28 @@
+#########
+# Recipe for Training kenLM on stm formated Data.
+#
+# Author:
+#  - Pierre Champion 2024
+################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+output_folder: !ref results/n_gram_lm/
+# for_token_type: phone
+for_token_type: char # new_word_on_apostrophe
+# Data files
+data_folder: !PLACEHOLDER # e.g, /path/to/corpus (**/*.stm)
+stm_directory: !ref <data_folder>/**/[^\.ne_e2\.|\.ne\.|\.spk\.|part\.]*.stm
+wav_directory: !ref <data_folder>/**/*.wav
+train_splits: {"train_ESTER2":["/ESTER2/train_trans_rapide/*", "/ESTER2/train/*"], "train_ESTER1":["/ESTER1/train/*"], "train_EPAC":["/EPAC/train/*"], "train_ETAPE":["/ETAPE/train/*"], "train_REPERE":["/REPERE/train/*"]}
+dev_splits: []
+test_splits: []
+merge_train_csv: "train_ESTER2, train_ESTER1, train_EPAC, train_ETAPE, train_REPERE"
+train_csv: !ref <output_folder>/train.csv
+lang_dir: !ref <output_folder>/lang
+vocab_file: !ref <output_folder>/vocab.txt
+add_word_boundary: True
+sil_prob: 0.
+caching: False
+skip_prep: False
+arpa_order: 4
+prune_level: [0, 1, 2]
+output_arpa: !ref <output_folder>/<arpa_order>-for-<for_token_type>-gram.arpa
diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/stm_prepare.py
@@ -0,0 +1 @@
+../stm_prepare.py
diff --git a/recipes/ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py b/recipes/ESTER+EPAC+ETAPE+REPERE/LM/train_ngram.py
@@ -0,0 +1,152 @@
+"""
+Recipe to train kenlm ngram model.
+
+To run this recipe, do the following:
+> python train.py hparams/train.yaml --data_folder=/path/to/corpus (**/*.stm)
+
+Authors
+ * Adel Moumen 2024
+ * Pierre Champion 2023
+"""
+
+import os
+import sys
+import logging
+import speechbrain as sb
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+import speechbrain.k2_integration as sbk2
+from speechbrain.utils.data_utils import get_list_from_csv
+
+logger = logging.getLogger(__name__)
+
+
+def dataprep_lm_training(
+    lm_dir,
+    output_arpa,
+    csv_files,
+    external_lm_corpus,
+    vocab_file,
+    arpa_order=3,
+    prune_level=[0, 1, 2],
+):
+    """Prepare lm txt corpus file for lm training with kenlm (https://github.com/kpu/kenlm)
+    Does nothing if output_arpa exists.
+    Else display to the user how to use kenlm in command line, then exit
+    (return code 1), the user has to run the command manually.
+    Instruction on how to compile kenlm (lmplz binary) is available in the
+    above link.
+
+    Arguments
+    ---------
+    lm_dir : str
+        Path to where to store txt corpus
+    output_arpa : str
+        File to write arpa lm
+    csv_files : List[str]
+        CSV files to use to increase lm txt corpus
+    external_lm_corpus : List[str]
+        (Big) text dataset corpus
+    vocab_file : str
+       N-grams that contain vocabulary items not in this file be pruned.
+    arpa_order : int
+        Order of the arpa lm
+    prune_level : List[int]
+        The numbers must be non-decreasing and the last number will be extended to any higher order.
+        For example, --prune 0 disables pruning (the default) while --prune 0 0 1 prunes singletons for orders three and higher.
+        Please refer to https://kheafield.com/code/kenlm/estimation/ for more details.
+    """
+    column_text_key = "wrd"  # defined in librispeech_prepare.py
+    lm_corpus = os.path.join(lm_dir, "lm_corpus.txt")
+    line_seen = set()
+    with open(lm_corpus, "w") as corpus:
+        for file in csv_files:
+            for line in get_list_from_csv(file, column_text_key):
+                corpus.write(line + "\n")
+                line_seen.add(line + "\n")
+        for file in external_lm_corpus:
+            with open(file) as f:
+                for line in f:
+                    if line not in line_seen:
+                        corpus.write(line)
+    prune_level = " ".join(map(str, prune_level))
+    cmd = f"lmplz -o {arpa_order} --prune {prune_level} --limit_vocab_file {vocab_file} < {lm_corpus} | sed  '1,20s/<unk>/<UNK>/1' > {output_arpa}"
+    logger.critical(
+        f"RUN the following kenlm command to build a {arpa_order}-gram arpa LM (https://github.com/kpu/kenlm):"
+    )
+    logger.critical(f"$ {cmd}")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    # Load hyperparameters file with command-line overrides
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep
+    import stm_prepare
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        stm_prepare.prepare_stm,
+        kwargs={
+            "stm_directory": hparams["stm_directory"],
+            "wav_directory": hparams["wav_directory"],
+            "tr_splits": hparams["train_splits"],
+            "dev_splits": hparams["dev_splits"],
+            "te_splits": hparams["test_splits"],
+            "save_folder": hparams["output_folder"],
+            "merge_train_csv": hparams["merge_train_csv"].split(","),
+            "train_csv": hparams["train_csv"],
+            "skip_prep": hparams["skip_prep"],
+            "new_word_on_apostrophe": hparams["for_token_type"] in ["char"],
+        },
+    )
+
+    # Create the lexicon.txt for k2
+    run_on_main(
+        sbk2.lexicon.prepare_char_lexicon,
+        kwargs={
+            "lang_dir": hparams["lang_dir"],
+            "vocab_files": [hparams["vocab_file"]],
+            "extra_csv_files": [hparams["output_folder"] + "/train.csv"]
+            if not hparams["skip_prep"]
+            else [],
+            "add_word_boundary": hparams["add_word_boundary"],
+        },
+    )
+
+    caching = (
+        {"cache": False}
+        if "caching" in hparams and hparams["caching"] is False
+        else {}
+    )
+
+    # Create the lang directory for k2
+    run_on_main(
+        sbk2.prepare_lang.prepare_lang,
+        kwargs={
+            "lang_dir": hparams["lang_dir"],
+            "sil_prob": hparams["sil_prob"],
+            **caching,
+        },
+    )
+
+    dataprep_lm_training(
+        lm_dir=hparams["output_folder"],
+        output_arpa=hparams["output_arpa"],
+        csv_files=[hparams["train_csv"]],
+        external_lm_corpus=[],
+        vocab_file=os.path.join(hparams["lang_dir"], "words.txt"),
+        arpa_order=hparams["arpa_order"],
+        prune_level=hparams["prune_level"],
+    )