Fix blank index ctc (#2266)

* update blank_index * whisper * revert change * mistake
speechbrain · Nov 24, 2023 · c5f83d0 · c5f83d0
1 parent 2520723
commit c5f83d0
Show file tree

Hide file tree

Showing 14 changed files with 40 additions and 34 deletions.
diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml
@@ -80,6 +80,7 @@ eos_index: 2
 
 # Decoding parameters
 test_beam_search:
+    blank_index: !ref <blank_index>
     beam_size: 100
     beam_prune_logp: -12.0
     token_prune_min_logp: -1.2

diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml
@@ -78,6 +78,7 @@ eos_index: 2
 
 # Decoding parameters
 test_beam_search:
+    blank_index: !ref <blank_index>
     beam_size: 100
     beam_prune_logp: -12.0
     token_prune_min_logp: -1.2

diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml
@@ -78,6 +78,7 @@ eos_index: 2
 
 # Decoding parameters
 test_beam_search:
+    blank_index: !ref <blank_index>
     beam_size: 100
     beam_prune_logp: -12.0
     token_prune_min_logp: -1.2

diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml
@@ -79,6 +79,7 @@ eos_index: 2
 
 # Decoding parameters
 test_beam_search:
+    blank_index: !ref <blank_index>
     beam_size: 100
     beam_prune_logp: -12.0
     token_prune_min_logp: -1.2

diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml
@@ -79,6 +79,7 @@ eos_index: 2
 
 # Decoding parameters
 test_beam_search:
+    blank_index: !ref <blank_index>
     beam_size: 100
     beam_prune_logp: -12.0
     token_prune_min_logp: -1.2

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml
@@ -71,12 +71,13 @@ freeze_wav2vec: True
 # Outputs
 ctc_neurons: 29
 output_neurons: 29  # Characters size, index(blank/eos/bos) = 0
+blank_index: 0
 
 # Decoding parameters
 test_beam_search:
    beam_size: 200
    topk: 1
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -10.0
    token_prune_min_logp: -5

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml
@@ -72,12 +72,13 @@ freeze_wav2vec: True
 # Outputs
 ctc_neurons: 29
 output_neurons: 29  # Characters size, index(blank/eos/bos) = 0
+blank_index: 0
 
 # Decoding parameters
 test_beam_search:
    beam_size: 200
    topk: 1
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -10.0
    token_prune_min_logp: -5

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml
@@ -70,12 +70,13 @@ freeze_wav2vec: True
 # Outputs
 ctc_neurons: 58 # Twice bigger than the  number of characters for upsampling
 output_neurons: 29  # Characters size, index(blank/eos/bos) = 0
+blank_index: 0
 
 # Decoding parameters
 test_beam_search:
    beam_size: 200
    topk: 1
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -10.0
    token_prune_min_logp: -5

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml
@@ -64,12 +64,13 @@ freeze_wav2vec: True
 
 # Outputs
 output_neurons: 29  # BPE size, index(blank/eos/bos) = 0
+blank_index: 0
 
 # Decoding parameters
 test_beam_search:
    beam_size: 143
    topk: 1
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -12.0
    token_prune_min_logp: -1.2

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml
@@ -85,12 +85,14 @@ tokenizer: !new:sentencepiece.SentencePieceProcessor
 
 # Decoding parameters
 lm_weight: 0.5
+blank_index: 0
 # topk is the number of hypotheses that will be rescored in the rescorer
 # lowering this value might decrease the wer, but will increase speed.
+
 test_beam_search:
    beam_size: 20
    topk: 20
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -12.0
    token_prune_min_logp: -12.0

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml
@@ -87,12 +87,13 @@ tokenizer: !new:sentencepiece.SentencePieceProcessor
 
 # Decoding parameters
 lm_weight: 0.5
+blank_index: 0
 # topk is the number of hypotheses that will be rescored in the rescorer
 # lowering this value might decrease the wer, but will increase speed.
 test_beam_search:
    beam_size: 20
    topk: 20
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -12.0
    token_prune_min_logp: -12.0

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml
@@ -69,23 +69,23 @@ whisper_output_dim: 512
 
 # Outputs
 output_neurons: 29  # BPE size, index(blank/eos/bos) = 0
-
-# Decoding parameters
-test_searcher: !name:speechbrain.decoders.CTCBeamSearcher
 blank_index: 0
-space_token: ' ' # make sure this is the same as the one used in the tokenizer
-beam_size: 143
-beam_prune_logp: -12.0
-token_prune_min_logp: -1.2
-prune_history: True
-topk: 1
-alpha: 0.8
-beta: 1.2
-# can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-# It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-# If you don't want to use an LM, comment it out or set it to null
-kenlm_model_path: null
 
+# Decoding parameters
+test_beam_search:
+   beam_size: 143
+   topk: 1
+   blank_index: !ref <blank_index>
+   space_token: ' ' # make sure this is the same as the one used in the tokenizer
+   beam_prune_logp: -12.0
+   token_prune_min_logp: -1.2
+   prune_history: True
+   alpha: 0.8
+   beta: 1.2
+   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
+   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
+   # If you don't want to use an LM, comment it out or set it to null
+   kenlm_model_path: null
 #
 # Functions and classes
 #

diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml
@@ -66,12 +66,13 @@ freeze_wav2vec: False
 
 # Outputs
 output_neurons: 29  # BPE size, index(blank/eos/bos) = 0
+blank_index: 0
 
 # Decoding parameters
 test_beam_search:
    beam_size: 200
    topk: 1
-   blank_index: 0
+   blank_index: !ref <blank_index>
    space_token: ' ' # make sure this is the same as the one used in the tokenizer
    beam_prune_logp: -10.0
    token_prune_min_logp: -5.0

diff --git a/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py b/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py
@@ -340,18 +340,11 @@ def text_pipeline(wrd):
     vocab_list = [
         tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size())
     ]
-    test_searcher = hparams["test_searcher"](
-        blank_index=hparams["blank_index"],
-        vocab_list=vocab_list,
-        space_token=hparams["space_token"],
-        alpha=hparams["alpha"],
-        beta=hparams["beta"],
-        beam_size=hparams["beam_size"],
-        beam_prune_logp=hparams["beam_prune_logp"],
-        token_prune_min_logp=hparams["token_prune_min_logp"],
-        prune_history=hparams["prune_history"],
-        topk=hparams["topk"],
-        kenlm_model_path=hparams.get("kenlm_model_path"),
+
+    from speechbrain.decoders.ctc import CTCBeamSearcher
+
+    test_searcher = CTCBeamSearcher(
+        **hparams["test_beam_search"], vocab_list=vocab_list,
     )
 
     # Training