speechbrain · mravanelli · Nov 25, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/recipes/CommonVoice/ASR/CTC/README.md b/recipes/CommonVoice/ASR/CTC/README.md
@@ -1,5 +1,5 @@
 # CommonVoice ASR with CTC based Seq2Seq models.
-This folder contains scripts necessary to run an ASR experiment with the CommonVoice dataset: [CommonVoice Homepage](https://commonvoice.mozilla.org/)
+This folder contains scripts necessary to run an ASR experiment with the CommonVoice 14.0 dataset: CommonVoice Homepage and pytorch 2.0
 
 # How to run
 python train.py hparams/{hparam_file}.yaml
@@ -14,18 +14,24 @@ Here is a list of the different languages that we tested within the CommonVoice
 - French
 - Italian
 - Kinyarwanda
+- Arabic
+- Spanish
+- Portuguese
+- Chinese(china)
 
 # Results
 | Language | CommonVoice Release | hyperparams file | LM | Val. CER | Val. WER | Test CER | Test WER | HuggingFace link | Model link | GPUs |
 | ------------- |:-------------:|:---------------------------:| -----:| -----:| -----:| -----:| -----:| :-----------:| :-----------:| :-----------:|
-| English | 2020-12-11 | train_en_with_wav2vec.yaml | No | 5.01 | 12.57 | 7.32 | 15.58 | Not Avail. | [model](https://www.dropbox.com/sh/o3q43r4wdovbmnd/AADXcVomQr549NdAgCpI7OQHa?dl=0) | 2xV100 32GB |
-| German | 2022-08-16 | train_de_with_wav2vec.yaml | No | 1.90 | 8.02 | 2.40 | 9.54 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-de) | [model](https://www.dropbox.com/sh/vdz7apt16nbq94g/AADI5o23Ll_NmjiPlg9bzPjta?dl=0) | 1xRTXA6000 48GB |
-| French | 2020-12-11 | train_fr_with_wav2vec.yaml | No | 2.60 | 8.59 | 3.19 | 9.96 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-fr) | [model](https://www.dropbox.com/sh/wytlbeddrt8oe4n/AAAY59qMsDlWy5F017bmBeVua?dl=0) | 2xV100 32GB |
-| Italian | 2020-12-11 | train_it_with_wav2vec.yaml | No | 2.77 | 9.83 | 3.16 | 10.85 | Not Avail. | [model](https://www.dropbox.com/sh/0v2o2hmrv1j33p6/AAA3xUiqKbSKsX88fWfptPmFa?dl=0) | 2xV100 32GB |
-| Kinyarwanda | 2020-12-11 | train_rw_with_wav2vec.yaml | No | 6.20 | 20.07 | 8.25 | 23.12 | Not Avail. | [model](https://www.dropbox.com/sh/ccgirbq9r8uzubi/AAAynCvEV8EjEpMavFRPp87Ta?dl=0) | 2xV100 32GB |
-
-*For German, it takes around 5.5 hrs an epoch.* <br>
-The output folders with checkpoints and logs can be found [here](https://www.dropbox.com/sh/852eq7pbt6d65ai/AACv4wAzk1pWbDo4fjVKLICYa?dl=0).
+| English | 2023-08-15 | train_en_with_wav2vec.yaml | No | 5.65 | 13.67 | 7.92 | 16.86 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-en) | [model](https://www.dropbox.com/sh/ch10cnbhf1faz3w/AACdHFG65LC6582H0Tet_glTa?dl=0) | 1xV100 32GB |
+| German | 2023-08-15 | train_de_with_wav2vec.yaml | No | 1.74 | 7.40 | 2.24 | 8.93 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-de) | [model](https://www.dropbox.com/sh/dn7plq4wfsujsi1/AABS1kqB_uqLJVkg-bFkyPpVa?dl=0) | 1xV100 32GB |
+| French | 2023-08-15 | train_fr_with_wav2vec.yaml | No | 2.59 | 8.47 | 3.44 | 10.24 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-fr) | [model](https://www.dropbox.com/sh/0i7esfa8jp3rxpp/AAArdi8IuCRmob2WAS7lg6M4a?dl=0) | 1xV100 32GB |
+| Italian | 2023-08-15 | train_it_with_wav2vec.yaml | No | 2.10 | 7.77 |  2.38 | 8.38 |[model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-it) | [model](https://www.dropbox.com/sh/hthxqzh5boq15rn/AACftSab_FM6EFWWPgHpKw82a?dl=0) | 1xV100 32GB |
+| Kinyarwanda | 2023-08-15 | train_rw_with_wav2vec.yaml | No | 5.47 | 19.58 | 7.59 | 23.71 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-rw) | [model](https://www.dropbox.com/sh/4iax0l4yfry37gn/AABuQ31JY-Sbyi1VlOJfV7haa?dl=0) | 1xV100 32GB |
+| Arabic | 2023-08-15 | train_ar_with_wav2vec.yaml | No | 6.45 | 20.80 | 10.01 | 29.92 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-ar) | [model](https://www.dropbox.com/sh/7tnuqqbr4vy96cc/AAA_5_R0RmqFIiyR0o1nVS4Ia?dl=0) | 1xV100 32GB |
+| Spanish | 2023-08-15 | train_es_with_wav2vec.yaml | No | 3.36 | 12.61 | 3.80 | 13.38 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-es) | [model](https://www.dropbox.com/sh/ejvzgl3d3g8g9su/AACYtbSWbDHvBr06lAb7A4mVa?dl=0) | 1xV100 32GB |
+| Portuguese | 2023-08-15 | train_pt_with_wav2vec.yaml | No | 6.26 | 21.05 | 6.85 | 22.51 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-pt) | [model](https://www.dropbox.com/sh/80wucrvijdvao2a/AAD6-SZ2_ZZXmlAjOTw6fVloa?dl=0) | 1xV100 32GB |
+| Chinese(china) | 2023-08-15 | train_zh-CN_with_wav2vec.yaml | No | 25.03 | - | 23.17 | - | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-14-zh-CN) | [model](https://www.dropbox.com/sh/2bikr81vgufoglf/AABMpD0rLIaZBxjtwBHgrNpga?dl=0) | 1xV100 32GB |
+
 
 ## How to simply use pretrained models to transcribe my audio file?
 

diff --git a/...eq2seq/hparams/train_rw_with_wav2vec.yaml → ...SR/CTC/hparams/train_ar_with_wav2vec.yaml b/...eq2seq/hparams/train_rw_with_wav2vec.yaml → ...SR/CTC/hparams/train_ar_with_wav2vec.yaml
@@ -1,18 +1,18 @@
 # ################################
-# Model: wav2vec2 + DNN + CTC/Attention
+# Model: wav2vec2 + DNN + CTC
 # Augmentation: SpecAugment
-# Authors: Titouan Parcollet 2021
+# Authors: Pooneh Mousavi 2023
 # ################################
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
 __set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/wav2vec2_ctcatt_rw/<seed>
+output_folder: !ref results/wav2vec2_ctc_ar/<seed>
 test_wer_file: !ref <output_folder>/wer_test.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 
-# URL for the biggest HuggingFace multilingual w2v2 from XLSR.
+# URL for the biggest Fairseq multilingual
 wav2vec2_hub: facebook/wav2vec2-large-xlsr-53
 wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
 
@@ -21,8 +21,8 @@ data_folder: !PLACEHOLDER  # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
 train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
 dev_tsv_file: !ref <data_folder>/dev.tsv  # Standard CommonVoice .tsv files
 test_tsv_file: !ref <data_folder>/test.tsv  # Standard CommonVoice .tsv files
-accented_letters: False
-language: rw # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
+accented_letters: True
+language: ar # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
 train_csv: !ref <save_folder>/train.csv
 valid_csv: !ref <save_folder>/dev.csv
 test_csv: !ref <save_folder>/test.csv
@@ -34,19 +34,16 @@ avoid_if_longer_than: 10.0
 
 # Training parameters
 number_of_epochs: 30
-number_of_ctc_epochs: 20
 lr: 1.0
 lr_wav2vec: 0.0001
-ctc_weight: 0.3
 sorting: ascending
 precision: fp32 # bf16, fp16 or fp32
 sample_rate: 16000
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
-
 # With data_parallel batch_size is split into N jobs
 # With DDP batch_size is multiplied by N jobs
-# Must be 6 per GPU to fit 32GB of VRAM
+# Must be 8 per GPU to fit 32GB of VRAM
 batch_size: 12
 test_batch_size: 4
 
@@ -62,15 +59,12 @@ token_type: unigram  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
 
 # Model parameters
-activation: !name:torch.nn.LeakyReLU
 wav2vec_output_dim: 1024
-dnn_layers: 2
 dnn_neurons: 1024
-emb_size: 128
-dec_neurons: 1024
-dec_hidden_size: !ref <dec_neurons>
-dec_attn_dim: !ref <dec_neurons>
 freeze_wav2vec: False
+freeze_feature_extractor: False
+dropout: 0.15
+warmup_steps: 500
 
 # Outputs
 output_neurons: 1000  # BPE size, index(blank/eos/bos) = 0
@@ -80,13 +74,28 @@ output_neurons: 1000  # BPE size, index(blank/eos/bos) = 0
 blank_index: 0
 bos_index: 1
 eos_index: 2
-min_decode_ratio: 0.0
-max_decode_ratio: 1.0
-beam_size: 80
-eos_threshold: 1.5
-using_max_attn_shift: True
-max_attn_shift: 140
-temperature: 1.50
+
+# Decoding parameters
+# Be sure that the bos and eos index match with the BPEs ones
+# Decoding parameters
+test_beam_search:
+    blank_index: !ref <blank_index>
+    beam_size: 100
+    beam_prune_logp: -12.0
+    token_prune_min_logp: -1.2
+    prune_history: True
+    topk: 1
+    alpha: 1.0
+    beta: 0.5
+    # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
+    # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
+    # If you don't want to use an LM, comment it out or set it to null
+    # kenlm_model_path: none
+# can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
+# It can either be a .bin or .arpa ; note: .arpa is much slower at loading
+# If you don't want to use an LM, comment it out or set it to null
+# kenlm_model_path: none
+
 
 #
 # Functions and classes
@@ -141,16 +150,33 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter
         !ref <drop_freq>,
         !ref <drop_chunk>]
 
-enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+enc: !new:speechbrain.nnet.containers.Sequential
     input_shape: [null, null, !ref <wav2vec_output_dim>]
-    activation: !ref <activation>
-    dnn_blocks: !ref <dnn_layers>
-    dnn_neurons: !ref <dnn_neurons>
+    linear1: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation: !new:torch.nn.LeakyReLU
+    drop: !new:torch.nn.Dropout
+        p: !ref <dropout>
+    linear2: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation2: !new:torch.nn.LeakyReLU
+    drop2: !new:torch.nn.Dropout
+        p: !ref <dropout>
+    linear3: !name:speechbrain.nnet.linear.Linear
+        n_neurons: !ref <dnn_neurons>
+        bias: True
+    bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
+    activation3: !new:torch.nn.LeakyReLU
 
 wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
     source: !ref <wav2vec2_hub>
     output_norm: True
     freeze: !ref <freeze_wav2vec>
+    freeze_feature_extractor: !ref <freeze_feature_extractor>
     save_path: !ref <wav2vec2_folder>
 
 #####
@@ -166,51 +192,23 @@ wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Ve
 #    save_path: !ref <save_folder>/wav2vec2_checkpoint/model.pt
 #####
 
-emb: !new:speechbrain.nnet.embedding.Embedding
-    num_embeddings: !ref <output_neurons>
-    embedding_dim: !ref <emb_size>
-
-dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
-    enc_dim: !ref <dec_neurons>
-    input_size: !ref <emb_size>
-    rnn_type: gru
-    attn_type: location
-    hidden_size: !ref <dec_hidden_size>
-    attn_dim: !ref <dec_attn_dim>
-    num_layers: 1
-    scaling: 1.0
-    channels: 10
-    kernel_size: 100
-    re_init: True
-    dropout: 0.15
-
 ctc_lin: !new:speechbrain.nnet.linear.Linear
     input_size: !ref <dnn_neurons>
     n_neurons: !ref <output_neurons>
 
-seq_lin: !new:speechbrain.nnet.linear.Linear
-    input_size: !ref <dec_neurons>
-    n_neurons: !ref <output_neurons>
-
 log_softmax: !new:speechbrain.nnet.activations.Softmax
     apply_log: True
 
 ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
     blank_index: !ref <blank_index>
 
-seq_cost: !name:speechbrain.nnet.losses.nll_loss
-    label_smoothing: 0.1
-
 modules:
     wav2vec2: !ref <wav2vec2>
     enc: !ref <enc>
-    emb: !ref <emb>
-    dec: !ref <dec>
     ctc_lin: !ref <ctc_lin>
-    seq_lin: !ref <seq_lin>
 
 model: !new:torch.nn.ModuleList
-    - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
+    - [!ref <enc>, !ref <ctc_lin>]
 
 model_opt_class: !name:torch.optim.Adadelta
     lr: !ref <lr>
@@ -232,20 +230,6 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler
     annealing_factor: 0.9
     patient: 0
 
-beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
-    embedding: !ref <emb>
-    decoder: !ref <dec>
-    linear: !ref <seq_lin>
-    bos_index: !ref <bos_index>
-    eos_index: !ref <eos_index>
-    min_decode_ratio: !ref <min_decode_ratio>
-    max_decode_ratio: !ref <max_decode_ratio>
-    beam_size: !ref <beam_size>
-    eos_threshold: !ref <eos_threshold>
-    using_max_attn_shift: !ref <using_max_attn_shift>
-    max_attn_shift: !ref <max_attn_shift>
-    temperature: !ref <temperature>
-
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables: