2022-11-10 nightly release (74f9a89)

pytorch · Nov 10, 2022 · 4b10b6a · 4b10b6a
1 parent 6692395
commit 4b10b6a
Show file tree

Hide file tree

Showing 17 changed files with 1,544 additions and 33 deletions.
diff --git a/docs/source/prototype.models.rst b/docs/source/prototype.models.rst
@@ -22,3 +22,13 @@ ConvEmformer
   .. automethod:: forward
 
   .. automethod:: infer
+
+conformer_wav2vec2_model
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: conformer_wav2vec2_model
+
+conformer_wav2vec2_base
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: conformer_wav2vec2_base
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
@@ -439,3 +439,28 @@ @article{coucke2018snips
   journal={arXiv preprint arXiv:1805.10190},
   year={2018}
 }
+@INPROCEEDINGS{9746490,
+  author={Srivastava, Sangeeta and Wang, Yun and Tjandra, Andros and Kumar, Anurag and Liu, Chunxi and Singh, Kritika and Saraf, Yatharth},
+  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={Conformer-Based Self-Supervised Learning For Non-Speech Audio Tasks}, 
+  year={2022},
+  volume={},
+  number={},
+  pages={8862-8866},
+  doi={10.1109/ICASSP43922.2022.9746490}}
+@article{chen2022wavlm,
+  title={Wavlm: Large-scale self-supervised pre-training for full stack speech processing},
+  author={Chen, Sanyuan and Wang, Chengyi and Chen, Zhengyang and Wu, Yu and Liu, Shujie and Chen, Zhuo and Li, Jinyu and Kanda, Naoyuki and Yoshioka, Takuya and Xiao, Xiong and others},
+  journal={IEEE Journal of Selected Topics in Signal Processing},
+  volume={16},
+  number={6},
+  pages={1505--1518},
+  year={2022},
+  publisher={IEEE}
+}
+@inproceedings{GigaSpeech2021,
+  title={GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio},
+  booktitle={Proc. Interspeech 2021},
+  year=2021,
+  author={Guoguo Chen and Shuzhou Chai and Guanbo Wang and Jiayu Du and Wei-Qiang Zhang and Chao Weng and Dan Su and Daniel Povey and Jan Trmal and Junbo Zhang and Mingjie Jin and Sanjeev Khudanpur and Shinji Watanabe and Shuaijiang Zhao and Wei Zou and Xiangang Li and Xuchen Yao and Yongqing Wang and Yujun Wang and Zhao You and Zhiyong Yan}
+}
diff --git a/test/torchaudio_unittest/assets/wav2vec2/huggingface/wavlm-base.json b/test/torchaudio_unittest/assets/wav2vec2/huggingface/wavlm-base.json
@@ -0,0 +1,98 @@
+{
+    "activation_dropout": 0.0,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": false,
+    "apply_spec_augment": true,
+    "architectures": [
+      "WavLMModel"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "classifier_proj_size": 256,
+    "codevector_dim": 256,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": false,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": false,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_norm": "group",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "freeze_feat_extract_train": true,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.05,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_min_space": 1,
+    "mask_time_other": 0.0,
+    "mask_time_prob": 0.05,
+    "mask_time_selection": "static",
+    "max_bucket_distance": 800,
+    "model_type": "wavlm",
+    "no_mask_channel_overlap": false,
+    "no_mask_time_overlap": false,
+    "num_adapter_layers": 3,
+    "num_attention_heads": 12,
+    "num_buckets": 320,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_ctc_classes": 80,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 12,
+    "num_negatives": 100,
+    "output_hidden_size": 768,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 256,
+    "tokenizer_class": "Wav2Vec2CTCTokenizer",
+    "torch_dtype": "float32",
+    "transformers_version": "4.15.0.dev0",
+    "use_weighted_layer_sum": false,
+    "vocab_size": 32
+  }
diff --git a/test/torchaudio_unittest/assets/wav2vec2/huggingface/wavlm-large.json b/test/torchaudio_unittest/assets/wav2vec2/huggingface/wavlm-large.json
@@ -0,0 +1,98 @@
+{
+    "activation_dropout": 0.0,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": false,
+    "apply_spec_augment": true,
+    "architectures": [
+      "WavLMModel"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "classifier_proj_size": 256,
+    "codevector_dim": 768,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": false,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": true,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_dropout": 0.0,
+    "feat_extract_norm": "layer",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_min_space": 1,
+    "mask_time_other": 0.0,
+    "mask_time_prob": 0.075,
+    "mask_time_selection": "static",
+    "max_bucket_distance": 800,
+    "model_type": "wavlm",
+    "num_adapter_layers": 3,
+    "num_attention_heads": 16,
+    "num_buckets": 320,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_ctc_classes": 80,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 24,
+    "num_negatives": 100,
+    "output_hidden_size": 1024,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 768,
+    "replace_prob": 0.5,
+    "tokenizer_class": "Wav2Vec2CTCTokenizer",
+    "torch_dtype": "float32",
+    "transformers_version": "4.15.0.dev0",
+    "use_weighted_layer_sum": false,
+    "vocab_size": 32
+  }
diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py
@@ -16,6 +16,7 @@
     TempDirMixin,
     TestBaseMixin,
     TorchaudioTestCase,
+    zip_equal,
 )
 from .data_utils import get_asset_path, get_sinusoid, get_spectrogram, get_whitenoise
 from .func_utils import torch_script
@@ -57,4 +58,5 @@
     "get_image",
     "rgb_to_gray",
     "rgb_to_yuv_ccir",
+    "zip_equal",
 ]
diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -6,6 +6,7 @@
 import tempfile
 import time
 import unittest
+from itertools import zip_longest
 
 import torch
 import torchaudio
@@ -245,3 +246,16 @@ def skipIfNoModule(module, display_name=None):
     ),
     key="ON_PYTHON_310",
 )
+
+
+def zip_equal(*iterables):
+    """With the regular Python `zip` function, if one iterable is longer than the other,
+    the remainder portions are ignored.This is resolved in Python 3.10 where we can use
+    `strict=True` in the `zip` function
+    From https://github.com/pytorch/text/blob/c047efeba813ac943cb8046a49e858a8b529d577/test/torchtext_unittest/common/case_utils.py#L45-L54  # noqa: E501
+    """
+    sentinel = object()
+    for combo in zip_longest(*iterables, fillvalue=sentinel):
+        if sentinel in combo:
+            raise ValueError("Iterables have different lengths")
+        yield combo