From 450a181d8b963b4e896be4aac701815aa554a6bb Mon Sep 17 00:00:00 2001 From: Susnato Dhar Date: Mon, 21 Aug 2023 21:05:00 +0530 Subject: [PATCH] Add Pop2Piano (#21785) * init commit * config updated also some modeling * Processor and Model config combined * extraction pipeline(upto before spectogram & mel_conditioner) added but not properly tested * model loading successful! * feature extractor done! * FE can now be called from HF * postprocessing added in fe file * same as prev commit * Pop2PianoConfig doc done * cfg docs slightly changed * fe docs done * batched * batched working! * temp * v1 * checking * trying to go with generate * with generate and model tests passed * before rebasing * . * tests done docs done remaining others & nits * nits * LogMelSpectogram shifted to FeatureExtractor * is_tf rmeoved from pop2piano/init * import solved * tokenization tests added * minor fixed regarding modeling_pop2piano * tokenizer changed to only return midi_object and other changes * Updated paper abstract(Camera-ready version) (#2) * more comments and nits * ruff changes * code quality fix * sg comments * t5 change added and rebased * comments except batching * batching done * comments * small doc fix * example removed from modeling * ckpt * forward it compatible with fe and generation done * comments * comments * code-quality fix(maybe) * ckpts changed * doc file changed from mdx to md * test fixes * tokenizer test fix * changes * nits done main changes remaining * code modified * Pop2PianoProcessor added with tests * other comments * added Pop2PianoProcessor to dummy_objects * added require_onnx to modeling file * changes * update .md file * remove extra line in index.md * back to the main index * added pop2piano to index * Added tokenizer.__call__ with valid args and batch_decode and aligned the processor part too * changes * added return types to 2 tokenizer methods * the PR build test might work now * added backends * PR build fix * vocab added * comments * refactored vocab into 1 file * added conversion script * comments * essentia version changed in .md * comments * more tokenizer tests added * minor fix * tests extended for outputs acc check * small fix --------- Co-authored-by: Jongho Choi --- README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 2 + docs/source/en/model_doc/pop2piano.md | 190 +++ src/transformers/__init__.py | 61 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/feature_extraction_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/auto/processing_auto.py | 1 + src/transformers/models/pop2piano/__init__.py | 122 ++ .../pop2piano/configuration_pop2piano.py | 129 ++ .../convert_pop2piano_weights_to_hf.py | 190 +++ .../pop2piano/feature_extraction_pop2piano.py | 463 ++++++ .../models/pop2piano/modeling_pop2piano.py | 1377 +++++++++++++++++ .../models/pop2piano/processing_pop2piano.py | 138 ++ .../pop2piano/tokenization_pop2piano.py | 714 +++++++++ src/transformers/pytorch_utils.py | 1 + src/transformers/testing_utils.py | 16 + src/transformers/utils/__init__.py | 2 + ...pretty_midi_and_scipy_and_torch_objects.py | 23 + src/transformers/utils/dummy_music_objects.py | 16 + src/transformers/utils/dummy_pt_objects.py | 17 + src/transformers/utils/import_utils.py | 48 + tests/models/pop2piano/__init__.py | 0 .../test_feature_extraction_pop2piano.py | 291 ++++ .../pop2piano/test_modeling_pop2piano.py | 778 ++++++++++ .../pop2piano/test_processor_pop2piano.py | 266 ++++ .../pop2piano/test_tokenization_pop2piano.py | 418 +++++ utils/check_config_attributes.py | 2 + utils/check_repo.py | 1 + utils/documentation_tests.txt | 2 + 38 files changed, 5284 insertions(+) create mode 100644 docs/source/en/model_doc/pop2piano.md create mode 100644 src/transformers/models/pop2piano/__init__.py create mode 100644 src/transformers/models/pop2piano/configuration_pop2piano.py create mode 100644 src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py create mode 100644 src/transformers/models/pop2piano/feature_extraction_pop2piano.py create mode 100644 src/transformers/models/pop2piano/modeling_pop2piano.py create mode 100644 src/transformers/models/pop2piano/processing_pop2piano.py create mode 100644 src/transformers/models/pop2piano/tokenization_pop2piano.py create mode 100644 src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py create mode 100644 src/transformers/utils/dummy_music_objects.py create mode 100644 tests/models/pop2piano/__init__.py create mode 100644 tests/models/pop2piano/test_feature_extraction_pop2piano.py create mode 100644 tests/models/pop2piano/test_modeling_pop2piano.py create mode 100644 tests/models/pop2piano/test_processor_pop2piano.py create mode 100644 tests/models/pop2piano/test_tokenization_pop2piano.py diff --git a/README.md b/README.md index 0f7e23c4620f9d..41fb758abe1500 100644 --- a/README.md +++ b/README.md @@ -434,6 +434,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. diff --git a/README_es.md b/README_es.md index b9e3211fba0949..6a0701b09d3432 100644 --- a/README_es.md +++ b/README_es.md @@ -411,6 +411,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. diff --git a/README_hd.md b/README_hd.md index cec0c4503e7c2a..8651678669a7e7 100644 --- a/README_hd.md +++ b/README_hd.md @@ -383,6 +383,7 @@ conda install -c huggingface transformers 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google से) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. द्वाराअनुसंधान पत्र [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) के साथ जारी किया गया 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP से) साथ वाला पेपर [प्रोग्राम अंडरस्टैंडिंग एंड जेनरेशन के लिए यूनिफाइड प्री-ट्रेनिंग](https://arxiv .org/abs/2103.06333) वसी उद्दीन अहमद, सैकत चक्रवर्ती, बैशाखी रे, काई-वेई चांग द्वारा। 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [ProphetNet: प्रेडिक्टिंग फ्यूचर एन-ग्राम फॉर सीक्वेंस-टू-सीक्वेंस प्री-ट्रेनिंग ](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा पोस्ट किया गया। 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) के साथ जारी किया गया 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA से) साथ वाला पेपर [डीप लर्निंग इंफ़ेक्शन के लिए इंटीजर क्वांटिज़ेशन: प्रिंसिपल्स एंड एम्पिरिकल इवैल्यूएशन](https:// arxiv.org/abs/2004.09602) हाओ वू, पैट्रिक जुड, जिआओजी झांग, मिखाइल इसेव और पॉलियस माइकेविसियस द्वारा। diff --git a/README_ja.md b/README_ja.md index 387051509b7e5f..8e497e94175a0a 100644 --- a/README_ja.md +++ b/README_ja.md @@ -445,6 +445,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google から) Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. から公開された研究論文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP から) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang から公開された研究論文: [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs から) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng から公開された研究論文: [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) diff --git a/README_ko.md b/README_ko.md index 486494b384329c..3f33e4b199d367 100644 --- a/README_ko.md +++ b/README_ko.md @@ -360,6 +360,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (Google 에서 제공)은 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.의 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347)논문과 함께 발표했습니다. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (UCLA NLP 에서) Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 의 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 논문과 함께 발표했습니다. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (Sea AI Labs 에서) Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 의 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 논문과 함께 발표했습니다. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)논문과 함께 발표했습니다. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA 에서) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 의 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 논문과 함께 발표했습니다. diff --git a/README_zh-hans.md b/README_zh-hans.md index 766a68b7f9fdfb..135f9b35a9631f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -384,6 +384,7 @@ conda install -c huggingface transformers 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (来自 Google) 伴随论文 [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) 由 Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova 发布。 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。 +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (来自 Nanjing University, The University of Hong Kong etc.) 伴随论文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 09f70cb89e95c7..781c412ca2a1db 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -396,6 +396,7 @@ conda install -c huggingface transformers 1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](https://huggingface.co/docs/transformers/main/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 70795f10f123ae..745065717e4cdd 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -584,6 +584,8 @@ title: MMS - local: model_doc/musicgen title: MusicGen + - local: model_doc/pop2piano + title: Pop2Piano - local: model_doc/sew title: SEW - local: model_doc/sew-d diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 85eea6338085d2..e1c346971f386e 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -200,6 +200,7 @@ The documentation is organized into five sections: 1. **[Pix2Struct](model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. 1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. 1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng. +1. **[Pop2Piano](model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. 1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. @@ -415,6 +416,7 @@ Flax), PyTorch, and/or TensorFlow. | Pix2Struct | ✅ | ❌ | ❌ | | PLBart | ✅ | ❌ | ❌ | | PoolFormer | ✅ | ❌ | ❌ | +| Pop2Piano | ✅ | ❌ | ❌ | | ProphetNet | ✅ | ❌ | ❌ | | PVT | ✅ | ❌ | ❌ | | QDQBert | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md new file mode 100644 index 00000000000000..d582b163874e60 --- /dev/null +++ b/docs/source/en/model_doc/pop2piano.md @@ -0,0 +1,190 @@ + + +# Pop2Piano + +## Overview + +The Pop2Piano model was proposed in [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee. + +Piano covers of pop music are widely enjoyed, but generating them from music is not a trivial task. It requires great +expertise with playing piano as well as knowing different characteristics and melodies of a song. With Pop2Piano you +can directly generate a cover from a song's audio waveform. It is the first model to directly generate a piano cover +from pop audio without melody and chord extraction modules. + +Pop2Piano is an encoder-decoder Transformer model based on [T5](https://arxiv.org/pdf/1910.10683.pdf). The input audio +is transformed to its waveform and passed to the encoder, which transforms it to a latent representation. The decoder +uses these latent representations to generate token ids in an autoregressive way. Each token id corresponds to one of four +different token types: time, velocity, note and 'special'. The token ids are then decoded to their equivalent MIDI file. + + +The abstract from the paper is the following: + +*Piano covers of pop music are enjoyed by many people. However, the +task of automatically generating piano covers of pop music is still +understudied. This is partly due to the lack of synchronized +{Pop, Piano Cover} data pairs, which made it challenging to apply +the latest data-intensive deep learning-based methods. To leverage +the power of the data-driven approach, we make a large amount of +paired and synchronized {Pop, Piano Cover} data using an automated +pipeline. In this paper, we present Pop2Piano, a Transformer network +that generates piano covers given waveforms of pop music. To the best +of our knowledge, this is the first model to generate a piano cover +directly from pop audio without using melody and chord extraction +modules. We show that Pop2Piano, trained with our dataset, is capable +of producing plausible piano covers.* + + +Tips: + +1. To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules: +``` +pip install pretty-midi==0.2.9 essentia==2.1b6.dev1034 librosa scipy +``` +Please note that you may need to restart your runtime after installation. +2. Pop2Piano is an Encoder-Decoder based model like T5. +3. Pop2Piano can be used to generate midi-audio files for a given audio sequence. +4. Choosing different composers in `Pop2PianoForConditionalGeneration.generate()` can lead to variety of different results. +5. Setting the sampling rate to 44.1 kHz when loading the audio file can give good performance. +6. Though Pop2Piano was mainly trained on Korean Pop music, it also does pretty well on other Western Pop or Hip Hop songs. + +This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). +The original code can be found [here](https://github.com/sweetcocoa/pop2piano). + +## Examples + +- Example using HuggingFace Dataset: + +```python +>>> from datasets import load_dataset +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor + +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") +>>> ds = load_dataset("sweetcocoa/pop2piano_ci", split="test") + +>>> inputs = processor( +... audio=ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="pt" +... ) +>>> model_output = model.generate(input_features=inputs["input_features"], composer="composer1") +>>> tokenizer_output = processor.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"][0] +>>> tokenizer_output.write("./Outputs/midi_output.mid") +``` + +- Example using your own audio file: + +```python +>>> import librosa +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor + +>>> audio, sr = librosa.load("", sr=44100) # feel free to change the sr to a suitable value. +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") + +>>> inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt") +>>> model_output = model.generate(input_features=inputs["input_features"], composer="composer1") +>>> tokenizer_output = processor.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"][0] +>>> tokenizer_output.write("./Outputs/midi_output.mid") +``` + +- Example of processing multiple audio files in batch: + +```python +>>> import librosa +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor + +>>> # feel free to change the sr to a suitable value. +>>> audio1, sr1 = librosa.load("", sr=44100) +>>> audio2, sr2 = librosa.load("", sr=44100) +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") + +>>> inputs = processor(audio=[audio1, audio2], sampling_rate=[sr1, sr2], return_attention_mask=True, return_tensors="pt") +>>> # Since we now generating in batch(2 audios) we must pass the attention_mask +>>> model_output = model.generate( +... input_features=inputs["input_features"], +... attention_mask=inputs["attention_mask"], +... composer="composer1", +... ) +>>> tokenizer_output = processor.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"] + +>>> # Since we now have 2 generated MIDI files +>>> tokenizer_output[0].write("./Outputs/midi_output1.mid") +>>> tokenizer_output[1].write("./Outputs/midi_output2.mid") +``` + + +- Example of processing multiple audio files in batch (Using `Pop2PianoFeatureExtractor` and `Pop2PianoTokenizer`): + +```python +>>> import librosa +>>> from transformers import Pop2PianoForConditionalGeneration, Pop2PianoFeatureExtractor, Pop2PianoTokenizer + +>>> # feel free to change the sr to a suitable value. +>>> audio1, sr1 = librosa.load("", sr=44100) +>>> audio2, sr2 = librosa.load("", sr=44100) +>>> model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +>>> feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano") +>>> tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano") + +>>> inputs = feature_extractor( +... audio=[audio1, audio2], +... sampling_rate=[sr1, sr2], +... return_attention_mask=True, +... return_tensors="pt", +... ) +>>> # Since we now generating in batch(2 audios) we must pass the attention_mask +>>> model_output = model.generate( +... input_features=inputs["input_features"], +... attention_mask=inputs["attention_mask"], +... composer="composer1", +... ) +>>> tokenizer_output = tokenizer.batch_decode( +... token_ids=model_output, feature_extractor_output=inputs +... )["pretty_midi_objects"] + +>>> # Since we now have 2 generated MIDI files +>>> tokenizer_output[0].write("./Outputs/midi_output1.mid") +>>> tokenizer_output[1].write("./Outputs/midi_output2.mid") +``` + + +## Pop2PianoConfig + +[[autodoc]] Pop2PianoConfig + +## Pop2PianoFeatureExtractor + +[[autodoc]] Pop2PianoFeatureExtractor + - __call__ + +## Pop2PianoForConditionalGeneration + +[[autodoc]] Pop2PianoForConditionalGeneration + - forward + - generate + +## Pop2PianoTokenizer + +[[autodoc]] Pop2PianoTokenizer + - __call__ + +## Pop2PianoProcessor + +[[autodoc]] Pop2PianoProcessor + - __call__ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index c9ff16c47d8105..9b95aadffccc6f 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -28,8 +28,12 @@ OptionalDependencyNotAvailable, _LazyModule, is_bitsandbytes_available, + is_essentia_available, is_flax_available, is_keras_nlp_available, + is_librosa_available, + is_pretty_midi_available, + is_scipy_available, is_sentencepiece_available, is_speech_available, is_tensorflow_text_available, @@ -475,6 +479,10 @@ ], "models.plbart": ["PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "PLBartConfig"], "models.poolformer": ["POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PoolFormerConfig"], + "models.pop2piano": [ + "POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP", + "Pop2PianoConfig", + ], "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"], "models.pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig"], "models.qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"], @@ -2430,6 +2438,13 @@ "PoolFormerPreTrainedModel", ] ) + _import_structure["models.pop2piano"].extend( + [ + "POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST", + "Pop2PianoForConditionalGeneration", + "Pop2PianoPreTrainedModel", + ] + ) _import_structure["models.prophetnet"].extend( [ "PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3783,6 +3798,29 @@ _import_structure["trainer_tf"] = ["TFTrainer"] +try: + if not ( + is_librosa_available() + and is_essentia_available() + and is_scipy_available() + and is_torch_available() + and is_pretty_midi_available() + ): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects + + _import_structure["utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects"] = [ + name + for name in dir(dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects) + if not name.startswith("_") + ] +else: + _import_structure["models.pop2piano"].append("Pop2PianoFeatureExtractor") + _import_structure["models.pop2piano"].append("Pop2PianoTokenizer") + _import_structure["models.pop2piano"].append("Pop2PianoProcessor") + + # FLAX-backed objects try: if not is_flax_available(): @@ -4478,6 +4516,10 @@ ) from .models.plbart import PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP, PLBartConfig from .models.poolformer import POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, PoolFormerConfig + from .models.pop2piano import ( + POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP, + Pop2PianoConfig, + ) from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer from .models.pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig @@ -6122,6 +6164,11 @@ PoolFormerModel, PoolFormerPreTrainedModel, ) + from .models.pop2piano import ( + POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST, + Pop2PianoForConditionalGeneration, + Pop2PianoPreTrainedModel, + ) from .models.prophetnet import ( PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, ProphetNetDecoder, @@ -7212,6 +7259,20 @@ # Trainer from .trainer_tf import TFTrainer + try: + if not ( + is_librosa_available() + and is_essentia_available() + and is_scipy_available() + and is_torch_available() + and is_pretty_midi_available() + ): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects import * + else: + from .models.pop2piano import Pop2PianoFeatureExtractor, Pop2PianoProcessor, Pop2PianoTokenizer + try: if not is_flax_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index c0aa474f3a4e9a..8f60447e7319f9 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -157,6 +157,7 @@ pix2struct, plbart, poolformer, + pop2piano, prophetnet, pvt, qdqbert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 0bc71e294d2240..a345235951d48c 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -162,6 +162,7 @@ ("pix2struct", "Pix2StructConfig"), ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), + ("pop2piano", "Pop2PianoConfig"), ("prophetnet", "ProphetNetConfig"), ("pvt", "PvtConfig"), ("qdqbert", "QDQBertConfig"), @@ -361,6 +362,7 @@ ("pix2struct", "PIX2STRUCT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("plbart", "PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("poolformer", "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("pop2piano", "POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("prophetnet", "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("pvt", "PVT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("qdqbert", "QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -578,6 +580,7 @@ ("pix2struct", "Pix2Struct"), ("plbart", "PLBart"), ("poolformer", "PoolFormer"), + ("pop2piano", "Pop2Piano"), ("prophetnet", "ProphetNet"), ("pvt", "PVT"), ("qdqbert", "QDQBert"), diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index 9b3ab2b1705b5d..90ece37c657191 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -73,6 +73,7 @@ ("owlvit", "OwlViTFeatureExtractor"), ("perceiver", "PerceiverFeatureExtractor"), ("poolformer", "PoolFormerFeatureExtractor"), + ("pop2piano", "Pop2PianoFeatureExtractor"), ("regnet", "ConvNextFeatureExtractor"), ("resnet", "ConvNextFeatureExtractor"), ("segformer", "SegformerFeatureExtractor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index f42f5b12dcc5b0..8be38bb3f8d577 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -346,6 +346,7 @@ ("openai-gpt", "OpenAIGPTLMHeadModel"), ("pegasus_x", "PegasusXForConditionalGeneration"), ("plbart", "PLBartForConditionalGeneration"), + ("pop2piano", "Pop2PianoForConditionalGeneration"), ("qdqbert", "QDQBertForMaskedLM"), ("reformer", "ReformerModelWithLMHead"), ("rembert", "RemBertForMaskedLM"), @@ -670,6 +671,7 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( [ + ("pop2piano", "Pop2PianoForConditionalGeneration"), ("speech-encoder-decoder", "SpeechEncoderDecoderModel"), ("speech_to_text", "Speech2TextForConditionalGeneration"), ("speecht5", "SpeechT5ForSpeechToText"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index dcd5d9ddf6bc1c..58a8904b5134af 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -67,6 +67,7 @@ ("oneformer", "OneFormerProcessor"), ("owlvit", "OwlViTProcessor"), ("pix2struct", "Pix2StructProcessor"), + ("pop2piano", "Pop2PianoProcessor"), ("sam", "SamProcessor"), ("sew", "Wav2Vec2Processor"), ("sew-d", "Wav2Vec2Processor"), diff --git a/src/transformers/models/pop2piano/__init__.py b/src/transformers/models/pop2piano/__init__.py new file mode 100644 index 00000000000000..08b1e732b7df89 --- /dev/null +++ b/src/transformers/models/pop2piano/__init__.py @@ -0,0 +1,122 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_essentia_available, + is_librosa_available, + is_pretty_midi_available, + is_scipy_available, + is_torch_available, +) + + +_import_structure = { + "configuration_pop2piano": ["POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP", "Pop2PianoConfig"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_pop2piano"] = [ + "POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST", + "Pop2PianoForConditionalGeneration", + "Pop2PianoPreTrainedModel", + ] + +try: + if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["feature_extraction_pop2piano"] = ["Pop2PianoFeatureExtractor"] + +try: + if not (is_pretty_midi_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_pop2piano"] = ["Pop2PianoTokenizer"] + +try: + if not ( + is_pretty_midi_available() + and is_torch_available() + and is_librosa_available() + and is_essentia_available() + and is_scipy_available() + ): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["processing_pop2piano"] = ["Pop2PianoProcessor"] + + +if TYPE_CHECKING: + from .configuration_pop2piano import POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP, Pop2PianoConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_pop2piano import ( + POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST, + Pop2PianoForConditionalGeneration, + Pop2PianoPreTrainedModel, + ) + + try: + if not (is_librosa_available() and is_essentia_available() and is_scipy_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .feature_extraction_pop2piano import Pop2PianoFeatureExtractor + + try: + if not (is_pretty_midi_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_pop2piano import Pop2PianoTokenizer + + try: + if not ( + is_pretty_midi_available() + and is_torch_available() + and is_librosa_available() + and is_essentia_available() + and is_scipy_available() + ): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .processing_pop2piano import Pop2PianoProcessor + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/pop2piano/configuration_pop2piano.py b/src/transformers/models/pop2piano/configuration_pop2piano.py new file mode 100644 index 00000000000000..15bf1ac438dd43 --- /dev/null +++ b/src/transformers/models/pop2piano/configuration_pop2piano.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Pop2Piano model configuration""" + + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + +POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "sweetcocoa/pop2piano": "https://huggingface.co/sweetcocoa/pop2piano/blob/main/config.json" +} + + +class Pop2PianoConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Pop2PianoForConditionalGeneration`]. It is used + to instantiate a Pop2PianoForConditionalGeneration model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + Pop2Piano [sweetcocoa/pop2piano](https://huggingface.co/sweetcocoa/pop2piano) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Arguments: + vocab_size (`int`, *optional*, defaults to 2400): + Vocabulary size of the `Pop2PianoForConditionalGeneration` model. Defines the number of different tokens + that can be represented by the `inputs_ids` passed when calling [`Pop2PianoForConditionalGeneration`]. + composer_vocab_size (`int`, *optional*, defaults to 21): + Denotes the number of composers. + d_model (`int`, *optional*, defaults to 512): + Size of the encoder layers and the pooler layer. + d_kv (`int`, *optional*, defaults to 64): + Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will + be defined as `num_heads * d_kv`. + d_ff (`int`, *optional*, defaults to 2048): + Size of the intermediate feed forward layer in each `Pop2PianoBlock`. + num_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + num_decoder_layers (`int`, *optional*): + Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set. + num_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + relative_attention_num_buckets (`int`, *optional*, defaults to 32): + The number of buckets to use for each attention layer. + relative_attention_max_distance (`int`, *optional*, defaults to 128): + The maximum distance of the longer sequences for the bucket separation. + dropout_rate (`float`, *optional*, defaults to 0.1): + The ratio for all dropout layers. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization + testing). + feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`): + Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + dense_act_fn (`string`, *optional*, defaults to `"relu"`): + Type of Activation Function to be used in `Pop2PianoDenseActDense` and in `Pop2PianoDenseGatedActDense`. + """ + + model_type = "pop2piano" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=2400, + composer_vocab_size=21, + d_model=512, + d_kv=64, + d_ff=2048, + num_layers=6, + num_decoder_layers=None, + num_heads=8, + relative_attention_num_buckets=32, + relative_attention_max_distance=128, + dropout_rate=0.1, + layer_norm_epsilon=1e-6, + initializer_factor=1.0, + feed_forward_proj="gated-gelu", # noqa + is_encoder_decoder=True, + use_cache=True, + pad_token_id=0, + eos_token_id=1, + dense_act_fn="relu", + **kwargs, + ): + self.vocab_size = vocab_size + self.composer_vocab_size = composer_vocab_size + self.d_model = d_model + self.d_kv = d_kv + self.d_ff = d_ff + self.num_layers = num_layers + self.num_decoder_layers = num_decoder_layers if num_decoder_layers is not None else self.num_layers + self.num_heads = num_heads + self.relative_attention_num_buckets = relative_attention_num_buckets + self.relative_attention_max_distance = relative_attention_max_distance + self.dropout_rate = dropout_rate + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_factor = initializer_factor + self.feed_forward_proj = feed_forward_proj + self.use_cache = use_cache + self.dense_act_fn = dense_act_fn + self.is_gated_act = self.feed_forward_proj.split("-")[0] == "gated" + self.hidden_size = self.d_model + self.num_attention_heads = num_heads + self.num_hidden_layers = num_layers + + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) diff --git a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py new file mode 100644 index 00000000000000..a73c57886da96e --- /dev/null +++ b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py @@ -0,0 +1,190 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" File for loading the Pop2Piano model weights from the official repository and to show how tokenizer vocab was + constructed""" + +import json + +import torch + +from transformers import Pop2PianoConfig, Pop2PianoForConditionalGeneration + + +########################## MODEL WEIGHTS ########################## + +# This weights were downloaded from the official pop2piano repository +# https://huggingface.co/sweetcocoa/pop2piano/blob/main/model-1999-val_0.67311615.ckpt +official_weights = torch.load("./model-1999-val_0.67311615.ckpt") +state_dict = {} + + +# load the config and init the model +cfg = Pop2PianoConfig.from_pretrained("sweetcocoa/pop2piano") +model = Pop2PianoForConditionalGeneration(cfg) + + +# load relative attention bias +state_dict["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][ + "transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight" +] +state_dict["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][ + "transformer.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight" +] + +# load embed tokens and final layer norm for both encoder and decoder +state_dict["encoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.encoder.embed_tokens.weight"] +state_dict["decoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.decoder.embed_tokens.weight"] + +state_dict["encoder.final_layer_norm.weight"] = official_weights["state_dict"][ + "transformer.encoder.final_layer_norm.weight" +] +state_dict["decoder.final_layer_norm.weight"] = official_weights["state_dict"][ + "transformer.decoder.final_layer_norm.weight" +] + +# load lm_head, mel_conditioner.emb and shared +state_dict["lm_head.weight"] = official_weights["state_dict"]["transformer.lm_head.weight"] +state_dict["mel_conditioner.embedding.weight"] = official_weights["state_dict"]["mel_conditioner.embedding.weight"] +state_dict["shared.weight"] = official_weights["state_dict"]["transformer.shared.weight"] + +# load each encoder blocks +for i in range(cfg.num_layers): + # layer 0 + state_dict[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.0.SelfAttention.q.weight" + ] + state_dict[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.0.SelfAttention.k.weight" + ] + state_dict[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.0.SelfAttention.v.weight" + ] + state_dict[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.0.SelfAttention.o.weight" + ] + state_dict[f"encoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.0.layer_norm.weight" + ] + + # layer 1 + state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight" + ] + state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight" + ] + state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wo.weight" + ] + state_dict[f"encoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][ + f"transformer.encoder.block.{i}.layer.1.layer_norm.weight" + ] + +# load each decoder blocks +for i in range(6): + # layer 0 + state_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.0.SelfAttention.q.weight" + ] + state_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.0.SelfAttention.k.weight" + ] + state_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.0.SelfAttention.v.weight" + ] + state_dict[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.0.SelfAttention.o.weight" + ] + state_dict[f"decoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.0.layer_norm.weight" + ] + + # layer 1 + state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.1.EncDecAttention.q.weight" + ] + state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.1.EncDecAttention.k.weight" + ] + state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.1.EncDecAttention.v.weight" + ] + state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.1.EncDecAttention.o.weight" + ] + state_dict[f"decoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.1.layer_norm.weight" + ] + + # layer 2 + state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight" + ] + state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight" + ] + state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wo.weight" + ] + state_dict[f"decoder.block.{i}.layer.2.layer_norm.weight"] = official_weights["state_dict"][ + f"transformer.decoder.block.{i}.layer.2.layer_norm.weight" + ] + +model.load_state_dict(state_dict, strict=True) + +# save the weights +torch.save(state_dict, "./pytorch_model.bin") + +########################## TOKENIZER ########################## + +# the tokenize and detokenize methods are taken from the official implementation + + +# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L34 +def tokenize(idx, token_type, n_special=4, n_note=128, n_velocity=2): + if token_type == "TOKEN_TIME": + return n_special + n_note + n_velocity + idx + elif token_type == "TOKEN_VELOCITY": + return n_special + n_note + idx + elif token_type == "TOKEN_NOTE": + return n_special + idx + elif token_type == "TOKEN_SPECIAL": + return idx + else: + return -1 + + +# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L48 +def detokenize(idx, n_special=4, n_note=128, n_velocity=2, time_idx_offset=0): + if idx >= n_special + n_note + n_velocity: + return "TOKEN_TIME", (idx - (n_special + n_note + n_velocity)) + time_idx_offset + elif idx >= n_special + n_note: + return "TOKEN_VELOCITY", idx - (n_special + n_note) + elif idx >= n_special: + return "TOKEN_NOTE", idx - n_special + else: + return "TOKEN_SPECIAL", idx + + +# crate the decoder and then the encoder of the tokenizer +decoder = {} +for i in range(cfg.vocab_size): + decoder.update({i: f"{detokenize(i)[1]}_{detokenize(i)[0]}"}) + +encoder = {v: k for k, v in decoder.items()} + +# save the vocab +with open("./vocab.json", "w") as file: + file.write(json.dumps(encoder)) diff --git a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py new file mode 100644 index 00000000000000..d0e50b1c17301e --- /dev/null +++ b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py @@ -0,0 +1,463 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Feature extractor class for Pop2Piano""" + +import copy +import warnings +from typing import List, Optional, Union + +import numpy +import numpy as np + +from ...audio_utils import mel_filter_bank, spectrogram +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import ( + TensorType, + is_essentia_available, + is_librosa_available, + is_scipy_available, + logging, + requires_backends, +) + + +if is_essentia_available(): + import essentia + import essentia.standard + +if is_librosa_available(): + import librosa + +if is_scipy_available(): + import scipy + + +logger = logging.get_logger(__name__) + + +class Pop2PianoFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a Pop2Piano feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed + to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as + well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate + extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate + and preprocessed and then log mel spectogram is computed from that to be used in our transformer model. + + Args: + sampling_rate (`int`, *optional*, defaults to 22050): + Target Sampling rate of audio signal. It's the sampling rate that we forward to the model. + padding_value (`int`, *optional*, defaults to 0): + Padding value used to pad the audio. Should correspond to silences. + window_size (`int`, *optional*, defaults to 4096): + Length of the window in samples to which the Fourier transform is applied. + hop_length (`int`, *optional*, defaults to 1024): + Step size between each window of the waveform, in samples. + min_frequency (`float`, *optional*, defaults to 10.0): + Lowest frequency that will be used in the log-mel spectrogram. + feature_size (`int`, *optional*, defaults to 512): + The feature dimension of the extracted features. + num_bars (`int`, *optional*, defaults to 2): + Determines interval between each sequence. + """ + model_input_names = ["input_features", "beatsteps", "extrapolated_beatstep"] + + def __init__( + self, + sampling_rate: int = 22050, + padding_value: int = 0, + window_size: int = 4096, + hop_length: int = 1024, + min_frequency: float = 10.0, + feature_size: int = 512, + num_bars: int = 2, + **kwargs, + ): + super().__init__( + feature_size=feature_size, + sampling_rate=sampling_rate, + padding_value=padding_value, + **kwargs, + ) + self.sampling_rate = sampling_rate + self.padding_value = padding_value + self.window_size = window_size + self.hop_length = hop_length + self.min_frequency = min_frequency + self.feature_size = feature_size + self.num_bars = num_bars + self.mel_filters = mel_filter_bank( + num_frequency_bins=(self.window_size // 2) + 1, + num_mel_filters=self.feature_size, + min_frequency=self.min_frequency, + max_frequency=float(self.sampling_rate // 2), + sampling_rate=self.sampling_rate, + norm=None, + mel_scale="htk", + ) + + def mel_spectrogram(self, sequence: np.ndarray): + """ + Generates MelSpectrogram. + + Args: + sequence (`numpy.ndarray`): + The sequence of which the mel-spectrogram will be computed. + """ + mel_specs = [] + for seq in sequence: + window = np.hanning(self.window_size + 1)[:-1] + mel_specs.append( + spectrogram( + waveform=seq, + window=window, + frame_length=self.window_size, + hop_length=self.hop_length, + power=2.0, + mel_filters=self.mel_filters, + ) + ) + mel_specs = np.array(mel_specs) + + return mel_specs + + def extract_rhythm(self, audio: np.ndarray): + """ + This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as + tempo in bpm for an audio signal. For more information please visit + https://essentia.upf.edu/reference/std_RhythmExtractor2013.html . + + Args: + audio(`numpy.ndarray`): + raw audio waveform which is passed to the Rhythm Extractor. + """ + requires_backends(self, ["essentia"]) + essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature") + bpm, beat_times, confidence, estimates, essentia_beat_intervals = essentia_tracker(audio) + + return bpm, beat_times, confidence, estimates, essentia_beat_intervals + + def interpolate_beat_times( + self, beat_times: numpy.ndarray, steps_per_beat: numpy.ndarray, n_extend: numpy.ndarray + ): + """ + This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is + then used to convert raw audio to log-mel-spectrogram. + + Args: + beat_times (`numpy.ndarray`): + beat_times is passed into `scipy.interpolate.interp1d` for processing. + steps_per_beat (`int`): + used as an parameter to control the interpolation. + n_extend (`int`): + used as an parameter to control the interpolation. + """ + + requires_backends(self, ["scipy"]) + beat_times_function = scipy.interpolate.interp1d( + np.arange(beat_times.size), + beat_times, + bounds_error=False, + fill_value="extrapolate", + ) + + ext_beats = beat_times_function( + np.linspace(0, beat_times.size + n_extend - 1, beat_times.size * steps_per_beat + n_extend) + ) + + return ext_beats + + def preprocess_mel(self, audio: np.ndarray, beatstep: np.ndarray): + """ + Preprocessing for log-mel-spectrogram + + Args: + audio (`numpy.ndarray` of shape `(audio_length, )` ): + Raw audio waveform to be processed. + beatstep (`numpy.ndarray`): + Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by + the value at beatstep[0]. + """ + + if audio is not None and len(audio.shape) != 1: + raise ValueError( + f"Expected `audio` to be a single channel audio input of shape `(n, )` but found shape {audio.shape}." + ) + if beatstep[0] > 0.0: + beatstep = beatstep - beatstep[0] + + num_steps = self.num_bars * 4 + num_target_steps = len(beatstep) + extrapolated_beatstep = self.interpolate_beat_times( + beat_times=beatstep, steps_per_beat=1, n_extend=(self.num_bars + 1) * 4 + 1 + ) + + sample_indices = [] + max_feature_length = 0 + for i in range(0, num_target_steps, num_steps): + start_idx = i + end_idx = min(i + num_steps, num_target_steps) + start_sample = int(extrapolated_beatstep[start_idx] * self.sampling_rate) + end_sample = int(extrapolated_beatstep[end_idx] * self.sampling_rate) + sample_indices.append((start_sample, end_sample)) + max_feature_length = max(max_feature_length, end_sample - start_sample) + padded_batch = [] + for start_sample, end_sample in sample_indices: + feature = audio[start_sample:end_sample] + padded_feature = np.pad( + feature, + ((0, max_feature_length - feature.shape[0]),), + "constant", + constant_values=0, + ) + padded_batch.append(padded_feature) + + padded_batch = np.asarray(padded_batch) + return padded_batch, extrapolated_beatstep + + def _pad(self, features: np.ndarray, add_zero_line=True): + features_shapes = [each_feature.shape for each_feature in features] + attention_masks, padded_features = [], [] + for i, each_feature in enumerate(features): + # To pad "input_features". + if len(each_feature.shape) == 3: + features_pad_value = max([*zip(*features_shapes)][1]) - features_shapes[i][1] + attention_mask = np.ones(features_shapes[i][:2], dtype=np.int64) + feature_padding = ((0, 0), (0, features_pad_value), (0, 0)) + attention_mask_padding = (feature_padding[0], feature_padding[1]) + + # To pad "beatsteps" and "extrapolated_beatstep". + else: + each_feature = each_feature.reshape(1, -1) + features_pad_value = max([*zip(*features_shapes)][0]) - features_shapes[i][0] + attention_mask = np.ones(features_shapes[i], dtype=np.int64).reshape(1, -1) + feature_padding = attention_mask_padding = ((0, 0), (0, features_pad_value)) + + each_padded_feature = np.pad(each_feature, feature_padding, "constant", constant_values=self.padding_value) + attention_mask = np.pad( + attention_mask, attention_mask_padding, "constant", constant_values=self.padding_value + ) + + if add_zero_line: + # if it is batched then we seperate each examples using zero array + zero_array_len = max([*zip(*features_shapes)][1]) + + # we concatenate the zero array line here + each_padded_feature = np.concatenate( + [each_padded_feature, np.zeros([1, zero_array_len, self.feature_size])], axis=0 + ) + attention_mask = np.concatenate( + [attention_mask, np.zeros([1, zero_array_len], dtype=attention_mask.dtype)], axis=0 + ) + + padded_features.append(each_padded_feature) + attention_masks.append(attention_mask) + + padded_features = np.concatenate(padded_features, axis=0).astype(np.float32) + attention_masks = np.concatenate(attention_masks, axis=0).astype(np.int64) + + return padded_features, attention_masks + + def pad( + self, + inputs: BatchFeature, + is_batched: bool, + return_attention_mask: bool, + return_tensors: Optional[Union[str, TensorType]] = None, + ): + """ + Pads the inputs to same length and returns attention_mask. + + Args: + inputs (`BatchFeature`): + Processed audio features. + is_batched (`bool`): + Whether inputs are batched or not. + return_attention_mask (`bool`): + Whether to return attention mask or not. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + If nothing is specified, it will return list of `np.ndarray` arrays. + Return: + `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added + to it: + - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` -- + Example : + 1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at + the end indicating they are padded) + + 0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2) + + 1, 1, 1, 1, 1 (audio 2) + + 0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3) + + 1, 1, 1, 1, 1 (audio 3) + - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)` + - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size, + max_extrapolated_beatstep_seq_length)` + """ + + processed_features_dict = {} + for feature_name, feature_value in inputs.items(): + if feature_name == "input_features": + padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=True) + processed_features_dict[feature_name] = padded_feature_values + if return_attention_mask: + processed_features_dict["attention_mask"] = attention_mask + else: + padded_feature_values, attention_mask = self._pad(feature_value, add_zero_line=False) + processed_features_dict[feature_name] = padded_feature_values + if return_attention_mask: + processed_features_dict[f"attention_mask_{feature_name}"] = attention_mask + + # If we are processing only one example, we should remove the zero array line since we don't need it to + # seperate examples from each other. + if not is_batched and not return_attention_mask: + processed_features_dict["input_features"] = processed_features_dict["input_features"][:-1, ...] + + outputs = BatchFeature(processed_features_dict, tensor_type=return_tensors) + + return outputs + + def __call__( + self, + audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]], + sampling_rate: Union[int, List[int]], + steps_per_beat: int = 2, + resample: Optional[bool] = True, + return_attention_mask: Optional[bool] = False, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model. + + Args: + audio (`np.ndarray`, `List`): + The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a + list of numpy arrays or a list of list of float values. + sampling_rate (`int`): + The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + steps_per_beat (`int`, *optional*, defaults to 2): + This is used in interpolating `beat_times`. + resample (`bool`, *optional*, defaults to `True`): + Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True + during inference. + return_attention_mask (`bool` *optional*, defaults to `False`): + Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as + output or not. Automatically set to True for batched inputs. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + If nothing is specified, it will return list of `np.ndarray` arrays. + """ + + requires_backends(self, ["librosa"]) + is_batched = bool(isinstance(audio, (list, tuple)) and isinstance(audio[0], (np.ndarray, tuple, list))) + if is_batched: + # This enables the user to process files of different sampling_rate at same time + if not isinstance(sampling_rate, list): + raise ValueError( + "Please give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. " + f"Received {sampling_rate}, expected [audio_1_sr, ..., audio_n_sr]." + ) + return_attention_mask = True if return_attention_mask is None else return_attention_mask + else: + audio = [audio] + sampling_rate = [sampling_rate] + return_attention_mask = False if return_attention_mask is None else return_attention_mask + + batch_input_features, batch_beatsteps, batch_ext_beatstep = [], [], [] + for single_raw_audio, single_sampling_rate in zip(audio, sampling_rate): + bpm, beat_times, confidence, estimates, essentia_beat_intervals = self.extract_rhythm( + audio=single_raw_audio + ) + beatsteps = self.interpolate_beat_times(beat_times=beat_times, steps_per_beat=steps_per_beat, n_extend=1) + + if self.sampling_rate != single_sampling_rate and self.sampling_rate is not None: + if resample: + # Change sampling_rate to self.sampling_rate + single_raw_audio = librosa.core.resample( + single_raw_audio, + orig_sr=single_sampling_rate, + target_sr=self.sampling_rate, + res_type="kaiser_best", + ) + else: + warnings.warn( + f"The sampling_rate of the provided audio is different from the target sampling_rate" + f"of the Feature Extractor, {self.sampling_rate} vs {single_sampling_rate}. " + f"In these cases it is recommended to use `resample=True` in the `__call__` method to" + f"get the optimal behaviour." + ) + + single_sampling_rate = self.sampling_rate + start_sample = int(beatsteps[0] * single_sampling_rate) + end_sample = int(beatsteps[-1] * single_sampling_rate) + + input_features, extrapolated_beatstep = self.preprocess_mel( + single_raw_audio[start_sample:end_sample], beatsteps - beatsteps[0] + ) + + mel_specs = self.mel_spectrogram(input_features.astype(np.float32)) + + # apply np.log to get log mel-spectrograms + log_mel_specs = np.log(np.clip(mel_specs, a_min=1e-6, a_max=None)) + + input_features = np.transpose(log_mel_specs, (0, -1, -2)) + + batch_input_features.append(input_features) + batch_beatsteps.append(beatsteps) + batch_ext_beatstep.append(extrapolated_beatstep) + + output = BatchFeature( + { + "input_features": batch_input_features, + "beatsteps": batch_beatsteps, + "extrapolated_beatstep": batch_ext_beatstep, + } + ) + + output = self.pad( + output, + is_batched=is_batched, + return_attention_mask=return_attention_mask, + return_tensors=return_tensors, + ) + + return output + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + output = copy.deepcopy(self.__dict__) + output["feature_extractor_type"] = self.__class__.__name__ + if "mel_filters" in output: + del output["mel_filters"] + return output diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py new file mode 100644 index 00000000000000..5a67b8044b0999 --- /dev/null +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -0,0 +1,1377 @@ +# coding=utf-8 +# Copyright 2023 The Pop2Piano Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Pop2Piano model.""" + + +import copy +import math +from typing import Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint + +from transformers.generation import GenerationConfig + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_torch_fx_proxy, + logging, + replace_return_docstrings, +) +from .configuration_pop2piano import Pop2PianoConfig + + +logger = logging.get_logger(__name__) + +_load_pop2piano_layer_norm = True + +try: + from apex.normalization import FusedRMSNorm + + _load_pop2piano_layer_norm = False + + logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNorm") +except ImportError: + # using the normal Pop2PianoLayerNorm + pass +except Exception: + logger.warning("Discovered apex but it failed to load, falling back to Pop2PianoLayerNorm") + pass + + +_CONFIG_FOR_DOC = "Pop2PianoConfig" +_CHECKPOINT_FOR_DOC = "sweetcocoa/pop2piano" + +POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "sweetcocoa/pop2piano", + # See all Pop2Piano models at https://huggingface.co/models?filter=pop2piano +] + + +POP2PIANO_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings + so you should be able to pad the inputs on both the right and the left. Indices can be obtained using + [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail. + [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining + take a look a [Pop2Pianp Training](./Pop2Piano#training). + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using + [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the + starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last + `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0, + 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0, + 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in + `[0, 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at + the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Does the same task as `inputs_embeds`. If `inputs_embeds` is not present but `input_features` is present + then `input_features` will be considered as `inputs_embeds`. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If + `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value of + `inputs_embeds`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->Pop2Piano +class Pop2PianoLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean. + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + # Pop2Piano uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean + # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated + # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for + # half-precision inputs is done in fp32 + + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +if not _load_pop2piano_layer_norm: + Pop2PianoLayerNorm = FusedRMSNorm # noqa + +ALL_LAYERNORM_LAYERS.append(Pop2PianoLayerNorm) + + +# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->Pop2Piano,t5->pop2piano +class Pop2PianoDenseActDense(nn.Module): + def __init__(self, config: Pop2PianoConfig): + super().__init__() + self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_states = self.wi(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + if ( + isinstance(self.wo.weight, torch.Tensor) + and hidden_states.dtype != self.wo.weight.dtype + and self.wo.weight.dtype != torch.int8 + ): + hidden_states = hidden_states.to(self.wo.weight.dtype) + hidden_states = self.wo(hidden_states) + return hidden_states + + +# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->Pop2Piano +class Pop2PianoDenseGatedActDense(nn.Module): + def __init__(self, config: Pop2PianoConfig): + super().__init__() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.act = ACT2FN[config.dense_act_fn] + + def forward(self, hidden_states): + hidden_gelu = self.act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + + # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32. + # See https://github.com/huggingface/transformers/issues/20287 + # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None`` + if ( + isinstance(self.wo.weight, torch.Tensor) + and hidden_states.dtype != self.wo.weight.dtype + and self.wo.weight.dtype != torch.int8 + ): + hidden_states = hidden_states.to(self.wo.weight.dtype) + + hidden_states = self.wo(hidden_states) + return hidden_states + + +# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->Pop2Piano +class Pop2PianoLayerFF(nn.Module): + def __init__(self, config: Pop2PianoConfig): + super().__init__() + if config.is_gated_act: + self.DenseReluDense = Pop2PianoDenseGatedActDense(config) + else: + self.DenseReluDense = Pop2PianoDenseActDense(config) + + self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + forwarded_states = self.DenseReluDense(forwarded_states) + hidden_states = hidden_states + self.dropout(forwarded_states) + return hidden_states + + +# Copied from transformers.models.t5.modeling_t5.T5Attention with T5->Pop2Piano,t5->pop2piano +class Pop2PianoAttention(nn.Module): + def __init__(self, config: Pop2PianoConfig, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.relative_attention_max_distance = config.relative_attention_max_distance + self.d_model = config.d_model + self.key_value_proj_dim = config.d_kv + self.n_heads = config.num_heads + self.dropout = config.dropout_rate + self.inner_dim = self.n_heads * self.key_value_proj_dim + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) + + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + self.gradient_checkpointing = False + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads + ) + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.inner_dim = self.key_value_proj_dim * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. The relative position is defined as + memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for + small absolute relative_position and larger buckets for larger absolute relative_positions. All relative + positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. + This should allow for more graceful generalization to longer sequences than the model has been trained on + + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + + Returns: + a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) + """ + relative_buckets = 0 + if bidirectional: + num_buckets //= 2 + relative_buckets += (relative_position > 0).to(torch.long) * num_buckets + relative_position = torch.abs(relative_position) + else: + relative_position = -torch.min(relative_position, torch.zeros_like(relative_position)) + # now relative_position is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_position_if_large = torch.min( + relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_position, relative_position_if_large) + return relative_buckets + + def compute_bias(self, query_length, key_length, device=None): + """Compute binned relative position bias""" + if device is None: + device = self.relative_attention_bias.weight.device + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] + relative_position = memory_position - context_position # shape (query_length, key_length) + relative_position_bucket = self._relative_position_bucket( + relative_position, # shape (query_length, key_length) + bidirectional=(not self.is_decoder), + num_buckets=self.relative_attention_num_buckets, + max_distance=self.relative_attention_max_distance, + ) + values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads) + values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) + return values + + def forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) + # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key_value is not None: + if len(past_key_value) != 2: + raise ValueError( + f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + ) + real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, key_value_states, past_key_value): + """projects hidden states correctly to key/query states""" + if key_value_states is None: + # self-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + + if past_key_value is not None: + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + elif past_key_value.shape[2] != key_value_states.shape[1]: + # checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None + ) + value_states = project( + hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None + ) + + # compute scores + scores = torch.matmul( + query_states, key_states.transpose(3, 2) + ) # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 + + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + scores += position_bias_masked + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( + scores + ) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) # (batch_size, n_heads, seq_length, key_length) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask + + attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) + + present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + + if output_attentions: + outputs = outputs + (attn_weights,) + return outputs + + +# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->Pop2Piano,t5->pop2piano +class Pop2PianoLayerSelfAttention(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.SelfAttention = Pop2PianoAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.SelfAttention( + normed_hidden_states, + mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + self.dropout(attention_output[0]) + outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->Pop2Piano,t5->pop2piano +class Pop2PianoLayerCrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.EncDecAttention = Pop2PianoAttention(config, has_relative_attention_bias=False) + self.layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + def forward( + self, + hidden_states, + key_value_states, + attention_mask=None, + position_bias=None, + layer_head_mask=None, + past_key_value=None, + use_cache=False, + query_length=None, + output_attentions=False, + ): + normed_hidden_states = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention( + normed_hidden_states, + mask=attention_mask, + key_value_states=key_value_states, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + query_length=query_length, + output_attentions=output_attentions, + ) + layer_output = hidden_states + self.dropout(attention_output[0]) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.t5.modeling_t5.T5Block with T5->Pop2Piano,t5->pop2piano +class Pop2PianoBlock(nn.Module): + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.is_decoder = config.is_decoder + self.layer = nn.ModuleList() + self.layer.append(Pop2PianoLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + if self.is_decoder: + self.layer.append(Pop2PianoLayerCrossAttention(config)) + + self.layer.append(Pop2PianoLayerFF(config)) + + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + use_cache=False, + output_attentions=False, + return_dict=True, + ): + if past_key_value is not None: + if not self.is_decoder: + logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") + expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 + + if len(past_key_value) != expected_num_past_key_values: + raise ValueError( + f"There should be {expected_num_past_key_values} past states. " + f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" + f"Got {len(past_key_value)} past key / value states" + ) + + self_attn_past_key_value = past_key_value[:2] + cross_attn_past_key_value = past_key_value[2:] + else: + self_attn_past_key_value, cross_attn_past_key_value = None, None + + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + layer_head_mask=layer_head_mask, + past_key_value=self_attn_past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states, present_key_value_state = self_attention_outputs[:2] + attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: + clamp_value = torch.where( + torch.isinf(hidden_states).any(), + torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + do_cross_attention = self.is_decoder and encoder_hidden_states is not None + if do_cross_attention: + # the actual query length is unknown for cross attention + # if using past key value states. Need to inject it here + if present_key_value_state is not None: + query_length = present_key_value_state[0].shape[2] + else: + query_length = None + + cross_attention_outputs = self.layer[1]( + hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + query_length=query_length, + use_cache=use_cache, + output_attentions=output_attentions, + ) + hidden_states = cross_attention_outputs[0] + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: + clamp_value = torch.where( + torch.isinf(hidden_states).any(), + torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + # Combine self attn and cross attn key value states + if present_key_value_state is not None: + present_key_value_state = present_key_value_state + cross_attention_outputs[1] + + # Keep cross-attention outputs and relative position weights + attention_outputs = attention_outputs + cross_attention_outputs[2:] + + # Apply Feed Forward layer + hidden_states = self.layer[-1](hidden_states) + + # clamp inf values to enable fp16 training + if hidden_states.dtype == torch.float16: + clamp_value = torch.where( + torch.isinf(hidden_states).any(), + torch.finfo(hidden_states.dtype).max - 1000, + torch.finfo(hidden_states.dtype).max, + ) + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if use_cache: + outputs = outputs + (present_key_value_state,) + attention_outputs + else: + outputs = outputs + attention_outputs + + return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + + +class Pop2PianoPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Pop2PianoConfig + base_model_prefix = "transformer" + is_parallelizable = False + supports_gradient_checkpointing = True + _no_split_modules = ["Pop2PianoBlock"] + _keep_in_fp32_modules = ["wo"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, Pop2PianoLayerNorm): + module.weight.data.fill_(factor * 1.0) + elif isinstance(module, Pop2PianoConcatEmbeddingToMel): + module.embedding.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, Pop2PianoForConditionalGeneration): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: + module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, Pop2PianoDenseActDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi, "bias") and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, Pop2PianoDenseGatedActDense): + module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None: + module.wi_0.bias.data.zero_() + module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None: + module.wi_1.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, Pop2PianoAttention): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + key_value_proj_dim = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (Pop2PianoAttention, Pop2PianoStack)): + module.gradient_checkpointing = value + + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + if decoder_start_token_id is None: + raise ValueError( + "self.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id." + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) + shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class Pop2PianoStack(Pop2PianoPreTrainedModel): + # Copied from transformers.models.t5.modeling_t5.T5Stack.__init__ with T5->Pop2Piano,t5->pop2piano + def __init__(self, config, embed_tokens=None): + super().__init__(config) + + self.embed_tokens = embed_tokens + self.is_decoder = config.is_decoder + + self.block = nn.ModuleList( + [Pop2PianoBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + ) + self.final_layer_norm = Pop2PianoLayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + # Initialize weights and apply final processing + self.post_init() + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + # Copied from transformers.models.t5.modeling_t5.T5Stack.get_input_embeddings + def get_input_embeddings(self): + return self.embed_tokens + + # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + inputs_embeds=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError( + f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + err_msg_prefix = "decoder_" if self.is_decoder else "" + raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds") + + if inputs_embeds is None: + if self.embed_tokens is None: + raise ValueError("You have to initialize the model with valid token embeddings") + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + + # required mask seq length can be calculated via length of past + mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length + + if use_cache is True: + if not self.is_decoder: + raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder") + + if attention_mask is None: + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: + encoder_seq_length = encoder_hidden_states.shape[1] + encoder_attention_mask = torch.ones( + batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long + ) + + # initialize past_key_values with `None` if past does not exist + if past_key_values is None: + past_key_values = [None] * len(self.block) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers) + present_key_value_states = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if (output_attentions and self.is_decoder) else None + position_bias = None + encoder_decoder_position_bias = None + + hidden_states = self.dropout(inputs_embeds) + + for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)): + layer_head_mask = head_mask[i] + cross_attn_layer_head_mask = cross_attn_head_mask[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return tuple(module(*inputs, use_cache, output_attentions)) + + return custom_forward + + layer_outputs = checkpoint( + create_custom_forward(layer_module), + hidden_states, + extended_attention_mask, + position_bias, + encoder_hidden_states, + encoder_extended_attention_mask, + encoder_decoder_position_bias, + layer_head_mask, + cross_attn_layer_head_mask, + None, # past_key_value is always None with gradient checkpointing + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + layer_head_mask=layer_head_mask, + cross_attn_layer_head_mask=cross_attn_layer_head_mask, + past_key_value=past_key_value, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + # layer_outputs is a tuple with: + # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) + if use_cache is False: + layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:] + + hidden_states, present_key_value_state = layer_outputs[:2] + + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), + # (cross-attention position bias), (cross-attention weights) + position_bias = layer_outputs[2] + if self.is_decoder and encoder_hidden_states is not None: + encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3] + # append next layer key value states + if use_cache: + present_key_value_states = present_key_value_states + (present_key_value_state,) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[3],) + if self.is_decoder: + all_cross_attentions = all_cross_attentions + (layer_outputs[5],) + + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_value_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Pop2PianoConcatEmbeddingToMel(nn.Module): + """Embedding Matrix for `composer` tokens.""" + + def __init__(self, config): + super().__init__() + self.embedding = nn.Embedding(num_embeddings=config.composer_vocab_size, embedding_dim=config.d_model) + + def forward(self, feature, index_value, embedding_offset): + index_shifted = index_value - embedding_offset + composer_embedding = self.embedding(index_shifted).unsqueeze(1) + inputs_embeds = torch.cat([composer_embedding, feature], dim=1) + return inputs_embeds + + +Pop2Piano_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Pop2PianoConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings("""Pop2Piano Model with a `language modeling` head on top.""", Pop2Piano_START_DOCSTRING) +class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: Pop2PianoConfig): + super().__init__(config) + self.config = config + self.model_dim = config.d_model + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + self.mel_conditioner = Pop2PianoConcatEmbeddingToMel(config) + + encoder_config = copy.deepcopy(config) + encoder_config.is_decoder = False + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + + self.encoder = Pop2PianoStack(encoder_config, self.shared) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + decoder_config.num_layers = config.num_decoder_layers + self.decoder = Pop2PianoStack(decoder_config, self.shared) + + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + self.decoder.set_input_embeddings(new_embeddings) + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_output_embeddings(self): + return self.lm_head + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def get_mel_conditioner_outputs( + self, + input_features: torch.FloatTensor, + composer: str, + generation_config: GenerationConfig, + attention_mask: torch.FloatTensor = None, + ): + """ + This method is used to concatenate mel conditioner tokens at the front of the input_features in order to + control the type of MIDI token generated by the model. + + Args: + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + input features extracted from the feature extractor. + composer (`str`): + composer token which determines the type of MIDI tokens to be generated. + generation_config (`~generation.GenerationConfig`): + The generation is used to get the composer-feature_token pair. + attention_mask (``, *optional*): + For batched generation `input_features` are padded to have the same shape across all examples. + `attention_mask` helps to determine which areas were padded and which were not. + - 1 for tokens that are **not padded**, + - 0 for tokens that are **padded**. + """ + composer_to_feature_token = generation_config.composer_to_feature_token + if composer not in composer_to_feature_token.keys(): + raise ValueError( + f"Please choose a composer from {list(composer_to_feature_token.keys())}. Composer received - {composer}" + ) + composer_value = composer_to_feature_token[composer] + composer_value = torch.tensor(composer_value, device=self.device) + composer_value = composer_value.repeat(input_features.shape[0]) + + embedding_offset = min(composer_to_feature_token.values()) + + input_features = self.mel_conditioner( + feature=input_features, + index_value=composer_value, + embedding_offset=embedding_offset, + ) + if attention_mask is not None: + input_features[~attention_mask[:, 0].bool()] = 0.0 + + # since self.mel_conditioner adds a new array at the front of inputs_embeds we need to do the same for attention_mask to keep the shapes same + attention_mask = torch.concatenate([attention_mask[:, 0].view(-1, 1), attention_mask], axis=1) + return input_features, attention_mask + + return input_features, None + + @add_start_docstrings_to_model_forward(POP2PIANO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + decoder_head_mask: Optional[torch.FloatTensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., + config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for + labels in `[0, ..., config.vocab_size]` + Returns: + """ + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None and input_features is not None: + raise ValueError("Both `inputs_embeds` and `input_features` received! Please provide only one of them") + elif input_features is not None and inputs_embeds is None: + inputs_embeds = input_features + + # Encode if needed (training, first prediction pass) + if encoder_outputs is None: + # Convert encoder inputs in embeddings if needed + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + hidden_states = encoder_outputs[0] + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + + # Decode + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = decoder_outputs[0] + + if self.config.tie_word_embeddings: + # Rescale output before projecting on vocab + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim**-0.5) + + lm_logits = self.lm_head(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-100) + loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output + + return Seq2SeqLMOutput( + loss=loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_features, + attention_mask=None, + composer="composer1", + generation_config=None, + **kwargs, + ): + """ + Generates token ids for midi outputs. + + + + Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the + model's default generation configuration. You can override any `generation_config` by passing the corresponding + parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation + strategies and code examples, check out the [following guide](./generation_strategies). + + + + Parameters: + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`. + attention_mask: + For batched generation `input_features` are padded to have the same shape across all examples. + `attention_mask` helps to determine which areas were padded and which were not. + - 1 for tokens that are **not padded**, + - 0 for tokens that are **padded**. + composer (`str`, *optional*, defaults to `"composer1"`): + This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each + `"composer"`. Please make sure that the composet value is present in `composer_to_feature_token` in + `generation_config`. For an example please see + https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json . + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + kwargs: + Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be + forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder + specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. + Return: + [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` + or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. + Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible + [`~utils.ModelOutput`] types are: + - [`~generation.GreedySearchEncoderDecoderOutput`], + - [`~generation.SampleEncoderDecoderOutput`], + - [`~generation.BeamSearchEncoderDecoderOutput`], + - [`~generation.BeamSampleEncoderDecoderOutput`] + """ + + if generation_config is None: + generation_config = self.generation_config + generation_config.update(**kwargs) + + # check for composer_to_feature_token + if not hasattr(generation_config, "composer_to_feature_token"): + raise ValueError( + "`composer_to_feature_token` was not found! Please refer to " + "https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json" + "and parse a dict like that." + ) + + if len(generation_config.composer_to_feature_token) != self.config.composer_vocab_size: + raise ValueError( + "config.composer_vocab_size must be same as the number of keys in " + f"generation_config.composer_to_feature_token! " + f"Found {self.config.composer_vocab_size} vs {len(generation_config.composer_to_feature_token)}." + ) + + # to control the variation of generated MIDI tokens we concatenate mel-conditioner tokens(which depends on composer_token) + # at the front of input_features. + input_features, attention_mask = self.get_mel_conditioner_outputs( + input_features=input_features, + attention_mask=attention_mask, + composer=composer, + generation_config=generation_config, + ) + + return super().generate( + inputs=None, + inputs_embeds=input_features, + attention_mask=attention_mask, + generation_config=generation_config, + **kwargs, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + def _reorder_cache(self, past_key_values, beam_idx): + # if decoder past is not included in output + # speedy decoding is disabled and no need to reorder + if past_key_values is None: + logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") + return past_key_values + + reordered_decoder_past = () + for layer_past_states in past_key_values: + # get the correct batch idx from layer past batch dim + # batch dim of `past` is at 2nd position + reordered_layer_past_states = () + for layer_past_state in layer_past_states: + # need to set correct `past` for each of the four key / value states + reordered_layer_past_states = reordered_layer_past_states + ( + layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)), + ) + + if reordered_layer_past_states[0].shape != layer_past_states[0].shape: + raise ValueError( + f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched" + ) + if len(reordered_layer_past_states) != len(layer_past_states): + raise ValueError( + f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched" + ) + + reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) + return reordered_decoder_past diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py new file mode 100644 index 00000000000000..5ea579111ddbcd --- /dev/null +++ b/src/transformers/models/pop2piano/processing_pop2piano.py @@ -0,0 +1,138 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Processor class for Pop2Piano.""" + +import os +from typing import List, Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...processing_utils import ProcessorMixin +from ...tokenization_utils import BatchEncoding, PaddingStrategy, TruncationStrategy +from ...utils import TensorType + + +class Pop2PianoProcessor(ProcessorMixin): + r""" + Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single + processor. + + [`Pop2PianoProcessor`] offers all the functionalities of [`Pop2PianoFeatureExtractor`] and [`Pop2PianoTokenizer`]. + See the docstring of [`~Pop2PianoProcessor.__call__`] and [`~Pop2PianoProcessor.decode`] for more information. + + Args: + feature_extractor (`Pop2PianoFeatureExtractor`): + An instance of [`Pop2PianoFeatureExtractor`]. The feature extractor is a required input. + tokenizer (`Pop2PianoTokenizer`): + An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input. + """ + attributes = ["feature_extractor", "tokenizer"] + feature_extractor_class = "Pop2PianoFeatureExtractor" + tokenizer_class = "Pop2PianoTokenizer" + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + + def __call__( + self, + audio: Union[np.ndarray, List[float], List[np.ndarray]] = None, + sampling_rate: Union[int, List[int]] = None, + steps_per_beat: int = 2, + resample: Optional[bool] = True, + notes: Union[List, TensorType] = None, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + verbose: bool = True, + **kwargs, + ) -> Union[BatchFeature, BatchEncoding]: + """ + This method uses [`Pop2PianoFeatureExtractor.__call__`] method to prepare log-mel-spectrograms for the model, + and [`Pop2PianoTokenizer.__call__`] to prepare token_ids from notes. + + Please refer to the docstring of the above two methods for more information. + """ + + # Since Feature Extractor needs both audio and sampling_rate and tokenizer needs both token_ids and + # feature_extractor_output, we must check for both. + if (audio is None and sampling_rate is None) and (notes is None): + raise ValueError( + "You have to specify at least audios and sampling_rate in order to use feature extractor or " + "notes to use the tokenizer part." + ) + + if audio is not None and sampling_rate is not None: + inputs = self.feature_extractor( + audio=audio, + sampling_rate=sampling_rate, + steps_per_beat=steps_per_beat, + resample=resample, + **kwargs, + ) + if notes is not None: + encoded_token_ids = self.tokenizer( + notes=notes, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + if notes is None: + return inputs + + elif audio is None or sampling_rate is None: + return encoded_token_ids + + else: + inputs["token_ids"] = encoded_token_ids["token_ids"] + return inputs + + def batch_decode( + self, + token_ids, + feature_extractor_output: BatchFeature, + return_midi: bool = True, + ) -> BatchEncoding: + """ + This method uses [`Pop2PianoTokenizer.batch_decode`] method to convert model generated token_ids to midi_notes. + + Please refer to the docstring of the above two methods for more information. + """ + + return self.tokenizer.batch_decode( + token_ids=token_ids, feature_extractor_output=feature_extractor_output, return_midi=return_midi + ) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + feature_extractor_input_names = self.feature_extractor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names)) + + def save_pretrained(self, save_directory, **kwargs): + if os.path.isfile(save_directory): + raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") + os.makedirs(save_directory, exist_ok=True) + return super().save_pretrained(save_directory, **kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + return cls(*args) diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py new file mode 100644 index 00000000000000..abb6c3584f46ce --- /dev/null +++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py @@ -0,0 +1,714 @@ +# coding=utf-8 +# Copyright 2023 The Pop2Piano Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for Pop2Piano.""" + +import json +import os +from typing import List, Optional, Tuple, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...tokenization_utils import AddedToken, BatchEncoding, PaddingStrategy, PreTrainedTokenizer, TruncationStrategy +from ...utils import TensorType, is_pretty_midi_available, logging, requires_backends, to_numpy + + +if is_pretty_midi_available(): + import pretty_midi + +logger = logging.get_logger(__name__) + +## TODO : changing checkpoints from `susnato/pop2piano_dev` to `sweetcocoa/pop2piano` after the PR is approved + +VOCAB_FILES_NAMES = { + "vocab": "vocab.json", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab": { + "susnato/pop2piano_dev": "https://huggingface.co/susnato/pop2piano_dev/blob/main/vocab.json", + }, +} + + +def token_time_to_note(number, cutoff_time_idx, current_idx): + current_idx += number + if cutoff_time_idx is not None: + current_idx = min(current_idx, cutoff_time_idx) + + return current_idx + + +def token_note_to_note(number, current_velocity, default_velocity, note_onsets_ready, current_idx, notes): + if note_onsets_ready[number] is not None: + # offset with onset + onset_idx = note_onsets_ready[number] + if onset_idx < current_idx: + # Time shift after previous note_on + offset_idx = current_idx + notes.append([onset_idx, offset_idx, number, default_velocity]) + onsets_ready = None if current_velocity == 0 else current_idx + note_onsets_ready[number] = onsets_ready + else: + note_onsets_ready[number] = current_idx + return notes + + +class Pop2PianoTokenizer(PreTrainedTokenizer): + """ + Constructs a Pop2Piano tokenizer. This tokenizer does not require training. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab (`str`): + Path to the vocab file which contains the vocabulary. + default_velocity (`int`, *optional*, defaults to 77): + Determines the default velocity to be used while creating midi Notes. + num_bars (`int`, *optional*, defaults to 2): + Determines cutoff_time_idx in for each token. + """ + + model_input_names = ["token_ids", "attention_mask"] + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + + def __init__( + self, + vocab, + default_velocity=77, + num_bars=2, + unk_token="-1", + eos_token="1", + pad_token="0", + bos_token="2", + **kwargs, + ): + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + + super().__init__( + unk_token=unk_token, + eos_token=eos_token, + pad_token=pad_token, + bos_token=bos_token, + **kwargs, + ) + + self.default_velocity = default_velocity + self.num_bars = num_bars + + # Load the vocab + with open(vocab, "rb") as file: + self.encoder = json.load(file) + + # create mappings for encoder + self.decoder = {v: k for k, v in self.encoder.items()} + + @property + def vocab_size(self): + """Returns the vocabulary size of the tokenizer.""" + return len(self.encoder) + + def get_vocab(self): + """Returns the vocabulary of the tokenizer.""" + return dict(self.encoder, **self.added_tokens_encoder) + + def _convert_id_to_token(self, token_id: int) -> list: + """ + Decodes the token ids generated by the transformer into notes. + + Args: + token_id (`int`): + This denotes the ids generated by the transformers to be converted to Midi tokens. + + Returns: + `List`: A list consists of token_type (`str`) and value (`int`). + """ + + token_type_value = self.decoder.get(token_id, f"{self.unk_token}_TOKEN_TIME") + token_type_value = token_type_value.split("_") + token_type, value = "_".join(token_type_value[1:]), int(token_type_value[0]) + + return [token_type, value] + + def _convert_token_to_id(self, token, token_type="TOKEN_TIME") -> int: + """ + Encodes the Midi tokens to transformer generated token ids. + + Args: + token (`int`): + This denotes the token value. + token_type (`str`): + This denotes the type of the token. There are four types of midi tokens such as "TOKEN_TIME", + "TOKEN_VELOCITY", "TOKEN_NOTE" and "TOKEN_SPECIAL". + + Returns: + `int`: returns the id of the token. + """ + return self.encoder.get(f"{token}_{token_type}", int(self.unk_token)) + + def relative_batch_tokens_ids_to_notes( + self, + tokens: np.ndarray, + beat_offset_idx: int, + bars_per_batch: int, + cutoff_time_idx: int, + ): + """ + Converts relative tokens to notes which are then used to generate pretty midi object. + + Args: + tokens (`numpy.ndarray`): + Tokens to be converted to notes. + beat_offset_idx (`int`): + Denotes beat offset index for each note in generated Midi. + bars_per_batch (`int`): + A parameter to control the Midi output generation. + cutoff_time_idx (`int`): + Denotes the cutoff time index for each note in generated Midi. + """ + + notes = None + + for index in range(len(tokens)): + _tokens = tokens[index] + _start_idx = beat_offset_idx + index * bars_per_batch * 4 + _cutoff_time_idx = cutoff_time_idx + _start_idx + _notes = self.relative_tokens_ids_to_notes( + _tokens, + start_idx=_start_idx, + cutoff_time_idx=_cutoff_time_idx, + ) + + if len(_notes) == 0: + pass + elif notes is None: + notes = _notes + else: + notes = np.concatenate((notes, _notes), axis=0) + + if notes is None: + return [] + return notes + + def relative_batch_tokens_ids_to_midi( + self, + tokens: np.ndarray, + beatstep: np.ndarray, + beat_offset_idx: int = 0, + bars_per_batch: int = 2, + cutoff_time_idx: int = 12, + ): + """ + Converts tokens to Midi. This method calls `relative_batch_tokens_ids_to_notes` method to convert batch tokens + to notes then uses `notes_to_midi` method to convert them to Midi. + + Args: + tokens (`numpy.ndarray`): + Denotes tokens which alongside beatstep will be converted to Midi. + beatstep (`np.ndarray`): + We get beatstep from feature extractor which is also used to get Midi. + beat_offset_idx (`int`, *optional*, defaults to 0): + Denotes beat offset index for each note in generated Midi. + bars_per_batch (`int`, *optional*, defaults to 2): + A parameter to control the Midi output generation. + cutoff_time_idx (`int`, *optional*, defaults to 12): + Denotes the cutoff time index for each note in generated Midi. + """ + beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx + notes = self.relative_batch_tokens_ids_to_notes( + tokens=tokens, + beat_offset_idx=beat_offset_idx, + bars_per_batch=bars_per_batch, + cutoff_time_idx=cutoff_time_idx, + ) + midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx]) + return midi + + # Taken from the original code + # Please see https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L257 + def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None): + """ + Converts relative tokens to notes which will then be used to create Pretty Midi objects. + + Args: + tokens (`numpy.ndarray`): + Relative Tokens which will be converted to notes. + start_idx (`float`): + A parameter which denotes the starting index. + cutoff_time_idx (`float`, *optional*): + A parameter used while converting tokens to notes. + """ + words = [self._convert_id_to_token(token) for token in tokens] + + current_idx = start_idx + current_velocity = 0 + note_onsets_ready = [None for i in range(sum([k.endswith("NOTE") for k in self.encoder.keys()]) + 1)] + notes = [] + for token_type, number in words: + if token_type == "TOKEN_SPECIAL": + if number == 1: + break + elif token_type == "TOKEN_TIME": + current_idx = token_time_to_note( + number=number, cutoff_time_idx=cutoff_time_idx, current_idx=current_idx + ) + elif token_type == "TOKEN_VELOCITY": + current_velocity = number + + elif token_type == "TOKEN_NOTE": + notes = token_note_to_note( + number=number, + current_velocity=current_velocity, + default_velocity=self.default_velocity, + note_onsets_ready=note_onsets_ready, + current_idx=current_idx, + notes=notes, + ) + else: + raise ValueError("Token type not understood!") + + for pitch, note_onset in enumerate(note_onsets_ready): + # force offset if no offset for each pitch + if note_onset is not None: + if cutoff_time_idx is None: + cutoff = note_onset + 1 + else: + cutoff = max(cutoff_time_idx, note_onset + 1) + + offset_idx = max(current_idx, cutoff) + notes.append([note_onset, offset_idx, pitch, self.default_velocity]) + + if len(notes) == 0: + return [] + else: + notes = np.array(notes) + note_order = notes[:, 0] * 128 + notes[:, 1] + notes = notes[note_order.argsort()] + return notes + + def notes_to_midi(self, notes: np.ndarray, beatstep: np.ndarray, offset_sec: int = 0.0): + """ + Converts notes to Midi. + + Args: + notes (`numpy.ndarray`): + This is used to create Pretty Midi objects. + beatstep (`numpy.ndarray`): + This is the extrapolated beatstep that we get from feature extractor. + offset_sec (`int`, *optional*, defaults to 0.0): + This represents the offset seconds which is used while creating each Pretty Midi Note. + """ + + requires_backends(self, ["pretty_midi"]) + + new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0) + new_inst = pretty_midi.Instrument(program=0) + new_notes = [] + + for onset_idx, offset_idx, pitch, velocity in notes: + new_note = pretty_midi.Note( + velocity=velocity, + pitch=pitch, + start=beatstep[onset_idx] - offset_sec, + end=beatstep[offset_idx] - offset_sec, + ) + new_notes.append(new_note) + new_inst.notes = new_notes + new_pm.instruments.append(new_inst) + new_pm.remove_invalid_notes() + return new_pm + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Saves the tokenizer's vocabulary dictionary to the provided save_directory. + + Args: + save_directory (`str`): + A path to the directory where to saved. It will be created if it doesn't exist. + filename_prefix (`Optional[str]`, *optional*): + A prefix to add to the names of the files saved by the tokenizer. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + + # Save the encoder. + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab"] + ) + with open(out_vocab_file, "w") as file: + file.write(json.dumps(self.encoder)) + + return (out_vocab_file,) + + def encode_plus( + self, + notes: Union[np.ndarray, List[pretty_midi.Note]], + truncation_strategy: Optional[TruncationStrategy] = None, + max_length: Optional[int] = None, + **kwargs, + ) -> BatchEncoding: + r""" + This is the `encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer + generated token ids. It only works on a single batch, to process multiple batches please use + `batch_encode_plus` or `__call__` method. + + Args: + notes (`numpy.ndarray` of shape `[sequence_length, 4]` or `list` of `pretty_midi.Note` objects): + This represents the midi notes. If `notes` is a `numpy.ndarray`: + - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`. + If `notes` is a `list` containing `pretty_midi.Note` objects: + - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`. + truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*): + Indicates the truncation strategy that is going to be used during truncation. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + + Returns: + `BatchEncoding` containing the tokens ids. + """ + + requires_backends(self, ["pretty_midi"]) + + # check if notes is a pretty_midi object or not, if yes then extract the attributes and put them into a numpy + # array. + if isinstance(notes[0], pretty_midi.Note): + notes = np.array( + [[each_note.start, each_note.end, each_note.pitch, each_note.velocity] for each_note in notes] + ).reshape(-1, 4) + + # to round up all the values to the closest int values. + notes = np.round(notes).astype(np.int32) + max_time_idx = notes[:, :2].max() + + times = [[] for i in range((max_time_idx + 1))] + for onset, offset, pitch, velocity in notes: + times[onset].append([pitch, velocity]) + times[offset].append([pitch, 0]) + + tokens = [] + current_velocity = 0 + for i, time in enumerate(times): + if len(time) == 0: + continue + tokens.append(self._convert_token_to_id(i, "TOKEN_TIME")) + for pitch, velocity in time: + velocity = int(velocity > 0) + if current_velocity != velocity: + current_velocity = velocity + tokens.append(self._convert_token_to_id(velocity, "TOKEN_VELOCITY")) + tokens.append(self._convert_token_to_id(pitch, "TOKEN_NOTE")) + + total_len = len(tokens) + + # truncation + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: + tokens, _, _ = self.truncate_sequences( + ids=tokens, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + **kwargs, + ) + + return BatchEncoding({"token_ids": tokens}) + + def batch_encode_plus( + self, + notes: Union[np.ndarray, List[pretty_midi.Note]], + truncation_strategy: Optional[TruncationStrategy] = None, + max_length: Optional[int] = None, + **kwargs, + ) -> BatchEncoding: + r""" + This is the `batch_encode_plus` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer + generated token ids. It works on multiple batches by calling `encode_plus` multiple times in a loop. + + Args: + notes (`numpy.ndarray` of shape `[batch_size, sequence_length, 4]` or `list` of `pretty_midi.Note` objects): + This represents the midi notes. If `notes` is a `numpy.ndarray`: + - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`. + If `notes` is a `list` containing `pretty_midi.Note` objects: + - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`. + truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`], *optional*): + Indicates the truncation strategy that is going to be used during truncation. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + + Returns: + `BatchEncoding` containing the tokens ids. + """ + + encoded_batch_token_ids = [] + for i in range(len(notes)): + encoded_batch_token_ids.append( + self.encode_plus( + notes[i], + truncation_strategy=truncation_strategy, + max_length=max_length, + **kwargs, + )["token_ids"] + ) + + return BatchEncoding({"token_ids": encoded_batch_token_ids}) + + def __call__( + self, + notes: Union[ + np.ndarray, + List[pretty_midi.Note], + List[List[pretty_midi.Note]], + ], + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + r""" + This is the `__call__` method for `Pop2PianoTokenizer`. It converts the midi notes to the transformer generated + token ids. + + Args: + notes (`numpy.ndarray` of shape `[batch_size, max_sequence_length, 4]` or `list` of `pretty_midi.Note` objects): + This represents the midi notes. + + If `notes` is a `numpy.ndarray`: + - Each sequence must have 4 values, they are `onset idx`, `offset idx`, `pitch` and `velocity`. + If `notes` is a `list` containing `pretty_midi.Note` objects: + - Each sequence must have 4 attributes, they are `start`, `end`, `pitch` and `velocity`. + padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to + `None`, this will use the predefined model maximum length if a maximum length is required by one of the + truncation/padding parameters. If the model has no specific maximum input length (like XLNet) + truncation/padding to a maximum length will be deactivated. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + + Returns: + `BatchEncoding` containing the token_ids. + """ + + # check if it is batched or not + # it is batched if its a list containing a list of `pretty_midi.Notes` where the outer list contains all the + # batches and the inner list contains all Notes for a single batch. Otherwise if np.ndarray is passed it will be + # considered batched if it has shape of `[batch_size, seqence_length, 4]` or ndim=3. + is_batched = notes.ndim == 3 if isinstance(notes, np.ndarray) else isinstance(notes[0], list) + + # get the truncation and padding strategy + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + if is_batched: + # If the user has not explicitly mentioned `return_attention_mask` as False, we change it to True + return_attention_mask = True if return_attention_mask is None else return_attention_mask + token_ids = self.batch_encode_plus( + notes=notes, + truncation_strategy=truncation_strategy, + max_length=max_length, + **kwargs, + ) + else: + token_ids = self.encode_plus( + notes=notes, + truncation_strategy=truncation_strategy, + max_length=max_length, + **kwargs, + ) + + # since we already have truncated sequnences we are just left to do padding + token_ids = self.pad( + token_ids, + padding=padding_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_tensors=return_tensors, + verbose=verbose, + ) + + return token_ids + + def batch_decode( + self, + token_ids, + feature_extractor_output: BatchFeature, + return_midi: bool = True, + ): + r""" + This is the `batch_decode` method for `Pop2PianoTokenizer`. It converts the token_ids generated by the + transformer to midi_notes and returns them. + + Args: + token_ids (`Union[np.ndarray, torch.Tensor, tf.Tensor]`): + Output token_ids of `Pop2PianoConditionalGeneration` model. + feature_extractor_output (`BatchFeature`): + Denotes the output of `Pop2PianoFeatureExtractor.__call__`. It must contain `"beatstep"` and + `"extrapolated_beatstep"`. Also `"attention_mask_beatsteps"` and + `"attention_mask_extrapolated_beatstep"` + should be present if they were returned by the feature extractor. + return_midi (`bool`, *optional*, defaults to `True`): + Whether to return midi object or not. + Returns: + If `return_midi` is True: + - `BatchEncoding` containing both `notes` and `pretty_midi.pretty_midi.PrettyMIDI` objects. + If `return_midi` is False: + - `BatchEncoding` containing `notes`. + """ + + # check if they have attention_masks(attention_mask, attention_mask_beatsteps, attention_mask_extrapolated_beatstep) or not + attention_masks_present = bool( + hasattr(feature_extractor_output, "attention_mask") + and hasattr(feature_extractor_output, "attention_mask_beatsteps") + and hasattr(feature_extractor_output, "attention_mask_extrapolated_beatstep") + ) + + # if we are processing batched inputs then we must need attention_masks + if not attention_masks_present and feature_extractor_output["beatsteps"].shape[0] > 1: + raise ValueError( + "attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep must be present " + "for batched inputs! But one of them were not present." + ) + + # check for length mismatch between inputs_embeds, beatsteps and extrapolated_beatstep + if attention_masks_present: + # since we know about the number of examples in token_ids from attention_mask + if ( + sum(feature_extractor_output["attention_mask"][:, 0] == 0) + != feature_extractor_output["beatsteps"].shape[0] + or feature_extractor_output["beatsteps"].shape[0] + != feature_extractor_output["extrapolated_beatstep"].shape[0] + ): + raise ValueError( + "Length mistamtch between token_ids, beatsteps and extrapolated_beatstep! Found " + f"token_ids length - {token_ids.shape[0]}, beatsteps shape - {feature_extractor_output['beatsteps'].shape[0]} " + f"and extrapolated_beatsteps shape - {feature_extractor_output['extrapolated_beatstep'].shape[0]}" + ) + if feature_extractor_output["attention_mask"].shape[0] != token_ids.shape[0]: + raise ValueError( + f"Found attention_mask of length - {feature_extractor_output['attention_mask'].shape[0]} but token_ids of length - {token_ids.shape[0]}" + ) + else: + # if there is no attention mask present then it's surely a single example + if ( + feature_extractor_output["beatsteps"].shape[0] != 1 + or feature_extractor_output["extrapolated_beatstep"].shape[0] != 1 + ): + raise ValueError( + "Length mistamtch of beatsteps and extrapolated_beatstep! Since attention_mask is not present the number of examples must be 1, " + f"But found beatsteps length - {feature_extractor_output['beatsteps'].shape[0]}, extrapolated_beatsteps length - {feature_extractor_output['extrapolated_beatstep'].shape[0]}." + ) + + if attention_masks_present: + # check for zeros(since token_ids are seperated by zero arrays) + batch_idx = np.where(feature_extractor_output["attention_mask"][:, 0] == 0)[0] + else: + batch_idx = [token_ids.shape[0]] + + notes_list = [] + pretty_midi_objects_list = [] + start_idx = 0 + for index, end_idx in enumerate(batch_idx): + each_tokens_ids = token_ids[start_idx:end_idx] + # check where the whole example ended by searching for eos_token_id and getting the upper bound + each_tokens_ids = each_tokens_ids[:, : np.max(np.where(each_tokens_ids == int(self.eos_token))[1]) + 1] + beatsteps = feature_extractor_output["beatsteps"][index] + extrapolated_beatstep = feature_extractor_output["extrapolated_beatstep"][index] + + # if attention mask is present then mask out real array/tensor + if attention_masks_present: + attention_mask_beatsteps = feature_extractor_output["attention_mask_beatsteps"][index] + attention_mask_extrapolated_beatstep = feature_extractor_output[ + "attention_mask_extrapolated_beatstep" + ][index] + beatsteps = beatsteps[: np.max(np.where(attention_mask_beatsteps == 1)[0]) + 1] + extrapolated_beatstep = extrapolated_beatstep[ + : np.max(np.where(attention_mask_extrapolated_beatstep == 1)[0]) + 1 + ] + + each_tokens_ids = to_numpy(each_tokens_ids) + beatsteps = to_numpy(beatsteps) + extrapolated_beatstep = to_numpy(extrapolated_beatstep) + + pretty_midi_object = self.relative_batch_tokens_ids_to_midi( + tokens=each_tokens_ids, + beatstep=extrapolated_beatstep, + bars_per_batch=self.num_bars, + cutoff_time_idx=(self.num_bars + 1) * 4, + ) + + for note in pretty_midi_object.instruments[0].notes: + note.start += beatsteps[0] + note.end += beatsteps[0] + notes_list.append(note) + + pretty_midi_objects_list.append(pretty_midi_object) + start_idx += end_idx + 1 # 1 represents the zero array + + if return_midi: + return BatchEncoding({"notes": notes_list, "pretty_midi_objects": pretty_midi_objects_list}) + + return BatchEncoding({"notes": notes_list}) diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index 4723c43035e67c..fb509fc3eab092 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -32,6 +32,7 @@ is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12") is_torch_greater_or_equal_than_1_11 = parsed_torch_version_base >= version.parse("1.11") is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11") +is_torch_1_8_0 = parsed_torch_version_base == version.parse("1.8.0") def softmax_backward_data(parent, grad_output, output, dim, self): diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 9a59327af61f57..0f70ac97775632 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -57,6 +57,7 @@ is_cython_available, is_decord_available, is_detectron2_available, + is_essentia_available, is_faiss_available, is_flax_available, is_ftfy_available, @@ -71,6 +72,7 @@ is_pandas_available, is_peft_available, is_phonemizer_available, + is_pretty_midi_available, is_pyctcdecode_available, is_pytesseract_available, is_pytest_available, @@ -825,6 +827,20 @@ def require_librosa(test_case): return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case) +def require_essentia(test_case): + """ + Decorator marking a test that requires essentia + """ + return unittest.skipUnless(is_essentia_available(), "test requires essentia")(test_case) + + +def require_pretty_midi(test_case): + """ + Decorator marking a test that requires pretty_midi + """ + return unittest.skipUnless(is_pretty_midi_available(), "test requires pretty_midi")(test_case) + + def cmd_exists(cmd): return shutil.which(cmd) is not None diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 422464ce5bd026..050ccae9c03d5f 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -112,6 +112,7 @@ is_datasets_available, is_decord_available, is_detectron2_available, + is_essentia_available, is_faiss_available, is_flax_available, is_ftfy_available, @@ -130,6 +131,7 @@ is_pandas_available, is_peft_available, is_phonemizer_available, + is_pretty_midi_available, is_protobuf_available, is_psutil_available, is_py3nvml_available, diff --git a/src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py b/src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py new file mode 100644 index 00000000000000..e6d75a6ec22e90 --- /dev/null +++ b/src/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py @@ -0,0 +1,23 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class Pop2PianoFeatureExtractor(metaclass=DummyObject): + _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"]) + + +class Pop2PianoTokenizer(metaclass=DummyObject): + _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"]) + + +class Pop2PianoProcessor(metaclass=DummyObject): + _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"]) diff --git a/src/transformers/utils/dummy_music_objects.py b/src/transformers/utils/dummy_music_objects.py new file mode 100644 index 00000000000000..89052be47c1d32 --- /dev/null +++ b/src/transformers/utils/dummy_music_objects.py @@ -0,0 +1,16 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class Pop2PianoFeatureExtractor(metaclass=DummyObject): + _backends = ["music"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["music"]) + + +class Pop2PianoTokenizer(metaclass=DummyObject): + _backends = ["music"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["music"]) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index af4ea05837a55b..5724e689f2fce2 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -5935,6 +5935,23 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class Pop2PianoForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Pop2PianoPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 8172ba23401fcb..0045d3345b21be 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -185,6 +185,22 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ logger.info("Disabling Tensorflow because USE_TORCH is set") +_essentia_available = importlib.util.find_spec("essentia") is not None +try: + _essentia_version = importlib.metadata.version("essentia") + logger.debug(f"Successfully imported essentia version {_essentia_version}") +except importlib.metadata.PackageNotFoundError: + _essentia_version = False + + +_pretty_midi_available = importlib.util.find_spec("pretty_midi") is not None +try: + _pretty_midi_version = importlib.metadata.version("pretty_midi") + logger.debug(f"Successfully imported pretty_midi version {_pretty_midi_version}") +except importlib.metadata.PackageNotFoundError: + _pretty_midi_available = False + + ccl_version = "N/A" _is_ccl_available = ( importlib.util.find_spec("torch_ccl") is not None @@ -242,6 +258,14 @@ def is_librosa_available(): return _librosa_available +def is_essentia_available(): + return _essentia_available + + +def is_pretty_midi_available(): + return _pretty_midi_available + + def is_torch_cuda_available(): if is_torch_available(): import torch @@ -986,6 +1010,27 @@ def is_jieba_available(): Please note that you may need to restart your runtime after installation. """ +# docstyle-ignore +ESSENTIA_IMPORT_ERROR = """ +{0} requires essentia library. But that was not found in your environment. You can install them with pip: +`pip install essentia==2.1b6.dev1034` +Please note that you may need to restart your runtime after installation. +""" + +# docstyle-ignore +LIBROSA_IMPORT_ERROR = """ +{0} requires thes librosa library. But that was not found in your environment. You can install them with pip: +`pip install librosa` +Please note that you may need to restart your runtime after installation. +""" + +# docstyle-ignore +PRETTY_MIDI_IMPORT_ERROR = """ +{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip: +`pip install pretty_midi` +Please note that you may need to restart your runtime after installation. +""" + DECORD_IMPORT_ERROR = """ {0} requires the decord library but it was not found in your environment. You can install it with pip: `pip install decord`. Please note that you may need to restart your runtime after installation. @@ -1011,11 +1056,14 @@ def is_jieba_available(): ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)), ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)), + ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)), ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)), ("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)), ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)), + ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)), + ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)), ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)), ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)), ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)), diff --git a/tests/models/pop2piano/__init__.py b/tests/models/pop2piano/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/pop2piano/test_feature_extraction_pop2piano.py b/tests/models/pop2piano/test_feature_extraction_pop2piano.py new file mode 100644 index 00000000000000..fcc424fbf6e0dc --- /dev/null +++ b/tests/models/pop2piano/test_feature_extraction_pop2piano.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import numpy as np +from datasets import load_dataset + +from transformers.testing_utils import ( + check_json_file_has_correct_format, + require_essentia, + require_librosa, + require_scipy, + require_tf, + require_torch, +) +from transformers.utils.import_utils import ( + is_essentia_available, + is_librosa_available, + is_scipy_available, + is_torch_available, +) + +from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin + + +requirements_available = ( + is_torch_available() and is_essentia_available() and is_scipy_available() and is_librosa_available() +) + +if requirements_available: + import torch + + from transformers import Pop2PianoFeatureExtractor + + +class Pop2PianoFeatureExtractionTester(unittest.TestCase): + def __init__( + self, + parent, + n_bars=2, + sample_rate=22050, + use_mel=True, + padding_value=0, + vocab_size_special=4, + vocab_size_note=128, + vocab_size_velocity=2, + vocab_size_time=100, + ): + self.parent = parent + self.n_bars = n_bars + self.sample_rate = sample_rate + self.use_mel = use_mel + self.padding_value = padding_value + self.vocab_size_special = vocab_size_special + self.vocab_size_note = vocab_size_note + self.vocab_size_velocity = vocab_size_velocity + self.vocab_size_time = vocab_size_time + + def prepare_feat_extract_dict(self): + return { + "n_bars": self.n_bars, + "sample_rate": self.sample_rate, + "use_mel": self.use_mel, + "padding_value": self.padding_value, + "vocab_size_special": self.vocab_size_special, + "vocab_size_note": self.vocab_size_note, + "vocab_size_velocity": self.vocab_size_velocity, + "vocab_size_time": self.vocab_size_time, + } + + +@require_torch +@require_essentia +@require_librosa +@require_scipy +class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = Pop2PianoFeatureExtractor if requirements_available else None + + def setUp(self): + self.feat_extract_tester = Pop2PianoFeatureExtractionTester(self) + + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + mel_1 = feat_extract_first.use_mel + mel_2 = feat_extract_second.use_mel + self.assertTrue(np.allclose(mel_1, mel_2)) + self.assertEqual(dict_first, dict_second) + + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + mel_1 = feat_extract_first.use_mel + mel_2 = feat_extract_second.use_mel + self.assertTrue(np.allclose(mel_1, mel_2)) + self.assertEqual(dict_first, dict_second) + + def test_call(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_input = np.zeros([1000000], dtype=np.float32) + + input_features = feature_extractor(speech_input, sampling_rate=16_000, return_tensors="np") + self.assertTrue(input_features.input_features.ndim == 3) + self.assertEqual(input_features.input_features.shape[-1], 512) + + self.assertTrue(input_features.beatsteps.ndim == 2) + self.assertTrue(input_features.extrapolated_beatstep.ndim == 2) + + def test_integration(self): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + speech_samples = ds.sort("id").select([0])["audio"] + input_speech = [x["array"] for x in speech_samples][0] + sampling_rate = [x["sampling_rate"] for x in speech_samples][0] + feaure_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano") + input_features = feaure_extractor( + input_speech, sampling_rate=sampling_rate, return_tensors="pt" + ).input_features + + EXPECTED_INPUT_FEATURES = torch.tensor( + [[-7.1493, -6.8701, -4.3214], [-5.9473, -5.7548, -3.8438], [-6.1324, -5.9018, -4.3778]] + ) + self.assertTrue(torch.allclose(input_features[0, :3, :3], EXPECTED_INPUT_FEATURES, atol=1e-4)) + + def test_attention_mask(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_input1 = np.zeros([1_000_000], dtype=np.float32) + speech_input2 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32) + input_features = feature_extractor( + [speech_input1, speech_input2], + sampling_rate=[44_100, 16_000], + return_tensors="np", + return_attention_mask=True, + ) + + self.assertTrue(hasattr(input_features, "attention_mask")) + + # check shapes + self.assertTrue(input_features["attention_mask"].ndim == 2) + self.assertEqual(input_features["attention_mask_beatsteps"].shape[0], 2) + self.assertEqual(input_features["attention_mask_extrapolated_beatstep"].shape[0], 2) + + # check if they are any values except 0 and 1 + self.assertTrue(np.max(input_features["attention_mask"]) == 1) + self.assertTrue(np.max(input_features["attention_mask_beatsteps"]) == 1) + self.assertTrue(np.max(input_features["attention_mask_extrapolated_beatstep"]) == 1) + + self.assertTrue(np.min(input_features["attention_mask"]) == 0) + self.assertTrue(np.min(input_features["attention_mask_beatsteps"]) == 0) + self.assertTrue(np.min(input_features["attention_mask_extrapolated_beatstep"]) == 0) + + def test_batch_feature(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_input1 = np.zeros([1_000_000], dtype=np.float32) + speech_input2 = np.ones([2_000_000], dtype=np.float32) + speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32) + + input_features = feature_extractor( + [speech_input1, speech_input2, speech_input3], + sampling_rate=[44_100, 16_000, 48_000], + return_attention_mask=True, + ) + + self.assertEqual(len(input_features["input_features"].shape), 3) + # check shape + self.assertEqual(input_features["beatsteps"].shape[0], 3) + self.assertEqual(input_features["extrapolated_beatstep"].shape[0], 3) + + def test_batch_feature_np(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_input1 = np.zeros([1_000_000], dtype=np.float32) + speech_input2 = np.ones([2_000_000], dtype=np.float32) + speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32) + + input_features = feature_extractor( + [speech_input1, speech_input2, speech_input3], + sampling_rate=[44_100, 16_000, 48_000], + return_tensors="np", + return_attention_mask=True, + ) + + # check np array or not + self.assertEqual(type(input_features["input_features"]), np.ndarray) + + # check shape + self.assertEqual(len(input_features["input_features"].shape), 3) + + def test_batch_feature_pt(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_input1 = np.zeros([1_000_000], dtype=np.float32) + speech_input2 = np.ones([2_000_000], dtype=np.float32) + speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32) + + input_features = feature_extractor( + [speech_input1, speech_input2, speech_input3], + sampling_rate=[44_100, 16_000, 48_000], + return_tensors="pt", + return_attention_mask=True, + ) + + # check pt tensor or not + self.assertEqual(type(input_features["input_features"]), torch.Tensor) + + # check shape + self.assertEqual(len(input_features["input_features"].shape), 3) + + @require_tf + def test_batch_feature_tf(self): + import tensorflow as tf + + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_input1 = np.zeros([1_000_000], dtype=np.float32) + speech_input2 = np.ones([2_000_000], dtype=np.float32) + speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32) + + input_features = feature_extractor( + [speech_input1, speech_input2, speech_input3], + sampling_rate=[44_100, 16_000, 48_000], + return_tensors="tf", + return_attention_mask=True, + ) + + # check tf tensor or not + self.assertTrue(tf.is_tensor(input_features["input_features"])) + + # check shape + self.assertEqual(len(input_features["input_features"].shape), 3) + + @unittest.skip( + "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)" + ) + def test_padding_accepts_tensors_pt(self): + pass + + @unittest.skip( + "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)" + ) + def test_padding_accepts_tensors_tf(self): + pass + + @unittest.skip( + "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)" + ) + def test_padding_from_list(self): + pass + + @unittest.skip( + "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)" + ) + def test_padding_from_array(self): + pass + + @unittest.skip("Pop2PianoFeatureExtractor does not support truncation") + def test_attention_mask_with_truncation(self): + pass + + @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation") + def test_truncation_from_array(self): + pass + + @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation") + def test_truncation_from_list(self): + pass diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py new file mode 100644 index 00000000000000..837b1b4f1db7a5 --- /dev/null +++ b/tests/models/pop2piano/test_modeling_pop2piano.py @@ -0,0 +1,778 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Pop2Piano model. """ + +import copy +import tempfile +import unittest + +import numpy as np +from datasets import load_dataset + +from transformers import Pop2PianoConfig +from transformers.feature_extraction_utils import BatchFeature +from transformers.testing_utils import ( + require_essentia, + require_librosa, + require_onnx, + require_scipy, + require_torch, + slow, + torch_device, +) +from transformers.utils import is_essentia_available, is_librosa_available, is_scipy_available, is_torch_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import Pop2PianoForConditionalGeneration + from transformers.models.pop2piano.modeling_pop2piano import POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST + from transformers.pytorch_utils import is_torch_1_8_0 + +else: + is_torch_1_8_0 = False + + +@require_torch +class Pop2PianoModelTester: + def __init__( + self, + parent, + vocab_size=99, + batch_size=13, + encoder_seq_length=7, + decoder_seq_length=9, + # For common tests + is_training=False, + use_attention_mask=True, + use_labels=True, + hidden_size=64, + num_hidden_layers=5, + num_attention_heads=4, + d_ff=37, + relative_attention_num_buckets=8, + dropout_rate=0.1, + initializer_factor=0.002, + eos_token_id=1, + pad_token_id=0, + decoder_start_token_id=0, + scope=None, + decoder_layers=None, + ): + self.parent = parent + self.batch_size = batch_size + self.encoder_seq_length = encoder_seq_length + self.decoder_seq_length = decoder_seq_length + # For common tests + self.seq_length = self.decoder_seq_length + self.is_training = is_training + self.use_attention_mask = use_attention_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.d_ff = d_ff + self.relative_attention_num_buckets = relative_attention_num_buckets + self.dropout_rate = dropout_rate + self.initializer_factor = initializer_factor + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.decoder_start_token_id = decoder_start_token_id + self.scope = None + self.decoder_layers = decoder_layers + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size) + decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) + + attention_mask = None + decoder_attention_mask = None + if self.use_attention_mask: + attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2) + decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2) + + lm_labels = ( + ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) if self.use_labels else None + ) + + return self.get_config(), input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels + + def get_pipeline_config(self): + return Pop2PianoConfig( + vocab_size=166, # Pop2Piano forces 100 extra tokens + d_model=self.hidden_size, + d_ff=self.d_ff, + d_kv=self.hidden_size // self.num_attention_heads, + num_layers=self.num_hidden_layers, + num_decoder_layers=self.decoder_layers, + num_heads=self.num_attention_heads, + relative_attention_num_buckets=self.relative_attention_num_buckets, + dropout_rate=self.dropout_rate, + initializer_factor=self.initializer_factor, + eos_token_id=self.eos_token_id, + bos_token_id=self.pad_token_id, + pad_token_id=self.pad_token_id, + decoder_start_token_id=self.decoder_start_token_id, + ) + + def get_config(self): + return Pop2PianoConfig( + vocab_size=self.vocab_size, + d_model=self.hidden_size, + d_ff=self.d_ff, + d_kv=self.hidden_size // self.num_attention_heads, + num_layers=self.num_hidden_layers, + num_decoder_layers=self.decoder_layers, + num_heads=self.num_attention_heads, + relative_attention_num_buckets=self.relative_attention_num_buckets, + dropout_rate=self.dropout_rate, + initializer_factor=self.initializer_factor, + eos_token_id=self.eos_token_id, + bos_token_id=self.pad_token_id, + pad_token_id=self.pad_token_id, + decoder_start_token_id=self.decoder_start_token_id, + ) + + def check_prepare_lm_labels_via_shift_left( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config) + model.to(torch_device) + model.eval() + + # make sure that lm_labels are correctly padded from the right + lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id) + + # add causal pad token mask + triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not() + lm_labels.masked_fill_(triangular_mask, self.pad_token_id) + decoder_input_ids = model._shift_right(lm_labels) + + for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)): + # first item + self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id) + if i < decoder_input_ids_slice.shape[-1]: + if i < decoder_input_ids.shape[-1] - 1: + # items before diagonal + self.parent.assertListEqual( + decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist() + ) + # pad items after diagonal + if i < decoder_input_ids.shape[-1] - 2: + self.parent.assertListEqual( + decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist() + ) + else: + # all items after square + self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist()) + + def create_and_check_model( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + decoder_past = result.past_key_values + encoder_output = result.encoder_last_hidden_state + + self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size)) + # There should be `num_layers` key value embeddings stored in decoder_past + self.parent.assertEqual(len(decoder_past), config.num_layers) + # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple + self.parent.assertEqual(len(decoder_past[0]), 4) + + def create_and_check_with_lm_head( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval() + outputs = model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + labels=lm_labels, + ) + self.parent.assertEqual(len(outputs), 4) + self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size)) + self.parent.assertEqual(outputs["loss"].size(), ()) + + def create_and_check_decoder_model_past( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config).get_decoder().to(torch_device).eval() + # first forward pass + outputs = model(input_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids) + outputs_no_past = model(input_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + + output_from_no_past = model(next_input_ids)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_decoder_model_attention_mask_past( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config).get_decoder() + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + + half_seq_length = input_ids.shape[-1] // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config).get_decoder().to(torch_device).eval() + # first forward pass + outputs = model(input_ids, attention_mask=attention_mask, use_cache=True) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([attention_mask, next_mask], dim=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"] + output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_generate_with_past_key_values( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval() + torch.manual_seed(0) + output_without_past_cache = model.generate( + input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False + ) + torch.manual_seed(0) + output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True) + self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache)) + + def create_and_check_model_fp16_forward( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).half().eval() + output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[ + "encoder_last_hidden_state" + ] + self.parent.assertFalse(torch.isnan(output).any().item()) + + def create_and_check_encoder_decoder_shared_weights( + self, + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + for model_class in [Pop2PianoForConditionalGeneration]: + torch.manual_seed(0) + model = model_class(config=config).to(torch_device).eval() + # load state dict copies weights but does not tie them + model.encoder.load_state_dict(model.decoder.state_dict(), strict=False) + + torch.manual_seed(0) + tied_config = copy.deepcopy(config) + tied_config.tie_encoder_decoder = True + tied_model = model_class(config=tied_config).to(torch_device).eval() + + model_result = model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + tied_model_result = tied_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + # check that models has less parameters + self.parent.assertLess( + sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters()) + ) + random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item() + + # check that outputs are equal + self.parent.assertTrue( + torch.allclose( + model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4 + ) + ) + + # check that outputs after saving and loading are equal + with tempfile.TemporaryDirectory() as tmpdirname: + tied_model.save_pretrained(tmpdirname) + tied_model = model_class.from_pretrained(tmpdirname) + tied_model.to(torch_device) + tied_model.eval() + + # check that models has less parameters + self.parent.assertLess( + sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters()) + ) + random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item() + + tied_model_result = tied_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + # check that outputs are equal + self.parent.assertTrue( + torch.allclose( + model_result[0][0, :, random_slice_idx], + tied_model_result[0][0, :, random_slice_idx], + atol=1e-4, + ) + ) + + def check_resize_embeddings_pop2piano_v1_1( + self, + config, + ): + prev_vocab_size = config.vocab_size + + config.tie_word_embeddings = False + model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval() + model.resize_token_embeddings(prev_vocab_size - 10) + + self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10) + self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10) + self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "use_cache": False, + } + return config, inputs_dict + + +@require_torch +class Pop2PianoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (Pop2PianoForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = () + all_parallelizable_model_classes = () + fx_compatible = False + test_pruning = False + test_resize_embeddings = True + test_model_parallel = False + is_encoder_decoder = True + + def setUp(self): + self.model_tester = Pop2PianoModelTester(self) + self.config_tester = ConfigTester(self, config_class=Pop2PianoConfig, d_model=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_shift_right(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_v1_1(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + # check that gated gelu feed forward and different word embeddings work + config = config_and_inputs[0] + config.tie_word_embeddings = False + config.feed_forward_proj = "gated-gelu" + self.model_tester.create_and_check_model(config, *config_and_inputs[1:]) + + def test_config_and_model_silu_gated(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + config = config_and_inputs[0] + config.feed_forward_proj = "gated-silu" + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_with_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_with_lm_head(*config_and_inputs) + + def test_decoder_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_past(*config_and_inputs) + + def test_decoder_model_past_with_attn_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs) + + def test_decoder_model_past_with_3d_attn_mask(self): + ( + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ) = self.model_tester.prepare_config_and_inputs() + + attention_mask = ids_tensor( + [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length], + vocab_size=2, + ) + decoder_attention_mask = ids_tensor( + [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length], + vocab_size=2, + ) + + self.model_tester.create_and_check_decoder_model_attention_mask_past( + config, + input_ids, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ) + + def test_decoder_model_past_with_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_encoder_decoder_shared_weights(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs) + + @unittest.skipIf(torch_device == "cpu", "Cant do half precision") + def test_model_fp16_forward(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs) + + def test_v1_1_resize_embeddings(self): + config = self.model_tester.prepare_config_and_inputs()[0] + self.model_tester.check_resize_embeddings_pop2piano_v1_1(config) + + @slow + def test_model_from_pretrained(self): + for model_name in POP2PIANO_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = Pop2PianoForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + @require_onnx + @unittest.skipIf( + is_torch_1_8_0, + reason="Test has a segmentation fault on torch 1.8.0", + ) + def test_export_to_onnx(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + model = Pop2PianoForConditionalGeneration(config_and_inputs[0]).to(torch_device) + with tempfile.TemporaryDirectory() as tmpdirname: + torch.onnx.export( + model, + (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]), + f"{tmpdirname}/Pop2Piano_test.onnx", + export_params=True, + opset_version=9, + input_names=["input_ids", "decoder_input_ids"], + ) + + def test_pass_with_input_features(self): + input_features = BatchFeature( + { + "input_features": torch.rand((75, 100, 512)).type(torch.float32), + "beatsteps": torch.randint(size=(1, 955), low=0, high=100).type(torch.float32), + "extrapolated_beatstep": torch.randint(size=(1, 900), low=0, high=100).type(torch.float32), + } + ) + model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") + model_opts = model.generate(input_features=input_features["input_features"], return_dict_in_generate=True) + + self.assertEqual(model_opts.sequences.ndim, 2) + + def test_pass_with_batched_input_features(self): + input_features = BatchFeature( + { + "input_features": torch.rand((220, 70, 512)).type(torch.float32), + "beatsteps": torch.randint(size=(5, 955), low=0, high=100).type(torch.float32), + "extrapolated_beatstep": torch.randint(size=(5, 900), low=0, high=100).type(torch.float32), + "attention_mask": torch.concatenate( + [ + torch.ones([120, 70], dtype=torch.int32), + torch.zeros([1, 70], dtype=torch.int32), + torch.ones([50, 70], dtype=torch.int32), + torch.zeros([1, 70], dtype=torch.int32), + torch.ones([47, 70], dtype=torch.int32), + torch.zeros([1, 70], dtype=torch.int32), + ], + axis=0, + ), + "attention_mask_beatsteps": torch.ones((5, 955)).type(torch.int32), + "attention_mask_extrapolated_beatstep": torch.ones((5, 900)).type(torch.int32), + } + ) + model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") + model_opts = model.generate( + input_features=input_features["input_features"], + attention_mask=input_features["attention_mask"], + return_dict_in_generate=True, + ) + + self.assertEqual(model_opts.sequences.ndim, 2) + + +@require_torch +class Pop2PianoModelIntegrationTests(unittest.TestCase): + @slow + def test_mel_conditioner_integration(self): + composer = "composer1" + model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") + input_embeds = torch.ones([10, 100, 512]) + + composer_value = model.generation_config.composer_to_feature_token[composer] + composer_value = torch.tensor(composer_value) + composer_value = composer_value.repeat(input_embeds.size(0)) + outputs = model.mel_conditioner( + input_embeds, composer_value, min(model.generation_config.composer_to_feature_token.values()) + ) + + # check shape + self.assertEqual(outputs.size(), torch.Size([10, 101, 512])) + + # check values + EXPECTED_OUTPUTS = torch.tensor( + [[1.0475305318832397, 0.29052114486694336, -0.47778210043907166], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]] + ) + + self.assertTrue(torch.allclose(outputs[0, :3, :3], EXPECTED_OUTPUTS, atol=1e-4)) + + @slow + @require_essentia + @require_librosa + @require_scipy + def test_full_model_integration(self): + if is_librosa_available() and is_scipy_available() and is_essentia_available() and is_torch_available(): + from transformers import Pop2PianoProcessor + + speech_input1 = np.zeros([1_000_000], dtype=np.float32) + sampling_rate = 44_100 + + processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") + input_features = processor.feature_extractor( + speech_input1, sampling_rate=sampling_rate, return_tensors="pt" + ) + + model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") + outputs = model.generate( + input_features=input_features["input_features"], return_dict_in_generate=True + ).sequences + + # check for shapes + self.assertEqual(outputs.size(0), 70) + + # check for values + self.assertEqual(outputs[0, :2].detach().cpu().numpy().tolist(), [0, 1]) + + # This is the test for a real music from K-Pop genre. + @slow + @require_essentia + @require_librosa + @require_scipy + def test_real_music(self): + if is_librosa_available() and is_scipy_available() and is_essentia_available() and is_torch_available(): + from transformers import Pop2PianoFeatureExtractor, Pop2PianoTokenizer + + model = Pop2PianoForConditionalGeneration.from_pretrained("susnato/pop2piano_dev") + model.eval() + feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("susnato/pop2piano_dev") + tokenizer = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev") + ds = load_dataset("sweetcocoa/pop2piano_ci", split="test") + + output_fe = feature_extractor( + ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="pt" + ) + output_model = model.generate(input_features=output_fe["input_features"], composer="composer1") + output_tokenizer = tokenizer.batch_decode(token_ids=output_model, feature_extractor_output=output_fe) + pretty_midi_object = output_tokenizer["pretty_midi_objects"][0] + + # Checking if no of notes are same + self.assertEqual(len(pretty_midi_object.instruments[0].notes), 59) + predicted_timings = [] + for i in pretty_midi_object.instruments[0].notes: + predicted_timings.append(i.start) + + # Checking note start timings(first 6) + EXPECTED_START_TIMINGS = [ + 0.4876190423965454, + 0.7314285635948181, + 0.9752380847930908, + 1.4396371841430664, + 1.6718367338180542, + 1.904036283493042, + ] + + np.allclose(EXPECTED_START_TIMINGS, predicted_timings[:6]) + + # Checking note end timings(last 6) + EXPECTED_END_TIMINGS = [ + 12.341403007507324, + 12.567797183990479, + 12.567797183990479, + 12.567797183990479, + 12.794191360473633, + 12.794191360473633, + ] + + np.allclose(EXPECTED_END_TIMINGS, predicted_timings[-6:]) diff --git a/tests/models/pop2piano/test_processor_pop2piano.py b/tests/models/pop2piano/test_processor_pop2piano.py new file mode 100644 index 00000000000000..28af7fbb55b1b3 --- /dev/null +++ b/tests/models/pop2piano/test_processor_pop2piano.py @@ -0,0 +1,266 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +import numpy as np +import pytest +from datasets import load_dataset + +from transformers.testing_utils import ( + require_essentia, + require_librosa, + require_pretty_midi, + require_scipy, + require_torch, +) +from transformers.tokenization_utils import BatchEncoding +from transformers.utils.import_utils import ( + is_essentia_available, + is_librosa_available, + is_pretty_midi_available, + is_scipy_available, + is_torch_available, +) + + +requirements_available = ( + is_torch_available() + and is_essentia_available() + and is_scipy_available() + and is_librosa_available() + and is_pretty_midi_available() +) + +if requirements_available: + import pretty_midi + + from transformers import ( + Pop2PianoFeatureExtractor, + Pop2PianoForConditionalGeneration, + Pop2PianoProcessor, + Pop2PianoTokenizer, + ) + +## TODO : changing checkpoints from `susnato/pop2piano_dev` to `sweetcocoa/pop2piano` after the PR is approved + + +@require_scipy +@require_torch +@require_librosa +@require_essentia +@require_pretty_midi +class Pop2PianoProcessorTest(unittest.TestCase): + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("susnato/pop2piano_dev") + tokenizer = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev") + processor = Pop2PianoProcessor(feature_extractor, tokenizer) + + processor.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs): + return Pop2PianoTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_feature_extractor(self, **kwargs): + return Pop2PianoFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_additional_features(self): + processor = Pop2PianoProcessor( + tokenizer=self.get_tokenizer(), + feature_extractor=self.get_feature_extractor(), + ) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer( + unk_token="-1", + eos_token="1", + pad_token="0", + bos_token="2", + ) + feature_extractor_add_kwargs = self.get_feature_extractor() + + processor = Pop2PianoProcessor.from_pretrained( + self.tmpdirname, + unk_token="-1", + eos_token="1", + pad_token="0", + bos_token="2", + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, Pop2PianoTokenizer) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.feature_extractor, Pop2PianoFeatureExtractor) + + def get_inputs(self): + """get inputs for both feature extractor and tokenizer""" + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + speech_samples = ds.sort("id").select([0])["audio"] + input_speech = [x["array"] for x in speech_samples][0] + sampling_rate = [x["sampling_rate"] for x in speech_samples][0] + + feature_extractor_outputs = self.get_feature_extractor()( + audio=input_speech, sampling_rate=sampling_rate, return_tensors="pt" + ) + model = Pop2PianoForConditionalGeneration.from_pretrained("susnato/pop2piano_dev") + token_ids = model.generate(input_features=feature_extractor_outputs["input_features"], composer="composer1") + dummy_notes = [ + [ + pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77), + pretty_midi.Note(start=0.673379, end=0.905578, pitch=73, velocity=77), + pretty_midi.Note(start=0.905578, end=2.159456, pitch=73, velocity=77), + pretty_midi.Note(start=1.114558, end=2.159456, pitch=78, velocity=77), + pretty_midi.Note(start=1.323537, end=1.532517, pitch=80, velocity=77), + ], + [ + pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77), + ], + ] + + return input_speech, sampling_rate, token_ids, dummy_notes + + def test_feature_extractor(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Pop2PianoProcessor( + tokenizer=tokenizer, + feature_extractor=feature_extractor, + ) + + input_speech, sampling_rate, _, _ = self.get_inputs() + + feature_extractor_outputs = feature_extractor( + audio=input_speech, sampling_rate=sampling_rate, return_tensors="np" + ) + processor_outputs = processor(audio=input_speech, sampling_rate=sampling_rate, return_tensors="np") + + for key in feature_extractor_outputs.keys(): + self.assertTrue(np.allclose(feature_extractor_outputs[key], processor_outputs[key], atol=1e-4)) + + def test_processor_batch_decode(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Pop2PianoProcessor( + tokenizer=tokenizer, + feature_extractor=feature_extractor, + ) + + audio, sampling_rate, token_ids, _ = self.get_inputs() + feature_extractor_output = feature_extractor(audio=audio, sampling_rate=sampling_rate, return_tensors="pt") + + encoded_processor = processor.batch_decode( + token_ids=token_ids, + feature_extractor_output=feature_extractor_output, + return_midi=True, + ) + + encoded_tokenizer = tokenizer.batch_decode( + token_ids=token_ids, + feature_extractor_output=feature_extractor_output, + return_midi=True, + ) + # check start timings + encoded_processor_start_timings = [token.start for token in encoded_processor["notes"]] + encoded_tokenizer_start_timings = [token.start for token in encoded_tokenizer["notes"]] + self.assertListEqual(encoded_processor_start_timings, encoded_tokenizer_start_timings) + + # check end timings + encoded_processor_end_timings = [token.end for token in encoded_processor["notes"]] + encoded_tokenizer_end_timings = [token.end for token in encoded_tokenizer["notes"]] + self.assertListEqual(encoded_processor_end_timings, encoded_tokenizer_end_timings) + + # check pitch + encoded_processor_pitch = [token.pitch for token in encoded_processor["notes"]] + encoded_tokenizer_pitch = [token.pitch for token in encoded_tokenizer["notes"]] + self.assertListEqual(encoded_processor_pitch, encoded_tokenizer_pitch) + + # check velocity + encoded_processor_velocity = [token.velocity for token in encoded_processor["notes"]] + encoded_tokenizer_velocity = [token.velocity for token in encoded_tokenizer["notes"]] + self.assertListEqual(encoded_processor_velocity, encoded_tokenizer_velocity) + + def test_tokenizer_call(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Pop2PianoProcessor( + tokenizer=tokenizer, + feature_extractor=feature_extractor, + ) + + _, _, _, notes = self.get_inputs() + + encoded_processor = processor( + notes=notes, + ) + + self.assertTrue(isinstance(encoded_processor, BatchEncoding)) + + def test_processor(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Pop2PianoProcessor( + tokenizer=tokenizer, + feature_extractor=feature_extractor, + ) + + audio, sampling_rate, _, notes = self.get_inputs() + + inputs = processor( + audio=audio, + sampling_rate=sampling_rate, + notes=notes, + ) + + self.assertListEqual( + list(inputs.keys()), + ["input_features", "beatsteps", "extrapolated_beatstep", "token_ids"], + ) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_model_input_names(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Pop2PianoProcessor( + tokenizer=tokenizer, + feature_extractor=feature_extractor, + ) + + audio, sampling_rate, _, notes = self.get_inputs() + feature_extractor(audio, sampling_rate, return_tensors="pt") + + inputs = processor( + audio=audio, + sampling_rate=sampling_rate, + notes=notes, + ) + self.assertListEqual( + list(inputs.keys()), + ["input_features", "beatsteps", "extrapolated_beatstep", "token_ids"], + ) diff --git a/tests/models/pop2piano/test_tokenization_pop2piano.py b/tests/models/pop2piano/test_tokenization_pop2piano.py new file mode 100644 index 00000000000000..b41557f83e883f --- /dev/null +++ b/tests/models/pop2piano/test_tokenization_pop2piano.py @@ -0,0 +1,418 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Please note that Pop2PianoTokenizer is too far from our usual tokenizers and thus cannot use the TokenizerTesterMixin class. +""" + +import os +import pickle +import shutil +import tempfile +import unittest + +from transformers.feature_extraction_utils import BatchFeature +from transformers.testing_utils import ( + is_pretty_midi_available, + is_torch_available, + require_pretty_midi, + require_torch, +) +from transformers.tokenization_utils import BatchEncoding + + +if is_torch_available(): + import torch + + +requirements_available = is_torch_available() and is_pretty_midi_available() +if requirements_available: + import pretty_midi + + from transformers import Pop2PianoTokenizer + + +## TODO : changing checkpoints from `susnato/pop2piano_dev` to `sweetcocoa/pop2piano` after the PR is approved + + +@require_torch +@require_pretty_midi +class Pop2PianoTokenizerTest(unittest.TestCase): + def setUp(self): + super().setUp() + self.tokenizer = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev") + + def get_input_notes(self): + notes = [ + [ + pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77), + pretty_midi.Note(start=0.673379, end=0.905578, pitch=73, velocity=77), + pretty_midi.Note(start=0.905578, end=2.159456, pitch=73, velocity=77), + pretty_midi.Note(start=1.114558, end=2.159456, pitch=78, velocity=77), + pretty_midi.Note(start=1.323537, end=1.532517, pitch=80, velocity=77), + ], + [ + pretty_midi.Note(start=0.441179, end=2.159456, pitch=70, velocity=77), + ], + ] + + return notes + + def test_call(self): + notes = self.get_input_notes() + + output = self.tokenizer( + notes, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=10, + return_attention_mask=True, + ) + + # check the output type + self.assertTrue(isinstance(output, BatchEncoding)) + + # check the values + expected_output_token_ids = torch.tensor( + [[134, 133, 74, 135, 77, 132, 77, 133, 77, 82], [134, 133, 74, 136, 132, 74, 134, 134, 134, 134]] + ) + expected_output_attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]) + + self.assertTrue(torch.allclose(output["token_ids"], expected_output_token_ids, atol=1e-4)) + self.assertTrue(torch.allclose(output["attention_mask"], expected_output_attention_mask, atol=1e-4)) + + def test_batch_decode(self): + # test batch decode with model, feature-extractor outputs(beatsteps, extrapolated_beatstep) + + # Please note that this test does not test the accuracy of the outputs, instead it is designed to make sure that + # the tokenizer's batch_decode can deal with attention_mask in feature-extractor outputs. For the accuracy check + # please see the `test_batch_decode_outputs` test. + + model_output = torch.concatenate( + [ + torch.randint(size=[120, 96], low=0, high=70, dtype=torch.long), + torch.zeros(size=[1, 96], dtype=torch.long), + torch.randint(size=[50, 96], low=0, high=40, dtype=torch.long), + torch.zeros(size=[1, 96], dtype=torch.long), + ], + axis=0, + ) + input_features = BatchFeature( + { + "beatsteps": torch.ones([2, 955]), + "extrapolated_beatstep": torch.ones([2, 1000]), + "attention_mask": torch.concatenate( + [ + torch.ones([120, 96], dtype=torch.long), + torch.zeros([1, 96], dtype=torch.long), + torch.ones([50, 96], dtype=torch.long), + torch.zeros([1, 96], dtype=torch.long), + ], + axis=0, + ), + "attention_mask_beatsteps": torch.ones([2, 955]), + "attention_mask_extrapolated_beatstep": torch.ones([2, 1000]), + } + ) + + output = self.tokenizer.batch_decode(token_ids=model_output, feature_extractor_output=input_features)[ + "pretty_midi_objects" + ] + + # check length + self.assertTrue(len(output) == 2) + + # check object type + self.assertTrue(isinstance(output[0], pretty_midi.pretty_midi.PrettyMIDI)) + self.assertTrue(isinstance(output[1], pretty_midi.pretty_midi.PrettyMIDI)) + + def test_batch_decode_outputs(self): + # test batch decode with model, feature-extractor outputs(beatsteps, extrapolated_beatstep) + + # Please note that this test tests the accuracy of the outputs of the tokenizer's `batch_decode` method. + + model_output = torch.tensor( + [ + [134, 133, 74, 135, 77, 82, 84, 136, 132, 74, 77, 82, 84], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ] + ) + input_features = BatchEncoding( + { + "beatsteps": torch.tensor([[0.0697, 0.1103, 0.1509, 0.1916]]), + "extrapolated_beatstep": torch.tensor([[0.0000, 0.0406, 0.0813, 0.1219]]), + } + ) + + output = self.tokenizer.batch_decode(token_ids=model_output, feature_extractor_output=input_features) + + # check outputs + self.assertEqual(len(output["notes"]), 4) + + predicted_start_timings, predicted_end_timings = [], [] + for i in output["notes"]: + predicted_start_timings.append(i.start) + predicted_end_timings.append(i.end) + + # Checking note start timings + expected_start_timings = torch.tensor( + [ + 0.069700, + 0.110300, + 0.110300, + 0.110300, + ] + ) + predicted_start_timings = torch.tensor(predicted_start_timings) + + self.assertTrue(torch.allclose(expected_start_timings, predicted_start_timings, atol=1e-4)) + + # Checking note end timings + expected_end_timings = torch.tensor( + [ + 0.191600, + 0.191600, + 0.191600, + 0.191600, + ] + ) + predicted_end_timings = torch.tensor(predicted_end_timings) + + self.assertTrue(torch.allclose(expected_end_timings, predicted_end_timings, atol=1e-4)) + + def test_get_vocab(self): + vocab_dict = self.tokenizer.get_vocab() + self.assertIsInstance(vocab_dict, dict) + self.assertGreaterEqual(len(self.tokenizer), len(vocab_dict)) + + vocab = [self.tokenizer.convert_ids_to_tokens(i) for i in range(len(self.tokenizer))] + self.assertEqual(len(vocab), len(self.tokenizer)) + + self.tokenizer.add_tokens(["asdfasdfasdfasdf"]) + vocab = [self.tokenizer.convert_ids_to_tokens(i) for i in range(len(self.tokenizer))] + self.assertEqual(len(vocab), len(self.tokenizer)) + + def test_save_and_load_tokenizer(self): + tmpdirname = tempfile.mkdtemp() + + sample_notes = self.get_input_notes() + + self.tokenizer.add_tokens(["bim", "bambam"]) + additional_special_tokens = self.tokenizer.additional_special_tokens + additional_special_tokens.append("new_additional_special_token") + self.tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + before_token_ids = self.tokenizer(sample_notes)["token_ids"] + before_vocab = self.tokenizer.get_vocab() + self.tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = self.tokenizer.__class__.from_pretrained(tmpdirname) + after_token_ids = after_tokenizer(sample_notes)["token_ids"] + after_vocab = after_tokenizer.get_vocab() + self.assertDictEqual(before_vocab, after_vocab) + self.assertListEqual(before_token_ids, after_token_ids) + self.assertIn("bim", after_vocab) + self.assertIn("bambam", after_vocab) + self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens) + + shutil.rmtree(tmpdirname) + + def test_pickle_tokenizer(self): + tmpdirname = tempfile.mkdtemp() + + notes = self.get_input_notes() + subwords = self.tokenizer(notes)["token_ids"] + + filename = os.path.join(tmpdirname, "tokenizer.bin") + with open(filename, "wb") as handle: + pickle.dump(self.tokenizer, handle) + + with open(filename, "rb") as handle: + tokenizer_new = pickle.load(handle) + + subwords_loaded = tokenizer_new(notes)["token_ids"] + + self.assertListEqual(subwords, subwords_loaded) + + def test_padding_side_in_kwargs(self): + tokenizer_p = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev", padding_side="left") + self.assertEqual(tokenizer_p.padding_side, "left") + + tokenizer_p = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev", padding_side="right") + self.assertEqual(tokenizer_p.padding_side, "right") + + self.assertRaises( + ValueError, + Pop2PianoTokenizer.from_pretrained, + "susnato/pop2piano_dev", + padding_side="unauthorized", + ) + + def test_truncation_side_in_kwargs(self): + tokenizer_p = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev", truncation_side="left") + self.assertEqual(tokenizer_p.truncation_side, "left") + + tokenizer_p = Pop2PianoTokenizer.from_pretrained("susnato/pop2piano_dev", truncation_side="right") + self.assertEqual(tokenizer_p.truncation_side, "right") + + self.assertRaises( + ValueError, + Pop2PianoTokenizer.from_pretrained, + "susnato/pop2piano_dev", + truncation_side="unauthorized", + ) + + def test_right_and_left_padding(self): + tokenizer = self.tokenizer + notes = self.get_input_notes() + notes = notes[0] + max_length = 20 + + padding_idx = tokenizer.pad_token_id + + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + padded_notes = tokenizer(notes, padding="max_length", max_length=max_length)["token_ids"] + padded_notes_length = len(padded_notes) + notes_without_padding = tokenizer(notes, padding="do_not_pad")["token_ids"] + padding_size = max_length - len(notes_without_padding) + + self.assertEqual(padded_notes_length, max_length) + self.assertEqual(notes_without_padding + [padding_idx] * padding_size, padded_notes) + + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" + padded_notes = tokenizer(notes, padding="max_length", max_length=max_length)["token_ids"] + padded_notes_length = len(padded_notes) + notes_without_padding = tokenizer(notes, padding="do_not_pad")["token_ids"] + padding_size = max_length - len(notes_without_padding) + + self.assertEqual(padded_notes_length, max_length) + self.assertEqual([padding_idx] * padding_size + notes_without_padding, padded_notes) + + # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding' + notes_without_padding = tokenizer(notes)["token_ids"] + + tokenizer.padding_side = "right" + padded_notes_right = tokenizer(notes, padding=False)["token_ids"] + self.assertEqual(len(padded_notes_right), len(notes_without_padding)) + self.assertEqual(padded_notes_right, notes_without_padding) + + tokenizer.padding_side = "left" + padded_notes_left = tokenizer(notes, padding="longest")["token_ids"] + self.assertEqual(len(padded_notes_left), len(notes_without_padding)) + self.assertEqual(padded_notes_left, notes_without_padding) + + tokenizer.padding_side = "right" + padded_notes_right = tokenizer(notes, padding="longest")["token_ids"] + self.assertEqual(len(padded_notes_right), len(notes_without_padding)) + self.assertEqual(padded_notes_right, notes_without_padding) + + tokenizer.padding_side = "left" + padded_notes_left = tokenizer(notes, padding=False)["token_ids"] + self.assertEqual(len(padded_notes_left), len(notes_without_padding)) + self.assertEqual(padded_notes_left, notes_without_padding) + + def test_right_and_left_truncation(self): + tokenizer = self.tokenizer + notes = self.get_input_notes() + notes = notes[0] + truncation_size = 3 + + # RIGHT TRUNCATION - Check that it correctly truncates when a maximum length is specified along with the truncation flag set to True + tokenizer.truncation_side = "right" + full_encoded_notes = tokenizer(notes)["token_ids"] + full_encoded_notes_length = len(full_encoded_notes) + truncated_notes = tokenizer(notes, max_length=full_encoded_notes_length - truncation_size, truncation=True)[ + "token_ids" + ] + self.assertEqual(full_encoded_notes_length, len(truncated_notes) + truncation_size) + self.assertEqual(full_encoded_notes[:-truncation_size], truncated_notes) + + # LEFT TRUNCATION - Check that it correctly truncates when a maximum length is specified along with the truncation flag set to True + tokenizer.truncation_side = "left" + full_encoded_notes = tokenizer(notes)["token_ids"] + full_encoded_notes_length = len(full_encoded_notes) + truncated_notes = tokenizer(notes, max_length=full_encoded_notes_length - truncation_size, truncation=True)[ + "token_ids" + ] + self.assertEqual(full_encoded_notes_length, len(truncated_notes) + truncation_size) + self.assertEqual(full_encoded_notes[truncation_size:], truncated_notes) + + # RIGHT & LEFT TRUNCATION - Check that nothing is done for 'longest' and 'no_truncation' + tokenizer.truncation_side = "right" + truncated_notes_right = tokenizer(notes, truncation=True)["token_ids"] + self.assertEqual(full_encoded_notes_length, len(truncated_notes_right)) + self.assertEqual(full_encoded_notes, truncated_notes_right) + + tokenizer.truncation_side = "left" + truncated_notes_left = tokenizer(notes, truncation="longest_first")["token_ids"] + self.assertEqual(len(truncated_notes_left), full_encoded_notes_length) + self.assertEqual(truncated_notes_left, full_encoded_notes) + + tokenizer.truncation_side = "right" + truncated_notes_right = tokenizer(notes, truncation="longest_first")["token_ids"] + self.assertEqual(len(truncated_notes_right), full_encoded_notes_length) + self.assertEqual(truncated_notes_right, full_encoded_notes) + + tokenizer.truncation_side = "left" + truncated_notes_left = tokenizer(notes, truncation=True)["token_ids"] + self.assertEqual(len(truncated_notes_left), full_encoded_notes_length) + self.assertEqual(truncated_notes_left, full_encoded_notes) + + def test_padding_to_multiple_of(self): + notes = self.get_input_notes() + + if self.tokenizer.pad_token is None: + self.skipTest("No padding token.") + else: + normal_tokens = self.tokenizer(notes[0], padding=True, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + normal_tokens = self.tokenizer(notes[0], pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # Should also work with truncation + normal_tokens = self.tokenizer(notes[0], padding=True, truncation=True, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # truncation to something which is not a multiple of pad_to_multiple_of raises an error + self.assertRaises( + ValueError, + self.tokenizer.__call__, + notes[0], + padding=True, + truncation=True, + max_length=12, + pad_to_multiple_of=8, + ) + + def test_padding_with_attention_mask(self): + if self.tokenizer.pad_token is None: + self.skipTest("No padding token.") + if "attention_mask" not in self.tokenizer.model_input_names: + self.skipTest("This model does not use attention mask.") + + features = [ + {"token_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]}, + {"token_ids": [1, 2, 3], "attention_mask": [1, 1, 0]}, + ] + padded_features = self.tokenizer.pad(features) + if self.tokenizer.padding_side == "right": + self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [1, 1, 0, 0, 0, 0]]) + else: + self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]]) diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index e0d4a01695fad2..f542dae5f45c65 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -58,6 +58,8 @@ # used internally in the configuration class file "LongT5Config": ["feed_forward_proj"], # used internally in the configuration class file + "Pop2PianoConfig": ["feed_forward_proj"], + # used internally in the configuration class file "SwitchTransformersConfig": ["feed_forward_proj"], # having default values other than `1e-5` - we can't fix them without breaking "BioGptConfig": ["layer_norm_eps"], diff --git a/utils/check_repo.py b/utils/check_repo.py index 8be8469465d56f..98f2436ae3af45 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -66,6 +66,7 @@ "T5Stack", "MT5Stack", "UMT5Stack", + "Pop2PianoStack", "SwitchTransformersStack", "TFDPRSpanPredictor", "MaskFormerSwinModel", diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 0649f6e028e193..b917bd9cdb3114 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -346,6 +346,8 @@ src/transformers/models/poolformer/configuration_poolformer.py src/transformers/models/poolformer/feature_extraction_poolformer.py src/transformers/models/poolformer/image_processing_poolformer.py src/transformers/models/poolformer/modeling_poolformer.py +src/transformers/models/pop2piano/configuration_pop2piano.py +src/transformers/models/pop2piano/modeling_pop2piano.py src/transformers/models/prophetnet/tokenization_prophetnet.py src/transformers/models/rag/tokenization_rag.py src/transformers/models/realm/configuration_realm.py