In [1]:
from datasets import load_dataset
from datasets import Audio
import copy
import json
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# この二つは実験によっては変えるべき
self.tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"
)
self.feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False,
)

In [1]:
from torch.utils.data import Dataset
from torchaudio.functional import resample
from typing import List
import torch

import os
import pickle
class LJSpeech(Dataset):
    def __init__(self, dataset_pkl_path: str, sample_rate: int = 16000):
        # Paremeter
        self.feature_max_length = None
        self.label_max_length = None
        self.sample_rate = sample_rate

        if not os.path.isfile(dataset_pkl_path):
            dataset = load_dataset("../../datasets/loading_scripts/lj_speech.py", data_dir="../../datasets/LJSpeech-1.1/")
            dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
            dataset = dataset.remove_columns(["id"])

            self.extract_vocab(train_all_texts=dataset["train"]["text"], vocab_path="./vocab.json")
            with open("vocab.json", "r") as vocab_file:
                vocab = json.load(vocab_file)
            
            def prepare_dataset(batch):
                audio = batch["audio"]
                batch["input_values"] = resample(audio, orig_freq=audio["sampling_rate"], new_freq=self.sample_rate)
                batch["input_length"] = len(batch["input_values"])

                batch["labels"] = self.tokenizer(batch["text"]).input_ids
                return batch

            dataset = dataset.map(
                prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4
            )
            with open(dataset_pkl_path, "wb") as f:
                pickle.dump(dataset, f)
        
        with open(dataset_pkl_path, "rb") as f:
            self.dataset = pickle.load(f)
        
        self.tokenizer = Wav2Vec2CTCTokenizer(
            "./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"
        )
    
    def __len__(self):
        return len(self.dataset["train"])
    
    def __getitem__(self, idx):
        return self.dataset["train"][idx]
    
    def collate_fn(self, batch):
        input_features = [
            {"input_values": feature["input_values"]} for feature in batch
        ]
        labels = [
            {"input_ids": feature["labels"]} for feature in batch
        ]
        input_feature_lenghts = torch.tensor([
            feature["input_length"] for feature in batch
        ])
        
        labels = self.tokenizer.pad(
            labels,
            padding=True,
            max_length=self.label_max_length,
            pad_to_multiple_of=1,
            return_tensors="pt",
        )
        input_features = self.feature_extractor.pad(
            input_features,
            padding=True,
            max_length=self.feature_max_length,
            pad_to_multiple_of=1,
            return_tensors="pt",
        )
        
        labels = labels["input_ids"].masked_fill(
            labels.attention_mask.ne(1), -100
        )

        input_features = input_features["input_values"]
        
        return input_features, labels, input_feature_lenghts

    def extract_vocab(
        self,
        train_all_texts: List = None, 
        test_all_texts : List = None,
        vocab_path: str = "./vocab.json",
        ) -> None:
        if train_all_texts is None:
            train_all_texts = []
        if test_all_texts is None:
            test_all_texts = []

        all_text = " ".join(train_all_texts + test_all_texts)
        vocab_list = list(set(all_text))

        vocab = {v: k for k, v in enumerate(vocab_list)}
        # use | as delimeter in stead of " "
        vocab["|"] = vocab[" "]
        # dekete unused char
        del vocab[" "]
        # add unk and pad token
        vocab["[UNK]"] = len(vocab)
        vocab["[PAD]"] = len(vocab)

        with open(vocab_path, "w") as vocab_file:
            json.dump(vocab, vocab_file)
    

            

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
BATCH_SIZE = 2
from torch.utils.data import DataLoader
train_dataset = LJSpeech(dataset_pkl_path="./ljspeech_dataset.pkl")
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    # 不完全なバッチの無視
    drop_last=True,
    # 高速化?
    pin_memory=True,
    collate_fn=train_dataset.collate_fn
)

In [13]:
tmp = train_dataloader.__iter__()
features, labels, feature_lengths = tmp.__next__()

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/shibutani/miniconda3/envs/py39/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/shibutani/miniconda3/envs/py39/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/tmp/ipykernel_102027/4021298803.py", line 68, in collate_fn
    input_features = self.feature_extractor.pad(
  File "/home/shibutani/miniconda3/envs/py39/lib/python3.9/site-packages/torch/utils/data/dataset.py", line 83, in __getattr__
    raise AttributeError
AttributeError


In [7]:
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H
model = WAV2VEC2_ASR_BASE_960H.get_model()

In [14]:
WAV2VEC2_ASR_BASE_960H._labels

('|',
 'E',
 'T',
 'A',
 'O',
 'N',
 'I',
 'H',
 'S',
 'R',
 'D',
 'L',
 'U',
 'M',
 'W',
 'C',
 'F',
 'G',
 'Y',
 'P',
 'B',
 'V',
 'K',
 "'",
 'X',
 'J',
 'Q',
 'Z')