## Note
- size: 特徴量
- length: 時系列

In [1]:
from torch.utils.data import Dataset

import torch
from torch import nn, optim
import pandas as pd
import torchaudio
import librosa
import numpy as np
import math

from torchmetrics.functional import char_error_rate, word_error_rate

import glob
import os
import re
import copy


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tkwargs_int = {
    "dtype": torch.int32,
    "device": "cuda",
}
tkwargs_float = {
    "dtype": torch.float32,
    "device": "cuda",
}

In [3]:
class YesNoDatasetWav(Dataset):
    def __init__(self, wav_dir_path, model_sample_rate):
        super().__init__()

        dataset = []
        columns = ["path", "text_idx"]
        self.labels = ["y", "e", "s", "n", "o", "<space>", "_"]
        self.label_to_idx = {label: i for i, label in enumerate(self.labels)}
        for wav_file_path in glob.glob(wav_dir_path + "*.wav"):
            file_name = os.path.splitext(os.path.basename(wav_file_path))[0]
            text_idx = []
            for c in file_name:
                if c == "1":
                    text_idx += [self.label_to_idx[ic] for ic in "yes"] 
                elif c == "0":
                    text_idx += [self.label_to_idx[ic] for ic in "no"] 
                elif c == "_":
                    text_idx.append(self.label_to_idx["<space>"])
                else:
                    raise ValueError("Invalid Dir Path")
            dataset.append([wav_file_path, text_idx])
        
        self.dataset = pd.DataFrame(dataset, columns=columns)
        self.model_sample_rate = model_sample_rate
    
    
    def __len__(self):
        return self.dataset.shape[0]
    
    def __getitem__(self, idx):
        wav_file_path = self.dataset.iloc[idx, 0]
        text_idx = self.dataset.iloc[idx, 1]
        wav_data, sample_rate = torchaudio.load(wav_file_path)
        if sample_rate != self.model_sample_rate:
            wav_data = torchaudio.functional.resample(wav_data, sample_rate, self.model_sample_rate)
        wav_data = wav_data.squeeze(0)
        return wav_data, torch.tensor(text_idx)

        

In [4]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

def collate_fn(batch):
    wavs, text_idxs = zip(*batch)
    original_wav_lens = torch.tensor(np.array([len(wav) for wav in wavs]))
    original_text_idx_lens = torch.tensor(np.array([len(text_idx) for text_idx in text_idxs]))
    # padding for spectrogram_db
    padded_wavs = []
    for wav in wavs:
        padded_wav = np.pad(wav, ((0, max(original_wav_lens)-wav.shape[0])), "constant", constant_values=0)
        padded_wavs.append(padded_wav)
    
    padded_wavs = torch.tensor(np.array(padded_wavs))

    # padding and packing for text_idx
    padded_text_idxs = pad_sequence(text_idxs, batch_first=True, padding_value=-1)

    return padded_wavs, padded_text_idxs, original_wav_lens, original_text_idx_lens

In [5]:
from torchaudio import pipelines
bundle = pipelines.WAV2VEC2_BASE

model_sample_rate = bundle.sample_rate
wav_dir_path = "../datasets/waves_yesno/"
dataset = YesNoDatasetWav(wav_dir_path, model_sample_rate)

In [6]:
from torch.utils.data import random_split, DataLoader
# 学習データとテストデータに分割
## 合計サイズが元のサイズと同一になるように注意
train_size = int(len(dataset) * 0.7)
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(
    dataset, [train_size, test_size]
)
BATCH_SIZE = 2
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    # 不完全なバッチの無視
    drop_last=True,
    # 高速化?
    pin_memory=True,
    collate_fn=collate_fn
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    # 不完全なバッチの無視
    drop_last=True,
    # 高速化?
    pin_memory=True,
    collate_fn=collate_fn
)

In [7]:
import sys
sys.path.append("..")
from modules.preprocessing.subsampling import Conv2DSubSampling
from modules.transformers.encoder import TransformerEncoder

In [8]:
class Model(nn.Module):
    def __init__(self, nlabel):
        super(Model, self).__init__()
        self.in_size = bundle._params["encoder_embed_dim"]
        self.nlabel = nlabel
        self.wav2vec_encoder = bundle.get_model()
        self.fc = nn.Linear(self.in_size, self.nlabel, bias=True)
        self.log_softmax = nn.functional.log_softmax
    
    def forward(self, x, x_lengths):
        # args:
        #   x: [B, T]
        #   x_lengths: [B]
        #       padding前のシーケンス長
        # return:
        #   log_prob: [B, T, nlabel]
        #   y_lengths: [B]
        #       非パディング部分のシーケンス長
        encoded, y_lengths = self.wav2vec_encoder.extract_features(x, x_lengths) # encoded: [L, B, T, in_size]

        y = self.fc(encoded[-1]) # [B, T', nlabel]
        
        log_probs = self.log_softmax(y, dim=2) # [B, T', nlabel]
        return log_probs, y_lengths
        

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"This learning will be running on {device}.")

num_labels = len(dataset.labels)
num_epochs = 1

This learning will be running on cuda.


これ以降、各モデルごとに実験用のコードを記述していきます。

In [10]:
def ctc_simple_decode(hypotheses_idxs, labels, padding_idx):
    # hypothesis_idxs: tensor(batch, time)
    # labels: np.array(num_labels)

    hypotheses_idxs = hypotheses_idxs.cpu().numpy()
    hypotheses = []
    blank_idx = labels.index("_")
    for hypothesis_idxs in hypotheses_idxs:
        hypothesis = []
        prev_idx = -1
        for idx in hypothesis_idxs:
            if idx == blank_idx:
                continue
            elif idx == prev_idx:
                continue
            elif idx == padding_idx:
                continue
            else:
                if labels[idx] == "<space>":
                    hypothesis.append(" ")
                else:
                    hypothesis.append(labels[idx])
                prev_idx = idx
        hypotheses.append("".join(hypothesis))
    return hypotheses

In [11]:
from torch.optim.lr_scheduler import _LRScheduler
class TransformerLR(_LRScheduler):
    """TransformerLR class for adjustment of learning rate.

    The scheduling is based on the method proposed in 'Attention is All You Need'.
    """

    def __init__(self, optimizer, warmup_epochs=1000, last_epoch=-1, verbose=False):
        """Initialize class."""
        self.warmup_epochs = warmup_epochs
        self.normalize = self.warmup_epochs**0.5
        super().__init__(optimizer, last_epoch, verbose)

    def get_lr(self):
        """Return adjusted learning rate."""
        step = self.last_epoch + 1
        scale = self.normalize * min(step**-0.5, step * self.warmup_epochs**-1.5)
        return [base_lr * scale for base_lr in self.base_lrs]

In [12]:
from torch.utils.tensorboard import SummaryWriter
import time

model = Model(num_labels).to(device)

ctc_loss = nn.CTCLoss(reduction="sum", blank=dataset.label_to_idx["_"])
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = TransformerLR(optimizer, warmup_epochs=1000)
# Adam

writer = SummaryWriter()

for i in range(num_epochs):
    t0 = time.time()
    model.train()
    epoch_loss = 0
    cnt = 0
    for _, (padded_wavs, padded_text_idxs, original_wav_lens, original_text_idx_lens) in enumerate(train_dataloader):
        cnt += 1
        padded_wavs = padded_wavs.to(device)
        original_wav_lens = original_wav_lens.to(device)
        padded_text_idxs = padded_text_idxs.to(device)
        original_text_idx_lens = original_text_idx_lens.to(device)
        
        optimizer.zero_grad()
        
        log_probs, y_lengths  = model(x=padded_wavs, x_lengths=original_wav_lens)

        loss = ctc_loss(log_probs.transpose(1, 0), padded_text_idxs, y_lengths, original_text_idx_lens)
        loss.backward()
        optimizer.step()
        # lossはバッチ内平均ロス
        epoch_loss += (loss.item() / BATCH_SIZE)
    scheduler.step()
    # バッチ内平均ロスの和をイテレーション数で割ることで、一つのデータあたりの平均ロスを求める
    writer.add_scalar("Loss/Training", epoch_loss / cnt, i)

    model.eval()
    with torch.no_grad():
        epoch_test_loss = 0
        cnt = 0
        total_cer = 0
        for _, (padded_wavs, padded_text_idxs, original_wav_lens, original_text_idx_lens) in enumerate(test_dataloader):
            cnt += 1
            padded_wavs = padded_wavs.to(device)
            original_wav_lens = original_wav_lens.to(device)
            padded_text_idxs = padded_text_idxs.to(device)
            original_text_idx_lens = original_text_idx_lens.to(device)
            
            log_probs, y_lengths  = model(x=padded_wavs, x_lengths=original_wav_lens)
            loss = ctc_loss(log_probs.transpose(1, 0), padded_text_idxs, y_lengths, original_text_idx_lens)
            epoch_test_loss += loss.item()
            # for CER calculation
            hypotheses_idxs = log_probs.argmax(dim=2) 
            hypotheses = ctc_simple_decode(hypotheses_idxs, dataset.labels, -1)
            teachers = ctc_simple_decode(padded_text_idxs, dataset.labels, -1)
            total_cer += char_error_rate(hypotheses, teachers)

    writer.add_scalar("Loss/Test", epoch_test_loss / cnt, i)
    writer.add_scalar("CER/Test", total_cer / cnt, i)
    t1 = time.time()
    print(f"{i} epoch: {epoch_loss / cnt} loss, {epoch_test_loss / cnt} test loss, CER: {total_cer / cnt}, {t1 - t0} sec")

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
with torch.no_grad():
    total_cer = 0
    cnt = 0
    for _, (padded_spectrogram_dbs,padded_text_idxs, original_spectrofram_db_lens, original_text_idx_lens) in enumerate(test_dataloader):
        padded_spectrogram_dbs = padded_spectrogram_dbs.to(device)
        original_spectrofram_db_lens = original_spectrofram_db_lens.to(device)
        padded_text_idxs = padded_text_idxs.to(device)
        original_text_idx_lens = original_text_idx_lens.to(device)
        
        log_probs, y_lengths  = model(x=padded_spectrogram_dbs, x_lengths=original_spectrofram_db_lens)

        hypotheses_idxs = log_probs.argmax(dim=2)
        hypotheses = ctc_simple_decode(hypotheses_idxs, dataset.labels, -1)
        teachers = ctc_simple_decode(padded_text_idxs, dataset.labels, -1)
        for hypothesis, teacher in zip(hypotheses, teachers):
            print(f"hyp: {hypothesis}")
            print(f"tea: {teacher}")
        total_cer += char_error_rate(hypotheses, teachers)
        cnt += 1
    print(f"CER: {total_cer / cnt}")

hyp: yes yes no yes yes no no yes
tea: yes yes no yes yes no no yes
hyp: no yes no no no yes no no
tea: no yes no no no yes no no
hyp: no yes yes yes yes no yes no
tea: no yes yes yes yes no yes no
hyp: yes yes yes no yes no yes yes
tea: yes yes yes no yes no yes yes
hyp: no no yes no no yes yes yes
tea: no no yes no no yes yes yes
hyp: no yes no yes yes yes no no
tea: no yes no yes yes yes no no
hyp: no no yes yes no no no yes
tea: no no yes yes no no no yes
hyp: no no no yes no no no yes
tea: no no no yes no no no yes
hyp: yes yes no no yes yes yes no
tea: yes yes no no yes yes yes no
hyp: yes no no no no no no no
tea: yes no no no no no no no
hyp: yes no yes yes yes yes no yes 
tea: yes no yes yes yes yes no yes
hyp: no no yes yes yes yes no no
tea: no no yes yes yes yes no no
hyp: no yes no no yes no yes yes
tea: no yes no no yes no yes yes
hyp: no no yes yes no yes yes yes
tea: no no yes yes no yes yes yes
hyp: yes yes yes no yes no yes no
tea: yes yes yes no yes no yes no
hyp: no