In [1]:
import yaml
config_p = yaml.load(open("config/preprocess.yaml", "r"), Loader=yaml.FullLoader)
config_m = yaml.load(open("config/model.yaml", "r"), Loader=yaml.FullLoader)
config_t = yaml.load(open("config/train.yaml", "r"), Loader=yaml.FullLoader)
configs = (config_p, config_m, config_t)
with open("preprocessed_data/RWCP-SSD/test.txt", "r") as f:
    test_files = f.readlines()

In [2]:
import sys
sys.path.append("./scripts")
from scripts.utils.model import get_model, get_vocoder
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = get_model(200000, configs, DEVICE)
vocoder = get_vocoder(config_m, DEVICE)



restore_step 200000




Removing weight norm...


In [3]:
from scripts.dataset import Dataset

dataset_ = Dataset(config_p, config_t)
print(f"sound label name : sound label id\n{dataset_.audiotype_map}")

sound label name : sound label id
{'bells5': 0, 'clock1': 1, 'coffmill': 2, 'cup1': 3, 'drum': 4, 'maracas': 5, 'shaver': 6, 'tear': 7, 'trashbox': 8, 'whistle3': 9}


In [4]:
from pathlib import Path
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import cv2

def pil2cv(pil_im, color=False):
    ''' PIL型 -> OpenCV型 '''
    new_image = np.array(pil_im, dtype=np.uint8)
    if new_image.ndim == 2:  # モノクロ
        pass
    elif new_image.shape[2] == 3:  # カラー
        if color:
            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
        else:
            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2GRAY)
    elif new_image.shape[2] == 4:  # 透過
        if color:
            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGBA2BGRA)
        else:
            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGBA2GRAY)
    return new_image


def img_pad(im, max_width):
    def add_margin(pil_img, top, right, bottom, left, color):
        width, height = pil_img.size
        new_width = width + right + left
        new_height = height + top + bottom
        result = Image.new(pil_img.mode, (new_width, new_height), color)
        result.paste(pil_img, (left, top))
        return result
    pad_left = (max_width - im.width)/2 + ((max_width - im.width)%2)
    pad_right = (max_width - im.width)/2

    return add_margin(im, 0, int(pad_right), 0, int(pad_left), (0,0,0))

def generate_visualono(text, max_width):
    fs = config_p["preprocessing"]["text"]["font_size"]
    bgcolor = tuple(config_p["preprocessing"]["image"]["background_color"])
    txtcolor = tuple(config_p["preprocessing"]["image"]["text_color"])
    font = ImageFont.truetype(
        str(Path(config_p["path"]["font_path"])), 
        fs
    )
    canvas_width = len(text)*max_width
    canvas = Image.new("RGB", (canvas_width, fs), (255, 255, 255))
    w = 0
    for char in text:
        c_im = Image.new("RGB", (fs, fs), bgcolor)
        c_draw = ImageDraw.Draw(c_im)
        c_draw.text((0, 0), char, fill=txtcolor, font=font)
        c_im = img_pad(c_im, max_width)
        canvas.paste(c_im, (w, 0))
        w += fs
    canvas_1ch = canvas.convert("L")
    return canvas_1ch

def get_input(class_id, onomatopoeia):
    name = [onomatopoeia]
    class_id = np.array([class_id])
    text = np.array([[dataset_.symbol_to_id[t] for t in list(onomatopoeia)]])
    text_lens = np.array([len(onomatopoeia)])
    visualono = [generate_visualono(onomatopoeia, max_width=dataset_.width)]
    batch = (
        name,
        class_id,
        text,
        text_lens,
        max(text_lens),
        None, None, None, None, None,
        visualono, [None]
    )
    return batch

In [5]:
from scripts.utils.tools import to_device
import random

test_sample = random.choice(test_files)
_, sound_label, _, _, onomatopoeia = test_sample.split("|")
sound_label_id = dataset_.audiotype_map[sound_label]
onomatopoeia = onomatopoeia.strip("\n")
batch = get_input(
    sound_label_id,
    onomatopoeia
)
batch = to_device(batch, DEVICE)
output = model(*(batch[1:]), config_t["use_image"])

from scripts.utils.model import vocoder_infer
wav = vocoder_infer(
    mels = output[1].detach().transpose(1,2),
    vocoder = vocoder,
    model_config=config_m,
    preprocess_config=config_p,
    Normalize=False
)
import IPython.display as ipd
print(f"sound label: {sound_label}\nOnomatopoeia: {batch[0][0]}")
ipd.display(ipd.Audio(wav, rate=config_p["preprocessing"]["audio"]["sampling_rate"]))

sound label: whistle3
Onomatopoeia: ピーィ
