# confing.py: 하이퍼 파라미터 설정

In [1]:
# data loader
N_CLASS = 256
FRAME_LENGTH = 0.025
FRAME_STRIDE = 0.010
N_MELS = 40
BATCHES = 100
SLICE_LENGTH = 8193
TEST_SIZE = 0.1
FILE_PREFIX = 'LJ001'
MAX_FILES = 40

# model
COND_CHANNELS = N_MELS
HIDDEN_CHANNELS = 128
N_REPEAT = 2
N_LAYER = 9

# training
BATCH_SIZE = 4
MAX_EPOCHS = 300
MAX_NORM = 4
LEARNING_RATE = 0.0001

PRINT_FREQ = 1
VALID_FREQ = 100
INFER_FREQ = 100
MAX_VALID = 1

# generate
MAX_GENERATE = 1
MAX_GENERATE_LENGTH = 4096
SAMPLE_RATE = 22050

# utils.py: 메모리나 데이터 샘플링 잘되었는지 확인

In [None]:
import numpy as np


def debug_memory():
    # prints currently alive Tensors and Variables
    import torch
    import gc
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                print(type(obj), obj.size())
        except:
            pass


def quantize_signal(data, n_class):
    mu_x = np.sign(data) * np.log(1 + n_class * np.abs(data)) / np.log(n_class + 1)
    bins = np.linspace(-1, 1, n_class)
    quantized_data = np.digitize(mu_x, bins) - 1
    return quantized_data


def dequantize_signal(data, n_class):
    data = (data / n_class) * 2. - 1
    s = np.sign(data) * (np.exp(np.abs(data) * np.log(n_class + 1)) - 1) / n_class
    return s

# datasets.py

In [None]:
from torch.utils.data import Dataset
from config import *
import numpy as np
import librosa
import random
import torch
import os
from utils import *


class SpeechDataset(Dataset):
    def __init__(self,
                 n_class,
                 slice_length,
                 frame_length,
                 frame_stride,
                 test_size,
                 device,
                 dataset_path='./dataset'):
        self.n_class = n_class
        self.slice_length = slice_length
        self.frame_length = frame_length
        self.frame_stride = frame_stride
        self.test_size = test_size
        self.device = device
        self.dataset_path = dataset_path

    # 이게 메인 함수
    def create_dataset(self, max_files, prefix=None):
        lj_path = './LJSpeech-1.1'
        wav_names = []
        
        # 메타데이터에서 파일 이름들 불러온다음 셔플
        with open(lj_path + '/metadata.csv', encoding='utf-8') as f:
            for line in f:
                name = line.split('|')[0]
                if prefix and name.split('-')[0] != prefix:
                    continue
                wav_names.append(line.split('|')[0])

        random.seed(42)
        random.shuffle(wav_names)
        wav_names = wav_names[:max_files]

        count = 0
        x, cond = None, None
        # 
        for wav_name in wav_names:
            wav_path = lj_path + '/wavs/' + wav_name + '.wav'

            # calculate log mel spectrum
            # 차후에 정리 음성파일의 특징값을 뽑는 것임
            y, sr = librosa.core.load(wav_path)
            input_nfft = int(round(sr * self.frame_length))
            input_stride = int(round(sr * self.frame_stride))
            s = librosa.feature.melspectrogram(y=y, n_mels=N_MELS, n_fft=input_nfft, hop_length=input_stride)
            s = librosa.core.power_to_db(s, ref=np.max)

            # scale to [0, 1] 왜 80으로 나누는지는 잘 모르겠
            s /= 80.0

            # u-law 샘플링 적용 -> 256개값으로
            # time_resolution 함수가 양자화 하는 거 이것도 연구 필요
            new_x = quantize_signal(y, self.n_class)
            new_cond = self.time_resolution(s, y.shape[0])
            print(wav_name, 'processed.')

            # 왜 두가지 경우로 나누는지 잘 모르겠
            if x is None:
                x, cond = new_x, new_cond
            else:
                x, cond = np.concatenate((x, new_x)), np.column_stack((cond, new_cond))

            # x는  i부터 i +slice, cond는 전체 로우, column i부터 i+slice
            for i in range(0, x.shape[0]-self.slice_length+1, self.slice_length):
                np.save(self.dataset_path + '/x_' + str(count) + '.npy', x[i:i+self.slice_length])
                np.save(self.dataset_path + '/cond_' + str(count) + '.npy', cond[:, i:i+self.slice_length])
                count += 1
                
            # 요건 왜 있는거야..
            if x.shape[0] % self.slice_length == 0:
                x, cond = None, None
            else:
                x, cond = x[-x.shape[0]%self.slice_length:], cond[:, -x.shape[0] % self.slice_length:]

    def init_dataset(self, test_mode):
        self.test_mode = test_mode
        file_list = os.listdir(self.dataset_path)
        tot = len(file_list)//2
        self.test_length = int(tot*self.test_size)
        self.train_length = tot-self.test_length

    def __len__(self):
        return self.test_length if self.test_mode else self.train_length

    def __getitem__(self, idx):
        if self.test_mode:
            idx += self.train_length
        x = np.load(self.dataset_path+'/x_'+str(idx)+'.npy')
        cond = np.load(self.dataset_path+'/cond_'+str(idx)+'.npy')

        # one hot encoding
        embedded_x = np.zeros((self.n_class, x.shape[0]))
        embedded_x[x, np.arange(x.shape[0])] = 1

        return torch.tensor(embedded_x[:, :-1], dtype=torch.float, device=self.device),\
               torch.tensor(x[1:], dtype=torch.long, device=self.device),\
               torch.tensor(cond[:, :-1], dtype=torch.float, device=self.device)

    # 양자화
    def time_resolution(self, cond, target_length):
        z = np.zeros((cond.shape[0], target_length))
        repeated_cond = np.repeat(cond, target_length//cond.shape[1], axis=1)
        z[:, :repeated_cond.shape[1]] = repeated_cond
        return z

# main.py: 앞에 정의된 파일 실행

In [None]:
from trainer import Trainer

tr = Trainer()
#tr.create_dataset()
tr.train()
tr.generate()