In [1]:
"""
Copyright 2019-present NAVER Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

#-*- coding: utf-8 -*-

def load_label(label_path):
    char2index = dict() # [ch] = id
    index2char = dict() # [id] = ch
    with open(label_path, 'r') as f:
        for no, line in enumerate(f):
            if line[0] == '#': 
                continue

            index, char, freq = line.strip().split('\t')
            char = char.strip()
            if len(char) == 0:
                char = ' '

            char2index[char] = int(index)
            index2char[int(index)] = char

    return char2index, index2char


In [2]:
"""
The wavio module defines the functions:
read(file)
    Read a WAV file and return a `wavio.Wav` object, with attributes
    `data`, `rate` and `sampwidth`.
write(filename, data, rate, scale=None, sampwidth=None)
    Write a numpy array to a WAV file.
-----
Author: Warren Weckesser
License: BSD 2-Clause:
Copyright (c) 2015, Warren Weckesser
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
   this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""

import wave
import numpy as np


def _wav2array(nchannels, sampwidth, data):
    """data must be the string containing the bytes from the wav file."""
    num_samples, remainder = divmod(len(data), sampwidth * nchannels)
    if remainder > 0:
        raise ValueError('The length of data is not a multiple of '
                         'sampwidth * num_channels.')
    if sampwidth > 4:
        raise ValueError("sampwidth must not be greater than 4.")

    if sampwidth == 3:
        a = np.empty((num_samples, nchannels, 4), dtype=np.uint8)
        raw_bytes = np.fromstring(data, dtype=np.uint8)
        a[:, :, :sampwidth] = raw_bytes.reshape(-1, nchannels, sampwidth)
        a[:, :, sampwidth:] = (a[:, :, sampwidth - 1:sampwidth] >> 7) * 255
        result = a.view('<i4').reshape(a.shape[:-1])
    else:
        # 8 bit samples are stored as unsigned ints; others as signed ints.
        dt_char = 'u' if sampwidth == 1 else 'i'
        a = np.fromstring(data, dtype='<%s%d' % (dt_char, sampwidth))
        result = a.reshape(-1, nchannels)
    return result


def readwav(file):
    """
    Read a wav file.
    Returns the frame rate, sample width (in bytes) and a numpy array
    containing the data.
    This function does not read compressed wav files.
    """
    wav = wave.open(file)
    rate = wav.getframerate()
    nchannels = wav.getnchannels()
    sampwidth = wav.getsampwidth()
    nframes = wav.getnframes()
    data = wav.readframes(nframes)
    wav.close()
    array = _wav2array(nchannels, sampwidth, data)
    return rate, sampwidth, array


def writewav24(filename, rate, data):
    """Create a 24 bit wav file.
    data must be "array-like", either 1- or 2-dimensional.  If it is 2-d,
    the rows are the frames (i.e. samples) and the columns are the channels.
    The data is assumed to be signed, and the values are assumed to be
    within the range of a 24 bit integer.  Floating point values are
    converted to integers.  The data is not rescaled or normalized before
    writing it to the file.
    Example: Create a 3 second 440 Hz sine wave.
    >>> rate = 22050  # samples per second
    >>> T = 3         # sample duration (seconds)
    >>> f = 440.0     # sound frequency (Hz)
    >>> t = np.linspace(0, T, T*rate, endpoint=False)
    >>> x = (2**23 - 1) * np.sin(2 * np.pi * f * t)
    >>> writewav24("sine24.wav", rate, x)
    """
    a32 = np.asarray(data, dtype=np.int32)
    if a32.ndim == 1:
        # Convert to a 2D array with a single column.
        a32.shape = a32.shape + (1,)
    # By shifting first 0 bits, then 8, then 16, the resulting output
    # is 24 bit little-endian.
    a8 = (a32.reshape(a32.shape + (1,)) >> np.array([0, 8, 16])) & 255
    wavdata = a8.astype(np.uint8).tostring()

    w = wave.open(filename, 'wb')
    w.setnchannels(a32.shape[1])
    w.setsampwidth(3)
    w.setframerate(rate)
    w.writeframes(wavdata)
    w.close()

In [6]:
"""
Copyright 2019-present NAVER Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

#-*- coding: utf-8 -*-

import os
import sys
import math
import time
import torch
import random
import threading
import logging
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np

logger = logging.getLogger('root')
FORMAT = "[%(asctime)s %(filename)s:%(lineno)s - %(funcName)s()] %(message)s"
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format=FORMAT)
logger.setLevel(logging.INFO)

PAD = 0
N_FFT = 512
SAMPLE_RATE = 16000

target_dict = dict()

def load_targets(path):
    with open(path, 'r') as f:
        for no, line in enumerate(f):
            key, target = line.strip().split(',')
            target_dict[key] = target

# 수정할 것 1
# Mel-Spectrogram으로 만듭시다

def get_spectrogram_feature(filepath):
    (rate, width, sig) = wavio.readwav(filepath)
    sig = sig.ravel()

    stft = torch.stft(torch.FloatTensor(sig),
                        N_FFT,
                        hop_length=int(0.01*SAMPLE_RATE),
                        win_length=int(0.030*SAMPLE_RATE),
                        window=torch.hamming_window(int(0.030*SAMPLE_RATE)),
                        center=False,
                        normalized=False,
                        onesided=True)

    stft = (stft[:,:,0].pow(2) + stft[:,:,1].pow(2)).pow(0.5);
    amag = stft.numpy();
    feat = torch.FloatTensor(amag)
    feat = torch.FloatTensor(feat).transpose(0, 1)

    return feat


def _normalize(S):
    min_level_db = -100
    return np.clip((S - min_level_db) / -min_level_db, 0 ,1)


def _zero_padding(ndarray, max_m, max_n):
    zeros = np.zeros( ((max_m), max_n - len(ndarray[0])) )
    ndarray = np.append(ndarray, zeros, axis = 1)
    return ndarray
'''
'''
# Mel-Spectrogram으로 테스트
def get_melspectrogram_feature(filepath):
    y, sr = librosa.load(filepath)
    wav_S = librosa.feature.melspectrogram(y = y, sr = sr, n_mels = 128)
    log_wav_S = librosa.power_to_db(wav_S, np.max)
    norm_S = _normalize(log_wav_S)
    feat = np.array(norm_S)
    #logger.info('{:4d} {:4d}'.format(len(norm_S), len(norm_S[0])))
    feat = torch.FloatTensor(feat)
    feat = torch.FloatTensor(feat).transpose(0, 1)
    #logger.info('{:4d} {:4d}'.format(len(feat), len(feat[0])))

    return feat


def get_script(filepath, bos_id, eos_id):
    key = filepath.split('/')[-1].split('.')[0]
    script = target_dict[key]
    tokens = script.split(' ')
    result = list()
    result.append(bos_id)
    for i in range(len(tokens)):
        if len(tokens[i]) > 0:
            result.append(int(tokens[i]))
    result.append(eos_id)
    return result

class BaseDataset(Dataset):
    def __init__(self, wav_paths, script_paths, bos_id=1307, eos_id=1308):
        self.wav_paths = wav_paths
        self.script_paths = script_paths
        self.bos_id, self.eos_id = bos_id, eos_id

    def __len__(self):
        return len(self.wav_paths)

    def count(self):
        return len(self.wav_paths)

    def getitem(self, idx):
        feat = get_spectrogram_feature(self.wav_paths[idx])
        logger.info('{:4d} {:4d}'.format(len(feat), len(feat[0])))
        script = get_script(self.script_paths[idx], self.bos_id, self.eos_id)
        return feat, script

def _collate_fn(batch):
    def seq_length_(p):
        return len(p[0])

    def target_length_(p):
        return len(p[1])

    seq_lengths = [len(s[0]) for s in batch]
    target_lengths = [len(s[1]) for s in batch]

    max_seq_sample = max(batch, key=seq_length_)[0]
    max_target_sample = max(batch, key=target_length_)[1]

    max_seq_size = max_seq_sample.size(0)
    max_target_size = len(max_target_sample)

    feat_size = max_seq_sample.size(1)
    batch_size = len(batch)

    seqs = torch.zeros(batch_size, max_seq_size, feat_size)

    targets = torch.zeros(batch_size, max_target_size).to(torch.long)
    targets.fill_(PAD)

    for x in range(batch_size):
        sample = batch[x]
        tensor = sample[0]
        target = sample[1]
        seq_length = tensor.size(0)
        seqs[x].narrow(0, 0, seq_length).copy_(tensor)
        targets[x].narrow(0, 0, len(target)).copy_(torch.LongTensor(target))

    return seqs, targets, seq_lengths, target_lengths

class BaseDataLoader(threading.Thread):
    def __init__(self, dataset, queue, batch_size, thread_id):
        threading.Thread.__init__(self)
        self.collate_fn = _collate_fn
        self.dataset = dataset
        self.queue = queue
        self.index = 0
        self.batch_size = batch_size
        self.dataset_count = dataset.count()
        self.thread_id = thread_id

    def count(self):
        return math.ceil(self.dataset_count / self.batch_size)

    def create_empty_batch(self):
        seqs = torch.zeros(0, 0, 0)
        targets = torch.zeros(0, 0).to(torch.long)
        seq_lengths = list()
        target_lengths = list()
        return seqs, targets, seq_lengths, target_lengths

    def run(self):
        logger.debug('loader %d start' % (self.thread_id))
        while True:
            items = list()

            for i in range(self.batch_size): 
                if self.index >= self.dataset_count:
                    break

                items.append(self.dataset.getitem(self.index))
                self.index += 1

            if len(items) == 0:
                batch = self.create_empty_batch()
                self.queue.put(batch)
                break

            random.shuffle(items)

            batch = self.collate_fn(items)
            self.queue.put(batch)
        logger.debug('loader %d stop' % (self.thread_id))

class MultiLoader():
    def __init__(self, dataset_list, queue, batch_size, worker_size):
        self.dataset_list = dataset_list
        self.queue = queue
        self.batch_size = batch_size
        self.worker_size = worker_size
        self.loader = list()

        for i in range(self.worker_size):
            self.loader.append(BaseDataLoader(self.dataset_list[i], self.queue, self.batch_size, i))

    def start(self):
        for i in range(self.worker_size):
            self.loader[i].start()

    def join(self):
        for i in range(self.worker_size):
            self.loader[i].join()



In [7]:
get_spectrogram_feature('marvin (1).wav').shape

torch.Size([78, 257])

In [8]:
get_melspectrogram_feature('marvin (1).wav').shape

torch.Size([35, 128])

In [9]:
get_spectrogram_feature('cat (1).wav').shape

torch.Size([97, 257])

In [10]:
get_melspectrogram_feature('cat (1).wav').shape

torch.Size([44, 128])

In [13]:
SOS_token = 0
EOS_token = 0
wav_paths = 'C:\Users\SooHwanKim\Desktop\수환\학교\예비캡스톤\소스코드\테스트\dataset'
workers = 4
batch_size = 8

def split_dataset(wav_paths, script_paths, valid_ratio=0.05):
    train_loader_count = workers
    records_num = len(wav_paths)
    batch_num = math.ceil(records_num / batch_size)

    valid_batch_num = math.ceil(batch_num * valid_ratio)
    train_batch_num = batch_num - valid_batch_num

    batch_num_per_train_loader = math.ceil(train_batch_num / workers)

    train_begin = 0
    train_end_raw_id = 0
    train_dataset_list = list()

    for i in range(config.workers):

        train_end = min(train_begin + batch_num_per_train_loader, train_batch_num)

        train_begin_raw_id = train_begin * batch_size
        train_end_raw_id = train_end * batch_size

        train_dataset_list.append(BaseDataset(
                                        wav_paths[train_begin_raw_id:train_end_raw_id],
                                        script_paths[train_begin_raw_id:train_end_raw_id],
                                        SOS_token, EOS_token))
        train_begin = train_end 

    valid_dataset = BaseDataset(wav_paths[train_end_raw_id:], script_paths[train_end_raw_id:], SOS_token, EOS_token)

    return train_batch_num, train_dataset_list, valid_dataset

In [16]:
import torch
import librosa
import numpy as np
PAD = 0
N_FFT = 512
SAMPLE_RATE = 16000
def get_spectrogram_feature(filepath):
    (rate, width, sig) = readwav(filepath)
    #print(type(sig))
    sig = sig.ravel()
    #print(sig)
    stft = torch.stft(torch.FloatTensor(sig),
                        N_FFT,
                        hop_length=int(0.01*SAMPLE_RATE),
                        win_length=int(0.030*SAMPLE_RATE),
                        window=torch.hamming_window(int(0.030*SAMPLE_RATE)),
                        center=False,
                        normalized=False,
                        onesided=True)
    stft = (stft[:,:,0].pow(2) + stft[:,:,1].pow(2)).pow(0.5);
    amag = stft.numpy();
    feat = torch.FloatTensor(amag)
    feat = torch.FloatTensor(feat).transpose(0, 1)
    print(feat.shape)
    return feat

def _normalize(S):
    min_level_db = -100
    return np.clip((S - min_level_db) / -min_level_db, 0 ,1)

# Mel-Spectrogram으로 테스트
def get_melspectrogram_feature(filepath):
    y, sr = librosa.load(filepath)
    wav_S = librosa.feature.melspectrogram(y = y, sr = sr, n_mels = 257)
    log_wav_S = librosa.power_to_db(wav_S, np.max)
    norm_S = _normalize(log_wav_S)
    feat = np.array(norm_S)
    #logger.info('{:4d} {:4d}'.format(len(norm_S), len(norm_S[0])))
    feat = torch.FloatTensor(feat)
    feat = torch.FloatTensor(feat).transpose(0, 1)
    #logger.info('{:4d} {:4d}'.format(len(feat), len(feat[0])))
    print(feat.shape)
    return feat

print(get_spectrogram_feature('C:/Users/SooHwanKim/Desktop/수환/학교/예비캡스톤/소스코드/테스트/dataset/bed/bed (1).wav'))
print(get_melspectrogram_feature('C:/Users/SooHwanKim/Desktop/수환/학교/예비캡스톤/소스코드/테스트/dataset/bed/bed (1).wav'))



torch.Size([97, 257])
tensor([[5.0011e+02, 8.4333e+02, 5.4157e+02,  ..., 2.7685e+00, 3.5773e+00,
         3.5385e-02],
        [2.1543e+02, 5.3764e+02, 9.8892e+02,  ..., 2.5724e+00, 1.6558e+00,
         1.5334e+00],
        [4.8812e+02, 4.5325e+02, 1.8081e+03,  ..., 2.8591e+00, 4.9504e+00,
         7.7795e-01],
        ...,
        [1.0760e+02, 1.6529e+03, 1.8535e+03,  ..., 2.5241e+00, 2.2961e+00,
         2.8644e+00],
        [1.1281e+03, 1.4562e+03, 1.5999e+03,  ..., 1.6011e+00, 1.2807e+00,
         2.3992e+00],
        [1.4431e+02, 5.4400e+02, 8.0558e+02,  ..., 4.5966e+00, 2.4539e+00,
         9.9558e-01]])
torch.Size([44, 257])
tensor([[0.6179, 0.6400, 0.6618,  ..., 0.2000, 0.2000, 0.2000],
        [0.5692, 0.5803, 0.6016,  ..., 0.2000, 0.2000, 0.2000],
        [0.5250, 0.5500, 0.5527,  ..., 0.2000, 0.2000, 0.2000],
        ...,
        [0.4954, 0.5519, 0.5805,  ..., 0.2000, 0.2000, 0.2000],
        [0.5219, 0.5745, 0.5715,  ..., 0.2000, 0.2000, 0.2000],
        [0.5938, 0.5954, 0.

In [1]:
import queue

In [2]:
queue = queue.Queue(4*2)

In [3]:
print(queue)

<queue.Queue object at 0x000001B5625BD208>


In [5]:
queue.put(1)

In [7]:
queue.get()

1

In [11]:
from torch.utils.data import Dataset, DataLoader

class BaseDataset(Dataset):
    def __init__(self, wav_paths, script_paths, bos_id=1307, eos_id=1308):
        self.wav_paths = wav_paths
        self.script_paths = script_paths
        self.bos_id, self.eos_id = bos_id, eos_id

    def __len__(self):
        return len(self.wav_paths)

    def count(self):
        return len(self.wav_paths)

    def getitem(self, idx):
        feat = get_spectrogram_feature(self.wav_paths[idx])
        logger.info('{:4d} {:4d}'.format(len(feat), len(feat[0])))
        script = get_script(self.script_paths[idx], self.bos_id, self.eos_id)
        logger.info('script도 잘 됨')
        return feat, script

In [18]:
import queue
q = queue.Queue(8)

In [19]:
print(q)

<queue.Queue object at 0x000001B562618518>


In [None]:
q.put(1)

In [21]:
q.get()

[1, 2, 3, 4, 5, 6, 7, 8]

In [1]:
import torch
a = torch.Tensor([1,2,3,4,5])

In [2]:
a

tensor([1., 2., 3., 4., 5.])

In [3]:
a.numpy()

array([1., 2., 3., 4., 5.], dtype=float32)