In [1]:
import numpy as np

from os import listdir
from os.path import isdir, isfile, join

from kspon_jamo import text_to_tokens, tokens_to_text, n_symbols, normalize_ksponspeech, SOS, EOS

### Data file 리스트 만들기

In [2]:
data_dir = 'dataset'
data_files = [{'wav': join(data_dir, file),\
               'txt': join(data_dir, file[:-3] + 'txt')} for file in listdir(data_dir) if '.wav' in file]
print('# of data files :', len(data_files))
data_files.sort(key=lambda x:x['txt'])
for data_file in data_files[:10]:
    print(data_file)

# of data files : 1000
{'wav': 'dataset/KsponSpeech_000001.wav', 'txt': 'dataset/KsponSpeech_000001.txt'}
{'wav': 'dataset/KsponSpeech_000002.wav', 'txt': 'dataset/KsponSpeech_000002.txt'}
{'wav': 'dataset/KsponSpeech_000003.wav', 'txt': 'dataset/KsponSpeech_000003.txt'}
{'wav': 'dataset/KsponSpeech_000004.wav', 'txt': 'dataset/KsponSpeech_000004.txt'}
{'wav': 'dataset/KsponSpeech_000005.wav', 'txt': 'dataset/KsponSpeech_000005.txt'}
{'wav': 'dataset/KsponSpeech_000006.wav', 'txt': 'dataset/KsponSpeech_000006.txt'}
{'wav': 'dataset/KsponSpeech_000007.wav', 'txt': 'dataset/KsponSpeech_000007.txt'}
{'wav': 'dataset/KsponSpeech_000008.wav', 'txt': 'dataset/KsponSpeech_000008.txt'}
{'wav': 'dataset/KsponSpeech_000009.wav', 'txt': 'dataset/KsponSpeech_000009.txt'}
{'wav': 'dataset/KsponSpeech_000010.wav', 'txt': 'dataset/KsponSpeech_000010.txt'}


### txt 파일 보기

In [3]:
file = data_files[2]['txt']
with open(file, 'r', encoding='cp949') as f:
    l = f.read()
    print(l)

b/ n/ 그래서 지호랑 계단 n/ 올라와서 b/ 막 위에 운동하는 기구 있대요. b/ 그서 그걸로 운동 할려구요. b/ n/



### 텍스트 normalization하기

In [4]:
l = normalize_ksponspeech(l)
print(l)

그래서 지호랑 계단 올라와서 막 위에 운동하는 기구 있대요. 그서 그걸로 운동 할려구요.


### token으로 바꾸기

In [5]:
tokens = text_to_tokens(l)
print(tokens)

[257 295 256 260 297 256 263 289 256  32 265 296 256 270 291 256 260 287
 264 256  32 257 300 256 259 287 258 256  32 264 291 260 256 260 287 256
 264 301 256 263 289 256  32 261 287 257 256  32 264 306 256 264 299 256
  32 264 293 258 256 259 291 264 256 270 287 256 258 295 258 256  32 257
 296 256 257 293 256  32 264 296 274 256 259 297 256 264 292 256  46  32
 257 295 256 263 289 256  32 257 295 256 257 289 260 256 260 291 256  32
 264 293 258 256 259 291 264 256  32 270 287 260 256 260 290 256 257 293
 256 264 292 256  46]


### token을 텍스트로 바꾸기

In [6]:
text_recon = tokens_to_text(tokens)
print(text_recon)

그래서 지호랑 계단 올라와서 막 위에 운동하는 기구 있대요. 그서 그걸로 운동 할려구요.


### SOS, EOS 추가하기 (optional)

In [7]:
tokens = np.concatenate([[SOS], tokens, [EOS]])
print(tokens)

[308 257 295 256 260 297 256 263 289 256  32 265 296 256 270 291 256 260
 287 264 256  32 257 300 256 259 287 258 256  32 264 291 260 256 260 287
 256 264 301 256 263 289 256  32 261 287 257 256  32 264 306 256 264 299
 256  32 264 293 258 256 259 291 264 256 270 287 256 258 295 258 256  32
 257 296 256 257 293 256  32 264 296 274 256 259 297 256 264 292 256  46
  32 257 295 256 263 289 256  32 257 295 256 257 289 260 256 260 291 256
  32 264 293 258 256 259 291 264 256  32 270 287 260 256 260 290 256 257
 293 256 264 292 256  46 309]


### Pytorch Dataset Class 만들기

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import librosa
import librosa.display
import matplotlib.pyplot as plt

In [9]:
class KSponSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir):
        self.data_files = [{'wav': join(data_dir, file),\
                            'txt': join(data_dir, file[:-3] + 'txt')} for file in listdir(data_dir) if '.wav' in file]

    def _get_audio(self, file):
        # (time,)
        wav, _ = librosa.core.load(file, sr=16000, mono=True)
        # (512, time)
        S = librosa.feature.melspectrogram(wav, sr=16000, n_fft=1024, n_mels=80, hop_length=256, power=1.0)
        S = np.log10(S + 1e-5) 
        # (time, 512)
        return S.T
            
    def _get_text(self, file):
        with open(file, 'r', encoding='cp949') as f:
            l = f.read()
            l = normalize_ksponspeech(l)
            array = text_to_tokens(l)
        # Insert SOS and EOS
        array = np.concatenate([[SOS], array, [EOS]])
        return array
        
    def __getitem__(self, index):
        while True:
            text = self._get_text(self.data_files[index]['txt'])
            if len(text) > 180:
                index = (index + 1) % self.__len__()
                continue

            audio = self._get_audio(self.data_files[index]['wav'])    
            if len(audio) > 450:
                index = (index + 1) % self.__len__()
                continue
                
            break
        
        return torch.FloatTensor(audio), torch.LongTensor(text)
        
    def __len__(self):
        return len(self.data_files)

In [10]:
class KSponSpeechDataCollate():
    def __call__(self, batch):
        audio_lengths = []
        text_lengths = []
        for audio, text in batch:
            audio_lengths.append(len(audio))
            text_lengths.append(len(text))
            
        audio_max_length = max(audio_lengths)
        text_max_length = max(text_lengths)
        
        audio_padded = torch.FloatTensor(len(batch), audio_max_length, 80)
        audio_padded.fill_(-5)
        audio_lengths = torch.from_numpy(np.array(audio_lengths)).long()
        
        text_padded = torch.LongTensor(len(batch), text_max_length)
        text_padded.zero_()
        text_lengths = torch.from_numpy(np.array(text_lengths)).long()
        
        for i, (audio, text) in enumerate(batch):
            audio_padded[i, :len(audio)] = audio
            text_padded[i, :len(text)] = text
            
        outputs = {'audio': audio_padded,
                   'audio_lengths': audio_lengths,
                   'text': text_padded,
                   'text_lengths': text_lengths
                  }
        
        return outputs

In [11]:
dataset = KSponSpeechDataset(data_dir='dataset')
train_loader = DataLoader(dataset, num_workers=8, shuffle=True, batch_size=64, collate_fn=KSponSpeechDataCollate())
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7f5fa3fed820>


In [None]:
for batch in train_loader:
    audio = batch['audio'].data.cpu().numpy()
    audio_lengths = batch['audio_lengths'].data.cpu().numpy()
    text = batch['text'].data.cpu().numpy()
    text_lengths = batch['text_lengths'].data.cpu().numpy()
    break

In [None]:
print(audio.shape)
plt.figure(figsize=[18, 3])
librosa.display.specshow(audio[0].T)
plt.colorbar()
plt.show()

In [None]:
print(audio_lengths.shape)
print(audio_lengths)

In [None]:
print(text.shape)
print(text[0])

In [None]:
print(text_lengths.shape)
print(text_lengths)