In [9]:
from tqdm import trange
import librosa
import numpy as np
import pandas as pd
import torch
DATASET_PATH = "E:/한국어 음성데이터/KaiSpeech/"
TRAIN_LIST_PATH = "./data/data_list/train_list.csv"

In [2]:
def load_data_list(data_list_path, dataset_path):
    data_list = pd.read_csv(data_list_path, "r", delimiter = ",", encoding="cp949")
    audio_paths = list(dataset_path + data_list["audio"])
    label_paths = list(dataset_path + data_list["label"])

    return audio_paths, label_paths

In [3]:
audio_paths, label_paths = load_data_list(data_list_path=TRAIN_LIST_PATH, dataset_path=DATASET_PATH)
audio_paths[:5]

['E:/한국어 음성데이터/KaiSpeech/KaiSpeech_103762.pcm',
 'E:/한국어 음성데이터/KaiSpeech/KaiSpeech_126833.pcm',
 'E:/한국어 음성데이터/KaiSpeech/KaiSpeech_367790.pcm',
 'E:/한국어 음성데이터/KaiSpeech/KaiSpeech_005465.pcm',
 'E:/한국어 음성데이터/KaiSpeech/KaiSpeech_133374.pcm']

In [4]:
def get_librosa_mfcc(filepath = None, n_mfcc = 33, del_silence = False, input_reverse = True, format='pcm'):
    if format == 'pcm':
        try:
            pcm = np.memmap(filepath, dtype='h', mode='r')
        except: # exception handling
            logger.info("np.memmap error in %s" % filepath)
            return torch.zeros(1)
        sig = np.array([float(x) for x in pcm])
    elif format == 'wav':
        sig, _ = librosa.core.load(filepath, sr=16000)
    else: logger.info("%s is not Supported" % format)

    if del_silence:
        non_silence_indices = librosa.effects.split(sig, top_db=30)
        sig = np.concatenate([sig[start:end] for start, end in non_silence_indices])
    feat = librosa.feature.mfcc(y=sig,sr=16000, hop_length=160, n_mfcc=n_mfcc, n_fft=400, window='hamming')
    if input_reverse:
        feat = feat[:,::-1]

    return torch.FloatTensor( np.ascontiguousarray( np.swapaxes(feat, 0, 1) ) )

In [5]:
def spec_augment(feat, T=40, F=15, time_mask_num=2, freq_mask_num=2):
    feat_size = feat.size(1)
    seq_len = feat.size(0)

    # time mask
    for _ in range(time_mask_num):
        t = np.random.uniform(low=0.0, high=T)
        t = int(t)
        t0 = random.randint(0, seq_len - t)
        feat[t0 : t0 + t, :] = 0

    # freq mask
    for _ in range(freq_mask_num):
        f = np.random.uniform(low=0.0, high=F)
        f = int(f)
        f0 = random.randint(0, feat_size - f)
        feat[:, f0 : f0 + f] = 0

    return feat

In [6]:
def get_mfcc_pickle(audio_paths):
    mfcc_list = []
    for idx in trange(len(audio_paths)):
        mfcc_list.append(get_librosa_mfcc(filepath=audio_paths[idx], 
                                          n_mfcc=33,
                                          del_silence=False,
                                          input_reverse=True,
                                          format="pcm"))
    with open("./data/pickle/mfccs.txt", "wb") as f:
        pickle.dump(mfcc_list, f)

In [7]:
def get_augment_pickle(audio_paths):
    augment_list = []
    for idx in trange(len(audio_paths)):
        feat = get_librosa_mfcc(filepath=audio_paths[idx], 
                                          n_mfcc=33,
                                          del_silence=False,
                                          input_reverse=True,
                                          format="pcm")
        augment_list.append(spec_augment(feat, T=40, F=15, time_mask_num=2, freq_mask_num=2))
    with open("./data/pickle/augments.txt", "wb") as f:
        pickle.dump(augment_list, f)

In [10]:
get_mfcc_pickle(audio_paths)
get_augment_pickle(audio_paths)

 10%|██████▉                                                               | 62848/633700 [2:53:09<24:57:48,  6.35it/s]

NameError: name 'logger' is not defined