In [7]:
import os
import malaya_speech
from malaya_speech import Pipeline
import gc
import librosa
import numpy as np
import torch
from tqdm import tqdm
import joblib

def norm_mel(y, sr):
    mel = librosa.feature.melspectrogram(y, sr = sr, n_mels = 80,hop_length=512)
    mel = librosa.power_to_db(mel, ref=np.max)
    return mel.T

quantized_model = malaya_speech.vad.deep_model(model = 'vggvox-v2', quantized = True)
p = Pipeline()

pipeline_left = (
    p.map(malaya_speech.generator.frames, frame_duration_ms = 30, sample_rate = 16000)
)

pipeline_right = (
    pipeline_left.batching(5)
    .foreach_map(quantized_model.predict)
    .flatten()
)

pipeline_left.foreach_zip(pipeline_right).map(malaya_speech.combine.without_silent,
                                             threshold_to_stop = 0.05)


def remove_silent(y,sr,time_length=80000):
    y =  p(y)['without_silent']
    if len(y) > time_length:
        y = y[0:0+time_length]
    else:
        y = np.pad(y, (0,time_length-len(y)))
    return y

def wav2featuresflow(y, sr):
    y_without_silent = remove_silent(y,sr)
    melspectrogram = np.rot90(norm_mel(y, sr))
    return melspectrogram

def preprocessing_X(wav_dir):
    files = os.listdir(wav_dir)
    files.sort() #正確排序很重要!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    X = torch.FloatTensor([])
    for f in tqdm(files):
        try:
            samples, sample_rate = malaya_speech.load(wav_dir+f)
            melspectrogram = wav2featuresflow(samples,sr=sample_rate)
            melspectrogram = melspectrogram.reshape(-1,80,157).copy()
            melspectrogram = torch.from_numpy(melspectrogram)
            X = torch.cat([X,torch.unsqueeze(melspectrogram,0)],dim=0)
            gc.collect()
        except:
            print(f)
    print('X shape:',X.shape)
    return X



In [8]:
os.listdir()

['Electricssound',
 'dogbarking',
 'expansion_data.ipynb',
 'GlassBreakingsound',
 'download-from-youtube.sh',
 'dogcrying',
 'Vaccumsound',
 'doghowling',
 'dishessound',
 'Catsound',
 '.ipynb_checkpoints']

In [9]:
dirs = os.listdir()
dirs.sort

<function list.sort(*, key=None, reverse=False)>

In [10]:
dirs

['Electricssound',
 'dogbarking',
 'expansion_data.ipynb',
 'GlassBreakingsound',
 'download-from-youtube.sh',
 'dogcrying',
 'Vaccumsound',
 'doghowling',
 'dishessound',
 'Catsound',
 '.ipynb_checkpoints']

In [11]:
dirs = ['Electricssound',
        'dogbarking',
        'GlassBreakingsound',
        'dogcrying',
        'Vaccumsound',
        'doghowling',
        'dishessound',
        'Catsound']

In [None]:
data = {}
for class_name in tqdm(dirs):
    wav_dir = class_name+'/'
    print(class_name)
    data[class_name] = preprocessing_X(wav_dir)

  0%|          | 0/8 [00:00<?, ?it/s]
  n_fft, y.shape[-1]


Electricssound


  n_fft, y.shape[-1]

  5%|▌         | 1/20 [00:09<02:58,  9.42s/it][A
 10%|█         | 2/20 [00:17<02:32,  8.45s/it][A
 15%|█▌        | 3/20 [00:25<02:20,  8.26s/it][A
 20%|██        | 4/20 [00:32<02:08,  8.02s/it][A
 25%|██▌       | 5/20 [00:39<01:55,  7.67s/it][A
 30%|███       | 6/20 [00:47<01:46,  7.60s/it][A
 35%|███▌      | 7/20 [00:54<01:38,  7.55s/it][A
 40%|████      | 8/20 [01:01<01:28,  7.39s/it][A
 45%|████▌     | 9/20 [01:09<01:21,  7.41s/it][A
 50%|█████     | 10/20 [01:16<01:11,  7.19s/it][A
 55%|█████▌    | 11/20 [01:22<01:03,  7.04s/it][A
 60%|██████    | 12/20 [01:29<00:56,  7.05s/it][A
 65%|██████▌   | 13/20 [01:36<00:48,  6.98s/it][A
 70%|███████   | 14/20 [01:43<00:42,  7.02s/it][A
 75%|███████▌  | 15/20 [01:51<00:35,  7.19s/it][A
 80%|████████  | 16/20 [01:58<00:28,  7.17s/it][A
 85%|████████▌ | 17/20 [02:05<00:21,  7.24s/it][A
 90%|█████████ | 18/20 [02:13<00:14,  7.24s/it][A
 95%|█████████▌| 19/20 [02:20<00:07,  7.31s/it][A
100%|██████████| 2

X shape: torch.Size([20, 1, 80, 157])
dogbarking



  0%|          | 1/260 [00:07<30:20,  7.03s/it][A
  1%|          | 2/260 [00:14<30:37,  7.12s/it][A
  1%|          | 3/260 [00:21<29:54,  6.98s/it][A
  2%|▏         | 4/260 [00:28<30:06,  7.06s/it][A
  2%|▏         | 5/260 [00:36<31:09,  7.33s/it][A
  2%|▏         | 6/260 [00:42<30:30,  7.21s/it][A
  3%|▎         | 7/260 [00:50<30:40,  7.28s/it][A
  3%|▎         | 8/260 [00:57<30:37,  7.29s/it][A
  3%|▎         | 9/260 [01:05<30:52,  7.38s/it][A
  4%|▍         | 10/260 [01:12<31:08,  7.47s/it][A
  4%|▍         | 11/260 [01:20<30:57,  7.46s/it][A
  5%|▍         | 12/260 [01:28<31:12,  7.55s/it][A
  5%|▌         | 13/260 [01:36<31:32,  7.66s/it][A
  5%|▌         | 14/260 [01:43<30:53,  7.54s/it][A
  6%|▌         | 15/260 [01:47<26:14,  6.43s/it][A

110.wav



  6%|▌         | 16/260 [01:47<18:31,  4.55s/it][A

111.wav


In [None]:
joblib.dump(data,'expansion_data.pkl')