In [155]:
import numpy as np
import pandas as pd
import soundfile
import librosa
from os.path import join
from os import listdir
import pickle
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Conv2D, Dense, BatchNormalization, ReLU, Input, LSTM, Concatenate, Conv2DTranspose, Reshape, Bidirectional

## Model

In [112]:
def SID_component(sid_input):

    conv1 = Conv2D(filters=48, kernel_size=(1,7), dilation_rate=(1,1), padding='same')(sid_input)
    conv1 = BatchNormalization()(conv1)
    conv1 = ReLU()(conv1)

    conv2 = Conv2D(filters=48, kernel_size=(7,1), dilation_rate=(1,1), padding='same')(conv1)
    conv2 = BatchNormalization()(conv2)
    conv2 = ReLU()(conv2)

    conv3 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(1,1), padding='same')(conv2)
    conv3 = BatchNormalization()(conv3)
    conv3 = ReLU()(conv3)

    conv4 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(2,1), padding='same')(conv3)
    conv4 = BatchNormalization()(conv4)
    conv4 = ReLU()(conv4)

    conv5 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(4,1), padding='same')(conv4)
    conv5 = BatchNormalization()(conv5)
    conv5 = ReLU()(conv5)

    conv6 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(8,1), padding='same')(conv5)
    conv6 = BatchNormalization()(conv6)
    conv6 = ReLU()(conv6)

    conv7 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(16,1), padding='same')(conv6)
    conv7 = BatchNormalization()(conv7)
    conv7 = ReLU()(conv7)

    conv8 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(32,1), padding='same')(conv7)
    conv8 = BatchNormalization()(conv8)
    conv8 = ReLU()(conv8)

    conv9 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(1,1), padding='same')(conv8)
    conv9 = BatchNormalization()(conv9)
    conv9 = ReLU()(conv9)

    conv10 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(2,2), padding='same')(conv9)
    conv10 = BatchNormalization()(conv10)
    conv10 = ReLU()(conv10)

    conv11 = Conv2D(filters=48, kernel_size=(5,5), dilation_rate=(4,4), padding='same')(conv10)
    conv11 = BatchNormalization()(conv11)
    conv11 = ReLU()(conv11)

    conv12 = Conv2D(filters=8, kernel_size=(1,1), dilation_rate=(1,1), padding='same')(conv11)
    conv12 = BatchNormalization()(conv12)
    conv12 = ReLU()(conv12)
    
    lstm_input = Reshape((256*291,8))(conv12)

    lstm = Bidirectional(LSTM(units=100))(lstm_input)

    fc = Dense(units=100)(lstm)
    fc = ReLU()(fc)

    sid_output = Dense(units=291)(fc)

    return sid_output

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Data

In [None]:
def silence_mask_extraction(audio):
    audio_stft = librosa.stft(audio, n_fft=510, hop_length=110, win_length=28, window="hann")
    return np.abs(librosa.util.normalize(audio_stft)).mean(axis=0) > 0.12

In [None]:
def make_df(dirs):
    df = pd.DataFrame({dir:listdir(dir) for dir in dirs})
    return df

In [167]:
def data_gen(noisy_path, clean_path, dataframe, batch_size, size):
    
    X = np.empty((batch_size,)+size)
    Y = np.empty((batch_size,)+size[1])
    
    n_samples = dataframe.shape[0]
    batch_ind = 0
    epoch_ind = 0
    
    while True:
        clean_audio, _ = librosa.load(join(clean_path, dataframe.iloc[epoch_ind, 0]), sr=16000)
        noisy_audio, _ = librosa.load(join(noisy_path, dataframe.iloc[epoch_ind, 1]), sr=16000)
        
        if librosa.get_duration(clean_audio, sr=16000) < 2:
            continue
        
        clean_mask = silence_mask_extraction(clean_audio)
        noisy_stft = librosa.stft(noisy_audio, n_fft=510, hop_length=110, win_length=28, window="hann")
            
        X[batch_ind] = noisy_stft
        Y[batch_ind] = clean_mask
        
        batch_ind += 1
        epoch_ind += 1
        
        if batch_ind == batch_size:
            yield X, Y
            batch_ind = 0
            
        if epoch_ind == n_samples:
            epoch_ind = 0
            

In [139]:
def audio_splitting(input_filename, input_path, output_path, duration=2):
    audio, sr = librosa.load(join(input_path, input_filename), sr=16000)
    
    buffer = duration * sr

    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    while samples_wrote < samples_total:

        #check if the buffer is not exceeding total samples 
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote : (samples_wrote + buffer)]
        temp = input_filename.split('.')
        output_filename = join(output_path, temp[0] + '_' + str(counter) + '.' + temp[1])

        # Write 2 second segment
        soundfile.write(output_filename, block, sr)
        counter += 1
        samples_wrote += buffer

In [128]:
data_path = "data/"

base_train_dirs = ["clean_trainset_28spk_wav", "noisy_trainset_28spk_wav"]
base_test_dirs = ["clean_testset_wav", "noisy_testset_wav"]

splitted_train_dirs = ["splitted_clean_trainset_28spk_wav", "splitted_noisy_trainset_28spk_wav"]
splitted_test_dirs = ["splitted_clean_testset_wav", "splitted_noisy_testset_wav"]

In [137]:
def make_path(path, dirs):
    return list(map(lambda d: join(path, d), dirs))

In [141]:
train_df = make_df(make_path(data_path, base_train_dirs))
test_df = make_df(make_path(data_path, base_test_dirs))

In [None]:
for train_dir, splitted_train_dir in zip(base_train_dirs, splitted_train_dirs):
    for file in train_df[train_dir]:
        audio_splitting(file, join(data_path, train_dir), join(data_path, splitted_train_dir))
        
for test_dir, splitted_test_dir in zip(base_test_dirs, splitted_test_dirs):
    for file in test_df[test_dir]:
        audio_splitting(file, join(data_path, test_dir), join(data_path, splitted_test_dir))

In [243]:
clean_path = join(data_path, splitted_train_dirs[0])
noisy_path = join(data_path, splitted_train_dirs[1])

spl_train_df = make_df(make_path(data_path, splitted_train_dirs))
spl_test_df = make_df(make_path(data_path, splitted_train_dirs))

## Training

In [None]:
size = (256, 291, 1)
batch_size = 15
epochs = 100
lr = 0.001

train_generator = data_gen(noisy_path, clean_path, spl_train_df, batch_size, size)
valid_generator = data_gen(noisy_path, clean_path, spl_test_df, batch_size, size)

steps_per_train_epoch = spl_train_df.shape[0]//batch_size
steps_per_valid_epoch = spl_test_df.shape[0]//batch_size

inputs = Input(shape=size)
outputs = SID_component(inputs)
model = Model(inputs, outputs)

model.compile(optimizer=Adam(lr=lr),
              loss='binary_cross_entropy',
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(train_datagen, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    verbose=1,
                    validation_data=valid_datagen,
                    steps_per_epoch=steps_per_train_epoch,
                    validation_steps=steps_per_valid_epoch)


model_name = "sid_1"

model.save(join(data_path, 'model_{}.hdf5'.format(model_name)))
with open(join(data_path, 'stats_{}.pickle'.format(model_name)), 'wb') as f:
    pickle.dump(history.history, f)

In [241]:
y,s = librosa.load('data/noisy_trainset_28spk_wav/p226_021.wav', sr=16000)
print(librosa.get_duration(y,s))
librosa.stft(y, n_fft=510, hop_length=110, win_length=28, window="hann").shape

9.4


(256, 1368)

In [242]:
buffer = 2 * s

samples_total = len(y)
samples_wrote = 0
counter = 1

while samples_wrote < samples_total:

    #check if the buffer is not exceeding total samples 
    if buffer > (samples_total - samples_wrote):
        buffer = samples_total - samples_wrote

    block = y[samples_wrote : (samples_wrote + buffer)]
    print(librosa.stft(block, n_fft=510, hop_length=110, win_length=28, window="hann").shape)
    counter += 1
    samples_wrote += buffer

(256, 291)
(256, 291)
(256, 291)
(256, 291)
(256, 204)
