# Abdi Audio Autoencoder (Input -> Input) + Denoising AE

### Abdi Audio Autoencoder (Input -> Input) + Denoising AE

- Features: MFCC + MEL + CHROMA 
- Dataset: 3.150 .wav files
- Total lenght: 52.55 min
- Epochs: 100
- Acc: 98,17%
- Loss: 0.0017
 
by Abdi (Oct, 2019)    

## Preparing the Data

In [44]:
import warnings
warnings.filterwarnings("ignore")

import gc
gc.collect()

import matplotlib.pyplot as plt
from keras.layers import Input, Dense
from keras import regularizers
from keras.models import Model
import numpy as np
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import scipy.io.wavfile as wavfile
import glob
import os
import soundfile # to read audio file
import librosa
import librosa.display

In [45]:
### Load data (waveform)
def load_data(folder):
    X = pd.DataFrame()
    audio_files = []
    for file in glob.glob(folder):
        basename = os.path.basename(file)
        audio_files.append(basename) # store audio files name
        sample_rate, data = wavfile.read(file) # read audio
        tmp = pd.DataFrame(data)
        X = pd.concat([X,tmp], ignore_index=True, axis=1)
    return X, audio_files

In [46]:
### Load data (feature extraction)
def load_feature(folder):
    X = []
    audio_meta = pd.DataFrame(columns=['audio_files', 'duration'])
    #audio_files = []
    for file in glob.glob(folder):
        basename = os.path.basename(file)
        #audio_files.append(basename)
        print("File:", basename)      
        features, duration = extract_feature(file, mfcc = True, mel = True, chroma = True)
        print("Duration:", duration)
        audio_meta = audio_meta.append(pd.Series([basename, duration], index = audio_meta.columns), ignore_index = True) ### fix here
        X.append(features)
    return np.array(X), audio_meta

In [47]:
def extract_feature(file_name, **kwargs):  
    mfcc = kwargs.get("mfcc")
    mel = kwargs.get("mel")
    chroma = kwargs.get("chroma")
    
    with soundfile.SoundFile(file_name) as sound_file:     
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        frames = sound_file.frames
        duration = float("{0:.2f}".format(frames/sample_rate)) # returns the audio duration
        result = np.array([])
        
        ### MFCC and Spectral features              
        if chroma:
            stft = np.abs(librosa.stft(X))  
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
    return result, duration

In [76]:
### Pre-Processing
from sklearn.model_selection import train_test_split
import time # 3150 files -> ~3.3min
start = time.clock()

############################################################

audio_feature = 1 # 1 for feature extraction, 0 for waveform

if audio_feature == 1:
    # Spectral and MFCC Features
    print("Loading files..")
    X, audio_files = load_feature("data/Audio_wav/*.wav")
    # This estimator scales and translates each feature individually 
    # such that it is in the given range on the training set, e.g. between zero and one.
    X = MinMaxScaler().fit_transform(X) 
    print(type(audio_files))
    print(audio_files.shape)
    print(audio_files)
    x_train, x_test, audio_train, audio_test = train_test_split(X, audio_files, test_size = 0.2)
    
elif audio_feature == 0:
    # Waveform Features
    print("Loading waveform files..")
    X, audio_files = load_data("data/Audio_wav/*.wav")
    X = X.dropna()
    
    G = []
    print("Smoothing the train wave..")
    for i in range(5000,10000,25):
        G.append(np.mean(X[i:i+500]))
    X = g    
    
    X = MinMaxScaler().fit_transform(X)
    X = np.transpose(X)
    X = X[:,:200] ### put a limit
    x_train, x_test, audio_train, audio_test = train_test_split(X, audio_files, test_size = 0.2)
    
############################################################

end = time.clock()
print("Loading time: ", end - start) 

print("\nTraining shape:", x_train.shape)
print("Training audio:", len(audio_train))
print("Test shape:", x_test.shape)
print("Test audio:", len(audio_test))

Loading files..
File: luis_0.wav
Duration: 3.0
File: luis_1.wav
Duration: 3.0
File: luis_2.wav
Duration: 3.0
File: voice_1.wav
Duration: 4.46
File: voice_10.wav
Duration: 3.16
File: voice_100.wav
Duration: 5.2
File: voice_1000.wav
Duration: 2.23
File: voice_1001.wav
Duration: 2.32
File: voice_1002.wav
Duration: 2.41
File: voice_1003.wav
Duration: 2.32
File: voice_1004.wav
Duration: 2.41
File: voice_1005.wav
Duration: 2.41
File: voice_1006.wav
Duration: 2.51
File: voice_1007.wav
Duration: 2.51
File: voice_1008.wav
Duration: 2.51
File: voice_1009.wav
Duration: 2.6
File: voice_101.wav
Duration: 5.2
File: voice_1010.wav
Duration: 2.41
File: voice_1011.wav
Duration: 2.88
File: voice_1012.wav
Duration: 5.11
File: voice_1013.wav
Duration: 2.14
File: voice_1014.wav
Duration: 2.14
File: voice_1015.wav
Duration: 3.07
File: voice_1016.wav
Duration: 4.74
File: voice_1017.wav
Duration: 4.64
File: voice_1018.wav
Duration: 3.81
File: voice_1019.wav
Duration: 3.16
File: voice_102.wav
Duration: 3.9
Fil

Duration: 2.23
File: voice_1203.wav
Duration: 2.6
File: voice_1204.wav
Duration: 2.14
File: voice_1205.wav
Duration: 2.41
File: voice_1206.wav
Duration: 2.23
File: voice_1207.wav
Duration: 2.88
File: voice_1208.wav
Duration: 2.32
File: voice_1209.wav
Duration: 2.41
File: voice_121.wav
Duration: 4.92
File: voice_1210.wav
Duration: 2.23
File: voice_1211.wav
Duration: 2.69
File: voice_1212.wav
Duration: 2.23
File: voice_1213.wav
Duration: 2.23
File: voice_1214.wav
Duration: 2.69
File: voice_1215.wav
Duration: 2.23
File: voice_1216.wav
Duration: 2.41
File: voice_1217.wav
Duration: 2.23
File: voice_1218.wav
Duration: 3.16
File: voice_1219.wav
Duration: 2.69
File: voice_122.wav
Duration: 1.76
File: voice_1220.wav
Duration: 2.6
File: voice_1221.wav
Duration: 2.23
File: voice_1222.wav
Duration: 2.69
File: voice_1223.wav
Duration: 2.23
File: voice_1224.wav
Duration: 2.32
File: voice_1225.wav
Duration: 2.23
File: voice_1226.wav
Duration: 2.32
File: voice_1227.wav
Duration: 2.23
File: voice_1228.

Duration: 1.76
File: voice_1412.wav
Duration: 1.76
File: voice_1413.wav
Duration: 1.76
File: voice_1414.wav
Duration: 1.67
File: voice_1415.wav
Duration: 1.67
File: voice_1416.wav
Duration: 1.67
File: voice_1417.wav
Duration: 1.76
File: voice_1418.wav
Duration: 2.14
File: voice_1419.wav
Duration: 4.18
File: voice_142.wav
Duration: 3.44
File: voice_1420.wav
Duration: 1.67
File: voice_1421.wav
Duration: 1.67
File: voice_1422.wav
Duration: 7.06
File: voice_1423.wav
Duration: 1.86
File: voice_1424.wav
Duration: 1.76
File: voice_1425.wav
Duration: 1.58
File: voice_1426.wav
Duration: 2.23
File: voice_1427.wav
Duration: 2.23
File: voice_1428.wav
Duration: 1.49
File: voice_1429.wav
Duration: 2.04
File: voice_143.wav
Duration: 3.16
File: voice_1430.wav
Duration: 1.49
File: voice_1431.wav
Duration: 2.14
File: voice_1432.wav
Duration: 2.23
File: voice_1433.wav
Duration: 1.76
File: voice_1434.wav
Duration: 2.6
File: voice_1435.wav
Duration: 1.58
File: voice_1436.wav
Duration: 1.95
File: voice_1437

Duration: 3.53
File: voice_162.wav
Duration: 3.34
File: voice_1620.wav
Duration: 5.94
File: voice_1621.wav
Duration: 10.03
File: voice_1622.wav
Duration: 6.13
File: voice_1623.wav
Duration: 8.27
File: voice_1624.wav
Duration: 14.12
File: voice_1625.wav
Duration: 4.83
File: voice_1626.wav
Duration: 7.34
File: voice_1627.wav
Duration: 2.14
File: voice_1628.wav
Duration: 2.14
File: voice_1629.wav
Duration: 3.81
File: voice_163.wav
Duration: 2.23
File: voice_1630.wav
Duration: 4.46
File: voice_1631.wav
Duration: 9.2
File: voice_1632.wav
Duration: 3.72
File: voice_1633.wav
Duration: 3.81
File: voice_1634.wav
Duration: 3.72
File: voice_1635.wav
Duration: 7.99
File: voice_1636.wav
Duration: 2.79
File: voice_1637.wav
Duration: 5.48
File: voice_1638.wav
Duration: 5.85
File: voice_1639.wav
Duration: 5.48
File: voice_164.wav
Duration: 7.71
File: voice_1640.wav
Duration: 2.04
File: voice_1641.wav
Duration: 2.6
File: voice_1642.wav
Duration: 2.51
File: voice_1643.wav
Duration: 4.74
File: voice_1644

Duration: 1.39
File: voice_1826.wav
Duration: 1.3
File: voice_1827.wav
Duration: 3.25
File: voice_1828.wav
Duration: 2.6
File: voice_1829.wav
Duration: 4.37
File: voice_183.wav
Duration: 1.95
File: voice_1830.wav
Duration: 1.86
File: voice_1831.wav
Duration: 2.14
File: voice_1832.wav
Duration: 2.23
File: voice_1833.wav
Duration: 3.16
File: voice_1834.wav
Duration: 2.23
File: voice_1835.wav
Duration: 2.41
File: voice_1836.wav
Duration: 3.16
File: voice_1837.wav
Duration: 2.88
File: voice_1838.wav
Duration: 2.6
File: voice_1839.wav
Duration: 3.07
File: voice_184.wav
Duration: 2.04
File: voice_1840.wav
Duration: 8.36
File: voice_1841.wav
Duration: 13.19
File: voice_1842.wav
Duration: 7.62
File: voice_1843.wav
Duration: 1.3
File: voice_1844.wav
Duration: 1.3
File: voice_1845.wav
Duration: 1.86
File: voice_1846.wav
Duration: 1.39
File: voice_1847.wav
Duration: 1.49
File: voice_1848.wav
Duration: 1.39
File: voice_1849.wav
Duration: 1.49
File: voice_185.wav
Duration: 1.86
File: voice_1850.wav

Duration: 2.04
File: voice_2035.wav
Duration: 3.9
File: voice_2036.wav
Duration: 2.88
File: voice_2037.wav
Duration: 2.79
File: voice_2038.wav
Duration: 4.55
File: voice_2039.wav
Duration: 4.18
File: voice_204.wav
Duration: 2.51
File: voice_2040.wav
Duration: 1.76
File: voice_2041.wav
Duration: 1.86
File: voice_2042.wav
Duration: 2.04
File: voice_2043.wav
Duration: 4.37
File: voice_2044.wav
Duration: 3.99
File: voice_2045.wav
Duration: 3.72
File: voice_2046.wav
Duration: 3.72
File: voice_2047.wav
Duration: 2.97
File: voice_2048.wav
Duration: 3.34
File: voice_2049.wav
Duration: 2.51
File: voice_205.wav
Duration: 2.14
File: voice_2050.wav
Duration: 2.32
File: voice_2051.wav
Duration: 1.67
File: voice_2052.wav
Duration: 1.67
File: voice_2053.wav
Duration: 1.67
File: voice_2054.wav
Duration: 1.76
File: voice_2055.wav
Duration: 1.67
File: voice_2056.wav
Duration: 1.76
File: voice_2057.wav
Duration: 1.21
File: voice_2058.wav
Duration: 1.21
File: voice_2059.wav
Duration: 1.21
File: voice_206.

Duration: 1.76
File: voice_2245.wav
Duration: 1.67
File: voice_2246.wav
Duration: 1.95
File: voice_2247.wav
Duration: 2.51
File: voice_2248.wav
Duration: 3.99
File: voice_2249.wav
Duration: 4.09
File: voice_225.wav
Duration: 1.95
File: voice_2250.wav
Duration: 3.99
File: voice_2251.wav
Duration: 4.09
File: voice_2252.wav
Duration: 2.14
File: voice_2253.wav
Duration: 2.32
File: voice_2254.wav
Duration: 9.57
File: voice_2255.wav
Duration: 2.97
File: voice_2256.wav
Duration: 3.34
File: voice_2257.wav
Duration: 2.32
File: voice_2258.wav
Duration: 2.79
File: voice_2259.wav
Duration: 5.39
File: voice_226.wav
Duration: 1.95
File: voice_2260.wav
Duration: 2.51
File: voice_2261.wav
Duration: 2.04
File: voice_2262.wav
Duration: 2.14
File: voice_2263.wav
Duration: 1.86
File: voice_2264.wav
Duration: 2.51
File: voice_2265.wav
Duration: 2.14
File: voice_2266.wav
Duration: 4.27
File: voice_2267.wav
Duration: 4.27
File: voice_2268.wav
Duration: 3.99
File: voice_2269.wav
Duration: 3.99
File: voice_227

Duration: 1.95
File: voice_2466.wav
Duration: 3.53
File: voice_2467.wav
Duration: 3.16
File: voice_2468.wav
Duration: 3.16
File: voice_2469.wav
Duration: 3.72
File: voice_247.wav
Duration: 3.9
File: voice_2470.wav
Duration: 2.97
File: voice_2471.wav
Duration: 3.16
File: voice_2472.wav
Duration: 1.86
File: voice_2473.wav
Duration: 3.72
File: voice_2474.wav
Duration: 3.25
File: voice_2475.wav
Duration: 3.53
File: voice_2476.wav
Duration: 3.44
File: voice_2477.wav
Duration: 3.62
File: voice_2478.wav
Duration: 3.16
File: voice_2479.wav
Duration: 3.07
File: voice_248.wav
Duration: 3.72
File: voice_2480.wav
Duration: 3.44
File: voice_2481.wav
Duration: 3.25
File: voice_2482.wav
Duration: 3.34
File: voice_2483.wav
Duration: 3.44
File: voice_2484.wav
Duration: 1.49
File: voice_2485.wav
Duration: 1.86
File: voice_2486.wav
Duration: 2.04
File: voice_2487.wav
Duration: 1.49
File: voice_2488.wav
Duration: 9.2
File: voice_2489.wav
Duration: 7.06
File: voice_249.wav
Duration: 2.6
File: voice_2490.wa

Duration: 1.49
File: voice_2675.wav
Duration: 3.07
File: voice_2677.wav
Duration: 1.67
File: voice_2678.wav
Duration: 2.32
File: voice_2679.wav
Duration: 1.86
File: voice_268.wav
Duration: 1.86
File: voice_2680.wav
Duration: 2.32
File: voice_2681.wav
Duration: 1.86
File: voice_2682.wav
Duration: 4.27
File: voice_2683.wav
Duration: 1.21
File: voice_2684.wav
Duration: 1.76
File: voice_2685.wav
Duration: 1.86
File: voice_2686.wav
Duration: 1.86
File: voice_2687.wav
Duration: 3.25
File: voice_2688.wav
Duration: 1.21
File: voice_2689.wav
Duration: 3.99
File: voice_269.wav
Duration: 1.86
File: voice_2690.wav
Duration: 1.58
File: voice_2691.wav
Duration: 2.51
File: voice_2692.wav
Duration: 1.58
File: voice_2693.wav
Duration: 1.58
File: voice_2694.wav
Duration: 1.39
File: voice_2695.wav
Duration: 1.49
File: voice_2696.wav
Duration: 1.86
File: voice_2697.wav
Duration: 1.86
File: voice_2698.wav
Duration: 9.66
File: voice_2699.wav
Duration: 2.69
File: voice_27.wav
Duration: 2.51
File: voice_270.w

Duration: 1.86
File: voice_2882.wav
Duration: 1.86
File: voice_2883.wav
Duration: 1.86
File: voice_2884.wav
Duration: 2.23
File: voice_2885.wav
Duration: 1.95
File: voice_2886.wav
Duration: 2.23
File: voice_2887.wav
Duration: 1.86
File: voice_2888.wav
Duration: 2.97
File: voice_2889.wav
Duration: 1.76
File: voice_289.wav
Duration: 2.23
File: voice_2890.wav
Duration: 2.32
File: voice_2891.wav
Duration: 1.86
File: voice_2892.wav
Duration: 1.86
File: voice_2893.wav
Duration: 1.86
File: voice_2894.wav
Duration: 1.95
File: voice_2895.wav
Duration: 2.23
File: voice_2896.wav
Duration: 2.32
File: voice_2897.wav
Duration: 2.23
File: voice_2898.wav
Duration: 2.23
File: voice_2899.wav
Duration: 1.67
File: voice_29.wav
Duration: 1.76
File: voice_290.wav
Duration: 1.95
File: voice_2900.wav
Duration: 1.58
File: voice_2901.wav
Duration: 1.76
File: voice_2902.wav
Duration: 5.76
File: voice_2903.wav
Duration: 6.22
File: voice_2904.wav
Duration: 1.95
File: voice_2905.wav
Duration: 1.76
File: voice_2906.

Duration: 2.88
File: voice_3091.wav
Duration: 2.88
File: voice_3092.wav
Duration: 3.07
File: voice_3093.wav
Duration: 3.07
File: voice_3094.wav
Duration: 3.81
File: voice_3095.wav
Duration: 2.97
File: voice_3096.wav
Duration: 2.97
File: voice_3097.wav
Duration: 5.76
File: voice_3098.wav
Duration: 2.14
File: voice_3099.wav
Duration: 3.72
File: voice_31.wav
Duration: 2.6
File: voice_310.wav
Duration: 2.97
File: voice_3100.wav
Duration: 2.69
File: voice_3101.wav
Duration: 2.32
File: voice_3102.wav
Duration: 1.95
File: voice_3103.wav
Duration: 2.23
File: voice_3104.wav
Duration: 2.23
File: voice_3105.wav
Duration: 2.32
File: voice_3106.wav
Duration: 2.51
File: voice_3107.wav
Duration: 3.25
File: voice_3108.wav
Duration: 2.04
File: voice_3109.wav
Duration: 1.86
File: voice_311.wav
Duration: 3.16
File: voice_3110.wav
Duration: 2.14
File: voice_3111.wav
Duration: 2.23
File: voice_3112.wav
Duration: 2.14
File: voice_3113.wav
Duration: 2.23
File: voice_3114.wav
Duration: 2.04
File: voice_3115.w

Duration: 2.79
File: voice_444.wav
Duration: 2.69
File: voice_445.wav
Duration: 2.51
File: voice_446.wav
Duration: 3.62
File: voice_447.wav
Duration: 3.9
File: voice_448.wav
Duration: 4.83
File: voice_449.wav
Duration: 7.71
File: voice_45.wav
Duration: 4.55
File: voice_450.wav
Duration: 4.64
File: voice_451.wav
Duration: 2.69
File: voice_452.wav
Duration: 2.79
File: voice_453.wav
Duration: 2.51
File: voice_454.wav
Duration: 2.32
File: voice_455.wav
Duration: 2.6
File: voice_456.wav
Duration: 2.79
File: voice_457.wav
Duration: 2.79
File: voice_458.wav
Duration: 2.79
File: voice_459.wav
Duration: 3.07
File: voice_46.wav
Duration: 2.32
File: voice_460.wav
Duration: 3.44
File: voice_461.wav
Duration: 9.57
File: voice_462.wav
Duration: 4.37
File: voice_463.wav
Duration: 3.44
File: voice_464.wav
Duration: 3.72
File: voice_465.wav
Duration: 6.32
File: voice_466.wav
Duration: 3.9
File: voice_467.wav
Duration: 2.32
File: voice_468.wav
Duration: 2.32
File: voice_469.wav
Duration: 2.41
File: voic

Duration: 1.39
File: voice_869.wav
Duration: 1.67
File: voice_87.wav
Duration: 2.69
File: voice_870.wav
Duration: 1.3
File: voice_871.wav
Duration: 1.39
File: voice_872.wav
Duration: 1.3
File: voice_874.wav
Duration: 1.39
File: voice_875.wav
Duration: 2.69
File: voice_876.wav
Duration: 3.07
File: voice_877.wav
Duration: 2.69
File: voice_878.wav
Duration: 1.76
File: voice_879.wav
Duration: 1.49
File: voice_88.wav
Duration: 2.79
File: voice_880.wav
Duration: 1.49
File: voice_881.wav
Duration: 6.04
File: voice_882.wav
Duration: 5.48
File: voice_883.wav
Duration: 3.81
File: voice_884.wav
Duration: 3.44
File: voice_885.wav
Duration: 2.79
File: voice_886.wav
Duration: 3.99
File: voice_887.wav
Duration: 1.76
File: voice_888.wav
Duration: 4.64
File: voice_889.wav
Duration: 1.3
File: voice_89.wav
Duration: 5.85
File: voice_890.wav
Duration: 1.21
File: voice_891.wav
Duration: 1.21
File: voice_892.wav
Duration: 1.49
File: voice_893.wav
Duration: 1.76
File: voice_894.wav
Duration: 2.51
File: voice

In [42]:
audio_files['duration'].describe()

count    3153.000000
mean        2.886765
std         1.688352
min         1.210000
25%         1.860000
50%         2.410000
75%         3.340000
max        22.010000
Name: duration, dtype: float64

In [49]:
### Adding noise factor
noise_factor = 0.1
x_train_noisy = x_train + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_train.shape) 
x_test_noisy = x_test + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_test.shape) 

x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)

print("\nNoisy training shape:", x_train_noisy.shape)
print("Noisy test shape:", x_test_noisy.shape)


Noisy training shape: (2522, 180)
Noisy test shape: (631, 180)


## Training Model

In [77]:
### Define training model
input_dim = len(x_train[0])
encoding_dim = 150

input_audio = Input(shape=(input_dim,), name = 'input')
encoded = Dense(encoding_dim, activation='relu', name = 'encoder_1',  W_regularizer=regularizers.l2(0.0002))(input_audio)
decoded = Dense(input_dim, activation='linear', name = 'output')(encoded)

autoencoder = Model(input = input_audio, output = decoded, name = 'AutoEncoder')
autoencoder.compile(optimizer='Adagrad', loss='mean_squared_error')

autoencoder.summary()

weights = "audio_best_weights.hdf5"
checkpoint = ModelCheckpoint(weights, monitor='mean_squared_error', verbose = 1, save_best_only = False)

Model: "AutoEncoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 180)               0         
_________________________________________________________________
encoder_1 (Dense)            (None, 150)               27150     
_________________________________________________________________
output (Dense)               (None, 180)               27180     
Total params: 54,330
Trainable params: 54,330
Non-trainable params: 0
_________________________________________________________________


In [78]:
### Fit the model
history = autoencoder.fit(x_train, x_train, nb_epoch = 100, 
                          batch_size = 32, callbacks=[checkpoint], validation_split = 0.2, verbose = 1)

Train on 2017 samples, validate on 505 samples
Epoch 1/100

Epoch 00001: saving model to audio_best_weights.hdf5
Epoch 2/100

Epoch 00002: saving model to audio_best_weights.hdf5
Epoch 3/100

Epoch 00003: saving model to audio_best_weights.hdf5
Epoch 4/100

Epoch 00004: saving model to audio_best_weights.hdf5
Epoch 5/100

Epoch 00005: saving model to audio_best_weights.hdf5
Epoch 6/100

Epoch 00006: saving model to audio_best_weights.hdf5
Epoch 7/100

Epoch 00007: saving model to audio_best_weights.hdf5
Epoch 8/100

Epoch 00008: saving model to audio_best_weights.hdf5
Epoch 9/100

Epoch 00009: saving model to audio_best_weights.hdf5
Epoch 10/100

Epoch 00010: saving model to audio_best_weights.hdf5
Epoch 11/100

Epoch 00011: saving model to audio_best_weights.hdf5
Epoch 12/100

Epoch 00012: saving model to audio_best_weights.hdf5
Epoch 13/100

Epoch 00013: saving model to audio_best_weights.hdf5
Epoch 14/100

Epoch 00014: saving model to audio_best_weights.hdf5
Epoch 15/100

Epoch 0001

## Inference Model

In [79]:
### Define inference model
encoded_input = Input(shape=(encoding_dim,), name = 'input_emb')
encoder = Model(input = input_audio, output = encoded, name = 'Encoder')
decoder_layer = autoencoder.layers[-1]
decoder = Model(input = encoded_input, output = decoder_layer(encoded_input), name = 'Decoder')

encoder.summary()
decoder.summary()

### Save Files
encoder.save( 'audio_enc_model_2.h5' ) 
decoder.save( 'audio_dec_model_2.h5' ) 
autoencoder.save( 'audio_model_2.h5' ) 

### Load files
#autoencoder = tf.keras.models.load_model('audio_model_2.h5')
#enc_model = tf.keras.models.load_model('audio_enc_model_2.h5') 
#dec_model = tf.keras.models.load_model('audio_dec_model_2.h5')
#autoencoder.load_weights(weights)
#autoencoder.compile(loss='mean_squared_error', optimizer='adam')

### Test prediction
encoded_audio = encoder.predict(x_train) 
decoded_audio = decoder.predict(encoded_audio) 
print("Train Accuracy = ", 1-np.mean(abs(x_train - decoded_audio)),'\n')

encoded_audio = encoder.predict(x_test) 
decoded_audio = decoder.predict(encoded_audio) 
print("Test Accuracy = ", 1-np.mean(abs(x_test - decoded_audio)),'\n')

Model: "Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 180)               0         
_________________________________________________________________
encoder_1 (Dense)            (None, 150)               27150     
Total params: 27,150
Trainable params: 27,150
Non-trainable params: 0
_________________________________________________________________
Model: "Decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_emb (InputLayer)       (None, 150)               0         
_________________________________________________________________
output (Dense)               (None, 180)               27180     
Total params: 27,180
Trainable params: 27,180
Non-trainable params: 0
_________________________________________________________________
Train Accuracy =  0.97975440853285

In [91]:
audio_test['audio_files']

0      voice_2157
1       voice_430
2      voice_2189
3       voice_817
4      voice_2646
          ...    
626    voice_3060
627     voice_698
628    voice_3094
629    voice_2477
630    voice_2306
Name: audio_files, Length: 631, dtype: object

In [None]:
### Results visualization
import re
print("Test Accuracy = ", 1-np.mean(abs(x_test - decoded_audio)),'\n')

#n = 5
n = len(x_test)
RESHAPE = (10,18) ### MFCC + MEL + CHROMA (180)
folder = 'AutoEncoder_Figures'
input_type = '_AE'
audio_test = audio_test.assign(audio_files = audio_test['audio_files'].str.replace(r'.wav', '')) ### remove .wav 
audio_test = audio_test.reset_index(drop=True)

for i in range(n):
    print("--------------------------------------------------------------")
    print("Audio:", audio_test['audio_files'][i])
    print("Accuracy = ", 1-np.mean(abs(x_test[i] - decoded_audio[i])),'\n')
    
    fig, axs = plt.subplots(3, figsize = (10,10))
    plt.gray()
    
    # original audio
    axs[0].set_title("ORIGINAL AUDIO")
    im = axs[0].imshow(x_test[i].reshape(RESHAPE))
    plt.colorbar(im, ax=axs[0])
    
    # noisy audio
    #axs[1].set_title("NOISY AUDIO")
    #im = axs[1].imshow(x_test_noisy[i].reshape(RESHAPE))
    #plt.colorbar(im, ax=axs[1])
    
    # encoded audio
    axs[1].set_title("ENCODED AUDIO")
    im = axs[1].imshow(encoded_audio[i].reshape(int(encoding_dim/2),2).T)
    plt.colorbar(im, ax=axs[1])
    
    # reconstructed audio
    axs[2].set_title("RECONSTRUCTED AUDIO")
    im = axs[2].imshow(decoded_audio[i].reshape(RESHAPE))
    plt.colorbar(im, ax=axs[2])

    #fig.savefig(str(folder) + '/reconstruction_' + str(audio_test['audio_files'][i]) + str(input_type) +'.png')

    # original and reconstructed in waveform
    plt.figure(figsize=(10, 2))
    plt.plot(x_test[i])
    plt.plot(decoded_audio[i],color='r', linewidth=2)
    plt.title('ORIGINAL (blue) and RECONSTRUCTED AUDIO (red)')
    #plt.savefig(str(folder) + '/waveform_' + str(audio_test['audio_files'][i]) + str(input_type) +'.png')
    plt.show()  

In [None]:
def performance_plot():  
    plt.figure(figsize=(16, 8))
    plt.plot(ae.history['loss'])
    plt.plot(ae.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['loss', 'val_loss'], loc='upper left')
    plt.show()
    
performance_plot()   

## Record a new Audio

In [None]:
### Record own audio
import sounddevice as sd
import soundfile as sf

def audioRecord():
    samplerate = 44100  
    duration =  3 # seconds
    filename = 'data/Audio_test/luis_1.wav'
    print("Speak please ...")
    mydata = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=1, blocking=True)
    print("Timeout!")
    sd.wait()
    plt.plot(mydata)
    plt.show()
    sf.write(filename, mydata, samplerate)
    print("The audio was saved successfully.")
    
    return filename

filename = audioRecord()

In [None]:
### Copying audio files
import shutil, sys  
Root = 'data\Audio_test\\'
TargetFolder = 'data\Audio_test\\'
index = 0

for root, dirs, files in os.walk((os.path.normpath(Root)), topdown=False):
        for name in files:
            if name.endswith('.wav'):
                index +=1
                SourceFolder = os.path.join(root, name)
                shutil.copy(SourceFolder, TargetFolder + '\\voice_' + str(index) + '.wav') 
                
print("Saved files:", index)       