In [1]:
import tensorflow as tf
import os.path
import numpy as np
import pickle
from scipy.ndimage.interpolation import zoom
from ipy_progressbar import ProgressBar
from random import shuffle

import matplotlib
from matplotlib import mlab, pyplot, rcParams
%matplotlib inline
rcParams['figure.figsize'] = (25, 10)
import librosa
from jupyter_progressbar import ProgressBar

from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Lambda, Dropout
from keras.optimizers import Adam
from kapre.time_frequency import Spectrogram
from sklearn.cross_validation import train_test_split
from keras import backend as K

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/cpu:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12386053361222617431, name: "/gpu:0"
 device_type: "GPU"
 memory_limit: 1253507072
 locality {
   bus_id: 1
 }
 incarnation: 10687157828282272591
 physical_device_desc: "device: 0, name: GeForce GTX 960M, pci bus id: 0000:01:00.0"]

In [11]:
sampling_rate = 22050
seconds = sampling_rate * 5
folder = '/home/herbert/RuG/youtube_classification/data2/'
classes = [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]

In [13]:
data = [
    (label, os.path.join(folder, class_, filename))
    for label, class_ in enumerate(classes)
    for filename in os.listdir(os.path.join(folder, class_))
    if filename.endswith('.wav')
]
train_data, test_data = train_test_split(data, test_size=0.01)

In [15]:
X_test = []
y_test = []
for label, filename in ProgressBar(test_data):
    try:
        audio, sr = librosa.load(filename)
    except:
        pass
    leave = audio.shape[0] // seconds
    if sr != sampling_rate or leave <= 0:
        continue
    X_test.append(audio[:leave * seconds].reshape(leave, seconds))
    y_test.extend([label] * leave)

X_test = np.concatenate(X_test, axis=0)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
y_test = np.array(y_test)
with open('test_data.p3', 'wb') as f:
    pickle.dump((X_test, y_test), f)

In [16]:
n = 20
for j in range(n):
    X_train = []
    y_train = []
    for i, (label, filename) in enumerate(ProgressBar(train_data[j::n])):
        try:
            audio, sr = librosa.load(filename)
        except:
            pass
        leave = audio.shape[0] // seconds
        if sr != sampling_rate or leave <= 0:
            continue
        X_train.append(audio[:leave * seconds].reshape(leave, seconds))
        y_train.extend([label] * leave)

    X_train = np.concatenate(X_train, axis=0)
    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
    y_train = np.array(y_train)
    
    with open('train_data_{}.p3'.format(j), 'wb') as f:
        pickle.dump((X_train, y_train), f)
    break

In [18]:
# normalize spectogram output
slope = K.variable(value=1/40)
intercept = K.variable(value=1)

model = Sequential()
model.add(Spectrogram(n_dft=512, n_hop=256, input_shape=(1, seconds), 
          return_decibel_spectrogram=True, power_spectrogram=2.0, 
          trainable_kernel=False, name='static_stft'))
model.add(Lambda(lambda x: slope * x + intercept))
model.add(Conv2D(32, (7, 7), name='conv1', activation='relu'))
# model.add(BatchNormalization())
model.add(MaxPool2D((25, 17)))
model.add(Dropout(0.5))
model.add(Conv2D(32, (10, 10), name='conv2', activation='relu'))
model.add(Dropout(0.5))

# model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary(line_length=80, positions=[.33, .65, .8, 1.])

________________________________________________________________________________
Layer (type)              Output Shape              Param #     
static_stft (Spectrogram) (None, 257, 431, 1)       263168      
________________________________________________________________________________
lambda_2 (Lambda)         (None, 257, 431, 1)       0           
________________________________________________________________________________
conv1 (Conv2D)            (None, 251, 425, 32)      1600        
________________________________________________________________________________
max_pooling2d_2 (MaxPooli (None, 10, 25, 32)        0           
________________________________________________________________________________
dropout_1 (Dropout)       (None, 10, 25, 32)        0           
________________________________________________________________________________
conv2 (Conv2D)            (None, 1, 16, 32)         102432      
___________________________________________________________

In [11]:
with open('train_data_0.p3', 'rb') as f:
    X_train, y_train = pickle.load(f)
with open('test_data.p3', 'rb') as f:
    X_test, y_test = pickle.load(f)

In [13]:
perm = np.random.permutation(X_train.shape[0])
X_train = X_train[perm, :, :]
y_train = y_train[perm]

In [19]:
optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=optimizer,
      loss='binary_crossentropy',
      metrics=['accuracy'])

In [21]:
model.optimizer.lr = 0.003

In [22]:
# ss = np.random.rand(X_train.shape[0]) < 0.01
# ss2 = np.random.rand(X_test.shape[0]) < 0.01

history = model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_test, y_test))

Train on 7170 samples, validate on 1334 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# ss = np.random.rand(X_train.shape[0]) < 0.01
# ss2 = np.random.rand(X_test.shape[0]) < 0.01

history2 = model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_test, y_test))

Train on 7170 samples, validate on 1334 samples
Epoch 1/10