In [1]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
import pandas as pd

%matplotlib inline
plt.style.use('ggplot')

train_path = '../input/audio_train/'
test_path = '../input/audio_test/'

In [2]:
# label
train_label = pd.read_csv("../input/train.csv")
test_label = pd.read_csv("../input/sample_submission.csv")


train_label.head()


Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [3]:
files = train_label['fname'].get_values()
files_labels = train_label['label'].get_values()

bands = 20
frames = 41
window_size = 512 * (frames - 1)


In [4]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size / 2)

        
def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [5]:

mfccs = []
labels = []

#for fn in range(0,len(files)):
for fn in range(0,100):
    print(str(fn), end=' ')
    sound_clip,s = librosa.load(train_path+files[fn])
    for (start,end) in windows(sound_clip,window_size):
        start = int(start)
        end = int(end)
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            mfcc = librosa.feature.mfcc(y=signal, sr=s, n_mfcc = bands).T.flatten()[:, np.newaxis].T
            mfccs.append(mfcc)
            labels.append(files_labels[fn]) 
            
features = np.asarray(mfccs).reshape(len(mfccs),frames,bands)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 

In [6]:
tr_features = np.array(features)
tr_labels = np.array(labels)

LABELS = np.unique(tr_labels)
label_idx = {label: i for i, label in enumerate(LABELS)}
vfunc = np.vectorize(lambda x: label_idx[x])
tr_labels=vfunc(tr_labels)

tr_labels = one_hot_encode(tr_labels)


In [7]:
files = test_label['fname'].get_values()

mfccs = []

#for fn in range(0,len(files)):
for fn in range(0,10):
    print(str(fn), end=' ')
    sound_clip,s = librosa.load(test_path+files[fn])
    for (start,end) in windows(sound_clip,window_size):
        start = int(start)
        end = int(end)
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            mfcc = librosa.feature.mfcc(y=signal, sr=s, n_mfcc = bands).T.flatten()[:, np.newaxis].T
            mfccs.append(mfcc)
            
features = np.asarray(mfccs).reshape(len(mfccs),frames,bands)

ts_features = np.array(features)


0 1 2 3 4 5 6 7 8 9 

In [42]:
tf.reset_default_graph()

learning_rate = 0.01
training_iters = 1000
batch_size = 50
display_step = 200

# Network Parameters
n_input = 20 
n_steps = 41
n_hidden = 300
n_classes = np.unique(labels).shape[0]

x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, n_classes])

weight = tf.Variable(tf.random_normal([n_hidden, n_classes]))
bias = tf.Variable(tf.random_normal([n_classes]))

In [43]:
def RNN(x, weight, bias):
    cell = rnn_cell.LSTMCell(n_hidden,state_is_tuple = True)
    cell = rnn_cell.MultiRNNCell([cell] * 2)
    output, state = tf.nn.dynamic_rnn(cell=cell, inputs=x, dtype = tf.float32)
    output = tf.transpose(output, [1, 0, 2])
    last = tf.gather(output, int(output.get_shape()[0]) - 1)
    return tf.nn.softmax(tf.matmul(last, weight) + bias)

In [44]:
prediction = RNN(x, weight, bias)

# Define loss and optimizer
loss_f = -tf.reduce_sum(y * tf.log(prediction))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss_f)

# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

ValueError: Dimensions must be equal, but are 600 and 320 for 'rnn/while/rnn/multi_rnn_cell/cell_0/lstm_cell/MatMul_1' (op: 'MatMul') with input shapes: [?,600], [320,1200].

In [33]:
with tf.Session() as session:
    session.run(init)
    
    for itr in range(training_iters):    
        offset = (itr * batch_size) % (tr_labels.shape[0] - batch_size)
        batch_x = tr_features[offset:(offset + batch_size), :, :]
        batch_y = tr_labels[offset:(offset + batch_size), :]
        _, c = session.run([optimizer, loss_f],feed_dict={x: batch_x, y : batch_y})
            
        if epoch % display_step == 0:
            # Calculate batch accuracy
            acc = session.run(accuracy, feed_dict={x: batch_x, y: batch_y})
            # Calculate batch loss
            loss = session.run(loss_f, feed_dict={x: batch_x, y: batch_y})
            print "Iter " + str(epoch) + ", Minibatch Loss= " + /
                  "{:.6f}".format(loss) + ", Training Accuracy= " + /
                  "{:.5f}".format(acc)
    
    print('Test accuracy: ',round(session.run(accuracy, feed_dict={x: ts_features, y: ts_labels}) , 3))

NameError: name 'optimizer' is not defined