In [4]:
import glob
import os
import random
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

In [5]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size / 2)

# 각 33개 input, 각 6개 label
        
def extract_features(file_path, file_label, file_ext="*.wav",bands = 20, frames = 41):
    window_size = 512 * (frames - 1)
    mfccs = []
    labels = []
    sound_clip, s = librosa.load(file_path)
    #print(type(sound_clip))
    if file_label=='other':
        label_code = 0
    elif file_label=='person':
        label_code = 1
    elif file_label=='car':
        label_code = 2
    elif file_label=='drone':
        label_code = 3
        
        
    #print(file_label, label_code)
    for (start,end) in windows(sound_clip,window_size):
        start = int(start)
        end = int(end)
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            mfcc = librosa.feature.mfcc(y=signal, sr=s, n_mfcc = bands).T.flatten()[:, np.newaxis].T
            mfccs.append(mfcc)
            labels.append(label_code)         
    features = np.asarray(mfccs).reshape(len(mfccs),frames,bands)
    return np.array(features), np.array(labels,dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [13]:
wav_file_path_training = 'C://slice_wav_data/training/'
file_list_training = os.listdir(wav_file_path_training)

tr_features = []
tr_labels = []
for f in file_list_training:
    features_temp, labels_temp = extract_features(wav_file_path_training + f, f.split("_")[0])
    for tr_f in features_temp:
        tr_features.append(tr_f)
    for tr_l in labels_temp:
        tr_labels.append(tr_l)
    
tr_labels = one_hot_encode(tr_labels)


tmp = [[x,y] for x,y in zip(tr_features, tr_labels)]
random.shuffle(tmp)
tr_features = [n[0] for n in tmp]
tr_labels = [n[1] for n in tmp]


wav_file_path_test = 'C://slice_wav_data/testing/'
file_list_test = os.listdir(wav_file_path_test)

ts_features = []
ts_labels = []
for f in file_list_test:
    features_temp, labels_temp = extract_features(wav_file_path_test + f, f.split("_")[0])
    for ts_f in features_temp:
        ts_features.append(ts_f)
    for ts_l in labels_temp:
        ts_labels.append(ts_l)
    
ts_labels = one_hot_encode(ts_labels)

tr_features = np.array(tr_features)
tr_labels = np.array(tr_labels)

ts_features = np.array(ts_features)
ts_labels = np.array(ts_labels)

In [14]:
temp_arr = np.array(tr_features)
temp_arr.shape

(1188, 41, 20)

In [15]:
temp_arr = np.array(ts_labels) #mfcc를 통해 1188개 트레이닝 데이터, 216개 테스팅 데이터로 변환
temp_arr.shape

(216, 4)

In [16]:
tf.reset_default_graph()

learning_rate = 0.001
training_iters = 20000
batch_size = 50
display_step = 200

# Network Parameters
n_input = 20
n_steps = 41
n_hidden = 20
n_classes = 4

#앞에거는 hidden *2, 뒤에거는 n_input + n_hidden

x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, n_classes])

weight = tf.Variable(tf.random_normal([n_hidden, n_classes]))
bias = tf.Variable(tf.random_normal([n_classes]))

In [17]:
def RNN(x, weight, bias):
    cell = rnn_cell.LSTMCell(n_hidden,state_is_tuple = True)
    cell = rnn_cell.MultiRNNCell([cell] * 2, state_is_tuple=True)
    output, state = tf.nn.dynamic_rnn(cell, x, dtype = tf.float32)
    output = tf.transpose(output, [1, 0, 2])
    last = tf.gather(output, int(output.get_shape()[0]) - 1)
    return tf.nn.softmax(tf.matmul(last, weight) + bias)

In [18]:
prediction = RNN(x, weight, bias)

# Define loss and optimizer
loss_f = -tf.reduce_sum(y * tf.log(prediction))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss_f)

# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

In [19]:
with tf.Session() as session:
    session.run(init)
    
    for itr in range(training_iters):    
        offset = (itr * batch_size) % (tr_labels.shape[0] - batch_size)
        batch_x = tr_features[offset:offset + batch_size]
        batch_y = tr_labels[offset:offset + batch_size]
        _, c = session.run([optimizer, loss_f],feed_dict={x: batch_x, y : batch_y})
            
        #if epoch % display_step == 0:
        if itr % 100 == 0:
            # Calculate batch accuracy
            acc = session.run(accuracy, feed_dict={x: batch_x, y: batch_y})
            # Calculate batch loss
            loss = session.run(loss_f, feed_dict={x: batch_x, y: batch_y})
            print("Iter " + str(itr) + ", Minibatch Loss= " + 
                  "{:.6f}".format(loss) + ", Training Accuracy= " + 
                  "{:.5f}".format(acc))
    
    print('Test accuracy: ',round(session.run(accuracy, feed_dict={x: ts_features, y: ts_labels}) , 3))

Iter 0, Minibatch Loss= 93.417648, Training Accuracy= 0.36000
Iter 100, Minibatch Loss= 65.404861, Training Accuracy= 0.38000
Iter 200, Minibatch Loss= 53.215546, Training Accuracy= 0.54000
Iter 300, Minibatch Loss= 37.304848, Training Accuracy= 0.68000
Iter 400, Minibatch Loss= 34.511467, Training Accuracy= 0.68000
Iter 500, Minibatch Loss= 27.462158, Training Accuracy= 0.78000
Iter 600, Minibatch Loss= 30.738861, Training Accuracy= 0.74000
Iter 700, Minibatch Loss= 23.066650, Training Accuracy= 0.82000
Iter 800, Minibatch Loss= 35.500786, Training Accuracy= 0.76000
Iter 900, Minibatch Loss= 23.836876, Training Accuracy= 0.78000
Iter 1000, Minibatch Loss= 20.383902, Training Accuracy= 0.88000
Iter 1100, Minibatch Loss= 25.005852, Training Accuracy= 0.78000
Iter 1200, Minibatch Loss= 17.075623, Training Accuracy= 0.82000
Iter 1300, Minibatch Loss= 25.335052, Training Accuracy= 0.82000
Iter 1400, Minibatch Loss= 18.794292, Training Accuracy= 0.82000
Iter 1500, Minibatch Loss= 25.813677,

KeyboardInterrupt: 