In [1]:
import glob
import os
import random
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

In [2]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size / 2)

# 각 33개 input, 각 6개 label
        
def extract_features(file_path, file_label, file_ext="*.wav",bands = 20, frames = 41):
    window_size = 512 * (frames - 1)
    mfccs = []
    log_specgrams = []
    features = []
    labels = []
    sound_clip, s = librosa.load(file_path)
    #print(type(sound_clip))
    if file_label=='other':
        label_code = 0
    elif file_label=='person':
        label_code = 1
    elif file_label=='car':
        label_code = 2
    elif file_label=='drone':
        label_code = 3
        
        
    #print(file_label, label_code)
    for (start,end) in windows(sound_clip,window_size):
        start = int(start)
        end = int(end)
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            logspec = librosa.amplitude_to_db(melspec)
            logspec = logspec.T.flatten()[:, np.newaxis].T
            #print(1, logspec.shape)
            log_specgrams.append(logspec)
            
            mfcc = librosa.feature.mfcc(y=signal, sr=s, n_mfcc = bands).T.flatten()[:, np.newaxis].T
            mfccs.append(mfcc)
            #print(2, mfcc.shape)
            features = np.hstack((mfccs, log_specgrams))
            labels.append(label_code)         
    features = np.asarray(features).reshape(len(mfccs), frames, bands*2)
    #print(features.shape)
    return np.array(features), np.array(labels,dtype = np.int)

def extract_features_for_predict(file_path, bands = 20, frames = 41):
    window_size = 512 * (frames - 1)
    mfccs = []
    log_specgrams = []
    features = []
    sound_clip, s = librosa.load(file_path)
    
    for (start,end) in windows(sound_clip,window_size):
        start = int(start)
        end = int(end)
        if(len(sound_clip[start:end]) == window_size):
            signal = sound_clip[start:end]
            
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            logspec = librosa.amplitude_to_db(melspec)
            logspec = logspec.T.flatten()[:, np.newaxis].T
            log_specgrams.append(logspec)
            
            mfcc = librosa.feature.mfcc(y=signal, sr=s, n_mfcc = bands).T.flatten()[:, np.newaxis].T
            mfccs.append(mfcc)
            features = np.hstack((mfccs, log_specgrams))      
            
    features = np.asarray(features).reshape(len(mfccs), frames, bands*2)
    #print(features.shape)
    return np.array(features)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [3]:
wav_file_path_training = 'C://slice_wav_data/training/'
file_list_training = os.listdir(wav_file_path_training)

tr_features = []
tr_labels = []
for f in file_list_training:
    file_label = f.split("_")[0]
    #if file_label=='person' or file_label=='car': ## 2개씩만 하는 코드
     #   continue
    features_temp, labels_temp = extract_features(wav_file_path_training + f, file_label)
    for tr_f in features_temp:
        tr_features.append(tr_f)
    for tr_l in labels_temp:
        tr_labels.append(tr_l)
    

tmp = [[x,y] for x,y in zip(tr_features, tr_labels)]
random.shuffle(tmp)
tr_features = [n[0] for n in tmp]
tr_labels = [n[1] for n in tmp]


wav_file_path_test = 'C://slice_wav_data/testing/'
file_list_test = os.listdir(wav_file_path_test)

ts_features = []
ts_labels = []
for f in file_list_test:
    file_label = f.split("_")[0]
    #if file_label=='person' or file_label=='car': ## 2개씩만 하는 코드
     #   continue
    features_temp, labels_temp = extract_features(wav_file_path_test + f, file_label)
    for ts_f in features_temp:
        ts_features.append(ts_f)
    for ts_l in labels_temp:
        ts_labels.append(ts_l)
    
tr_labels = one_hot_encode(tr_labels)
ts_labels = one_hot_encode(ts_labels)

tr_features = np.array(tr_features)
tr_labels = np.array(tr_labels)

ts_features = np.array(ts_features)
ts_labels = np.array(ts_labels)

In [4]:
temp_arr = np.array(tr_features)
temp_arr.shape

(1188, 41, 40)

In [5]:
temp_arr = np.array(ts_labels) #mfcc를 통해 1188개 트레이닝 데이터, 216개 테스팅 데이터로 변환
temp_arr.shape

(216, 4)

In [6]:
tf.reset_default_graph()

learning_rate = 0.0004
training_iters = 20000
batch_size = 54 #1188과 216의 최대공약수는 54
display_step = 200

# Network Parameters
n_input = 40
n_steps = 41
n_hidden = 40
n_classes = 4

#앞에거는 hidden *2, 뒤에거는 n_input + n_hidden

x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, n_classes])

weight = tf.Variable(tf.random_normal([n_hidden, n_classes]))
bias = tf.Variable(tf.random_normal([n_classes]))

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
def RNN(x, weight, bias):
    cell = rnn_cell.LSTMCell(n_hidden,state_is_tuple = True)
    cell = rnn_cell.MultiRNNCell([cell] * 8, state_is_tuple=True)
    output, state = tf.nn.dynamic_rnn(cell, x, dtype = tf.float32)
    print(1, output)
    output = tf.transpose(output, [1, 0, 2])
    print(1, output)
    last = tf.gather(output, int(output.get_shape()[0]) - 1)
    print(output.get_shape())
    print(1, last)
    return tf.nn.softmax(tf.matmul(last, weight) + bias)

In [8]:
prediction = RNN(x, weight, bias)
print(prediction)

prediction_str = tf.argmax(prediction, 1)
#prediction_str = tf.argmax(prediction)

# Define loss and optimizer
loss_f = -tf.reduce_sum(y * tf.log(prediction))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss_f)

# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
1 Tensor("rnn/transpose_1:0", shape=(?, 41, 40), dtype=float32)
1 Tensor("transpose:0", shape=(41, ?, 40), dtype=float32)
(41, ?, 40)
1 Tensor("GatherV2:0", shape=(?, 40), dtype=float32)
Tensor("Softmax:0", shape=(?, 4), dtype=float32)
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [9]:
session = tf.Session()
session.run(init)
    
training_epochs = 10000
for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = int(len(tr_features) / batch_size)
 
    for i in range(total_batch):
        start = ((i+1) * batch_size) - batch_size
        end = ((i+1) * batch_size)
        batch_x = tr_features[start:end]
        batch_y = tr_labels[start:end]
                
        _, c = session.run([optimizer, loss_f], feed_dict={x: batch_x, y : batch_y})
        avg_cost += c / total_batch
 
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost), end='')
    print('Test accuracy: ',round(session.run(accuracy, feed_dict={x: ts_features, y: ts_labels}) , 3))
print('Learning Finished!')
    

Epoch: 0001 cost = 89.814955625Test accuracy:  0.264
Epoch: 0002 cost = 75.129258936Test accuracy:  0.259
Epoch: 0003 cost = 71.766682712Test accuracy:  0.162
Epoch: 0004 cost = 66.888124466Test accuracy:  0.25
Epoch: 0005 cost = 63.035421718Test accuracy:  0.273
Epoch: 0006 cost = 59.330391624Test accuracy:  0.347
Epoch: 0007 cost = 58.454699430Test accuracy:  0.273
Epoch: 0008 cost = 56.160501133Test accuracy:  0.292
Epoch: 0009 cost = 55.095351999Test accuracy:  0.329
Epoch: 0010 cost = 52.418754924Test accuracy:  0.5
Epoch: 0011 cost = 47.629983729Test accuracy:  0.454
Epoch: 0012 cost = 42.585150979Test accuracy:  0.44
Epoch: 0013 cost = 39.765633323Test accuracy:  0.477
Epoch: 0014 cost = 38.018416231Test accuracy:  0.495
Epoch: 0015 cost = 36.512323293Test accuracy:  0.532
Epoch: 0016 cost = 35.387279684Test accuracy:  0.569
Epoch: 0017 cost = 35.003973181Test accuracy:  0.593
Epoch: 0018 cost = 33.552373626Test accuracy:  0.597
Epoch: 0019 cost = 32.942707322Test accuracy:  0.6

Epoch: 0156 cost = 12.639805815Test accuracy:  0.556
Epoch: 0157 cost = 12.222450712Test accuracy:  0.602
Epoch: 0158 cost = 13.182252645Test accuracy:  0.579
Epoch: 0159 cost = 13.578895547Test accuracy:  0.611
Epoch: 0160 cost = 13.279055964Test accuracy:  0.509
Epoch: 0161 cost = 12.611089663Test accuracy:  0.597
Epoch: 0162 cost = 11.823019223Test accuracy:  0.583
Epoch: 0163 cost = 10.232620543Test accuracy:  0.616
Epoch: 0164 cost = 10.751374483Test accuracy:  0.606
Epoch: 0165 cost = 11.083569538Test accuracy:  0.62
Epoch: 0166 cost = 10.611087214Test accuracy:  0.634
Epoch: 0167 cost = 10.965623032Test accuracy:  0.583
Epoch: 0168 cost = 12.214343136Test accuracy:  0.579
Epoch: 0169 cost = 13.772648681Test accuracy:  0.625
Epoch: 0170 cost = 18.260828387Test accuracy:  0.56
Epoch: 0171 cost = 16.691630970Test accuracy:  0.537
Epoch: 0172 cost = 11.492864219Test accuracy:  0.583
Epoch: 0173 cost = 10.589109898Test accuracy:  0.537
Epoch: 0174 cost = 12.548055692Test accuracy:  0

Epoch: 0313 cost = 13.057647813Test accuracy:  0.644
Epoch: 0314 cost = 15.339633508Test accuracy:  0.62
Epoch: 0315 cost = 13.518961365Test accuracy:  0.546
Epoch: 0316 cost = 9.799856273Test accuracy:  0.639
Epoch: 0317 cost = 7.779033563Test accuracy:  0.63
Epoch: 0318 cost = 7.713567994Test accuracy:  0.63
Epoch: 0319 cost = 7.657250263Test accuracy:  0.644
Epoch: 0320 cost = 7.807190256Test accuracy:  0.625
Epoch: 0321 cost = 7.867816535Test accuracy:  0.62
Epoch: 0322 cost = 7.593383442Test accuracy:  0.62
Epoch: 0323 cost = 7.751729711Test accuracy:  0.625
Epoch: 0324 cost = 7.764666850Test accuracy:  0.611
Epoch: 0325 cost = 7.753052896Test accuracy:  0.63
Epoch: 0326 cost = 7.878985134Test accuracy:  0.634
Epoch: 0327 cost = 8.041607716Test accuracy:  0.616
Epoch: 0328 cost = 10.167768327Test accuracy:  0.639
Epoch: 0329 cost = 8.448385802Test accuracy:  0.611
Epoch: 0330 cost = 8.171431715Test accuracy:  0.597
Epoch: 0331 cost = 8.665017085Test accuracy:  0.611
Epoch: 0332 co

KeyboardInterrupt: 

In [None]:
saver = tf.train.Saver()
saver.save(session, './cuav_rnn.ckpt')
print('Graph Saved! ')

In [None]:
def predict(file_source):
    print(file_source)
    data_to_predict = extract_features_for_predict(file_source)
    print(data_to_predict.shape)
    result_list = session.run(prediction_str, feed_dict={x: data_to_predict})
    print(result_list)
    result_list = list(result_list)
    label_count = {'others': 0, 'person': 0, 'car': 0, 'drone': 0,}
   
    total = 0
    for i, key in enumerate(list(label_count.keys())):
        label_count[key] = (result_list.count(i))
        print(key, int(label_count[key])/data_to_predict.shape[0])
    
    t = list(zip(list(label_count.values()), list(label_count.keys())))
    t.sort(reverse=True)
    print(t)
    print("")
        
    #for i in range(data_to_predict.shape[0]):
        #temp_output = session.run(prediction, feed_dict={x: data_to_predict[i]})
        #print(temp_output)
        
    

In [None]:
wav_file_path_test = 'C://slice_wav_data/testing/'
test_file_list = os.listdir(wav_file_path_test)
sample_test_file = wav_file_path_test + test_file_list[0]
for file in test_file_list:
    predict(wav_file_path_test + file)

In [None]:
sess = tf.InteractiveSession()
saver = tf.train.import_meta_graph('./cuav_rnn.ckpt.meta')
saver.restore(self.sess,'./cuav_rnn.ckpt.ckpt')
        
graph = tf.get_default_graph()
self.X =  self.sess.graph.get_tensor_by_name("Placeholder:0")
self.Y =  self.sess.graph.get_tensor_by_name("Placeholder_1:0")