In [10]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

import tensorflow as tf
# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd

%matplotlib inline

n_mels = 40
n_frame = 500
window_size=1024
hop_size=512

sequence_length = 50 #layer
n_unique_labels = 3

In [2]:
import glob
uav_path = '../data/0808_field-0807_train/Unloaded/*.*'
loaded_path = '../data/0808_field-0807_train/Loaded/*.*'
none_path = '../data/0808_field-0807_train/Background/*.*'

uav_files = glob.glob(uav_path)#[0]]
loaded_files = glob.glob(loaded_path)#[0]]
none_files = glob.glob(none_path)#[0]]

In [3]:
print(len(uav_files),'개\t', uav_files[0])
print(len(uav_files),'개\t', loaded_files[0])
print(len(none_files), '개\t',none_files[0])

13 개	 ../data/0808_field-0807_train/Unloaded\rpi1_1533670549.wav
13 개	 ../data/0808_field-0807_train/Loaded\rpi2_1533669810.wav
4 개	 ../data/0808_field-0807_train/Background\rpi2_1533670583.wav


In [4]:
SR = 44100

# Load Data

The reason of why SR is 44100 is that the sample rate of above files is 44.1kbps

a wav file sample has 884736. if sample is divided by sample rate, the value is time
the time is fixed by 20.06

In [5]:
def load(files, sr=SR):
    [raw, sr] = librosa.load(files[0], sr=sr)
    for f in files[1:]:
        [array, sr] = librosa.load(f, sr=sr)
        raw = np.hstack((raw, array))
    print(raw.shape)
    return raw

In [6]:
none_raw = load(none_files)
uav_raw = load(uav_files)
loaded_raw = load(loaded_files)


(86495188,)
(54636926,)
(62515322,)


In [7]:
sample_rate=SR


# Feature extraction 
## steps
#### 1. Resampling 
#### 2. *VAD*( Voice Activity Detection)
#### 3. Maybe padding with 0 to make signals be equal length
#### 4. Log spectrogram (or *MFCC*, or *PLP*)
#### 5. Features normalization with *mean* and *std*
#### 6. Stacking of a given number of frames to get temporal information



## 1. Resampling

if you see the graph, there are few at high frequency. this is mean that data is big but it's no useless. so To small the data, do Resampling. In general, use 0~8000Hz 

In [8]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [9]:
def showFreqTime(combine):
    for sample, filename, _ in combine:
        freqs, times, spectrogram = log_specgram(sample, SR)
        fig = plt.figure(figsize=(14, 10))
        ax1 = fig.add_subplot(211)
        ax1.set_title('Raw wave of ' + filename)
        ax1.set_ylabel('Amplitude')
        ax1.plot(np.linspace(0, len(sample)/SR, len(sample)), sample)

        ax2 = fig.add_subplot(212)
        ax2.imshow(spectrogram.T, aspect='auto', origin='lower', 
               extent=[times.min(), times.max(), freqs.min(), freqs.max()])
        ax2.set_yticks(freqs[::16])
        ax2.set_xticks(times[::16])
        ax2.set_title('Spectrogram of ' + filename)
        ax2.set_ylabel('Freqs in Hz')
        ax2.set_xlabel('Seconds')

In [10]:

#n_path = '../data/Test44100_20sec/Background/*.*'
#n_files = [glob.glob(n_path)[0]]
#n_raw = load(n_files)
#combine = [[n_raw,n_path,sample_rate]]
#showFreqTime(combine)

In [11]:
#base =500000
#combine = [[none_raw[:],none_path,sample_rate],[uav_raw[:],uav_path,sample_rate],[loaded_raw[:],none_path,sample_rate]]
#showFreqTime(combine)

### Data diffrence

qualiy is diffrence but, the data is almost same.

## 2. VAD

Sometimes, Files have silence. It is not necessary. So, We need to find sound of Drone except silence.

But, Not yet implemented

## 3. padding with 0 to make signals be equal length

If we have a lot of sound files, we need to pad some datas. But These files's time is longger than 1 second. So It dosn't need to pad

## 4. Log spectrogram (or MFCC, or PLP)

The upper picture is resampled data. 
The lower picture is original data.

In MFCC Feature, There is no big difference. 

In [12]:
#__, _, none_spec = log_specgram(none_raw, sample_rate)
#__, _, uav_spec = log_specgram(uav_raw, sample_rate)
#__, _, load_spec = log_specgram(loaded_raw, sample_rate)



none_spec = librosa.feature.melspectrogram(y=none_raw, sr=SR,n_fft=window_size, hop_length=hop_size, power=2.0, n_mels=40)

uav_spec = librosa.feature.melspectrogram(y=uav_raw, sr=SR,n_fft=window_size, hop_length=hop_size, power=2.0, n_mels=40)

load_spec = librosa.feature.melspectrogram(y=loaded_raw, sr=SR,n_fft=window_size, hop_length=hop_size, power=2.0, n_mels=40)


In [13]:
print( load_spec, load_spec.min(), load_spec.max())
load_spec.shape

[[2.11006605e-02 1.61835442e-02 3.03901490e-02 ... 1.20252896e-03
  6.10812950e-03 7.27907501e-03]
 [2.00451710e-02 1.64841656e-03 1.32050756e-02 ... 2.05985918e-04
  1.92671290e-04 1.80553821e-04]
 [3.15206195e-03 9.46259218e-04 4.58904355e-04 ... 3.56457019e-05
  7.91620351e-05 4.40740739e-05]
 ...
 [4.42708682e-07 6.41881814e-07 6.40017341e-07 ... 3.93027883e-07
  4.45212443e-07 5.90004649e-07]
 [4.65871136e-07 5.06076491e-07 4.45226961e-07 ... 4.36538617e-07
  3.34484985e-07 4.41376191e-07]
 [3.34930289e-07 4.93845921e-07 4.74430388e-07 ... 4.01971818e-07
  2.81076206e-07 3.48555609e-07]] 9.515359367670075e-08 5.8225694873904486


(40, 122101)

In [14]:
y_none =np.zeros(none_spec.shape[1], dtype=int)
y_uav = np.ones(uav_spec.shape[1], dtype=int)
y_loaded = np.ones(load_spec.shape[1], dtype=int)*2

print(len(none_spec),len(uav_spec),len(load_spec))
print(y_none.shape, y_none[0])
print(y_uav.shape, y_uav[0])
print(y_loaded.shape, y_loaded[0])

40 40 40
(168936,) 0
(106713,) 1
(122101,) 2


## 5. Features normalization with *mean* and *std*

## 6. Stacking of a given number of frames to get temporal information

In [15]:
X_mfcc = np.hstack((none_spec, uav_spec))
X_mfcc = np.hstack((X_mfcc, load_spec))
X_mfcc = X_mfcc.T

y = np.hstack((y_none, y_uav))
y = np.hstack((y, y_loaded))

#X = np.concatenate((mfcc_loaded, mfcc_uav, mfcc_none), axis=0)
#y = np.hstack((y_loaded, y_uav, y_none))
print(X_mfcc.shape, y.shape)


(397750, 40) (397750,)


In [16]:

n_labels = y.shape[0]
y_encoded = np.zeros((n_labels, n_unique_labels))
y_encoded[np.arange(n_labels), y] = 1
print(y_encoded.shape)
#print(y_encoded[0], y_encoded[40000],y_encoded[100000])

(397750, 3)


In [17]:
dataX = X_mfcc
dataY = y_encoded
#print(y_encoded)
print(dataX.shape, dataY.shape)

(397750, 40) (397750, 3)


In [18]:
def makeHot(dataX, dataY, sequence_length):
    X_hot_list= []
    Y_hot_tmp = dataY[sequence_length-1:]

    for i in range(0, dataX.shape[0] - sequence_length+1):
        _x = dataX[i:i + sequence_length]
        #if i<10:
            #print(_x, "->", Y_hot_tmp[i])
        X_hot_list.append(_x)

    X_hot = np.array(X_hot_list[:])
    Y_hot= Y_hot_tmp.reshape((len(Y_hot_tmp),n_unique_labels))
    return X_hot[:], Y_hot[:]

In [19]:
X_hot, Y_hot = makeHot( dataX, dataY, sequence_length)
print(X_hot.shape, Y_hot.shape)


(397701, 50, 40) (397701, 3)


In [21]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_hot, Y_hot, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [6]:
base = '../data/Xy/Spec'
class Data:
    def __init__(self,X,Y,BatchSize):
        self.X = X
        self.Y = Y
        self.len = len(Y)
        self.bs = BatchSize
        
        self.bs_i = 0
    def getBatchData(self):
        s = self.bs_i
        e = self.bs_i + self.bs
        if e> self.len:
            e -= self.len
            result =  np.vstack((self.X[s:],self.X[:e])), np.vstack((self.Y[s:],self.Y[:e]))
        else:
            result =  self.X[s:e], self.Y[s:e]
            
        self.bs_i = e
        return result
dataX = [1,2,3,4,5,6,7,8]
dataY = [11,12,13,14,15,16,17,18]
D = Data(dataX, dataY,3)
x, y = D.getBatchData()
print(x,y)
x, y = D.getBatchData()
print(x,y)

#np.save('../data/Xy/Spec/y_val2', y_val)

[1, 2, 3] [11, 12, 13]
[4, 5, 6] [14, 15, 16]


In [30]:

np.save(base+'/X_train2', X_train)
np.save(base+'/X_test2', X_test)
np.save(base+'/X_val2', X_val)
np.save(base+'/y_train2', y_train)
np.save(base+'/y_test2', y_test)
np.save(base+'/y_val2', y_val)

In [4]:
X_train = np.load(base+'/X_train2.npy')
X_test = np.load(base+'/X_test2.npy')
X_val = np.load(base+'/X_val2.npy')
y_train = np.load(base+'/y_train2.npy')
y_test = np.load(base+'/y_test2.npy')
y_val = np.load(base+'/y_val2.npy')

In [7]:
batch_size = 2048
traindata = Data(X_train,y_train,batch_size)
testdata = Data(X_test,y_test,batch_size)
valdata = Data(X_val,y_val,batch_size)

In [8]:
print(X_train.shape, X_test.shape,X_val.shape)
print(y_train.shape, y_test.shape,y_val.shape)

(254528, 50, 40) (79541, 50, 40) (63632, 50, 40)
(254528, 3) (79541, 3) (63632, 3)


# Tensorflow RNN

## Train 

In [12]:
batch_size = batch_size
num_classes = n_mels           #분류할 사전의 크기 

learning_rate = 0.001
sequence_length = sequence_length #9

output_dim = n_unique_labels
layers = 3
 
model_path = '../models/RNN/my_RNN_model_test_spec'

print(batch_size, num_classes)
print(learning_rate, sequence_length)
print(output_dim, layers)

2048 40
0.001 50
3 3


In [13]:
X = tf.placeholder(tf.float32, [None, sequence_length,num_classes], name="X")
Y = tf.placeholder(tf.float32, [None, output_dim], name="Y")

keep_prob = tf.placeholder(tf.float32)

cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_classes, state_is_tuple=True)
#cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
cell = tf.contrib.rnn.MultiRNNCell([cell]*2, state_is_tuple= True)

BatchSize = tf.placeholder(tf.int32, [], name='BatchSize')
initial_state = cell.zero_state(BatchSize, tf.float32)
outputs, _states = tf.nn.dynamic_rnn(cell, X,initial_state=initial_state,dtype=tf.float32)

dense1 = tf.layers.dense(inputs=outputs[:,-1], units=sequence_length*output_dim, activation=tf.nn.relu)

dense2 = tf.layers.dense(inputs=dense1, units=sequence_length*output_dim, activation=tf.nn.relu)
dropout2 = tf.nn.dropout(dense1, keep_prob=keep_prob)

dense3 = tf.layers.dense(inputs=dense2, units=output_dim, activation=tf.nn.relu)

Y_pred= tf.layers.dense(inputs=dense3, units=output_dim)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=Y_pred, labels=Y))
lr = tf.placeholder(tf.float32,shape=(), name='learning_rate')
train = tf.train.AdamOptimizer(lr).minimize(cost)


In [14]:
traindata.X[0]

array([[6.12215189e-03, 3.31032027e-02, 2.85044409e-02, ...,
        5.99796072e-07, 3.29981956e-07, 3.98012499e-07],
       [2.05139357e-02, 4.02045705e-02, 1.72614741e-02, ...,
        5.05759738e-07, 4.84961720e-07, 3.82044306e-07],
       [1.06363211e-02, 1.39114275e-02, 7.75138549e-03, ...,
        3.90992772e-07, 3.85307925e-07, 4.45898119e-07],
       ...,
       [4.23804717e-02, 1.24427865e-02, 3.91737217e-03, ...,
        4.79267537e-07, 5.49407258e-07, 4.10613591e-07],
       [1.01528196e-02, 1.47945278e-02, 7.25676772e-03, ...,
        4.43085639e-07, 3.64535158e-07, 3.66937756e-07],
       [2.91146717e-02, 1.66137838e-02, 7.72954848e-03, ...,
        4.63182143e-07, 4.23779524e-07, 4.40112267e-07]])

In [15]:
print(traindata.Y[:10])

[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [16]:
x, y = traindata.getBatchData()
print(y)

[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [None]:
from sklearn.metrics import accuracy_score
init = tf.global_variables_initializer()
cost_history = np.empty(shape=[1],dtype=float)
step_loss = 999999.0
saver = tf.train.Saver()
training_epochs = 1000
# Training step

sess = tf.InteractiveSession()
sess.run(init)
#learning_rate_ = [i*0.001 for i in range(20,10,-1)]
#for learning_rate in [0.02, 0.01]:
#    feed = {lr:learning_rate, BatchSize: batch_size}
N = int(len(valdata.Y) / batch_size) + 1
for i in range(training_epochs):
    feed = {lr:learning_rate, BatchSize: batch_size, keep_prob : 1.0}
    for n in range(N):
        x,y = traindata.getBatchData()
        feed[X], feed[Y] = x, y
        step_loss_prev = step_loss
        _, step_loss = sess.run([train, cost], feed_dict=feed)
        cost_history = np.append(cost_history,step_loss)
        
    y_pred = sess.run(tf.argmax(Y_pred,1),feed_dict={
        X: valdata.X, BatchSize: len(valdata.Y), keep_prob:1.0})
    y_true =  sess.run(tf.argmax(valdata.Y,1))
    accuracy_val = accuracy_score(y_pred, y_true)
    print("[step: {}] loss: {}".format(i, step_loss), "\tvalidation: {:.3f}%".format(accuracy_val * 100))
    if i%10 == 1:
        print('')
        saver.save(sess, model_path)
sess.close()
    

    


An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).



[step: 0] loss: 1.0371475219726562 	validation: 50.539%
[step: 1] loss: 0.9940330982208252 	validation: 52.939%

[step: 2] loss: 0.9908829927444458 	validation: 52.714%
[step: 3] loss: 0.9367541074752808 	validation: 53.033%
[step: 4] loss: 0.9241124391555786 	validation: 52.739%
[step: 5] loss: 0.912929892539978 	validation: 54.997%
[step: 6] loss: 0.8925859332084656 	validation: 55.221%
[step: 7] loss: 0.8585377931594849 	validation: 55.378%
[step: 8] loss: 0.8661543130874634 	validation: 65.062%
[step: 9] loss: 0.8352466821670532 	validation: 66.762%
[step: 10] loss: 0.8285336494445801 	validation: 67.106%
[step: 11] loss: 0.7982887029647827 	validation: 65.596%

[step: 12] loss: 0.7888187170028687 	validation: 66.302%
[step: 13] loss: 0.7916765809059143 	validation: 68.964%
[step: 14] loss: 0.81630539894104 	validation: 70.372%
[step: 15] loss: 0.7907871007919312 	validation: 67.461%
[step: 16] loss: 0.8414484262466431 	validation: 55.915%
[step: 17] loss: 0.7825598120689392 	valid

In [None]:
sess = tf.InteractiveSession()
saver.restore(sess, model_path)
y_pred = sess.run(tf.argmax(Y_pred,1),feed_dict=
                  {X: testdata.X, BatchSize: len(testdata.Y), keep_prob : 1.0})
y_true = sess.run(tf.argmax(testdata.Y,1))
print(y_pred.shape, y_true.shape)
sess.close()

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

fig = plt.figure(figsize=(10,8))
plt.plot(cost_history)
plt.ylabel("Cost")
plt.xlabel("Iterations") 
plt.axis([0,len(cost_history),0,np.max(cost_history)])
plt.show()


p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
print("F-Score:", round(f,3))
print("Accuracy: ", accuracy_score(y_true, y_pred))

print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))