# cnn
- scipy.io.wavfile
- 0.5s chunk size, 0.25s window 
- mfcc
- f1 0.972

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display
import librosa.display
import numpy as np
import librosa
import tensorflow as tf
from scipy import signal
import scipy.io.wavfile

## Data load
- uav: only p2 unloaded
- none: some of the background sound and other sounds like gator/train

In [None]:
import glob
uav_path = '../data/phantom/JUNE_01_PHANTOMS/wavs/p2-unloaded/*.wav'
none_path = '../data/phantom/JUNE_02_BACKGROUND/wavs/background/use/*.wav'
uav_files = glob.glob(uav_path)
none_files = glob.glob(none_path)
print('\n'.join(uav_files))
print('\n'.join(none_files))

In [None]:
CHUNK_SIZE = 8192
SR = 44100
N_MFCC = 16

In [None]:
def load2(files):
    _, raw = scipy.io.wavfile.read(files[0])
    for f in files[1:]:
        _, array = scipy.io.wavfile.read(f)
        raw = np.hstack((raw, array))
    print(raw.shape)
    return raw

In [None]:
uav_raw = load2(uav_files)
none_raw = load2(none_files)

In [None]:
np.save('../data/Xy/uav_p2_unloaded_raw', uav_raw)
np.save('../data/Xy/none_selected_raw', none_raw)

In [None]:
uav_raw = np.load('../data/Xy/uav_p2_unloaded_raw.npy')
none_raw = np.load('../data/Xy/none_selected_raw.npy')

In [None]:
uav_raw = uav_raw.astype(float)
none_raw = none_raw.astype(float)

## Data preprocessing
- features: mfcc, delta mfcc, delta2 mfcc, log spectrogram
- 사용한피처: mfcc
- 0.5초 청크, 0.25초 윈도우 슬라이드 (50% 오버랩)

In [None]:
# chunk:한번에 처리하는 오디오 데이터 단위
# window: 슬라이드하는 윈도우 크기
# 50% overlap
# chunk size 44100/2 --> n_frame 43 (n_frame은 mfcc.shape[1]인덱스사이즈)
def mfcc5(raw, label, chunk_size=44100//2, window_size=44100//4, sr=44100, n_mfcc=16, n_frame=43):
    mfcc = np.empty((0, n_mfcc, n_frame))
    y = []
    print(raw.shape)
    i = 0
    while i+chunk_size <= len(raw):
        mfcc_slice = librosa.feature.mfcc(raw[i:i+chunk_size], sr=sr, n_mfcc=n_mfcc)
        if mfcc_slice.shape[1] < n_frame+1:
            print("small end:", mfcc_slice.shape)
            continue
        mfcc_slice = mfcc_slice[:,:-1]
        mfcc_slice = mfcc_slice.reshape((1, mfcc_slice.shape[0], mfcc_slice.shape[1]))
        mfcc = np.vstack((mfcc, mfcc_slice))
        y.append(label)
        i += window_size
    y = np.array(y)
    mfcc = mfcc.reshape(mfcc.shape[0], mfcc.shape[1], mfcc.shape[2], 1)
    y = onehot(y, 2)
    return mfcc, y

In [None]:
# delta_mfcc5
# order 1 아니면 2 
def delta_mfcc5(raw, label, order, chunk_size=44100//2, window_size=44100//4, sr=44100, n_mfcc=16, n_frame=43):
    delta2_mfcc = np.empty((0, n_mfcc, n_frame))
    y = []
    print(raw.shape)
    i = 0
    while i+chunk_size <= len(raw):
        S = librosa.feature.melspectrogram(raw[i:i+chunk_size], sr=sr, n_mels=128)
        log_S = librosa.amplitude_to_db(S, ref=np.max)
        mfcc_slice = librosa.feature.mfcc(S=log_S, sr=sr, n_mfcc=n_mfcc)
        #print(delta_mfcc.shape)
        delta2_mfcc_slice = librosa.feature.delta(mfcc_slice, order=order)
        #print(delta2_mfcc_slice.shape)

        #mfcc_slice = librosa.feature.mfcc(raw[i:i+chunk_size], sr=sr, n_mfcc=n_mfcc)
        if delta2_mfcc_slice.shape[1] < n_frame+1:
            print("small end:", delta2_mfcc_slice.shape)
            continue
        delta2_mfcc_slice = delta2_mfcc_slice[:,:-1]
        delta2_mfcc_slice = delta2_mfcc_slice.reshape((1, delta2_mfcc_slice.shape[0], delta2_mfcc_slice.shape[1]))
        #print(delta2_mfcc_slice.shape)
        #print(delta2_mfcc.shape)
        delta2_mfcc = np.vstack((delta2_mfcc, delta2_mfcc_slice))
        y.append(label)
        i += window_size
    y = np.array(y)
    delta2_mfcc = delta2_mfcc.reshape(delta2_mfcc.shape[0], delta2_mfcc.shape[1], delta2_mfcc.shape[2], 1)
    y = onehot(y, 2)
    return delta2_mfcc, y

In [None]:
# log_spectrogram 계산
# n_frame사이즈 조정
def log_spectrograms(raw, label, chunk_size=44100//2, window_size=44100//4, sr=44100, n_frame=49, n_freqs=442):
    ls = np.empty((0, n_frame, n_freqs))
    y = []
    print(raw.shape)
    i = 0

    while i+chunk_size <= len(raw):
        #(청크개수,freqs=442, time=47)
        ls_slice = log_specgram(raw[i:i+chunk_size], sample_rate=sr)[2]
        if ls_slice.shape[0] < n_frame:
            print("small end:", ls_slice.shape)
            continue
        ls_slice = ls_slice.reshape((1, ls_slice.shape[0], ls_slice.shape[1]))
        #print(ls_slice.shape)
        #print(ls.shape)
        ls = np.vstack((ls, ls_slice))
        y.append(label)
        i += window_size
    y = np.array(y)
    ls = ls.reshape(ls.shape[0], ls.shape[1], ls.shape[2], 1)
    y = onehot(y,2)
    return ls, y

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

# log specgram one pass - 한번에...안썼음
def log_specgram_one_pass(raw, label, sr=44100):
    ls = log_specgram(raw, sample_rate=sr)[2]
    ls = ls.reshape(ls.shape[0], ls.shape[1], 1)

    y = [label]*ls.shape[0]
    y = np.ones((ls.shape[0],),dtype=np.int)*label
    print(y.shape)
    y = onehot(y, 2)
    return ls, y

In [4]:
def combine(X_uav, X_none, y_uav, y_none):
    X = np.concatenate((X_uav, X_none), axis=0)
    y = np.concatenate((y_uav, y_none), axis=0)
    return X, y

def onehot(y, n_classes):
    y_encoded = np.zeros((y.shape[0], n_classes))
    y_encoded[np.arange(y.shape[0]), y] = 1
    print(y_encoded.shape)
    return y_encoded

def split_save(X, y, name, save=False):    
        from sklearn import model_selection
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=0.2, random_state=42)
        if save:
            np.save('../data/Xy/X_train_%s'%name, X_train)
            np.save('../data/Xy/X_test_%s'%name, X_test)
            np.save('../data/Xy/y_train_%s'%name, y_train)
            np.save('../data/Xy/y_test_%s'%name, y_test)
        return X_train, X_test, y_train, y_test

def load_Xy(name):
    X_train = np.load('../data/Xy/X_train_%s.npy'%name)
    X_test = np.load('../data/Xy/X_test_%s.npy'%name)
    y_train = np.load('../data/Xy/y_train_%s.npy'%name)
    y_test = np.load('../data/Xy/y_test_%s.npy'%name)
    return X_train, X_test, y_train, y_test

In [None]:
ls_uav = log_spectrograms(uav_raw, 1)#freq 442...
ls_none = log_spectrograms(none_raw, 0)#freq 442...
print(ls_uav[0].shape, ls_uav[1].shape)
print(ls_none[0].shape, ls_none[1].shape)

X_ls, y_ls = combine(ls_uav[0], ls_none[0], ls_uav[1], ls_none[1])

In [None]:
ls_uav = log_specgram_one_pass(uav_raw, 1)#freq 442...
ls_none = log_specgram_one_pass(none_raw, 0)#freq 442...
print(ls_uav[0].shape, ls_uav[1].shape)
print(ls_none[0].shape, ls_none[1].shape)

X_ls, y_ls = combine(ls_uav[0], ls_none[0], ls_uav[1], ls_none[1])

In [None]:
delta2_uav = delta_mfcc5(uav_raw, 1, order=2)
print(delta2_uav[0].shape, delta2_uav[1].shape)
delta2_none = delta_mfcc5(none_raw, 0, order=2)
print(delta2_none[0].shape, delta2_none[1].shape)


X_delta2, y_delta2 = combine(delta2_uav[0], delta2_none[0], delta2_uav[1], delta2_none[1])

In [None]:
delta_uav = delta_mfcc5(uav_raw, 1, order=1)
print(delta_uav[0].shape, delta_uav[1].shape)
delta_none = delta_mfcc5(none_raw, 0, order=1)
print(delta_none[0].shape, delta_none[1].shape)


X_delta, y_delta = combine(delta_uav[0], delta_none[0], delta_uav[1], delta_none[1])

In [None]:
mfcc_uav = mfcc5(uav_raw, 1)
print(mfcc_uav[0].shape, mfcc_uav[1].shape)
mfcc_none = mfcc5(none_raw, 0)
print(mfcc_none[0].shape, mfcc_none[1].shape)


X_mfcc, y_mfcc = combine(mfcc_uav[0], mfcc_none[0], mfcc_uav[1], mfcc_none[1])

In [5]:
X_train, X_test, y_train, y_test = split_save(X_mfcc, y_mfcc, 'mfcc5_longwindow_0723', save=True)

NameError: name 'X_mfcc' is not defined

In [6]:
X_train, X_test, y_train, y_test = load_Xy('mfcc5_longwindow_0723')

In [None]:
X_train, X_test, y_train, y_test = split_save(X_delta2, y_delta2, 'delta2', save=True)

In [None]:
X_train, X_test, y_train, y_test = load_Xy('delta2')

In [None]:
X_train, X_test, y_train, y_test = split_save(X_delta, y_delta, 'delta', save=True)

In [None]:
X_train, X_test, y_train, y_test = load_Xy('delta')

In [None]:
X_train, X_test, y_train, y_test = split_save(X_ls, y_ls, 'logspec', save=True)

In [None]:
X_train, X_test, y_train, y_test = load_Xy('logspec')

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

# Tensorflow finally!

## Training

### Parameters
- learning rate, epoch 유의

In [7]:
n_mfcc = 16
n_frame = 43
n_classes = 2
n_channels = 1

learning_rate = 0.001
training_epochs = 2500

### Experiment 1 - One convolutional layer
- filter size: [13,4] n_mfcc가 16이고 frequency 대역이 중요해서 필터는 n_mfcc에 맞게 되도록 길게

In [8]:
X = tf.placeholder(tf.float32, shape=[None,n_mfcc*n_frame*n_channels])
X = tf.reshape(X, [-1, n_mfcc, n_frame, n_channels])
Y = tf.placeholder(tf.float32, shape=[None,n_classes])

# rectangular filter
conv1 = tf.layers.conv2d(inputs=X, filters=1, kernel_size=[13, 4],
                         activation=tf.nn.relu)
print(conv1)

pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[3, 3],
                                padding='SAME', strides=1)
print(pool1)

conv2 = tf.layers.conv2d(inputs=pool1, filters=1, kernel_size=[2, 2],
                         padding="SAME", activation=tf.nn.relu)
print(conv2)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2],
                                padding="SAME", strides=2)
print(pool2)


flat = tf.reshape(pool2, [-1, 2*20*1])
print(flat)

dense3 = tf.layers.dense(inputs=flat, units=200, activation=tf.nn.relu)
logits = tf.layers.dense(inputs=dense3, units=2)
print(dense3)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

Tensor("conv2d/Relu:0", shape=(?, 4, 40, 1), dtype=float32)
Tensor("max_pooling2d/MaxPool:0", shape=(?, 4, 40, 1), dtype=float32)
Tensor("conv2d_1/Relu:0", shape=(?, 4, 40, 1), dtype=float32)
Tensor("max_pooling2d_1/MaxPool:0", shape=(?, 2, 20, 1), dtype=float32)
Tensor("Reshape_1:0", shape=(?, 40), dtype=float32)
Tensor("dense/Relu:0", shape=(?, 200), dtype=float32)


In [9]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [10]:
model_path = '../models/cnn/cnn'
saver = tf.train.Saver()

In [11]:
batch_size = 100
cost_history = np.empty(shape=[1], dtype=float)

for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = int(y_train.shape[0] / batch_size)
    for i in range(0, y_train.shape[0], batch_size):
        feed_dict={X:X_train[i:i+batch_size,:,:,:], Y:y_train[i:i+batch_size,:]}
        c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
        cost_history = np.append(cost_history,cost)
        avg_cost += c/total_batch
    if epoch % 50 == 0:
        print('Epoch:', '%04d' % (epoch+1), 'cost = ', '{:.9f}'.format(avg_cost))
saver.save(sess, model_path)

Epoch: 0001 cost =  0.786575564
Epoch: 0051 cost =  0.008409197
Epoch: 0101 cost =  0.000729695
Epoch: 0151 cost =  0.000290679
Epoch: 0201 cost =  0.000158104
Epoch: 0251 cost =  0.000096778
Epoch: 0301 cost =  0.000063447
Epoch: 0351 cost =  0.000043069
Epoch: 0401 cost =  0.000030492
Epoch: 0451 cost =  0.000022118
Epoch: 0501 cost =  0.000016419
Epoch: 0551 cost =  0.000012351
Epoch: 0601 cost =  0.000009326
Epoch: 0651 cost =  0.000007147
Epoch: 0701 cost =  0.000005513
Epoch: 0751 cost =  0.000004265
Epoch: 0801 cost =  0.000003315
Epoch: 0851 cost =  0.000002610
Epoch: 0901 cost =  0.000002030
Epoch: 0951 cost =  0.000001602
Epoch: 1001 cost =  0.000001249
Epoch: 1051 cost =  0.000000985
Epoch: 1101 cost =  0.000000775
Epoch: 1151 cost =  0.000000610
Epoch: 1201 cost =  0.000000482
Epoch: 1251 cost =  0.000000380
Epoch: 1301 cost =  0.000000305
Epoch: 1351 cost =  0.000000239
Epoch: 1401 cost =  0.000000191
Epoch: 1451 cost =  0.000000152
Epoch: 1501 cost =  0.000000122
Epoch: 1

'../models/cnn/cnn'

In [12]:
y_pred = sess.run(tf.argmax(logits,1),feed_dict={X: X_test})
y_true = sess.run(tf.argmax(y_test,1))

### Results
- mfcc
- long frame (0.5sec), long window (0.25)
- batch 100
- rectangular filter size
- epoch 2500
- learning rate 0.001

In [13]:
from sklearn.metrics import precision_recall_fscore_support
p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
print("F-Score:", round(f,3))
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(y_true, y_pred))

from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true, y_pred))

F-Score: 0.972
Accuracy:  0.9715909090909091
             precision    recall  f1-score   support

          0       0.98      0.97      0.98       113
          1       0.95      0.97      0.96        63

avg / total       0.97      0.97      0.97       176

[[110   3]
 [  2  61]]
