### 依據原始的 predict/predict.py 修改而來

#### vvv Setting-1 vvvv

In [1]:
# prediction related paths, should be consistent with /preprocess/loadData.py and /train/trainModel.py
output_dim = 123
labelBinarizerPath = '../data/labelBinarizer_top123_bird_frog_dog.pickle'
savedModelH5 = '../train/trainModel_InceptionV3_1-v5_bird_frog_dog.h5'
spMapFile = '../preprocess/species_mapping.csv'

In [2]:
from scipy import io
from scipy.io import wavfile
import numpy as np
import os
import pandas as pd
import pickle
import datetime
import time
from matplotlib import mlab
%matplotlib inline
np.random.seed(0)

In [3]:
# this parameter is used for preprocessing
# the number comes from the following equation: np.floor(sampling_frequency/(FFT_length-FFT_overlap))*num_of_seconds
# we use 16kHz sampling rate for the wavs, 512 FFT window length with 256 overlap and we investigate 5 seconds 
spectrogramWindowLength = int(5*np.floor(16000/(512-256)));
spectrogramHeight = 200

#### vvvvv 載入 labelBinarizer

In [4]:
lb = pickle.load(open(labelBinarizerPath, 'rb'))
classIdList = lb.inverse_transform(np.diag([1 for i in range(output_dim)]))
print(len(classIdList), '\n', classIdList)

123 
 ['b00001' 'b00002' 'b00003' 'b00004' 'b00005' 'b00006' 'b00007' 'b00008'
 'b00009' 'b00010' 'b00011' 'b00012' 'b00013' 'b00014' 'b00015' 'b00016'
 'b00017' 'b00018' 'b00019' 'b00020' 'b00021' 'b00022' 'b00023' 'b00024'
 'b00025' 'b00026' 'b00027' 'b00028' 'b00029' 'b00030' 'b00031' 'b00032'
 'b00033' 'b00034' 'b00035' 'b00036' 'b00037' 'b00038' 'b00039' 'b00040'
 'b00041' 'b00042' 'b00043' 'b00044' 'b00045' 'b00046' 'b00047' 'b00048'
 'b00049' 'b00050' 'b00051' 'b00052' 'b00053' 'b00054' 'b00055' 'b00056'
 'b00057' 'b00058' 'b00059' 'b00060' 'b00061' 'b00062' 'b00063' 'b00064'
 'b00065' 'b00066' 'b00067' 'b00068' 'b00069' 'b00070' 'b00071' 'b00072'
 'b00073' 'd00001' 'd00002' 'd00003' 'd00004' 'd00005' 'd00006' 'd00007'
 'd00008' 'd00009' 'd00010' 'd00011' 'd00012' 'd00013' 'd00014' 'd00015'
 'f00001' 'f00002' 'f00004' 'f00005' 'f00006' 'f00007' 'f00008' 'f00009'
 'f00010' 'f00011' 'f00012' 'f00013' 'f00014' 'f00015' 'f00016' 'f00017'
 'f00018' 'f00019' 'f00020' 'f00021' 'f00022'

#### vvvvv 載入原始物種對照表 species_mapping.csv

In [None]:
sp_map = pd.read_csv(spMapFile, sep='\t', encoding='utf-8')
sp_map

Unnamed: 0,category,classId,chineseName,scientificName
0,frog,f00001,中國樹蟾,Hyla chinensis
1,frog,f00002,亞洲錦蛙,Kaloula pulchra
2,frog,f00003,台北樹蛙,Rhacophorus taipeianus
3,frog,f00004,台北赤蛙,Hylarana taipehensis
4,frog,f00005,史丹吉氏小雨蛙,Micryletta steinegeri
5,frog,f00006,太田樹蛙,Buergeria otai
6,frog,f00007,小雨蛙,Microhyla ornata
7,frog,f00008,巴氏小雨蛙,Microhyla butleri
8,frog,f00009,拉都希氏赤蛙,Hylarana latouchii
9,frog,f00010,斑腿樹蛙,Polypedates megacephalus


#### vvvvv 載入 train 好的模型

In [None]:
os.environ['CUDA_VISIBLE_DEVICES']='0'
from keras.models import load_model
model = load_model(savedModelH5)

Using TensorFlow backend.


#### vvvvv 與 loadData 相同的音波處理程式碼

In [None]:
# if 'x' array contains 1, this expands it inthe given directions
# used for the mask applied to the spectogram
def expandOnes(x, directions = [[-1,0], [1,0], [0,-1], [0,1]]):
    expand = np.zeros(x.shape)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            if (x[i,j] == 1):
                for direction in directions:
                    cx = i + direction[0]
                    cy = j + direction[1]
                    if (0 <= cx < x.shape[0] and 0 <= cy < x.shape[1]):
                        expand[cx,cy]=1;
    return x+expand;

In [None]:
# function to filter the spectogram based on the energy of the signal
#
# parameters:
#   data: audio data
#   expandByOne: if it is True, than the mask of the spectogram will be expanded in every direction
#   dropZeroColumnsPercent: determines the ratio of 0 values along the frequency axis when a timeslice is dropped
#
# return values:
#   spectogram
#   filtered spectogram

def audioToFilteredSpectrogram(data, expandByOne = True, dropZeroColumnsPercent = 0.95):
    # calculate the spectogram
    
    #
    #tempSpec = np.log10(mlab.specgram(data, NFFT=512, noverlap=256, Fs=16000)[0])
    #vvvvv 在部份蛙音原始檔裡會出現導致 np.1og10 裡有 0 的情況產生, 而產生負無窮大的值, 故改成 +1 vvvvv
    tempSpec = np.log10(mlab.specgram(data, NFFT=512, noverlap=256, Fs=16000)[0]+1)

    # drop higher frequencies
    tempSpec = tempSpec[0:200,:]
    tempSpecFiltered = np.copy(tempSpec)

    # we analize the spectogram by 20x30 sized cells
    # to achieve better accuray the size of this cell should be finetuned
    rowBorders = np.ceil(np.linspace(0,tempSpec.shape[0], 20))
    columnBorders = np.hstack((np.ceil(np.arange(0,tempSpec.shape[1], 30)), tempSpec.shape[1]))
    rowBorders = [ int(x) for x in rowBorders ]
    columnBorders = [ int(x) for x in columnBorders ]
    keepCells = np.ones((len(rowBorders)-1, len(columnBorders)-1))

    # we create a mask for the spectogram: we scan the spectogram with the 20x30 sized
    # cell and create 0 mask based on the mean and std of the spectogram calculated for the cells and rows
    for i in range(len(rowBorders)-1):
        row_mean = np.mean(tempSpec[rowBorders[i]:rowBorders[i+1],:])
        row_std = np.std(tempSpec[rowBorders[i]:rowBorders[i+1],:])

        for j in range(len(columnBorders)-1):
            cell_mean = np.mean(tempSpec[rowBorders[i]:rowBorders[i+1],columnBorders[j]:columnBorders[j+1]])
            cell_max_top10_mean = np.mean(np.sort(tempSpec[rowBorders[i]:rowBorders[i+1],columnBorders[j]:columnBorders[j+1]], axis=None)[-10:])

            if (cell_mean < 0 or ((cell_max_top10_mean) < (row_mean + row_std)*1.5)):
                keepCells[i,j]=0

    # expand by ones (see above)
    if expandByOne:
        keepCells = expandOnes(keepCells)

    # apply the mask to the spectogram
    for i in range(keepCells.shape[0]):
        for j in range(keepCells.shape[1]):
            if not keepCells[i,j]:
                tempSpecFiltered[rowBorders[i]:rowBorders[i+1],columnBorders[j]:columnBorders[j+1]] = 0

    # drop zero columns
    # the amount of zero values along axis 0 (frequency) is calculated for every column (timeslice)
    # and it is dropped, if the number of zero values is higher than the dropZeroColumnsPercent 
    # eg. dropZeroColumnsPercent=0.95, than a column (timeslice) is dropped, if more than 95% of the values (frequencies) is 0
    tempSpecFilteredBackup = np.copy(tempSpecFiltered)
    tempSpecFiltered = np.delete(tempSpecFiltered, np.nonzero((tempSpecFiltered==0).sum(axis=0) > tempSpecFiltered.shape[0]*dropZeroColumnsPercent), axis=1)

    # if every row was 0 than use the backed up spectogram
    if tempSpecFiltered.shape[1] == 0:
        tempSpecFiltered = tempSpecFilteredBackup

    return tempSpec, tempSpecFiltered;

In [None]:
# bulk processing of wav files in path
#
# parameters:
#   path        the source path
#   filenames   the filenames in the path
#   dontFilter  does not filter the spectogram if it is set to True
#
# return value:
#   specotogram data of multiple files

def wavsToSpectrogramByList(path, filenames, dontFilter=False):
    print("wavsToSpectrogramByList...")
    data=list()
    for filename in filenames:
        print('\r    Processing {}'.format(os.path.join(path, filename))),
        (tempSpecUnfiltered,tempSpecFiltered) = audioToFilteredSpectrogram(io.wavfile.read(os.path.join(path, filename))[1], expandByOne=True)
        if (not dontFilter):
            data.append(tempSpecFiltered)
        else:
            data.append(tempSpecUnfiltered)
    print("\nwavsToSpectrogramByList finished")
    return data;

In [None]:
# function to create training data from the list generated by wavsToSpectogramByList function
#
# parameters:
#   slist      the spectogram list generated by wavsToSpectogramByList function
#   N          (1*44100)/(1024-512)=86
#
# return values
#   X          the constructed input

  
def spectrogramListToT4_X(slist, N=spectrogramWindowLength): 
    print("SpectrogramListToT4_X start...")

    rows = len(slist[0])
    X = np.empty((0,1,rows,N))

    # process all spectograms
    for i in range(len(slist)):
        print('\r    Processing no. %d / %d' % (i, len(slist)))
        ranges = np.hstack((np.arange(0, len(slist[i][0]), N), len(slist[i][0])))

        for j in range(len(ranges)-1):
            # variable contains
            tempSpec = np.empty((1,rows,N))

            if (len(slist[i][0]) < N): # if data is shorter than N than fill up with zeros
                tempSpec[0] = np.hstack((slist[i],np.zeros((rows, N-len(slist[i][0])))))
            elif (ranges[j+1]-ranges[j] < N): # last element
                tempSpec[0] = slist[i][:,-N:]
            else: # other part of the spectrum
                tempSpec[0] = slist[i][:,ranges[j]:ranges[j+1]]

            X = np.vstack((X,[tempSpec]))
    print("SpectrogramListToT4_X finished!")
    return X

#### vvvvv 載入音檔並轉換

In [None]:
#M: 之後反覆測試其他檔案, 由此 loop 以下程式即可!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

fileName = '小雨蛙_1349748532197m-l.wav' #M: 結果: 小雨蛙 <== 正確
wavdirpath = '../data/wav/'

In [None]:
tempSG = wavsToSpectrogramByList(wavdirpath, fileList, dontFilter=False)
X = spectrogramListToT4_X(tempSG, N=spectrogramWindowLength) #convert to t4
print(type(X), X.shape)

In [None]:
result = model.predict(X)

In [None]:
result_argmax = np.argmax(result, axis=1)
result_argmax

In [None]:
sp_set = set(result_argmax)
print('這段音檔裡有可能的物種有:')
for i, sp in enumerate(sp_set):
    sp_CName = sp_map[sp_map['classId']==classIdList[sp]].chineseName.as_matrix()[0]
    print('  第 {} 可能為: {}'.format(i+1, sp_CName))