In [None]:
# -*- coding: utf-8 -*-
"""
ryPrepareDataset00.py
ryPrepareDataset02.py
+ryAddSilenceInTrain(allFiles)
"internet" ==> sd_GSCmdV2/ ==> fn= 'ryGsc_sil1.npz'

#--------------------
# ryPrepareDataset03.py
#--------------------

"""
# In[]
from tqdm import tqdm
import requests
import math
import os
import tarfile
import numpy as np
import librosa
import pandas as pd
import time

#import audioUtils

def _downloadFile(url, fName):
    # Streaming, so we can iterate over the response.
    r = requests.get(url, stream=True)

    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0)); 
    block_size = 1024
    wrote = 0 
    print('Downloading {} into {}'.format(url, fName))
    with open(fName, 'wb') as f:
        for data in tqdm(r.iter_content(block_size), 
                         total= math.ceil(total_size//block_size) , 
                         unit=  'KB', 
                         unit_scale= True):
            
            wrote = wrote  + len(data)
            f.write(data)
    if total_size != 0 and wrote != total_size:
        print("ERROR, something went wrong")
        
def _extractTar(fname, folder):
    print('Extracting {} into {}'.format(fname, folder))
    if (fname.endswith("tar.gz")):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(path=folder)
        tar.close()
    elif (fname.endswith("tar")):
        tar = tarfile.open(fname, "r:")
        tar.extractall(path=folder)
        tar.close()      

def _DownloadGoogleSpeechCmdV2(basePath, forceDownload= False):
    """
    Downloads Google Speech commands dataset version 2
    """
    #global basePath
    
    dataUrl01= 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
    dataUrl02= 'http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz'
    
    if os.path.isdir(basePath) and not forceDownload:
        print('Google Speech commands dataset version 2 already exists. Skipping download.')
    else:
        if not os.path.exists(basePath):
            os.makedirs(basePath)
        trainFiles= dataUrl01
        testFiles=  dataUrl02
        _downloadFile(testFiles, basePath+'test.tar.gz')
        _downloadFile(trainFiles, basePath+'train.tar.gz')
    
    #extract files
    if not os.path.isdir(basePath+"test/"):
        _extractTar(basePath+'test.tar.gz', basePath+'test/')
        
    if not os.path.isdir(basePath+"train/"):
        _extractTar(basePath+'train.tar.gz', basePath+'train/')
        
#--------------------------        
basePath= '../ryDatasets/gscV2/'
_DownloadGoogleSpeechCmdV2(basePath, forceDownload= False)  
 


# In[]
#read split from files and all files in folders
testWAVs= pd.read_csv(basePath+'train/testing_list.txt', sep=" ", header=None)[0].tolist()
valWAVs=  pd.read_csv(basePath+'train/validation_list.txt', sep=" ", header=None)[0].tolist()


testWAVs= [os.path.join(basePath+'train/', f) for f in testWAVs if f.endswith('.wav')]
valWAVs=  [os.path.join(basePath+'train/', f) for f in valWAVs if f.endswith('.wav')]


allWAVs= []
for root, dirs, files in os.walk(basePath+'train/'):
    allWAVs+= [root+'/'+f  for f in files if f.endswith('.wav')]

trainWAVs= list( set(allWAVs)-set(valWAVs)-set(testWAVs) )

info= '{},{},{},{}'.format(
        len(testWAVs), 
        len(valWAVs), 
        len(trainWAVs), 
        len(allWAVs))
print(info)
# In[]

ryGscDict=  {   'unknown' : 0, 'silence' : 0, 
                '_unknown_' : 0, '_silence_' : 0, 
                '_background_noise_' : 0,
                
                'one' : 1, 'two' : 2, 'three' : 3, 'four' : 4, 'five' : 5,
                'six' : 6, 'seven' : 7,  'eight' : 8, 'nine' : 9,  'zero' : 10,                

                'yes' : 11, 'no' : 12, 
                'go' : 13, 'stop' :14, 
                'on' : 15,  'off' :16, 

                'up' : 17, 'down' : 18, 
                'left' : 19, 'right' : 20,
                'forward':21, 'backward':22, 
                
                'marvin':23,'sheila':24, 
                'dog':25,   'cat':26, 
                'bird':27,  'bed':28, 
                'happy':29, 'house':30,
                'learn':31, 'follow':32,  
                 'tree':33, 'visual':34, 
                 'wow':35}

# list(ryGscCategs.keys())
ryGscList=[ 
 '_silence_',
 'one',  'two', 'three', 'four', 'five',
 'six', 'seven', 'eight', 'nine', 'zero',
 'yes', 'no',
 'go', 'stop',
 'on', 'off',
 'up', 'down',
 'left', 'right',
 'forward', 'backward',
 'marvin', 'sheila',
 'dog', 'cat',
 'bird', 'bed',
 'happy', 'house',
 'learn', 'follow',
 'tree', 'visual',
 'wow'
 ]

def _getFileCategory(file, catDict):
    """
    Receives a file with name sd_GSCmdV2/train/<cat>/<filename> and returns an integer that is catDict[cat]
    """
    categ = os.path.basename(os.path.dirname(file))
    return catDict.get(categ,0)

#get categories
testWAVlabels= [_getFileCategory(f, ryGscDict) for f in testWAVs]
valWAVlabels=  [_getFileCategory(f, ryGscDict) for f in valWAVs]
trainWAVlabels=[_getFileCategory(f, ryGscDict) for f in trainWAVs]

#background noise should be used for validation as well
bgWAVs= [trainWAVs[i] 
         for i in range(len(trainWAVlabels)) 
         if trainWAVlabels[i]==ryGscDict['silence']]
bgWAVlabels= [ryGscDict['_silence_'] for i in range(len(bgWAVs))]


# In[]
#build dictionaries
testWAVlabelsDict=  dict(zip(testWAVs, testWAVlabels))
valWAVlabelsDict=   dict(zip(valWAVs, valWAVlabels))
trainWAVlabelsDict= dict(zip(trainWAVs, trainWAVlabels))

bgWAVlabelsDict=    dict(zip(bgWAVs, bgWAVlabels))


#info dictionary
trainInfo= {'files': trainWAVs, 'labels' : trainWAVlabelsDict}
valInfo=   {'files': valWAVs,   'labels' : valWAVlabelsDict}
testInfo=  {'files': testWAVs,  'labels' : testWAVlabelsDict}
bgInfo=    {'files': bgWAVs,    'labels' : bgWAVlabelsDict}

gscInfo= {'train': trainInfo, 
          'val':   valInfo, 
          'test':  testInfo,
          'bg':    bgInfo}    

info= [(s, len(gscInfo[s]['files'])) for s in gscInfo.keys()] 
print(info)

# In[]
# In[]
import librosa
import pylab as pl

#import sounddevice as sd

# Data Visualization for 'train'
s='train'

fn= list(gscInfo[s]['labels'].keys())[-1]
x, sr= librosa.load(fn, sr=None)

print(f'x.shape= {x.shape}, sr= {sr}')

pl.figure(figsize=(10,10))
for i in range(100):
    
    fn= list(gscInfo[s]['labels'].keys())[i]
    x, sr= librosa.load(fn, sr=None)
    
    #sd.play(x,sr)

    
    label= list(gscInfo[s]['labels'].values())[i]
    
    c= ryGscList[label]
    
    pl.subplot(10,10,i+1)
    pl.title(c)
    pl.plot(x)
    
    #sd.wait()

# In[]
    
# Data Visualization for 'bg'
    
s='bg'

fn= list(gscInfo[s]['labels'].keys())[-1]
x, sr= librosa.load(fn, sr=None)

print(f'x.shape= {x.shape}, sr= {sr}')


nWav= len(gscInfo[s]['labels'].keys())

pl.figure(figsize=(10,10))
for i in range(nWav):
    
    fn= list(gscInfo[s]['labels'].keys())[i]
    x, sr= librosa.load(fn, sr=None)
    
    #sd.play(x,sr)
    
    label= list(gscInfo[s]['labels'].values())[i]
    
    c= ryGscList[label]
    
    pl.subplot(nWav,1,i+1)
    pl.title(c)
    pl.plot(x)
    
    #sd.wait()
# In[]
# In[]
# load all data into memory
# 有些檔案長度不為 1 sec，要 normalize 成 1 sec= 16000
def ryLengthNormalize(x, length=16000):
    #curX could be bigger or smaller than self.dim
    if len(x) == length:
        X= x
        #print('Same dim')
    elif len(x) > length: #bigger
        #we can choose any position in curX-self.dim
        randPos= np.random.randint(len(x)-length)
        X= x[randPos:randPos+length]
        #print('File dim bigger')
    else: #smaller
        randPos= np.random.randint(length-len(x))
        
        X= np.random.random(length)*1e-10
        
        X[randPos:randPos+len(x)]= x
        #print('File dim smaller')
    return X


# In[]
    
xLL= []
yLL= []

for s in ['val', 'test', 'train']:
    aL=  gscInfo[s]['files']
    xL= []
    for fn in tqdm(aL):
        x, sr= librosa.load(fn, sr= None)
        x= ryLengthNormalize(x)
        xL += [x]
    xL= np.vstack(xL)
    xLL += [xL]
    
    yL=  list(gscInfo[s]['labels'].values())
    yL= np.array(yL)
    yLL += [yL]
    
x_val, x_test, x_train= xLL
y_val, y_test, y_train= yLL


# In[]
# 針對 silence, bg, 長度太長，把他們切成 數個 1秒 sound
bgFiles= gscInfo['bg']['files']

def rySplitSilenceIn1SecSoundList(bgFiles):
    
    noiseL= [librosa.load(fn, sr=None)[0] for fn in bgFiles]
       
    n=0
    silenceL= []
    for x in noiseL:
        t=0
        while (t+1)*16000 < x.size:
            x1sec= x[t*16000:(t+1)*16000]
            silenceL += [x1sec]
            t+=1
        n+=1

    return silenceL

silenceL= rySplitSilenceIn1SecSoundList(bgFiles)
x_bg= silenceL= np.vstack(silenceL)
y_bg= np.zeros(len(silenceL))

x_trainWithSil=  np.vstack((x_train, x_bg))
y_trainWithSil=  np.concatenate((y_train, y_bg))

# In[]
assert x_train.shape[0]        == y_train.shape[0]
assert x_val.shape[0]          == y_val.shape[0]
assert x_test.shape[0]         == y_test.shape[0]
assert x_trainWithSil.shape[0] == y_trainWithSil.shape[0]

x_trainWithSil= x_trainWithSil.astype('float32')
x_test=         x_test.astype('float32')
x_val=          x_val.astype('float32')

y_trainWithSil= y_trainWithSil.astype('int')
y_test=         y_test.astype('int')
y_val=          y_val.astype('int')


# In[]
import time

t0= time.time()
import os

#basePath= '../ryDatasets/gscV2/'

fn= 'gscV2_data.npz'
if not os.path.isfile(basePath+fn):
    np.savez_compressed(
        basePath+fn, 
        x_trainWithSil=    x_trainWithSil, 
        y_trainWithSil=    y_trainWithSil,
        x_val=      x_val,
        y_val=      y_val,
        x_test=     x_test, 
        y_test=     y_test,
        )

dt= time.time()-t0
print(f'np.savez_compressed(), fn= {fn}, dt(sec)= {dt:.2f}')

# np.savez_compressed(), dt(sec)= 778.8924231529236

# In[]
# In[]
# In[]
# In[]
# In[]
if __name__=='__main__':
    
    t0= time.time()
    
    #basePath= '../ryDatasets/gscV2/'
    fn= 'gscV2_data.npz'
    z= np.load(basePath+fn)
    
    x_train=    z['x_trainWithSil']    
    y_train=    z['y_trainWithSil']    
    x_val=      z['x_val']      
    y_val=      z['y_val']
    x_test=     z['x_test']     
    y_test=     z['y_test']     
    
    
    dt= time.time()-t0
    print(f'np.load({basePath+fn}), dt(sec)= {dt:.3f}')



# In[]


Google Speech commands dataset version 2 already exists. Skipping download.
11005,9981,84849,105835
[('train', 84849), ('val', 9981), ('test', 11005), ('bg', 6)]
x.shape= (16000,), sr= 16000
x.shape= (980062,), sr= 16000


100%|██████████████████████████████████████████████████████████████████████████████| 9981/9981 [03:08<00:00, 53.08it/s]
100%|████████████████████████████████████████████████████████████████████████████| 11005/11005 [03:42<00:00, 49.55it/s]
  6%|████▋                                                                        | 5130/84849 [02:01<33:29, 39.68it/s]

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 21 23:44:17 2019
@author: renyu

functionalKeras005_spchCmdNmelSpec.py
ryLab00.py

ryTrainModel03.py

"""
# In[]
#import ryPrepareDataset00

# In[]
import numpy as np
import time

basePath= '../ryDatasets/gscV2/'
fn= 'gscV2_data.npz'


t0= time.time()

z= np.load(basePath+fn)

x_train=    z['x_trainWithSil']    
y_train=    z['y_trainWithSil']    
x_val=      z['x_val']      
y_val=      z['y_val']
x_test=     z['x_test']     
y_test=     z['y_test']     

fnModel= 'ryModel.hdf5'

print(".... z= np.load({}) will train into {}".format(fn, fnModel))


# In[]
import tensorflow as tf

def ryFeature(x, 
           sample_rate= 16000, 
           
           frame_length= 1024,
           frame_step=    128,  # frame_length//2
           
           num_mel_bins=     128,
           lower_edge_hertz= 20,     # 0
           upper_edge_hertz= 16000/2, # sample_rate/2   
           
           mfcc_dim= 13
           ):
    
    stfts= tf.signal.stft(x, 
                          frame_length, #=  256, #1024, 
                          frame_step, #=    128,
                          #fft_length= 1024
                          pad_end=True
                          )
    
    spectrograms=     tf.abs(stfts)
    log_spectrograms= tf.math.log(spectrograms + 1e-10)
    
    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins= stfts.shape[-1]  #.value
    
    linear_to_mel_weight_matrix= tf.signal.linear_to_mel_weight_matrix(
          num_mel_bins, 
          num_spectrogram_bins, 
          sample_rate, 
          lower_edge_hertz,
          upper_edge_hertz)
    
    mel_spectrograms= tf.tensordot(
          spectrograms, 
          linear_to_mel_weight_matrix, 1)
    
    mel_spectrograms.set_shape(
          spectrograms.shape[:-1].concatenate(
              linear_to_mel_weight_matrix.shape[-1:]))
    
    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms= tf.math.log(mel_spectrograms + 1e-10)
    
    # Compute MFCCs from log_mel_spectrograms and take the first 13.
    mfccs= tf.signal.mfccs_from_log_mel_spectrograms(
          log_mel_spectrograms)[..., :mfcc_dim]
    
    feature= {'mfcc':               mfccs, 
              'log_mel_spectrogram':log_mel_spectrograms, 
              'log_spectrogram':    log_spectrograms, 
              'spectrogram':        spectrograms}
    
    return  feature


'''
batch_size= 1000  # 預防 gpu memory 不夠， 分批作業 
x= x_train[0:batch_size].astype(np.float32)
X= ryFeature(x)['log_mel_spectrogram']
X= X.numpy()
'''


# In[]

import time

import tensorflow as tf


def get_all_fearure(all_x, batch_size= 1000):
    t0= time.time()
    
    x= all_x.astype(np.float32)
    
    #batch_size= 1000  # 預防 gpu memory 不夠， 分批作業 
    
    i=0
    XL=[]
    while i < x.shape[0]:
        
        if i+batch_size<=x.shape[0]:
            xx= x[i:i+batch_size]
        else:
            xx= x[i:]
        
        XX= ryFeature(xx)
        X= XX['log_mel_spectrogram'] 
        #'log_spectrogram'] #'mfcc'] #'log_mel_spectrogram']
        
        X= X.numpy().astype(np.float32)
        
        i  += batch_size
        XL += [X]
    
    XL= np.concatenate(XL)
    print('XL.shape={}'.format(XL.shape))
    
    dt= time.time()-t0
    print('tf.signal.stft, 執行時間 dt= {}'.format(dt))
    
    '''
    XL.shape=(64721, 125, 129) # nTime= 16000/128, nFreq=256/2+1
    tf.signal.stft, dt= 8.066392660140991
    '''
    return XL
# In[]
print('.... get_all_fearure() .... ')

t0= time.time()

#X_testREAL= get_all_fearure(x_testREAL)
X_test=     get_all_fearure(x_test)
X_val=      get_all_fearure(x_val)
X_train=    get_all_fearure(x_train)

#t0= time.time()
dt= time.time()- t0
print('... get_all_fearure() ... dt(sec)= {:.3f}'.format(dt))

### get_all_fearure() ... dt(sec)= 36.128026723861694
### get_all_fearure() ... dt(sec)= 52.950


# In[]

nTime, nFreq= X_train[0].shape

zzz='''
nTime, nFreq= (125, 128)
'''

# In[]
def normalize(x, axis= None):   
    if axis== None:
        x= (x-x.mean())/x.std()
    else:
        x= (x-x.mean(axis= axis))/x.std(axis= axis)
    
    return x

# In[]
print('.... normalize() ....')

X_train= X_train.reshape(-1, nTime, nFreq, 1).astype('float32') 
X_val=   X_val.reshape(-1, nTime, nFreq, 1).astype('float32') 
X_test=  X_test.reshape( -1, nTime, nFreq, 1).astype('float32') 
#X_testREAL=  X_testREAL.reshape( -1, nTime, nFreq, 1).astype('float32') 

#'''  好像重複做了？！
X_train=     normalize(X_train)#, axis=0)  # normalized for the all set, many utterence
X_val=       normalize(X_val)#, axis=0)
X_test=      normalize(X_test)#, axis=0)
#X_testREAL=  normalize(X_testREAL)#, axis=0)
#'''

# In[]

import tensorflow as tf

tf.keras.backend.clear_session()  
# For easy reset of notebook state.

from tensorflow              import keras
from tensorflow.keras        import layers, Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import AveragePooling1D

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# In[]






# In[]
nCategs= len(set(y_train)) #36 #c_train.size #36


x= Input(shape= (nTime, nFreq, 1))

h= x


#'''
h= Conv2D(8,   (16,16), activation='relu', padding='same')(h)
h= MaxPooling2D((4,4), padding='same')(h)
h= Dropout(0.2)(h)

h= Conv2D(16,   (8,8), activation='relu', padding='same')(h)
h= MaxPooling2D((4,4), padding='same')(h)
h= Dropout(0.2)(h)

h= Flatten()(h)

h= Dense(256,  activation='relu')(h)
h= Dropout(0.2)(h)


h= Dense(nCategs,  activation='softmax')(h)

y= h

m= Model(inputs=  x, 
         outputs= y)

m.summary()



# In[]
#keras.utils.plot_model(m, 'm.png', show_shapes=True)



# In[]
m.compile(  
        loss=    'sparse_categorical_crossentropy',
        metrics= ['accuracy'])


es= EarlyStopping(
        monitor=   'val_loss', 
        min_delta= 1e-10,
        patience=  10, 
        mode=      'min', 
        verbose=   1) 



mc= ModelCheckpoint(fnModel, 
        monitor=    'val_accuracy', 
        verbose=    1, 
        save_best_only= True, 
        mode=      'max')

t0= time.time()

h= m.fit(X_train, y_train,
         
        batch_size=500, #1000, # 1000
        epochs=    100,
        
        callbacks=[es, mc],
        
        #validation_split= 0.1
        validation_data= (X_val, y_val)
        )


#t0= time.time()
dt= time.time()- t0
print('... h= m.fit() ... dt(sec)= {}'.format(dt))

# In[]
import numpy as np
from matplotlib import pyplot as pl
v0= h.history['accuracy']
v1= h.history['val_accuracy']
pl.plot(v0, label='accuracy')
pl.plot(v1, label='val_accuracy')
pl.legend()
pl.grid('on')
pl.show()
#keras.utils.plot_model(m, 'm.png', show_shapes=True)

# In[]


m.evaluate(X_test,      y_test,      verbose=2)


# In[]


# In[]

## for version.002

print(' ~~~ simulation session ended ~~~')

# In[]
import numpy as np
from tensorflow.keras.models import load_model
import sounddevice as sd

#import audioUtils


ryGscList=[ 
 '_silence_',
 'one',  'two', 'three', 'four', 'five',
 'six', 'seven', 'eight', 'nine', 'zero',
 'yes', 'no',
 'go', 'stop',
 'on', 'off',
 'up', 'down',
 'left', 'right',
 'forward', 'backward',
 'marvin', 'sheila',
 'dog', 'cat',
 'bird', 'bed',
 'happy', 'house',
 'learn', 'follow',
 'tree', 'visual',
 'wow'
 ]


labels= ryGscList


model= load_model(fnModel)


def predict(x):#, fs=16000):
    prob=  model.predict(x)#.reshape(1,fs,1))
    index= np.argmax(prob[0])
    y= labels[index]
    return y

def recWav(x, featureOut= False):
    x= x.flatten()    

    X= ryFeature(x)['log_mel_spectrogram']
    
    X= X.numpy().astype(np.float32)
    
    X= normalize(X)  # normalized for only one utterence x

    Xin= X.reshape(1,X.shape[0],X.shape[1], 1)
    y= predict(Xin)
    
    if featureOut == True:
        return y, X
    else:
        return y


# In[]
    
#import numpy as np
#from tensorflow.keras.models import load_model
import sounddevice as sd

import pylab as pl 
from tqdm import tqdm



n= 0
nWrong= 0 
wrongL= []

infoL= []
yL= []  
for x, yI in tqdm(zip(x_test[0:1000], y_test[0:1000])): #xL[0:10]:
    #x= x_testREAL[i]
    
    yAns= labels[yI]
    
    x= x.astype(np.float32)
    
    #sd.play(x, samplerate= 16000)
        
    y= recWav(x) #, featureOut=True) 
    
    yL += [y]
    # the acc will be slightly different because of different normalization base
    
    #info= 'n= {:05d}, nWrong= {:05d}, y=【{}】, yAns= [{}]'.format(
    #        n, nWrong, y, yAns)
    #print(info)
    
    
    if y != yAns:
        nWrong += 1
        wrongL += [n]
        
        info= '''n= {:05d}, nWrong= {:05d}, wer= {:.5f}, y=【{}】, yAns= [{}]'''.format(
                 n, nWrong, nWrong/(n+1), y, yAns)
        
        #print(info)
        
        infoL += [info]
        
        
        '''
        sd.play(x, samplerate= 16000)
        
        pl.subplot(2,1,1)
        
        pl.title(info)

        pl.imshow(X.transpose(), origin='low')
        
        pl.subplot(2,1,2)
        pl.plot(x)
        pl.grid('on')
        pl.show()
            
        sd.wait()
        '''
    
    n += 1
    #break
# In[]    
info= '''n= {:05d}, nWrong= {:05d}, wer= {:.5f}, acc= {:.5f}'''.format(
         n, nWrong, nWrong/n, 1-nWrong/n)

infoL += [info]

print(info)

fnInfo= 'infoL_test.txt'
#np.save(, np.array(infoL))
with open(fnInfo,'w') as fp:
    for info in infoL:
        print(info, file= fp)




# In[]

print('''##########
a Real-time Test..., 
press 【Enter】and speak out within 1 sec      
the words are in the list of 35 words: 

###################
{}
###################

PS: (you cannot say '_silence_', 
it just for "silence" 
or "no sound" 
or "background noise") 
'''.format(labels))


    
T=  1     # Duration of recording
fs= 16000  # Sample rate

xL= []
for i in range(100):
    
    aKey= input('{}\n{}\n'.format(
                'press "q" to quit', 
                'or another key to record 1 sec speech...'))
    if aKey=='q':
        print('~~~the end~~~')
        break
    
    x= sd.rec(int(T*fs), 
            samplerate= fs, 
            channels= 1, 
            dtype='float32')
        
    sd.wait()  # Wait until recording is finished
    
    y= recWav(x)
    
    print('y= 【{}】'.format(y))

    xL += [x]
    
    if y=='stop': break

# In[]
#import pickle
import compress_pickle as cpk

fn='rySp.gz'
cpk.dump(xL, fn)
xL= cpk.load(fn)

# In[]
    
print('... ry: Good Luck ...')












































In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 21 23:44:17 2019
@author: renyu

functionalKeras005_spchCmdNmelSpec.py
ryLab00.py

ryLab01.py   ... 
ryLab01_1.py ...

ryRecog03.py ...

第一次用 CNN 做出 能辨識 35 個 英文詞 的 語音辨識系統 ...

"""
# In[]


# In[]
#
import time

import numpy as np

import tensorflow as tf

from tensorflow.keras.models import load_model

import sounddevice as sd


# In[]


def ryFeature(x, 
           sample_rate= 16000, 
           
           frame_length= 1024,
           frame_step=    128,  # frame_length//2
           
           num_mel_bins=     128,
           lower_edge_hertz= 20,     # 0
           upper_edge_hertz= 16000/2, # sample_rate/2   
           
           mfcc_dim= 13
           ):
    
    stfts= tf.signal.stft(x, 
                          frame_length, #=  256, #1024, 
                          frame_step, #=    128,
                          #fft_length= 1024
                          pad_end=True
                          )
    
    spectrograms=     tf.abs(stfts)
    log_spectrograms= tf.math.log(spectrograms + 1e-10)
    
    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins= stfts.shape[-1]  #.value
    
    linear_to_mel_weight_matrix= tf.signal.linear_to_mel_weight_matrix(
          num_mel_bins, 
          num_spectrogram_bins, 
          sample_rate, 
          lower_edge_hertz,
          upper_edge_hertz)
    
    mel_spectrograms= tf.tensordot(
          spectrograms, 
          linear_to_mel_weight_matrix, 1)
    
    mel_spectrograms.set_shape(
          spectrograms.shape[:-1].concatenate(
              linear_to_mel_weight_matrix.shape[-1:]))
    
    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms= tf.math.log(mel_spectrograms + 1e-10)
    
    # Compute MFCCs from log_mel_spectrograms and take the first 13.
    mfccs= tf.signal.mfccs_from_log_mel_spectrograms(
          log_mel_spectrograms)[..., :mfcc_dim]
    
    feature= {'mfcc':               mfccs, 
              'log_mel_spectrogram':log_mel_spectrograms, 
              'log_spectrogram':    log_spectrograms, 
              'spectrogram':        spectrograms}
    
    return  feature


# In[]
def normalize(x):   
    x= (x-x.mean())/x.std()
    return x



# In[]
def predict(x, withProb= False):#, fs=16000):
    global NNmodel, LabelDic

    prob=  NNmodel.predict(x)#.reshape(1,fs,1))
    index= np.argmax(prob[0])
    y= LabelDic[index]
    
    if withProb== True:
        probability= np.max(prob[0])
        
        #y='{} ({:.2f})'.format(y, probability)
        y= (y, probability)
    return y

def recWav(x, featureOut= False, withProb= False):
    x= x.flatten()    

    X= ryFeature(x)['log_mel_spectrogram']
    
    X= X.numpy().astype(np.float32)
    
    X= normalize(X)

    Xin= X.reshape(1,X.shape[0],X.shape[1], 1)
    y=   predict(Xin, withProb)
    
    if featureOut == True:
        return y, X
    else:
        return y





# In[]

def rec_long_wav(x= None, T=1, dt=.5, fs=16000, pauseByKey= False, fn= None):
    
    if pauseByKey==True:
        aKey= input('press a key to record speech...')
    
    if fn==None and x == None:
        x= sd.rec(int(T*fs), 
                samplerate= fs, 
                channels=   1, 
                dtype=      'float32')
            
        sd.wait()  # Wait until recording is finished
        
    elif fn != None:
        x= np.load(fn)
        #T= x.size/fs
    else:
        #T= x.size/fs
        print('x.shape= {}'.format(x.shape))
        pass
    
    T= x.size/fs
    if T==1:
        y= recWav(x)
    elif T>1:
        # 若輸入語音的長度 T > 1 (sec)，
        # 則移動音框 dt 切成一個一個 1 sec 的語音片段
        # 保持 T/dt 個輸出結果 (邊界之處仍有bug...)
        t=0
        yL= []
        while t<T-dt:
            
            if int((1+t)*fs)<=T*fs:
                x1sec= x[int(t*fs) : int(t*fs)+fs]
            else:
                x1sec= np.random.random(1*fs)*1e-10
                x1sec= x1sec.astype(np.float32)
                
                xx= x[int(t*fs): ].flatten()
                x1sec[0:xx.size]= xx 
            
            y= recWav(x1sec, withProb= True)
            yL += [y]
            t += dt
        y= np.array(yL)
    else:
        y= None
        pass
    
    print('y=【{}】'.format(y))
    
    return x, y


'''
LabelDic= {0: '_silence_', 
           1: 'nine', 2: 'yes', 3: 'no', 4: 'up', 5: 'down', 
           6: 'left', 7: 'right', 8: 'on', 9: 'off', 10: 'stop', 
           11: 'go', 12: 'zero', 13: 'one', 14: 'two', 15: 'three', 
           16: 'four', 17: 'five', 18: 'six', 19: 'seven', 20: 'eight', 
           21: 'backward', 22: 'bed', 23: 'bird', 24: 'cat', 25: 'dog', 
           26: 'follow', 27: 'forward', 28: 'happy', 29: 'house', 30: 'learn', 
           31: 'marvin', 32: 'sheila', 33: 'tree', 34: 'visual', 35: 'wow'}
'''

LabelDic= ryGscList=[ 
 '_silence_',
 'one',  'two', 'three', 'four', 'five',
 'six', 'seven', 'eight', 'nine', 'zero',
 'yes', 'no',
 'go', 'stop',
 'on', 'off',
 'up', 'down',
 'left', 'right',
 'forward', 'backward',
 'marvin', 'sheila',
 'dog', 'cat',
 'bird', 'bed',
 'happy', 'house',
 'learn', 'follow',
 'tree', 'visual',
 'wow'
 ]


#nCategs= len(LabelDic) #36 

tf.keras.backend.clear_session() 

#fnModel= 'ry_best_model1.hdf5'

fnModel= 'ryModel.hdf5'
 
NNmodel= load_model(fnModel)

if __name__=='__main__':
    timeDuration= 10 #sec
    input('press any to record a {} sec wav...'.format(
                timeDuration))
    
    # .... main recognition ....
    x, y= rec_long_wav(T=timeDuration, dt=.1)
    
    xyL= []
    while True:
        aKey= input('press "q" to quit, or any other to record a {} sec wav...'.format(
                timeDuration))
        if aKey == 'q': break
        
        x, y= rec_long_wav(T=timeDuration, dt=.1)
        
        xyL += (x, y)
    
    print('ry: Good Luck, Bye...')    


