# for live audio acquisition and inferencing system, based on custom trained model
# with adaptation, 
https://www.swharden.com/wp/2016-07-19-realtime-audio-visualization-in-python/

In [1]:
import pyaudio
import time
import pylab
import numpy as np

In [2]:
import matplotlib.pyplot as plt

# In this stream, we use mel_features for audio feature extraction
This is however not very accurate for inferencing as the log mel spectrograms obtained have low Signal to Noise ratio
##### and the model was trained with audio features extracted by librosa functions instead of mel_features function

In [3]:
import mel_features

# Load model

In [5]:
import tensorflow as tf
print(tf.__version__)
import keras

1.15.0


Using TensorFlow backend.


In [6]:
#loaded_model = keras.models.load_model('trained_mobilenet_2.h5')
loaded_model = keras.models.load_model('trained_mobilenet_2.h5')












Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [7]:
loaded_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 225, 225, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 112, 112, 32)      864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 112, 112, 32)      128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 112, 112, 32)      0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 112, 112, 32)      288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 112, 112, 32)      128       
__________

# Audio is read in like a continuous tape of 2s duration
# Waveform and Log Mel Spectrogram can be plotted on 'index.html' when opened as live server with Vscode
##### Each cycle takes chunk/rate, i.e. 32768/44100 = 0.74 seconds to run. 
##### Reducing chunk reduces latency, since less data points are read each time, but increases computational cost, since more cycles are required to read data points.

In [8]:
class SWHear(object):
    """
    The SWHear class is made to provide access to continuously recorded
    (and mathematically processed) microphone data.
    """

    def __init__(self,device=None,startStreaming=True):
        """fire up the SWHear class."""
        print(" -- initializing SWHear")

        self.chunk = 32768 # number of data points to read at a time
        self.rate = 44100 # time resolution of the recording device (Hz)

        # for tape recording (continuous "tape" of recent audio)
        self.tapeLength=2 # in seconds
        self.tape=np.zeros(self.rate*self.tapeLength) 

        self.p=pyaudio.PyAudio() # start the PyAudio class
        if startStreaming:
            self.stream_start()

    ### LOWEST LEVEL AUDIO ACCESS
    # pure access to microphone and stream operations
    # keep math, plotting, FFT, etc out of here.

    def stream_read(self):
        """return values for a single chunk"""
        data = np.frombuffer(self.stream.read(self.chunk),dtype=np.int16)
        #print(data)
        return data

    def stream_start(self):
        """connect to the audio device and start a stream"""
        print(" -- stream started")
        self.stream=self.p.open(format=pyaudio.paInt16,channels=1,
                                rate=self.rate,input=True,
                                frames_per_buffer=self.chunk)

    def stream_stop(self):
        """close the stream but keep the PyAudio instance alive."""
        if 'stream' in locals():
            self.stream.stop_stream()
            self.stream.close()
        print(" -- stream CLOSED")

    def close(self):
        """gently detach from things."""
        self.stream_stop()
        self.p.terminate()

    ### TAPE METHODS
    # tape is like a circular magnetic ribbon of tape that's continously
    # recorded and recorded over in a loop. self.tape contains this data.
    # the newest data is always at the end. Don't modify data on the type,
    # but rather do math on it (like FFT) as you read from it.

    def tape_add(self):
        """add a single chunk to the tape."""
        self.tape[:-self.chunk]=self.tape[self.chunk:]
        self.tape[-self.chunk:]=self.stream_read()

    def tape_flush(self):
        """completely fill tape with new data."""
        readsInTape=int(self.rate*self.tapeLength/self.chunk) 
        print(" -- flushing %d s tape with %dx%.2f ms reads"%\
                  (self.tapeLength,readsInTape,self.chunk/self.rate))
        for i in range(readsInTape):
            self.tape_add()
            
            
    def tape_forever(self,plotSec=.25):
        t1=0
        try:
            while True:
                self.tape_add()
                
                if (time.time()-t1)>plotSec:
                    t1=time.time()
                    print()
                    self.tape_plot() # to plot waveform
                    self.tape_processing() # to plot log mel spectrogram
                    self.tape_inferencing() # to output inferencing results
    
        except:
            print(" ~~ exception (keyboard?)")
            raise
            return

    def tape_plot(self,saveAs="03.png"): # plots waveform of audio streaming
        """plot what's in the tape."""        
        pylab.plot(np.arange(len(self.tape))/self.rate,self.tape) 
        pylab.axis([0,self.tapeLength,-2**16/2,2**16/2])
        if saveAs:
            t1=time.time()
            pylab.savefig(saveAs,dpi=70)
            print("waveform plotting saving took:            %.02f ms"%((time.time()-t1)*1000))
        else:
            pylab.show()
            print() #good for IPython
        pylab.close('all') 
        
    

    # add stuff here
    def tape_processing(self,saveAs="04.png"): # plots log mel spectrogram of audio
        """plot log mel spec content in the tape"""
        
        # self.melspec = librosa.feature.melspectrogram(self.tape/65536, sr=self.rate, n_mels=128)
        # self.log_S = librosa.amplitude_to_db(self.melspec)
        self.log_S = mel_features.log_mel_spectrogram(self.tape/65536)

        
        # plot the compressed spec
        fig, ax = plt.subplots(figsize=(12,4))
        cax = ax.matshow(
            self.log_S.astype("int16"),
            interpolation="nearest",
            aspect="auto",
            cmap=plt.cm.afmhot,
            origin="lower",
        )
        fig.colorbar(cax)
        plt.title("mel Spectrogram")

        
        if saveAs: 
            t2=time.time()
            plt.savefig(saveAs, dpi=70)
            print("log mel spectrogram plotting saving took: %.02f ms"%((time.time()-t2)*1000))
        else: 
            pylab.show()
            print()
        pylab.close('all')

    

    def tape_inferencing(self):
        """inferencing content in the tape"""
        # print("running tape inferencing")

        start_index_x = (224-128)//2
        # start_index_x = 48
        end_index_x = (224-128)//2 + 128
        # end_index_x = 176
        start_index_y = (224-169)//2
        # start_index_y = 25
        end_index_y = (224-169)//2 + 169
        # end_index_y = 198
        # print(start_index_x, end_index_x, start_index_y, end_index_y)
        
        self.mean = np.mean(self.log_S) # shape is alr (128,173)
        self.spectrogram = self.mean*np.ones(shape = (224,224,3))
        for j in range(3):
            self.spectrogram[start_index_x:end_index_x, start_index_y:end_index_y,j] = self.log_S
        
        t3 = time.time()
        self.predictions = loaded_model.predict(np.expand_dims(self.spectrogram, axis=0))
        print("inferencing took:                         %.02f ms"%((time.time()-t3)*1000))
        
        
        classidx = np.argmax(self.predictions[0]) 
        
        def classify(i):
            switcher = {
                0: "fall",
                1: "cough",
                2: "shout",
                3: "speech"
            }
            return switcher.get(i, "Invalid class")
        
        classlab = self.predictions[0]
        
        thresh = 0.60
        if classlab[0] < thresh and classlab[1] < thresh and classlab[2] < thresh and classlab[3] < thresh:
            print("sound class:                              ambience sound")
        else:
            print("sound class:                             ", classify(classidx), classlab[classidx])
    
        
        
        print(np.argmax(self.predictions), self.predictions)
        

In [9]:
if __name__=="__main__":
    ear=SWHear()
    ear.tape_forever()
    ear.close()
    print("DONE")

 -- initializing SWHear
 -- stream started

waveform plotting saving took:            100.19 ms


  return 20 * np.log10(mel_spectrogram + log_offset)


log mel spectrogram plotting saving took: 125.66 ms
inferencing took:                         667.83 ms
sound class:                              fall nan
0 [[nan nan nan nan]]

waveform plotting saving took:            78.85 ms


  return 20 * np.log10(mel_spectrogram + log_offset)


log mel spectrogram plotting saving took: 150.64 ms
inferencing took:                         151.63 ms
sound class:                              fall nan
0 [[nan nan nan nan]]





waveform plotting saving took:            76.83 ms
log mel spectrogram plotting saving took: 128.66 ms
inferencing took:                         160.57 ms
sound class:                              fall 0.86574155
0 [[0.86574155 0.0199352  0.05724455 0.05707881]]

waveform plotting saving took:            72.80 ms
log mel spectrogram plotting saving took: 110.70 ms
inferencing took:                         140.57 ms ~~ exception (keyboard?)


KeyboardInterrupt: 

(np.empty(44100*2)*np.nan).shape