In [108]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,Model
from keras.layers import Activation, Dense,Conv2D,Dropout,Activation,Flatten,MaxPooling2D
from tensorflow.keras.models import Sequential
import imblearn
from imblearn.over_sampling import RandomOverSampler,SMOTE
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import librosa
from sklearn.model_selection import train_test_split

import numpy as np
import scipy.io.wavfile as wav
from scipy.io.wavfile import write
from io import BytesIO
from base64 import b64decode
from google.colab import output
import IPython.display as ipd

from IPython.display import Javascript


In [109]:
train_path='/content/sample_data/data/train2/'
test_path='/content/sample_data/data/test2/'
model_path='/content/sample_data/data/audio_classification'


In [110]:
def features_extractorcnn(file_name):
    audio, sample_rate = librosa.load(file_name) 
    # audio, sample_rate = json.load(file_name) 
    mfccs_features =librosa.feature.melspectrogram(y=audio,sr=sample_rate)
    #mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_features.reshape(128,44,1)

In [111]:
features_extractorcnn('/content/sample_data/data/test2/firefly.2v6b3rcf.ingestion-7558d88f45-lsf97.wav').shape

(128, 44, 1)

In [112]:
x=np.array([features_extractorcnn(train_path+'/'+f) for f in listdir(train_path) if isfile(join(train_path, f))])
x_test=np.array([features_extractorcnn(test_path+'/'+f) for f in listdir(test_path) if isfile(join(test_path, f))])

In [113]:
y=np.array([f.split('.')[0] for f in listdir(train_path) if isfile(join(train_path, f))])
y=np.array(pd.get_dummies(y))

# y_test=np.array([f.split('.')[0] for f in listdir(test_path) if isfile(join(test_path, f))])
# y_test=np.array(pd.get_dummies(y_test))

In [114]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.2,random_state=0)

In [115]:
oversample = RandomOverSampler(sampling_strategy='minority')
#x_over, y_over = oversample.fit_resample(x_train, y_train)
x_over, y_over = oversample.fit_resample(x_train.reshape(len(x_train),-1), y_train)
x_over=x_over.reshape(len(x_over),128,44,1)

In [116]:
x_over.shape

(384, 128, 44, 1)

In [117]:
y_over.shape

(384, 7)

In [118]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

model1=keras.Sequential([
    #cnn
    layers.Conv2D(filters=32,kernel_size=(3,3),activation='relu',input_shape=(128,44,1)),
    keras.layers.Dropout(0.25),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(filters=64,kernel_size=(3,3),activation='relu'),
    keras.layers.Dropout(0.25),
    layers.MaxPooling2D((2,2)),
    
    #dense
    layers.Flatten(),
    layers.Dense(64,activation='relu'),
    keras.layers.Dropout(0.25),
    layers.Dense(7,activation='softmax')
    
])
model1.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
model1.fit(x_over,y_over, epochs=100, validation_data=(x_val, y_val),callbacks=[callback])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


<keras.callbacks.History at 0x7f8c77afef90>

In [119]:

def build_cnn(input_shape, learning_rate, error="sparse_categorical_crossentropy") :
    #build network
  model = keras.Sequential()

    # type of NN we are building
    #conv 1 - filters=64 kernel=3,3 kernel_regularizer->L2 tackles training window to improve model
  model.add(keras.layers.Conv2D(64, (3,3), activation="relu", input_shape=input_shape, kernel_regularizer=keras.regularizers.l2(0.001)))
    # batch normalization 
  model.add(keras.layers.BatchNormalization())
    # maxpooling to downsample the output
  model.add(keras.layers.MaxPool2D((3,3), strides=(2,2), padding="same"))


    #conv 2
  model.add(keras.layers.Conv2D(32, (3,3), activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(keras.layers.BatchNormalization())
  model.add(keras.layers.MaxPool2D((3,3), strides=(2,2), padding="same"))


    #conv 3
  model.add(keras.layers.Conv2D(32, (2,2), activation="relu", kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(keras.layers.BatchNormalization())
  model.add(keras.layers.MaxPool2D((2,2), strides=(2,2), padding="same"))

    #flatten the 3D output feed into dense layer (1D array)
  model.add(keras.layers.Flatten())
  model.add(keras.layers.Dense(64, activation="relu"))
    # tackle the fitting -> shoots down 30% of neurons into dense layer stochastically 
  model.add(keras.layers.Dropout(0.3))

    #softmax classifier -> output array of scores for the prediction of each keyword
  model.add(keras.layers.Dense(num_keywords=7, activation="softmax"))

    #compile the model
  optimiser = keras.optimizers.Adam(learning_rate=0.001)
  model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=["accuracy"])

    #print model overview
  model.summary()

  model.fit(x_train, y_train, epochs=40, batch_size=32, validation_data=(x_validation,y_validation))


    # return model

In [120]:
model1.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_16 (Conv2D)          (None, 126, 42, 32)       320       
                                                                 
 dropout_24 (Dropout)        (None, 126, 42, 32)       0         
                                                                 
 max_pooling2d_16 (MaxPoolin  (None, 63, 21, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_17 (Conv2D)          (None, 61, 19, 64)        18496     
                                                                 
 dropout_25 (Dropout)        (None, 61, 19, 64)        0         
                                                                 
 max_pooling2d_17 (MaxPoolin  (None, 30, 9, 64)        0         
 g2D)                                                 

In [121]:
model1.predict(x_test)

array([[9.72445589e-03, 6.68589532e-01, 1.23360520e-02, 2.08435277e-03,
        1.53615151e-03, 3.04835230e-01, 8.94292025e-04],
       [1.29986145e-02, 2.18370073e-02, 1.00366250e-02, 5.74935794e-01,
        5.83662316e-02, 3.72604765e-02, 2.84565270e-01],
       [3.29507282e-03, 1.17310742e-03, 5.76793274e-04, 4.10720915e-01,
        2.71099340e-02, 7.41610304e-03, 5.49708068e-01],
       [1.07661327e-02, 5.42052574e-02, 3.65929231e-02, 3.60766768e-01,
        2.89241839e-02, 3.33118170e-01, 1.75626576e-01],
       [1.63034633e-01, 1.16478167e-01, 2.11855143e-01, 9.68802050e-02,
        1.21035211e-01, 1.55682936e-01, 1.35033652e-01],
       [3.39155421e-02, 2.69206129e-02, 3.88855860e-02, 4.50929821e-01,
        5.49088567e-02, 8.38079154e-02, 3.10631573e-01],
       [6.60272529e-21, 8.63753951e-20, 6.39646835e-10, 2.47200508e-22,
        1.00000000e+00, 6.87632302e-25, 9.23865298e-23],
       [4.79694366e-01, 1.89426020e-02, 4.29785341e-01, 2.88124871e-03,
        3.57783027e-03, 4

In [126]:
# classes=['blue','change_color','green','high','low','noise','off','on','party','purple','red','sleep','unknown','wakeup','white','yellow']
classes=['firefly', 'off', 'party', 'red', 'sleep', 'white', 'yellow']
predictions=[classes[i] for i in np.argmax(model1.predict(x_test.reshape(len(x_test-1),128,44,1)),axis=1)]
test_classes=[f.split('.')[0] for f in listdir(test_path) if isfile(join(test_path, f))]


# New Section

In [127]:
mydf=pd.DataFrame(data={'real':test_classes, 'pred':predictions})

In [73]:
mydf

Unnamed: 0,real,pred
0,white,red
1,sleep,sleep
2,firefly,firefly
3,firefly,firefly
4,red,yellow
...,...,...
56,off,off
57,sleep,party
58,firefly,off
59,white,party


In [74]:
def predictSound(soundFile):
    x=features_extractorcnn(soundFile)
    i=np.argmax(model1.predict(x[np.newaxis,...]))
    return classes[i]
    

In [75]:
predictSound('/content/sample_data/data/test2/white.2v6al0un.ingestion-7558d88f45-k9jh8.wav')

'white'

In [76]:
from io import BytesIO
from base64 import b64decode
from google.colab import output
import IPython.display as ipd

from IPython.display import Javascript


def record(sec=1):
  RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""
  print("Speak Now...")
  display(Javascript(RECORD))
  # sec += 0.03
  sec += 1.0
  s = output.eval_js('record(%d)' % (sec*1000))
  print("Done Recording !")
  b = b64decode(s.split(',')[1])
  ipd.display(ipd.Audio(b))
  with open('audio.wav','wb') as f:
    f.write(b)
  audio, sample_rate = librosa.load('audio.wav')
  os.remove("audio.wav") 
  mfccs_features =librosa.feature.melspectrogram(y=audio[200:22250],sr=sample_rate)
  return mfccs_features.reshape(128,44,1)

In [128]:
def predictRecord():   
    x=record() 
    i=np.argmax(model1.predict(x[np.newaxis,...]))
    return classes[i]

In [129]:
predictRecord()

Speak Now...


<IPython.core.display.Javascript object>

Done Recording !




'yellow'