# CNN

Note that here testing is done on .wav files. For testing on spectrogram , the readDir function can be changed accordingly. However here too, the MFCC features are calculated from decibel power spectrum only.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import log, dot, e
import librosa
from numpy.random import rand
import tensorflow as tf
from tensorflow import keras

import tensorflow.keras as keras

In [None]:
def splitData(X, t, testFraction=0.2, randomize = False):
    """
    Split the data randomly into training and test sets
    Use numpy functions only
    Inputs:
        X: (np array of len Nclips) input feature vectors
        t: (np array of len Nclips) targets; one hot vectors
        testFraction: (float) Nclips_test = testFraction * Nclips
    Outputs:
        X_train: training set
        X_test: test set
        t_train: training labels
        t_test: test labels
    """


    ### WRITE YOUR CODE HERE - 5 MARKS

    
    test_samples = (int) (len(X)*testFraction )
    
    temp=np.arange(len(X))
    np.random.shuffle(temp)
    
    
    temp1 = temp[0: test_samples]
    temp2 = temp[test_samples: len(X)]
    
   
    
    X_test = np.reshape(X[temp1[0]],(-1,1))
    X_test = np.transpose(X_test)
    
    t_test = np.reshape(t[temp1[0]],(-1,1))
    t_test = np.transpose(t_test)
    for x in range(1,len(temp1)):
        i=temp1[x]
        a=np.reshape(X[i],(-1,1))
        a=np.transpose(a)
        b=np.reshape(t[i],(-1,1))
        b=np.transpose(b)
        X_test = np.concatenate((X_test,a))
        t_test = np.concatenate((t_test,b))
        
    X_train = np.reshape(X[temp2[0]],(-1,1))
    X_train = np.transpose(X_train)
    
    t_train = np.reshape(t[temp2[0]],(-1,1))
    t_train = np.transpose(t_train)
    for x in range(1,len(temp2)):
        i=temp2[x]
        a=np.reshape(X[i],(-1,1))
        a=np.transpose(a)
        b=np.reshape(t[i],(-1,1))
        b=np.transpose(b)
        X_train = np.concatenate((X_train,a))
        t_train = np.concatenate((t_train,b))
        
    return X_train, t_train, X_test, t_test

In [2]:
def predict_framewise(model,x_test):
        '''
        Framewise classification (speech or music)
        
        '''
        
        
        lis=[]   

        
        for i in range(0,len(x_test)):
            
            
            temp =np.ones((1,2))
            #print(x_test[i].shape)
            
            temp1=[]
            for l in range(0,len(x_test[i])):
                temp2=x_test[i][l]
                temp2=np.reshape(temp2,(1,60))
                temp1.append(temp2)
            data=np.array(temp1)

            data = data[..., np.newaxis]
            
            
            feat_4 = model.model.predict(data)
            
            
                
                
            for j in range(0,len(feat_4)):   
                
                if(feat_4[j][0] > 0.5):
                    temp = np.concatenate( (temp, np.transpose(np.reshape([1,0] , (-1,1) ) ) ) ) 
                else:
                    temp = np.concatenate((temp, np.transpose(np.reshape([0,1] , (-1,1) ) ) ) ) 
                    
            #print(temp.shape)
                    
            temp = temp[1:]
            
                
            
            lis.append(temp)

        

        
        
        return lis

In [3]:
def predict_aggregate(y_pred_framewise):
        '''
        Aggregate frames to give a single class label (music or speech) to the entire audio file
        Input:
            y_pred_framewise = framewise prediction
        Output:
            y_hat = frame aggregate (one-hot vectors)
            
        '''
       

        
        y_hat= np.ones((1,2))
        for i in range (0,len(y_pred_framewise)):
            
            zero=0
            one=0
            t=y_pred_framewise[i]
            for j in range(0, len(t)):
                
                if(t[j][0] == 1):
                    one+=1
                else:
                    zero+=1
            if(one>zero):
                y_hat= np.concatenate((y_hat, np.transpose(np.reshape([1,0] , (-1,1) ) ) )) 
                
            else:
                y_hat= np.concatenate((y_hat, np.transpose(np.reshape([0,1] , (-1,1) ) ) ))  

        

        y_hat=y_hat[1:]
        return y_hat

In [4]:
from glob import glob 
def readDir(dirname, Fs = 16000):
    
    '''
    This function reads all training samples, find thier power spectrogram, convert power spectrogram to mfcc features and 
    concatenate all frames of all samples along rows. Hence output matrix is frames X n_mfcc
    '''  


    no_samp_in_10_sec = 10* Fs
    files = glob(dirname + '/*.wav')
    x= np.ones((1,60))
    
    
    
    
    
    for f in files:
    
        
        samples = load_audio(f,Fs)
        n_fft = 1024
        hop_length = 512
        win_length = 1024
        X = np.abs(librosa.stft(samples, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann'))
        X = librosa.power_to_db(X**2,ref=np.max)
        
        temp =pow(10,X/10) 
        mel_spectrogram = librosa.feature.melspectrogram(y='none',S=temp, sr=16000, n_fft=n_fft, hop_length=hop_length, n_mels=21,win_length = win_length, window='hann')
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
        
        
        mfccs = librosa.feature.mfcc(y='none',S=log_mel_spectrogram, n_mfcc=21, sr=16000,hop_length = hop_length, win_length = win_length ,n_fft=n_fft,window='hann')
        mfccs= mfccs[1:]
        #print(mfccs.shape)
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)
        mfccs=np.concatenate((mfccs,delta_mfccs))
        mfccs=np.concatenate((mfccs,delta2_mfccs))
        mfccs=np.array(mfccs)
        
        mfccs=np.transpose(mfccs)
        x=np.concatenate((x,mfccs))
    
    #print(k)
    
    return x[1:]


    return x 

In [5]:
def load_audio(filename, Fs = 16000):
    '''
    Inputs: 
        filename: (str) filename
        Fs: (int) sampling rate
    Output: 
        x: 1D np array 
    '''
    

    ### WRITE YOUR CODE HERE - 2 MARKS

    x,sr=librosa.load(filename,sr=Fs)
    x=np.array(x)
    
    return x

In [6]:
def prepare_datasets(X,y,test_size, validation_size):

    

    # load data
    

    # create train, validation and test split
    X_train,  y_train, X_test, y_test = splitData(X, y, 0.2)
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    #X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)
    X_train, y_train, X_validation, y_validation = splitData(X_train,y_train,0.1)
    # add an axis to input sets
    X_train = X_train[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


In [7]:
def audio2mfcc(x, n_mfcc = 20, Fs = 16000):
    
    '''
    Compute Mel-frequency cepstral coefficients (MFCCs)
    Inputs:
        x: np array of shape (Nclips,)
        Fs: (int) sampling rate
        n_mfcc: (int) number of MFCC features
    Output:
        X: (np array) MFCC sequence
    '''

  
    
   
    lis=[]
    if(len(x.shape) <=1):
        x= np.reshape(x, (1,len(x))) 
    
    for i in range(0, len(x)):
        
        mfccs = librosa.feature.mfcc(y=x[i], n_mfcc=60, sr=Fs,n_fft=1024,hop_length = 512, win_length = 1024)
        
        #delta_mfccs = librosa.feature.delta(mfccs)
        #delta2_mfccs = librosa.feature.delta(mfccs, order=2)
        #mfccs=np.concatenate((mfccs,delta_mfccs))
        #mfccs=np.concatenate((mfccs,delta2_mfccs))
        #mfccs=np.array(mfccs)
        mfccs=np.transpose(mfccs)
        lis.append(mfccs)
    ### WRITE YOUR CODE HERE - 3 MARKS
    
    X=np.array(lis)

     

    return X 

In [8]:
class Classifier: 
    '''
    Create a linear classifier to classify each frame
    '''
    def __init__(self):
        self.W=np.random.rand(20,)
    
    def sigmoid(self, z): return 1 / (1 + e**(-z))
    

   
    
    def cn(self,X_train, y_train,X_validation, y_validation, input_shape):
        
        model = keras.Sequential()

            # 1st conv layer
        model.add(keras.layers.Conv2D(32, (3, 3),padding='same' ,activation='relu', input_shape=input_shape))
        model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
        model.add(keras.layers.BatchNormalization())

            # 2nd conv layer
        model.add(keras.layers.Conv2D(32, (3, 3),padding='same' , activation='relu'))
        model.add(keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same'))
        model.add(keras.layers.BatchNormalization())

            # 3rd conv layer
        model.add(keras.layers.Conv2D(32, (2, 2), padding='same' ,activation='relu'))
        model.add(keras.layers.MaxPooling2D((2, 2), strides=(2, 2), padding='same'))
        model.add(keras.layers.BatchNormalization())

            # flatten output and feed it into dense layer
        model.add(keras.layers.Flatten())

        model.add(keras.layers.Dense(256, activation='relu'))
        model.add(keras.layers.Dropout(0.3))

        model.add(keras.layers.Dense(192, activation='relu'))
        model.add(keras.layers.Dropout(0.3))

        model.add(keras.layers.Dense(128, activation='relu'))
        model.add(keras.layers.Dropout(0.3))

        model.add(keras.layers.Dense(64, activation='relu'))
        model.add(keras.layers.Dropout(0.3))

            # output layer
        model.add(keras.layers.Dense(2, activation='softmax'))


        model.compile(optimizer='adam', loss=tf.losses.CategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
        model.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=32, epochs=100)
        
    
        
        self.model=model

    
    
    def save_model(self, save_path):
        '''
        Save the trained model on local disk
        Input:
            save_path: location at which model is to be saved
        Output:
            None
            
        '''
        ## Assuming save_path contains the file name too. If save_path contains only directory
        ## name , uncomment the below line to save file as data.npy
        
        #save_path = save_path +'/data'
        
        data=self.W
        np.save(save_path, data)
        
        ### WRITE YOUR CODE HERE - 0 MARKS
            
        return
    
    def load_model(self, load_path):
        '''
        Save the trained model on local disk
        Input:
            load_path: location from which model is to be loaded
        Output:
            None
        '''
        
        
        ### WRITE YOUR CODE HERE - 0 MARKS
        
        ## Assuming load_path also contains the name of file which has to be loaded. If load_path only contains the
        ## directory name, uncomment the below line and replace data.npy with file name.
        
        #load_path = load_path +'/data.npy'
        
        self.W = np.load(load_path)
            
        return


In [9]:
def computeCM(y, y_hat):
    '''
    Compute confusion matrix to evaluate your model
    Inputs:
        y = labels 
        y_hat = predicted output
    Output:
        confusion matrix: confusion matrix
    '''

    


    metrics = np.array([[0,0], [0,0]])

    
    for i in range(0,len(y)):
        if(y[i][0] == 1 and y_hat[i][0] == 1):
            metrics[0][0]+=1
        elif(y[i][0] == 1 and y_hat[i][0] == 0):
            metrics[0][1] +=1
        elif(y[i][0] == 0 and y_hat[i][0] == 1):
            metrics[1][0] +=1
        else:
            metrics[1][1]+=1

    confusion_matrix = metrics
    return confusion_matrix 

In [10]:
#silence vs audio training

x_silence = readDir('C:/Users/HP/Documents/test/no-audio', 16000) 
x_speech =  readDir('C:/Users/HP/Documents/test/audio', 16000)
X = np.concatenate((x_silence, x_speech))
y_silence = np.array([[1,0]]*len(x_silence))
y_speech = np.array([[0,1]]*len(x_speech))
Y = np.concatenate((y_silence, y_speech))


lis=[]
for i in range(0,len(X)):
    temp=X[i]
    temp=np.reshape(temp,(1,60))
    lis.append(temp)
X=np.array(lis)



X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(X,Y,0.25, 0.2)
input_shape = (X_train.shape[1], X_train.shape[2], 1)


model1 = Classifier()
model1.cn(X_train, y_train,X_validation, y_validation,input_shape)

Epoch 1/100


  '"`categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [11]:
#speech vs music training


x_speech = readDir('C:/Users/HP/Documents/test/speech', 16000)
x_music = readDir('C:/Users/HP/Documents/test/music', 16000) 
X = np.concatenate((x_speech, x_music))
y_speech = np.array([[1,0]]*len(x_speech))
y_music= np.array([[0,1]]*len(x_music))
Y = np.concatenate((y_speech, y_music))



lis=[]
for i in range(0,len(X)):
    temp=X[i]
    temp=np.reshape(temp,(1,60))
    lis.append(temp)
X=np.array(lis)


X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(X,Y,0.25, 0.2)
input_shape = (X_train.shape[1], X_train.shape[2], 1)


model2 = Classifier()
model2.cn(X_train, y_train,X_validation, y_validation,input_shape)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [12]:
model1.model.save('C:/Users/HP/Documents/test/model1_cnn')
model2.model.save('C:/Users/HP/Documents/test/model2_cnn')

INFO:tensorflow:Assets written to: C:/Users/HP/Documents/test/model1_cnn\assets
INFO:tensorflow:Assets written to: C:/Users/HP/Documents/test/model2_cnn\assets


In [13]:
# audio vs no audio prediction

data = readDir('C:/Users/HP/Documents/test/unknown/', 16000) # This will work for single file in folder. For multiple files, WE can run this in a loop for every unknown audio file in folder
lis=[]
for i in range(0,len(data)):
    temp=data[i]
    temp=np.reshape(temp,(1,60))
    lis.append(temp)
data=np.array(lis)

data = data[..., np.newaxis]

p = model1.model.predict(data)

for i in range(0, len(p)):
    if(p[i][1]>0.5):
        p[i][0]=0
        p[i][1]=1
    else:
        p[i][0]=1
        p[i][1]=0

In [14]:
#Time label prediction

k=0
j=0
sil=np.array([[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0]])
sp=np.array([[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0]])

def check(x):
    sum=0
    for i in range(0,len(x)):
        sum=sum+x[i][0]
    if(sum>=10):
        return 1
    else:
        return 0
    
    
    
prev=1

for i in range(0,len(p)-16):
    x=p[i:i+16]
    a=check(x)
    #print(a)
    if(a==1):
        if(prev==1):
            sil[j][1]=librosa.frames_to_time(i+22, sr=16000, hop_length=512, n_fft=1024)
        else:
            temp=i
            while(p[i][0]!=1):
                i+=1
            sp[k][1]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sil[j][0]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sil[j][1]=librosa.frames_to_time(temp+22, sr=16000, hop_length=512, n_fft=1024)
            k+=1
        
    
    else:
        if(prev==1):
            temp=i
            while(p[i][1]!=1):
                i+=1
            sil[j][1]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sp[k][0]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sp[k][1]=librosa.frames_to_time(temp+22, sr=16000, hop_length=512, n_fft=1024)
            j+=1
        
        else:
            sp[k][1]=librosa.frames_to_time(i+22, sr=16000, hop_length=512, n_fft=1024)
    prev=a      

    
sp  

array([[0.864, 2.784],
       [3.776, 6.208],
       [6.944, 9.408],
       [0.   , 0.   ],
       [0.   , 0.   ]])

In [15]:
#Extracting audio portions

lis=[]
for i in range(len(sp)):
    if(sp[i][0]==0 and sp[i][1]==0):
        continue
    else:
        lis.append(np.array([sp[i][0],sp[i][1]]))
sp=np.array(lis)


lis=[]
for i in range(0,len(sp)):
    start= librosa.time_to_frames(sp[i][0], sr=16000, hop_length=512, n_fft=1024)
    end = librosa.time_to_frames(sp[i][1], sr=16000, hop_length=512, n_fft=1024)
    print(start,end)
    lis.append(np.array(data[start:end]))

26 86
117 193
216 293


In [16]:
#Labels of extracted audio portions
frame_pred= predict_framewise(model2,lis)

agg=predict_aggregate(frame_pred)  

for i in range(len(sp)):
    print(sp[i][0],sp[i][1], 'speech' if (agg[i][0]==1) else  'music')

0.864 2.784 speech
3.776 6.208 speech
6.944 9.408 speech
