# Neural Network using Back-Propagation

Note that here testing is done on .wav files. For testing on spectrogram , the readDir function can be changed accordingly. However here too, the MFCC features are calculated from decibel power spectrum only.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from numpy import log, dot, e
import librosa
from numpy.random import rand
import tensorflow as tf
from tensorflow import keras


In [5]:
def predict_framewise(model,x_test):
        '''
        Framewise classification (speech or music)
        
        '''
        
        
        lis=[]   

        
        for i in range(0,len(x_test)):
            k=0
            
            temp =np.ones((1,2))
            #print(x_test[i].shape)
            data=np.transpose(x_test[i])
            
            data=data[np.newaxis,...]
            #print(data.shape)
            feat_4 = model.predict_framewise(data)
            
            feat_4= np.transpose(feat_4[0])    
            
            for j in range(0,len(feat_4)):   
                
                if(feat_4[j][0] > 0.5):
                    temp = np.concatenate( (temp, np.transpose(np.reshape([1,0] , (-1,1) ) ) ) ) 
                else:
                    temp = np.concatenate((temp, np.transpose(np.reshape([0,1] , (-1,1) ) ) ) ) 
                    
            #print(temp.shape)
                    
            temp = temp[1:]
            
                
            
            lis.append(temp)

        ### WRITE YOUR CODE HERE - 5 MARKS

        
        
        return lis

In [6]:
def predict_aggregate(y_pred_framewise):
        '''
        Aggregate frames to give a single class label (music or speech) to the entire audio file
        
            
        '''
       

        
        y_hat= np.ones((1,2))
        for i in range (0,len(y_pred_framewise)):
            
            zero=0
            one=0
            t=y_pred_framewise[i]
            for j in range(0, len(t)):
                
                if(t[j][0] == 1):
                    one+=1
                else:
                    zero+=1
            if(one>zero):
                y_hat= np.concatenate((y_hat, np.transpose(np.reshape([1,0] , (-1,1) ) ) )) 
                
            else:
                y_hat= np.concatenate((y_hat, np.transpose(np.reshape([0,1] , (-1,1) ) ) ))  

        ### WRITE YOUR CODE HERE - 5 MARKS

        y_hat=y_hat[1:]
        return y_hat

In [7]:
from glob import glob 
def readDir(dirname, Fs = 16000):
    
    '''
   This function reads all training samples, find thier power spectrogram, convert power spectrogram to mfcc features and 
    concatenate all frames of all samples along rows. Hence output matrix is frames X n_mfcc
    '''  


    no_samp_in_10_sec = 10* Fs
    files = glob(dirname + '/*.wav')
    x= np.ones((1,5))
    
    
    
    
    
    for f in files:
    
        
        samples = load_audio(f,Fs)
        n_fft = 1024
        hop_length = 512
        win_length = 1024
        X = np.abs(librosa.stft(samples, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann'))
        X = librosa.power_to_db(X**2,ref=np.max)
        
        temp =librosa.db_to_power(X)
        mel_spectrogram = librosa.feature.melspectrogram(y='none',S=temp, sr=16000)
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
        
        
        mfccs = librosa.feature.mfcc(y='none',S=log_mel_spectrogram, n_mfcc=6, sr=16000,hop_length = hop_length, win_length = win_length ,n_fft=n_fft,window='hann')
        mfccs= mfccs[1:]
        #mfccs = librosa.feature.mfcc(y=samples, n_mfcc=20, sr=Fs,n_fft=1024,hop_length = 512, win_length = 1024)
       
        
        mfccs=np.transpose(mfccs)
        x=np.concatenate((x,mfccs))
    
    #print(k)
    
    return x[1:]


    return x 

In [8]:
def load_audio(filename, Fs = 16000):
    '''
    Inputs: 
        filename: (str) filename
        Fs: (int) sampling rate
    Output: 
        x: 1D np array 
    '''
    

    ### WRITE YOUR CODE HERE - 2 MARKS

    x,sr=librosa.load(filename,sr=Fs)
    x=np.array(x)
    
    return x

In [9]:
def splitData(X, t, testFraction=0.2, randomize = False):
    """
    Split the data randomly into training and test sets
    Use numpy functions only
    Inputs:
        X: (np array of len Nclips) input feature vectors
        t: (np array of len Nclips) targets; one hot vectors
        testFraction: (float) Nclips_test = testFraction * Nclips
    Outputs:
        X_train: training set
        X_test: test set
        t_train: training labels
        t_test: test labels
    """


    ### WRITE YOUR CODE HERE - 5 MARKS

    
    test_samples = (int) (len(X)*testFraction )
    
    temp=np.arange(len(X))
    np.random.shuffle(temp)
    
    
    temp1 = temp[0: test_samples]
    temp2 = temp[test_samples: len(X)]
    
   
    
    X_test = np.reshape(X[temp1[0]],(-1,1))
    X_test = np.transpose(X_test)
    
    t_test = np.reshape(t[temp1[0]],(-1,1))
    t_test = np.transpose(t_test)
    for x in range(1,len(temp1)):
        i=temp1[x]
        a=np.reshape(X[i],(-1,1))
        a=np.transpose(a)
        b=np.reshape(t[i],(-1,1))
        b=np.transpose(b)
        X_test = np.concatenate((X_test,a))
        t_test = np.concatenate((t_test,b))
        
    X_train = np.reshape(X[temp2[0]],(-1,1))
    X_train = np.transpose(X_train)
    
    t_train = np.reshape(t[temp2[0]],(-1,1))
    t_train = np.transpose(t_train)
    for x in range(1,len(temp2)):
        i=temp2[x]
        a=np.reshape(X[i],(-1,1))
        a=np.transpose(a)
        b=np.reshape(t[i],(-1,1))
        b=np.transpose(b)
        X_train = np.concatenate((X_train,a))
        t_train = np.concatenate((t_train,b))
        
    return X_train, t_train, X_test, t_test

In [10]:
def audio2mfcc(x, n_mfcc = 20, Fs = 16000):
    
    '''
    Compute Mel-frequency cepstral coefficients (MFCCs)
    Inputs:
        x: np array of shape (Nclips,)
        Fs: (int) sampling rate
        n_mfcc: (int) number of MFCC features
    Output:
        X: (np array) MFCC sequence
    '''

    ### WRITE YOUR CODE HERE - 3 MARKS
    
    lis=[]
    if(len(x.shape) <=1):
        x= np.reshape(x, (1,len(x))) 
    
    for i in range(0, len(x)):
        mfccs = librosa.feature.mfcc(y=x[i], n_mfcc=n_mfcc, sr=Fs)
        #mfccs=np.transpose(mfccs)
        mfccs=np.array(mfccs)
        lis.append(mfccs)
    ### WRITE YOUR CODE HERE - 3 MARKS
    
    X=np.array(lis)

     

    return X 

In [27]:
class Classifier: 
    '''
    Create a linear classifier to classify each frame
    '''
    def __init__(self):
        self.W=np.random.rand(20,)
    
    def sigmoid(self, z): return 1 / (1 + e**(-z))
    
    def soft(self, x):

        return np.array([[e**x[0][0]/(e**x[0][0]+e**x[1][0])],[e**x[1][0]/(e**x[0][0]+e**x[1][0])]])
        
    def fit(self, X, y, epochs=100, lr=0.5):        
        loss = []
        weights = rand(X.shape[1])
        N = len(X)
                 
        for _ in range(epochs):        
            # Gradient Descent
           
            y_hat = self.sigmoid(dot(X, weights))
            de = dot(X.T,  y_hat - y) / N
            weights -= lr * dot(X.T,  y_hat - y) / N            
 
        self.de=de   
        self.W = weights
        
        # To save the model uncomment the below line and replace save_path with location at which you want to save
        self.save_model('C:/Users/HP/Desktop/data') 
   
    
    
    

    def neural(self,x,y):
        lr=50

        layer_1 = np.random.rand(6,5)*2 - np.ones((6,5))
        layer_2 = np.random.rand(6,6)*2 - np.ones((6,6))
        layer_3 = np.random.rand(2,6)*2 - np.ones((2,6))

        delta_3 = np.zeros((2,6))
        delta_2 = np.zeros((6,6))
        delta_1 = np.zeros((6,5))

        for j in range(1000):

            for i in range(0,len(x)):
                feat_1 = np.reshape(x[i],(-1,1)) 
                feat_2 = self.sigmoid(np.matmul(layer_1,feat_1) )
                feat_3 = self.sigmoid( np.matmul(layer_2,feat_2) )
                feat_4 =  self.soft(np.matmul(layer_3,feat_3) )

                er_4 = feat_4 - np.reshape(y[i],(-1,1))
                er_3 = np.multiply( np.matmul(np.transpose(layer_3) ,er_4) ,  np.multiply(feat_3 , np.ones((6,1))- feat_3) ) 
                er_2 = np.multiply( np.matmul(np.transpose(layer_2) ,er_3) ,  np.multiply(feat_2 , np.ones((6,1))- feat_2) )

                delta_3 = delta_3 + np.matmul(  er_4,np.transpose(feat_3))
                delta_2 = delta_2 + np.matmul(  er_3,np.transpose(feat_2))
                delta_1 = delta_1 + np.matmul(  er_2,np.transpose(feat_1)) 

            delta_3 = 1/len(x) * delta_3 #+ 0.1* layer_3
            delta_2 = 1/len(x) * delta_2 #+ 0.1* layer_2
            delta_1 = 1/len(x) * delta_1 #+ 0.1* layer_1

            layer_1 = layer_1 - lr * delta_1
            layer_2 = layer_2 - lr * delta_2
            layer_3 = layer_3 - lr * delta_3

        self.w1 =layer_1
        self.w2 =layer_2
        self.w3=layer_3

            
    
        
    def train(self,x_train, y_train):
        
        if(len(x_train.shape) ==2):
            x_train=np.reshape(x_train,(1,len(x_train), len(x_train[0]) ) )
            
        if(len(y_train.shape) ==1):
            y_train=np.reshape(y_train,(1,len(y_train) ) ) 
        
        x= np.transpose(x_train[0])
        y= np.array([ [ y_train[0][0]  ]  ] * x_train[0].shape[1] )
        
        #print(y.shape)
        
        for i in range(1,len(x_train)):
            x=np.concatenate((x, np.transpose(x_train[i]) )) 
            temp=np.array([[ y_train[i][0] ]]  * x_train[i].shape[1]) 
            y=np.concatenate( (y,temp))
        #self.fit(x,y)
        #print(y)
        #print(x.shape)
        self.neural(x,y)
        #print(y.shape)
        
        
         

        
        return 
    
    def save_model(self, save_path):
        '''
        Save the trained model on local disk
        Input:
            save_path: location at which model is to be saved
        Output:
            None
            
        '''
        ## Assuming save_path contains the file name too. If save_path contains only directory
        ## name , uncomment the below line to save file as data.npy
        
        #save_path = save_path +'/data'
        
        data=self.W
        np.save(save_path, data)
        
        ### WRITE YOUR CODE HERE - 0 MARKS
            
        return
    
    def load_model(self, load_path):
        '''
        Save the trained model on local disk
        Input:
            load_path: location from which model is to be loaded
        Output:
            None
        '''
        
        
        ### WRITE YOUR CODE HERE - 0 MARKS
        
        ## Assuming load_path also contains the name of file which has to be loaded. If load_path only contains the
        ## directory name, uncomment the below line and replace data.npy with file name.
        
        #load_path = load_path +'/data.npy'
        
        self.W = np.load(load_path)
            
        return


    
    def predict_framewise(self,x_test):
        '''
        Framewise classification (speech or music)
        Input:
            x_test: test set
        Output:
            y_pred_framewise = framewise prediction
        '''
        
        lis = []
        
        if(len(x_test.shape) ==2 ):
            x_test = np.reshape(x_test,(1,len(x_test) , len(x_test[0]) ))
            

        
        for i in range(0,len(x_test)):
            k=0
            t=np.transpose(x_test[i])
            temp =np.ones((1,2))
            
            for j in range(0,len(t)):
                                                
                
                
                feat_1 = np.reshape(t[j], (-1,1))
                feat_2 = self.sigmoid(np.matmul(self.w1,feat_1) )
                feat_3 = self.sigmoid( np.matmul(self.w2,feat_2) )
                feat_4 =  self.soft(np.matmul(self.w3,feat_3) )
                
                #print(feat_4)
                
                if(feat_4[0][0] > 0.5):
                    temp = np.concatenate( (temp, np.transpose(np.reshape([1,0] , (-1,1) ) ) ) ) 
                else:
                    temp = np.concatenate((temp, np.transpose(np.reshape([0,1] , (-1,1) ) ) ) ) 
                    
            #print(temp.shape)
                    
            temp = temp[1:]
            temp=np.transpose(temp)
                
            
            lis.append(temp)

        ### WRITE YOUR CODE HERE - 5 MARKS

        y_pred_framewise= np.array(lis)
        
        return y_pred_framewise 
    
    def predict_aggregate(self,y_pred_framewise):
        '''
        Aggregate frames to give a single class label (music or speech) to the entire audio file
        Input:
            y_pred_framewise = framewise prediction
        Output:
            y_hat = frame aggregate (one-hot vectors)
            
        '''
        if(len(y_pred_framewise.shape) ==2 ):
            y_pred_framewise = np.reshape(y_pred_framewise, (1,2,len(y_pred_framewise[0])))        

        
        y_hat= np.ones((1,2))
        for i in range (0,len(y_pred_framewise)):
            
            zero=0
            one=0
            t=np.transpose(y_pred_framewise[i])
            for j in range(0, len(t)):
                
                if(t[j][0] == 1):
                    one+=1
                else:
                    zero+=1
            if(one>zero):
                y_hat= np.concatenate((y_hat, np.transpose(np.reshape([1,0] , (-1,1) ) ) )) 
                
            else:
                y_hat= np.concatenate((y_hat, np.transpose(np.reshape([0,1] , (-1,1) ) ) ))  

        ### WRITE YOUR CODE HERE - 5 MARKS

        y_hat=y_hat[1:]
        return y_hat

In [12]:
def computeCM(y, y_hat):
    '''
    Compute confusion matrix to evaluate your model
    Inputs:
        y = labels 
        y_hat = predicted output
    Output:
        confusion matrix: confusion matrix
    '''

    ### WRITE YOUR CODE HERE - 5 MARKS


    metrics = np.array([[0,0], [0,0]])

    
    for i in range(0,len(y)):
        if(y[i][0] == 1 and y_hat[i][0] == 1):
            metrics[0][0]+=1
        elif(y[i][0] == 1 and y_hat[i][0] == 0):
            metrics[0][1] +=1
        elif(y[i][0] == 0 and y_hat[i][0] == 1):
            metrics[1][0] +=1
        else:
            metrics[1][1]+=1

    confusion_matrix = metrics
    return confusion_matrix 

In [13]:
def computeCM_framewise(y, y_hat):
    '''
    Compute confusion matrix to evaluate your model
    Inputs:
        y = labels 
        y_hat = predicted output
    Output:
        confusion matrix: confusion matrix
    '''
    metrics = np.array([[0,0], [0,0]])
    ### WRITE YOUR CODE HERE - 5 MARKS
    #k=0
    for i in range(0,len(y)):
        #k=0
        for j in range(0,len(y_hat[0][0])):
            #k+=1
        
            if(y[i][0] == 1 and y_hat[i][0][j] == 1):
                metrics[0][0]+=1
            elif(y[i][0] == 1 and y_hat[i][0][j] == 0):
                metrics[0][1] +=1
            elif(y[i][0] == 0 and y_hat[i][0][j] == 1):
                metrics[1][0] +=1
            else:
                metrics[1][1]+=1


    #print(k)
    return metrics

In [28]:
#silence vs audio training

x_silence = readDir('C:/Users/HP/Documents/test/less_data_no-audio', 16000) 
x_speech =  readDir('C:/Users/HP/Documents/test/less_data_audio', 16000)
X = np.concatenate((x_silence, x_speech))
y_silence = np.array([1]*len(x_silence))
y_speech = np.array([0]*len(x_speech))
Y = np.concatenate((y_silence, y_speech))

X_train,  y_train, X_test, y_test = splitData(X, Y, 0.2)
print(X_train.shape)
model1 = Classifier()
model1.neural(X_train,y_train)

(1590, 5)


  
  if sys.path[0] == '':


In [33]:
#speech vs music training


x_speech = readDir('C:/Users/HP/Documents/test/less_data_speech', 16000)
x_music = readDir('C:/Users/HP/Documents/test/less_data_music', 16000) 
X = np.concatenate((x_speech, x_music))
y_speech = np.array([1]*len(x_speech))
y_music= np.array([0]*len(x_music))
Y = np.concatenate((y_speech, y_music))

X_train,  y_train, X_test, y_test = splitData(X, Y, 0.2)

model2 = Classifier()
model2.neural(X_train,y_train)

  


In [29]:
#audio vs Silence

data = readDir('C:/Users/HP/Documents/test/unknown/', 16000) # This will work for single file in folder. For multiple files, WE can run this in a loop for every unknown audio file in folder

temp=np.transpose(data)
pred=temp[np.newaxis,...]
p=model1.predict_framewise(pred)
p=np.transpose(p[0])
p.shape  

(313, 2)

In [30]:
pred[0]

array([[68.73458862, 77.37579346, 87.50971222, ..., 92.8523407 ,
        91.17234802, 84.36462402],
       [15.05480957, 15.52451611, 17.10684586, ..., 20.72632599,
        17.13963127, 11.12570572],
       [18.00509453, 23.82581329, 26.76900101, ..., 36.16954803,
        26.89697838, 25.83516312],
       [19.87253952, 13.88354874, 19.30182266, ..., 10.70814991,
         9.18837929,  3.19696569],
       [21.02840424, 18.34971619, 18.0487957 , ...,  1.19154942,
         1.6587224 ,  0.57066733]])

In [31]:
#Time label prediction

k=0
j=0
sil=np.array([[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0]])
sp=np.array([[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0],[0.0,0.0]])

def check(x):
    sum=0
    for i in range(0,len(x)):
        sum=sum+x[i][0]
    if(sum>=10):
        return 1
    else:
        return 0
    
    
    
prev=1

for i in range(0,len(p)-17):
    x=p[i:i+16]
    a=check(x)
    #print(a)
    if(a==1):
        if(prev==1):
            sil[j][1]=librosa.frames_to_time(i+22, sr=16000, hop_length=512, n_fft=1024)
        else:
            temp=i
            while(p[i][0]!=1):
                i+=1
            sp[k][1]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sil[j][0]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sil[j][1]=librosa.frames_to_time(temp+22, sr=16000, hop_length=512, n_fft=1024)
            k+=1
        
    
    else:
        if(prev==1):
            temp=i
            while(p[i][1]!=1):
                i+=1
            sil[j][1]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sp[k][0]=librosa.frames_to_time(i, sr=16000, hop_length=512, n_fft=1024)
            sp[k][1]=librosa.frames_to_time(temp+22, sr=16000, hop_length=512, n_fft=1024)
            j+=1
        
        else:
            sp[k][1]=librosa.frames_to_time(i+22, sr=16000, hop_length=512, n_fft=1024)
    prev=a      

    
sp  

array([[ 0.032, 10.176],
       [ 0.   ,  0.   ],
       [ 0.   ,  0.   ],
       [ 0.   ,  0.   ],
       [ 0.   ,  0.   ],
       [ 0.   ,  0.   ]])

In [32]:
#Extracting audio portions

lis=[]
for i in range(len(sp)):
    if(sp[i][0]==0 and sp[i][1]==0):
        continue
    else:
        lis.append(np.array([sp[i][0],sp[i][1]]))
sp=np.array(lis)


lis=[]
for i in range(0,len(sp)):
    start= librosa.time_to_frames(sp[i][0], sr=16000, hop_length=512, n_fft=1024)
    end = librosa.time_to_frames(sp[i][1], sr=16000, hop_length=512, n_fft=1024)
    #print(start,end)
    lis.append(np.array(data[start:end]))

In [26]:
#Frame labels

frame_pred= predict_framewise(model2,lis)
agg=predict_aggregate(frame_pred)  

for i in range(len(sp)):
    print(sp[i][0],sp[i][1], 'speech' if (agg[i][0]==1) else  'music')

0.032 10.176 music


  


In [34]:
np.save('C:/Users/HP/Documents/test/backprop_model1/w1.npy',model1.w1)

In [35]:
np.save('C:/Users/HP/Documents/test/backprop_model1/w2.npy',model1.w2)
np.save('C:/Users/HP/Documents/test/backprop_model1/w3.npy',model1.w3)

In [36]:
np.save('C:/Users/HP/Documents/test/backprop_model2/w1.npy',model2.w1)
np.save('C:/Users/HP/Documents/test/backprop_model2/w2.npy',model2.w2)
np.save('C:/Users/HP/Documents/test/backprop_model2/w3.npy',model2.w3)


In [None]:
2