# EE603 Coding Assignment
- Use python3
- Submit your "rendered" ipynb, i.e., with outputs of codes (plots and printed values) visible below
- Do not change the return variables, as the evaluation is done by test cases based on the variables specified. Only add your code at "### WRITE YOUR CODE HERE"
- Use only numpy and librosa library for computing and signal processing, no other package allowed
- If you are using your mobile phone, you can use colab.research.google.com for coding
- Do not define multiple functions using same name. We will be using eval.py to auto evaluate your codes. Please check with sample test cases before submitting. We will share the evaluation test cases with you after the submission deadline.
- While submitting this file, change file name from 'YourRollNo.ipynb' to your actual roll no (Eg. 18204279.ipynb)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa

In [2]:
from glob import glob
def readDir(dirname, Fs = 16000):
    
    '''
    Each audio clip should be upto 10s long; split larger audio files into many clips (non-overlapping) 

    Use load_audio(file) 
    
    Inputs: 
        dirname: (str) directory name
        Fs: (int) sampling rate
    Output: 
        x: np arrays of shape (Nclips, Nsamples) Nsamples correspond to 10s length. Use zero-padding for shorter clips.
    '''  


    ### WRITE YOUR CODE HERE - 5 MARKS
    length_interval = 10*Fs
    x = np.empty(shape = (1, length_interval))
    for name in glob(dirname + "/*.wav"):
      audio_file = load_audio(name, Fs)
      length = len(audio_file)
      to_append = np.zeros(length_interval-length%length_interval)
      audio_file_new = np.concatenate((audio_file, to_append))
      length_new = len(audio_file_new)
      y = np.split(audio_file_new, int(length_new/length_interval))
      y = np.array(y)
      x = np.concatenate((x, y))

    x = np.delete(x, (0), axis=0)
    return x 

In [3]:
def load_audio(filename, Fs = 16000):
    '''
    Inputs: 
        filename: (str) filename
        Fs: (int) sampling rate
    Output: 
        x: 1D np array 
    '''
    

    ### WRITE YOUR CODE HERE - 2 MARKS
    if(filename.endswith(".wav")==0):
      filename = +".wav"
    x, sr = librosa.load(path = filename, sr = Fs)
    x = np.array(x)
    return x

In [4]:
def splitData(X, t, testFraction=0.2, randomize = False):
    """
    Split the data randomly into training and test sets
    Use numpy functions only
    Inputs:
        X: (np array of len Nclips) input feature vectors
        t: (np array of len Nclips) targets; one hot vectors
        testFraction: (float) Nclips_test = testFraction * Nclips
    Outputs:
        X_train: training set
        X_test: test set
        t_train: training labels
        t_test: test labels
    """


    ### WRITE YOUR CODE HERE - 5 MARKS
    Nsamples = t.shape[0]
    Ntest = int(testFraction*Nsamples)
    Ntrain = Nsamples - Ntest
    p = np.random.permutation(Nsamples)
    shuffled_X = X[p, :]
    shuffled_t = t[p, :]
    X_train = shuffled_X[:Ntrain, :]
    t_train = shuffled_t[:Ntrain, :]
    X_test = shuffled_X[Ntrain:, :]
    t_test = shuffled_t[Ntrain:, :]

    
    return X_train, t_train, X_test, t_test

In [5]:
def audio2mfcc(x, n_mfcc = 20, Fs = 16000):
    
    '''
    Compute Mel-frequency cepstral coefficients (MFCCs)
    Inputs:
        x: np array of shape (Nclips,)
        Fs: (int) sampling rate
        n_mfcc: (int) number of MFCC features
    Output:
        X: (np array) MFCC sequence
    '''

    ### WRITE YOUR CODE HERE - 3 MARKS
    x = np.array(x)
    y = librosa.feature.mfcc(y = x[0], n_mfcc = n_mfcc, sr = Fs)
    X = np.array([y])
    for i in range(1, (x.shape)[0]):
      y = librosa.feature.mfcc(y = x[i], n_mfcc = n_mfcc, sr = Fs)
      y_numpy = np.array([y])
      X = np.concatenate((X, y_numpy))
      
    return X 

In [6]:
class Classifier: 
    '''
    Create a linear classifier to classify each frame
    '''
    def __init__(self):
        self.W = np.array([])  # define model parameters here

    def sigmoid(self, x):
      return 1/(1+np.exp(-x))
    
        
    def train(self,x_train, y_train):
        '''
        Train the linear classifier
        Inputs:
            x_train: training set
            y_train: training labels
        Output:
            None
        '''

        ### WRITE YOUR CODE HERE - 0 MARKS
        self.W = np.random.randn(x_train.shape[1])
        learning_rate = 0.0001
        iterations = 100
        for i in range(iterations):
          delta = 0
          for i in range(x_train.shape[0]):
            pred = self.sigmoid(np.dot(x_train[i].T, self.W))
            actual = y_train[i, 0]*np.ones(pred.shape[0])
            delta = delta + np.dot(x_train[i], pred - actual)
        
          self.W = self.W - learning_rate*delta
        
        return 
    
    def save_model(self, save_path):
        '''
        Save the trained model on local disk
        Input:
            save_path: location at which model is to be saved
        Output:
            None
        '''
        
        ### WRITE YOUR CODE HERE - 0 MARKS
        np.save(save_path+"/weights", self.W)

        return
    
    def load_model(self, load_path):
        '''
        Save the trained model on local disk
        Input:
            load_path: location from which model is to be loaded
        Output:
            None
        '''
        
        ### WRITE YOUR CODE HERE - 0 MARKS
        self.W = np.load(load_path+"weights.npy")
        return


    
    def predict_framewise(self,x_test):
        '''
        Framewise classification (speech or music)
        Input:
            x_test: test set
        Output:
            y_pred_framewise = framewise prediction
        '''
        

        ### WRITE YOUR CODE HERE - 5 MARKS
        if(self.W.size==0):
          self.W = np.random.randn(x_test.shape[1])

        y_1 = np.array(np.sign(self.sigmoid(np.dot(x_test[0].T, self.W))))
        y_2 = np.array(1 - np.sign(self.sigmoid(np.dot(x_test[0].T, self.W))))
        y_pred_framewise = np.array([[y_1, y_2]])
        for i in range(1, x_test.shape[0]):
          pred_1 = np.array(np.sign(self.sigmoid(np.dot(x_test[i].T, self.W))))
          pred_2 = np.array(1 - np.sign(self.sigmoid(np.dot(x_test[i].T, self.W))))
          pred = np.array([[pred_1, pred_2]])
          y_pred_framewise = np.concatenate((y_pred_framewise, pred))
        return y_pred_framewise 
    
    def predict_aggregate(self,y_pred_framewise):
        '''
        Aggregate frames to give a single class label (music or speech) to the entire audio file
        Input:
            y_pred_framewise = framewise prediction
        Output:
            y_hat = frame aggregate (one-hot vectors)
        '''

        ### WRITE YOUR CODE HERE - 5 MARKS
        length = y_pred_framewise.shape[2]
        if(np.sum(y_pred_framewise[0][0]) > length/2):
          y_hat = np.array([[1, 0]])
        else:
          y_hat = np.array([[0, 1]])
        
        for i in range(1, y_pred_framewise.shape[0]):
          if(np.sum(y_pred_framewise[i][0])>length/2):
            y_hat = np.concatenate((y_hat, np.array([[1, 0]]) ))
          else:
            y_hat = np.concatenate((y_hat, np.array([[0, 1]]) ))

        return y_hat

In [7]:
def computeCM(y, y_hat):
    '''
    Compute confusion matrix to evaluate your model
    Inputs:
        y = labels 
        y_hat = predicted output
    Output:
        confusion matrix: confusion matrix
    '''

    ### WRITE YOUR CODE HERE - 5 MARKS
    confusion_matrix = np.zeros((2, 2))
    n = y.shape[0]
    for i in range(n):
      confusion_matrix[int(y[i, 0]), int(y_hat[i, 0])] = confusion_matrix[int(y[i, 0]), int(y_hat[i, 0])] + 1

    return confusion_matrix 

In [8]:
Fs = 16000

In [9]:
if __name__=="__main__":
    
    # Read audio
    x_music = readDir('music_wavs', Fs)    #change it as per your directory
    x_speech = readDir('speech_wavs', Fs)  #change it as per your directory
    X = np.concatenate((x_music, x_speech))
    
    # Create labels
    y_music = np.array([[1,0]]*len(x_music))
    y_speech = np.array([[0,1]]*len(x_speech))
    Y = np.concatenate((y_music, y_speech))
    
    
    X_train, y_train, X_test, y_test = splitData(X, Y)
    
    # TRAINING 
    x_train = audio2mfcc(X_train, Fs)    # x_train: (Nclips, N_mfcc, N_frames)
    model = Classifier()
    model.train(x_train, y_train)        # y_train: (Nclips, 2) -repeat it N_frames times inside the train
    
    # TESTING 
    x_test = audio2mfcc(X_test, Fs) 
    y_pred = model.predict_framewise(x_test)   # y_predict: (Nclips, 2, N_frames)
    y_hat = model.predict_aggregate(y_pred)    # y_hat: (Nclips, 2)
    
   # EVALUATION METRICS 
    confusion_matrix = computeCM(y_test, y_hat) 
    print(confusion_matrix) 

  if __name__ == '__main__':


[[0. 0.]
 [0. 4.]]
