# Music_speech_classifier_with_linear_network_

#### 1. Music/speech classification. Try various efforts with linear feed-forward network. Achieve an accuracy above 70%.

The number of network layers = 4 

Tanh/LogSoftmax, NLLLoss

learning rate = 0.01

SGD

Val accuracy = 81%

In [0]:
import torch
torch.manual_seed(0)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.figsize']=(15,3.4)
import IPython.display as ipd
import librosa.display
import torch.nn as nn

In [0]:
mpl.rcParams['figure.figsize'] = (14, 3.4)

In [3]:
from google.colab import drive

drive.mount('/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
import os

class_names = ['music', 'speech']
data_dir = {'music' : '/gdrive/My Drive/SGU_DeepLearning/music_speech/music_speech/music_wav', 
            'speech': '/gdrive/My Drive/SGU_DeepLearning/music_speech/music_speech/speech_wav'}

wav_files = {cls_name: [] for cls_name in class_names}
for cls_name in class_names:
    folder = data_dir[cls_name]
    filelist = os.listdir(folder)
    for filename in filelist:
        if filename[-4:] == '.wav':
            wav_files[cls_name].append(os.path.join(folder, filename))

In [0]:
np.random.seed(1)

file_list = {'train': [], 'val': []}
for class_id, c in enumerate(class_names):
    n_data = len(wav_files[c])
    rindx = np.random.permutation(n_data)
    n_validation = int(0.25*n_data)
    v_indx = rindx[:n_validation]
    t_indx = rindx[n_validation:]
    file_list['train'] += [ (wav_files[c][k], class_id) for k in t_indx ] 
    file_list['val'] += [ (wav_files[c][k], class_id) for k in v_indx ] 

In [0]:
class MSDataset(torch.utils.data.Dataset):
    """ Music/Speech Classification 
        filelist: [(file_path, class_id)]
        sample_time: time duration to sample from .wav file
                     the sample is extracted somewhere in the middle of the whole sequence
                     similar to data augmentation
                     
         Validation dataset: the first segment of the sequence is used.
                             Another option is to apply several segments and accumulate multiple inferences
    """
    def __init__(self, filelist, sample_sec=1., is_train=True):
        self.filelist = filelist
        self.time_duration = sample_sec
        self.is_train = is_train
        
        _, sf = self.load(filelist[0][0])
        self.sf = sf
        self.n_features = int(self.time_duration * sf)
        
    def __len__(self):
        return len(self.filelist)
    
    def __getitem__(self, i):
        # 1. load the file
        # 2. sample a segment of the length from the whole seq
        # 3. return segment, id
        audio_file, class_id = self.filelist[i]
        x, sf = librosa.load(audio_file)

        if self.is_train:
            k = np.random.randint(low=0, high=x.shape[0]-self.n_features) # choose the start index
        else:
            k = 0
        
        x = torch.from_numpy(x[k:k+self.n_features])
        
        # print('MSDataset(): ', x.shape)
        assert x.shape[0] == self.n_features
        
        return x, class_id
    
    def load(self, audio_file):
        return librosa.load(audio_file)

In [7]:
sample_sec = 1.5
batch_size = 2

data_loader = {tv: 
                   torch.utils.data.DataLoader(
                       MSDataset(file_list[tv], sample_sec=sample_sec, is_train=tv=='train'),
                       batch_size=batch_size,
                       shuffle=True,
                   )
               for tv in ['train', 'val']}
#
data_loader

{'train': <torch.utils.data.dataloader.DataLoader at 0x7f6916611400>,
 'val': <torch.utils.data.dataloader.DataLoader at 0x7f6916e31be0>}

In [0]:
def myNetworkModel(in_features, n_targets):
    net = nn.Sequential(
        nn.Linear(in_features=in_features, out_features=1024),
        nn.Tanh(),
        nn.Linear(in_features=1024, out_features=256),
        nn.Tanh(),
        nn.Linear(in_features=256, out_features=256),
        nn.Tanh(),
        nn.Linear(in_features=256, out_features=n_targets),
        nn.LogSoftmax(dim=1)
        )

    pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
    print(f'myNetworkModel()  in: {in_features}  out: {n_targets}', 'n_params (K): ', pytorch_total_params/1000)
    
    return net

In [9]:
n_features = data_loader['train'].dataset.n_features
n_targets = 2

model = myNetworkModel(n_features, n_targets)

myNetworkModel()  in: 33075  out: 2 n_params (K):  34198.53


In [0]:
def training_loop(n_epochs, optim, model, loss_fn, dl_train, dl_val, hist=None):
    if hist is not None:
        pass
    else:
        hist = {'tloss': [], 'tacc': [], 'vloss': [], 'vacc': []}
    best_acc = 0
    for epoch in range(1, n_epochs+1):
        tr_loss, tr_acc = 0., 0.
        n_data = 0
        for im_batch, label_batch in dl_train: # minibatch
            ypred = model(im_batch)
            loss_train = loss_fn(ypred, label_batch)
        
            optim.zero_grad()
            loss_train.backward()
            optim.step()
   
            # accumulate correct prediction
            tr_acc  += (torch.argmax(ypred.detach(), dim=1) == label_batch).sum().item() # number of correct predictions
            tr_loss += loss_train.item() * im_batch.shape[0]
            n_data  += im_batch.shape[0]
        #
        # statistics
        tr_loss /= n_data
        tr_acc  /= n_data
        #
        val_loss, val_acc = performance(model, loss_fn, dl_val)
        
        if epoch <= 5 or epoch % 1000 == 0 or epoch == n_epochs:
             print(f'Epoch {epoch}, tloss {tr_loss:.2f} t_acc: {tr_acc:.2f}  vloss {val_loss:.2f}  v_acc: {val_acc:.2f}')
        else:
            if best_acc < val_acc:
                best_acc = val_acc
                print(' best val accuracy updated: ', best_acc)
        #
        # record for history return
        hist['tloss'].append(tr_loss)
        hist['vloss'].append(val_loss) 
        hist['tacc'].append(tr_acc)
        hist['vacc'].append(val_acc)
        
    print ('finished training_loop().')
    return hist
#

def performance(model, loss_fn, dataloader):
    model.eval()
    with torch.no_grad():
        loss, acc, n = 0., 0., 0.
        for x, y in dataloader:
            ypred = model(x)
            loss += loss_fn(ypred, y).item() * len(y)
            p = torch.argmax(ypred, dim=1)
            acc += (p == y).sum().item()
            n += len(y)
        #
    loss /= n
    acc /= n
    return loss, acc
#
def plot_history(history):
    fig, axes = plt.subplots(1,2, figsize=(16,6))
    axes[0].set_title('Loss'); 
    axes[0].plot(history['tloss'], label='train'); axes[0].plot(history['vloss'], label='val')
    axes[0].legend()
    max_vacc = max(history['vacc'])
    axes[1].set_title(f'Acc. vbest: {max_vacc:.2f}')
    axes[1].plot(history['tacc'], label='train'); axes[1].plot(history['vacc'], label='val')
    axes[1].legend()

In [11]:
n_features = data_loader['train'].dataset.n_features
n_targets = 2
model = myNetworkModel(n_features, n_targets)

# optim
learning_rate = 0.01
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# loss
criterion = nn.NLLLoss()

# history
history = None

myNetworkModel()  in: 33075  out: 2 n_params (K):  34198.53


In [0]:
history = training_loop(500, optimizer, model, criterion, data_loader['train'], data_loader['val'], history)
plot_history(history)

Epoch 1, tloss 0.69 t_acc: 0.51  vloss 0.70  v_acc: 0.50
Epoch 2, tloss 0.70 t_acc: 0.49  vloss 0.69  v_acc: 0.50
Epoch 3, tloss 0.69 t_acc: 0.48  vloss 0.69  v_acc: 0.66
Epoch 4, tloss 0.69 t_acc: 0.55  vloss 0.69  v_acc: 0.62
Epoch 5, tloss 0.69 t_acc: 0.53  vloss 0.69  v_acc: 0.66
 best val accuracy updated:  0.65625
 best val accuracy updated:  0.71875
 best val accuracy updated:  0.75
 best val accuracy updated:  0.78125
 best val accuracy updated:  0.8125
