<a href="https://colab.research.google.com/github/prithvijaunjale/Engage-AI-Recruiter/blob/master/engage_audio_model_cnn_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
! pip install torchaudio

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/e9/0a/40e53c686c2af65b2a4e818d11d9b76fa79178440caf99f3ceb2a32c3b04/torchaudio-0.5.1-cp36-cp36m-manylinux1_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 2.7MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.5.1


In [None]:
import pickle
import os
import io
import glob
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import OrderedDict

import shutil
from zipfile import ZipFile

from sklearn.metrics import mean_absolute_error

import torch
from torchvision import datasets, models, transforms
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
from torch.utils.data import Dataset
from torch import optim

import librosa
from librosa.display import specshow
import torchaudio

project_dir = 'drive/My Drive/projects/engage_ai_recruiter/'
models_dir = 'drive/My Drive/projects/engage_ai_recruiter/models/audio/'
data_dir = 'drive/My Drive/projects/engage_ai_recruiter/data/'

In [None]:
! cp drive/My\ Drive/projects/engage_ai_recruiter/data/all_wav.zip all_wav.zip

In [None]:
shutil.unpack_archive('all_wav.zip', '', 'zip')

In [None]:
all_wav = os.listdir('all_wav')
len(all_wav)

8000

# Data

## DALI (NVIDIA Data Loading Library)

In [None]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
! pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100

Looking in indexes: https://pypi.org/simple, https://developer.download.nvidia.com/compute/redist
Collecting nvidia-dali-cuda100
[?25l  Downloading https://developer.download.nvidia.com/compute/redist/nvidia-dali-cuda100/nvidia_dali_cuda100-0.23.0-1396139-cp36-cp36m-manylinux1_x86_64.whl (264.6MB)
[K     |████████████████████████████████| 264.6MB 55kB/s 
Installing collected packages: nvidia-dali-cuda100
Successfully installed nvidia-dali-cuda100-0.23.0


In [None]:
import nvidia.dali.ops as ops
import nvidia.dali.types as types
from nvidia.dali.pipeline import Pipeline

In [None]:
def get_melspecgram(y):
    mel_specgram = torchaudio.transforms.MelSpectrogram(n_fft=1024,
                                                        hop_length=256,
                                                        n_mels=40,
                                                        sample_rate=16000)(y_mono)
    mel_specgram = librosa.power_to_db(mel_specgram, ref=np.max)    

    specshow(mel_specgram, fmax=8000)
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight')
    buf.seek(0)
    return buf

In [None]:
class ExternalInputIterator(object):
    def __init__(self, batch_size, csv_file, root_dir, indices):
        self.root_dir = root_dir
        self.batch_size = batch_size
        self.wav_df = pd.read_csv(csv_file)
        self.wav_df = self.wav_df.iloc[indices, :]

    def __iter__(self):
        self.i = 0
        self.n = len(self.wav_df)
        return self

    def __next__(self):
        b_wavs = []
        b_labels = []
        for _ in range(self.batch_size):
            wav = os.path.join(self.root_dir, self.wav_df.iloc[self.i, 0])
            y, sr = torchaudio.load(wav)
            y = y.squeeze(0).numpy()
            y = y[:(int(16000 * 15))]
            b_wavs.append(y)

            labels = self.wav_df.iloc[self.i, 1:].values.astype(np.float32)
            b_labels.append(labels)

            self.i = (self.i + 1) % self.n
        return (b_wavs, b_labels)

    @property
    def size(self,):
        return len(self.wav_df)

    next = __next__

In [None]:
class MelSpectrogramPipeline(Pipeline):
    def __init__(self, 
                 external_data,
                 device, 
                 batch_size, 
                 specgram_dict, 
                 num_threads=1, 
                 device_id=0):
        super(MelSpectrogramPipeline, self).__init__(batch_size, num_threads, device_id)

        self.device = device
        self.data_iterator = iter(external_data)
        self.specgram_dict = specgram_dict
        
        # input
        self.input_wav = ops.ExternalSource()
        self.input_label = ops.ExternalSource()

        # audio
        self.spectrogram = ops.Spectrogram(device=self.device,
                                           nfft=self.specgram_dict['n_fft'],
                                           window_length=self.specgram_dict['n_fft'],
                                           window_step=self.specgram_dict['hop_length'])
        self.mel_fbank = ops.MelFilterBank(device=self.device,
                                           sample_rate=self.specgram_dict['sr'],
                                           nfilter = self.specgram_dict['n_mels'],
                                           freq_high = self.specgram_dict['f_max'])
        self.dB = ops.ToDecibels(device=self.device,
                                 multiplier = 10.0,
                                 cutoff_db = -80)
        
        # image
        self.decode = ops.ImageDecoder(device=self.device)
        self.res = ops.Resize(device=self.device, resize_x=224, resize_y=224)
        self.norm = ops.CropMirrorNormalize(device = self.device,
                                            mean=[0.485, 0.456, 0.406], 
                                            std=[0.229, 0.224, 0.225])
        
    def define_graph(self):
        # audio transforms
        self.y = self.input_wav()
        self.labels = self.input_label()
        self.y = self.y.gpu() if self.device == 'gpu' else self.y
        specgram = self.spectrogram(self.y)
        mel_specgram = self.mel_fbank(specgram)
        mel_specgram_db = self.dB(mel_specgram)
        return (mel_specgram_db, self.labels)

    def iter_setup(self):
        y, labels = self.data_iterator.next()
        self.feed_input(self.y, y)
        self.feed_input(self.labels, labels)

In [None]:
# Creating data indices for training and validation splits
wav_df = pd.read_csv(data_dir + 'WAV_OCEANI.csv')

validation_split = 0.2
shuffle_dataset = True
random_seed = 42

dataset_size = len(wav_df)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

In [None]:
train_eii = ExternalInputIterator(batch_size=32, 
                            csv_file=data_dir + 'WAV_OCEANI.csv', 
                            root_dir='all_wav',
                            indices=train_indices)

val_eii = ExternalInputIterator(batch_size=32, 
                            csv_file=data_dir + 'WAV_OCEANI.csv', 
                            root_dir='all_wav',
                            indices=val_indices)

In [None]:
# build pipeline, initialize iterator

from nvidia.dali.plugin.pytorch import DALIGenericIterator

specgram_dict = {'n_fft': 1024,
                 'hop_length': 256,
                 'n_mels': 40,
                 'sr': 16000,
                 'f_max': 8000,
                 'duration': 15}

train_pipe = MelSpectrogramPipeline(external_data=train_eii, 
                                    device='gpu',
                                    specgram_dict=specgram_dict,
                                    batch_size=32,
                                    device_id=0)
train_pipe.build()

val_pipe = MelSpectrogramPipeline(external_data=train_eii,
                                  device='gpu',
                                  specgram_dict=specgram_dict, 
                                  batch_size=32, 
                                  device_id=0)
val_pipe.build()

train_iterator = DALIGenericIterator(train_pipe, 
                                     ['mel_specgram_db', 'labels'], 
                                     size=train_eii.size)
val_iterator = DALIGenericIterator(val_pipe, 
                                   ['mel_specgram_db', 'labels'], 
                                   size=val_eii.size)



In [None]:
for item in train_iterator:
    print(item[0]['mel_specgram_db'].shape)
    break
train_iterator.reset()



torch.Size([32, 40, 938])


In [None]:
# set lengths of the iterators early on, 
# cause calculating the length every time makes a full run through the iterator
# which might consume the gpu memory

train_iterator.reset()
val_iterator.reset()

len_train_iterator = len(list(train_iterator))
len_val_iterator = len(list(val_iterator))

print(len_train_iterator, len_val_iterator)

# reset iterators before the start of every epoch
train_iterator.reset()
val_iterator.reset()



199 50


# CNN - LSTM

In [None]:
device = torch.device('cuda')
print(torch.cuda.get_device_name())

Tesla K80


In [None]:
"""
input dimensions
[batch_size, 1, 957, 40]
975 - timesteps (in milliseconds | 15 seconds)
40 - embeddings (mel freq bins | n_mels)

conv1 out - [-1, 32, 975, 1]
pool1 out - [-1, 32, 487, 1]

conv2 out - [-1, 64, 487, 1]
pool2 out - [-1, 64, 243, 1]

conv3 out - [-1, 128, 243, 1]
pool3 out - [-1, 128, 121, 1]
"""

'\ninput dimensions\n[batch_size, 1, 957, 40]\n975 - timesteps (in milliseconds | 15 seconds)\n40 - embeddings (mel freq bins | n_mels)\n\nconv1 out - [-1, 32, 975, 1]\npool1 out - [-1, 32, 487, 1]\n\nconv2 out - [-1, 64, 487, 1]\npool2 out - [-1, 64, 243, 1]\n\nconv3 out - [-1, 128, 243, 1]\npool3 out - [-1, 128, 121, 1]\n'

In [None]:
class CnnLstmModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, 
                               out_channels=32, 
                               kernel_size=(1, 40))
        self.conv2 = nn.Conv2d(in_channels=32, 
                               out_channels=64, 
                               kernel_size=(3, 1))
        self.conv3 = nn.Conv2d(in_channels=64, 
                               out_channels=128, 
                               kernel_size=(3, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(2, 1), stride=2)

        self.lstm = nn.LSTM(input_size=128, 
                            hidden_size=64, 
                            batch_first=True,
                            bidirectional=True)
        
        self.fc1 = nn.Linear(128, 16)
        self.o_output = nn.Linear(16, 1)
        self.c_output = nn.Linear(16, 1)
        self.e_output = nn.Linear(16, 1)
        self.a_output = nn.Linear(16, 1)
        self.n_output = nn.Linear(16, 1)
        self.i_output = nn.Linear(16, 1)

        self.dropout = nn.Dropout(p=0.25)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.maxpool(x)
        x = F.relu(self.conv2(x))
        x = self.maxpool(x)
        x = F.relu(self.conv3(x))
        pool_out = self.maxpool(x)

        lstm_inp = pool_out.squeeze().permute(0, 2, 1)
        lstm_out, (h_n, c_n) = self.lstm(lstm_inp)
        x = F.relu(lstm_out[:, -1, :])

        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        o_out = F.sigmoid(self.o_output(x))
        c_out = F.sigmoid(self.c_output(x))
        e_out = F.sigmoid(self.e_output(x))
        a_out = F.sigmoid(self.a_output(x))
        n_out = F.sigmoid(self.n_output(x))
        i_out = F.sigmoid(self.i_output(x))

        return [o_out, c_out, e_out, a_out, n_out, i_out]

In [None]:
model = CnnLstmModel()
model.to(device)

CnnLstmModel(
  (conv1): Conv2d(1, 32, kernel_size=(1, 40), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 1), stride=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 1), stride=(1, 1))
  (maxpool): MaxPool2d(kernel_size=(2, 1), stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=16, bias=True)
  (o_output): Linear(in_features=16, out_features=1, bias=True)
  (c_output): Linear(in_features=16, out_features=1, bias=True)
  (e_output): Linear(in_features=16, out_features=1, bias=True)
  (a_output): Linear(in_features=16, out_features=1, bias=True)
  (n_output): Linear(in_features=16, out_features=1, bias=True)
  (i_output): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [None]:
# get out shape
# sample = next(iter(train_iterator))
# b_input = sample[0]['mel_specgram_db'].unsqueeze(1).to(device)
# b_input = b_input.permute(0, 1, 3, 2)
# print(b_input.shape)
# lstm_out, (h_n, c_n) = model.forward(b_input)
# h_n.shape
# out = out.squeeze().permute(0, 2, 1)
# out.shape

In [None]:
mse_criterion = nn.MSELoss()
mae_criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10
save_best_only = True
early_stopping_limit = epochs
model_name = 'cnn_melspecgram_mse'

In [None]:
training_losses, val_losses = [], []
prev_val_loss, min_val_loss = 0, 9999
early_stopping_cnt = 0
train_iterator.reset()
val_iterator.reset()

for epoch in range(epochs):
    # Training
    train_loss = 0
    model.train()

    for batch in tqdm(train_iterator, total=len_train_iterator, desc='Epoch ' + str(epoch)):
        b_input, b_labels = batch[0]['mel_specgram_db'], batch[0]['labels']
        b_input, b_labels = b_input.unsqueeze(1).to(device), b_labels.to(device)
        b_input = b_input.permute(0, 1, 3, 2)

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        logits = model.forward(b_input)

        # calc loss
        mse_loss_sum, mae_loss_sum = 0, 0

        for idx, logit in enumerate(logits):
            temp_mse = mse_criterion(logit, b_labels[idx])
            mse_loss_sum += temp_mse

            temp_mae = mae_criterion(logit, b_labels[idx])
            mae_loss_sum += temp_mae

        mse_loss_avg = mse_loss_sum / len(logits)
        mae_loss_avg = mae_loss_sum / len(logits)

        mse_mae_avg = (mse_loss_avg + mae_loss_avg) / 2
        train_loss += mse_mae_avg.item()

        # backward pass
        mse_mae_avg.backward()

        # update weights
        optimizer.step()
    
    avg_train_loss = train_loss/len_train_iterator
    training_losses.append(avg_train_loss)
    print('Avg Training MSE_MAE:', avg_train_loss)

    # Validation
    val_loss = 0
    pred, true = [], []
    val_o_loss, val_c_loss, val_e_loss, val_a_loss, val_n_loss, val_i_loss = 0, 0, 0, 0, 0, 0

    model.eval()
    for batch in val_iterator:
        b_input, b_labels = batch[0]['mel_specgram_db'], batch[0]['labels']
        b_input, b_labels = b_input.unsqueeze(1).to(device), b_labels.to(device)
        b_input = b_input.permute(0, 1, 3, 2)

        with torch.no_grad():
            logits = model.forward(b_input)

            # calc loss
            mse_loss_sum, mae_loss_sum = 0, 0

            for idx, logit in enumerate(logits):
                temp_mse = mse_criterion(logit, b_labels[idx])
                mse_loss_sum += temp_mse

                temp_mae = mae_criterion(logit, b_labels[idx])
                mae_loss_sum += temp_mae

            mse_loss_avg = mse_loss_sum / len(logits)
            mae_loss_avg = mae_loss_sum / len(logits)

            mse_mae_avg = (mse_loss_avg + mae_loss_avg) / 2

            val_loss += mse_mae_avg.item()
            # val_o_loss += o_loss.item()
            # val_c_loss += c_loss.item()
            # val_e_loss += e_loss.item()
            # val_a_loss += a_loss.item()
            # val_n_loss += n_loss.item()
            # val_i_loss += i_loss.item()

            logits_numpy = [logit.cpu().numpy() for logit in logits]
            labels = b_labels.cpu().numpy()

            for logits in logits_numpy:
                pred.extend(logits)
            for label in labels:
                true.extend(label)

    avg_val_loss = val_loss / len_val_iterator
    val_losses.append(avg_val_loss)
    # print('\nO Validation MSE_MAE:', val_o_loss/len_val_iterator)
    # print('C Validation MSE_MAE:', val_c_loss/len_val_iterator)
    # print('E Validation MSE_MAE:', val_e_loss/len_val_iterator)
    # print('A Validation MSE_MAE:', val_a_loss/len_val_iterator)
    # print('N Validation MSE_MAE:', val_n_loss/len_val_iterator)
    # print('I Validation MSE_MAE:', val_i_loss/len_val_iterator)

    print('Avg Validation MSE_MSE:', avg_val_loss)
    print('Validation MAE:', mean_absolute_error(true, pred))
    # reset iterator after every epoch
    train_iterator.reset()
    val_iterator.reset()

    if save_best_only and avg_val_loss < min_val_loss: 
        if not os.path.exists(models_dir + model_name):
            os.makedirs(models_dir + model_name)
        torch.save(model.state_dict(), os.path.join(models_dir, model_name, model_name + '.pt'))
        print(f'--- Model Saved. Val loss: {min_val_loss} -> {avg_val_loss}')
        min_val_loss = avg_val_loss
        early_stopping_cnt = 0

    early_stopping_cnt += 1
    if early_stopping_cnt == early_stopping_limit:
        print('\n--- Stopped Early.')
        break