<a href="https://colab.research.google.com/github/prithvijaunjale/Engage-AI-Recruiter/blob/master/engage_audio_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
! pip install torchaudio

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/e9/0a/40e53c686c2af65b2a4e818d11d9b76fa79178440caf99f3ceb2a32c3b04/torchaudio-0.5.1-cp36-cp36m-manylinux1_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 2.6MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.5.1


In [None]:
import pickle
import os
import io
import glob
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import OrderedDict

import shutil
from zipfile import ZipFile

from sklearn.metrics import mean_absolute_error

import torch
from torchvision import datasets, models, transforms
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler
from torch.utils.data import Dataset
from torch import optim

import librosa
from librosa.display import specshow
import torchaudio

project_dir = 'drive/My Drive/projects/engage_ai_recruiter/'
models_dir = 'drive/My Drive/projects/engage_ai_recruiter/models/audio/'
data_dir = 'drive/My Drive/projects/engage_ai_recruiter/data/'

In [None]:
! cp drive/My\ Drive/projects/engage_ai_recruiter/data/all_wav.zip all_wav.zip

In [None]:
shutil.unpack_archive('all_wav.zip', '', 'zip')

In [None]:
all_wav = os.listdir('all_wav')
len(all_wav)

8000

# Data Preprocessing

## Dataset Creation - WAV & Annotation

In [None]:
pwd = b'zeAzLQN7DnSIexQukc9W'
with ZipFile(data_dir + 'Chalearn Data/val-annotation-e.zip') as zf:
    zf.extractall(pwd=pwd)

with ZipFile(data_dir + 'Chalearn Data/train-annotation.zip') as zf:
    zf.extractall()

In [None]:
with open('annotation_training.pkl', 'rb') as p:
    annotation_training = pickle.load(p, encoding='latin1')

with open('annotation_validation.pkl', 'rb') as p:
    annotation_validation = pickle.load(p, encoding='latin1')

In [None]:
annotations = annotation_training.copy()

for trait in annotation_validation.keys():
    for item in annotation_validation[trait]:
        annotations[trait][item] = annotation_validation[trait][item]

# example length
len(annotations['conscientiousness'])

8000

In [None]:
# creating Specgram_InterviewScore.csv
wav_ids = []
o_score = []
c_score = []
e_score = []
a_score = []
n_score = []
i_score = []
for vid_file in annotations['openness']:
    wav = vid_file.replace('mp4', 'wav')
    if wav in all_wav:
        wav_ids.append(wav)
        o_score.append(annotations['openness'][vid_file])
        c_score.append(annotations['conscientiousness'][vid_file])
        e_score.append(annotations['extraversion'][vid_file])
        a_score.append(annotations['agreeableness'][vid_file])
        n_score.append(annotations['neuroticism'][vid_file])
        i_score.append(annotations['interview'][vid_file])

df = pd.DataFrame()
df['wav_id'] = wav_ids
df['openness'] = o_score
df['conscientiousness'] = c_score
df['extraversion'] = e_score
df['agreeableness'] = a_score
df['neuroticism'] = n_score
df['interview'] = i_score

print(len(df))
df.head()

8000


Unnamed: 0,wav_id,openness,conscientiousness,extraversion,agreeableness,neuroticism,interview
0,J4GQm9j0JZ0.003.wav,0.488889,0.601942,0.523364,0.626374,0.552083,0.504673
1,zEyRyTnIw5I.005.wav,0.366667,0.582524,0.345794,0.472527,0.375,0.457944
2,nskJh7v6v1U.004.wav,0.511111,0.485437,0.252336,0.406593,0.291667,0.373832
3,6wHQsN5g2RM.000.wav,0.377778,0.398058,0.457944,0.505495,0.489583,0.457944
4,dQOeQYWIgm8.000.wav,0.622222,0.621359,0.607477,0.406593,0.489583,0.570093


In [None]:
df.to_csv(data_dir + 'WAV_OCEANI.csv', index=False)

## DALI (NVIDIA Data Loading Library)

In [None]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
! pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100

Looking in indexes: https://pypi.org/simple, https://developer.download.nvidia.com/compute/redist
Collecting nvidia-dali-cuda100
[?25l  Downloading https://developer.download.nvidia.com/compute/redist/nvidia-dali-cuda100/nvidia_dali_cuda100-0.23.0-1396139-cp36-cp36m-manylinux1_x86_64.whl (264.6MB)
[K     |████████████████████████████████| 264.6MB 60kB/s 
Installing collected packages: nvidia-dali-cuda100
Successfully installed nvidia-dali-cuda100-0.23.0


In [None]:
import nvidia.dali.ops as ops
import nvidia.dali.types as types
from nvidia.dali.pipeline import Pipeline

In [None]:
def get_melspecgram(y):
    mel_specgram = torchaudio.transforms.MelSpectrogram(n_fft=1024,
                                                        hop_length=256,
                                                        n_mels=40,
                                                        sample_rate=16000)(y_mono)
    mel_specgram = librosa.power_to_db(mel_specgram, ref=np.max)    

    specshow(mel_specgram, fmax=8000)
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight')
    buf.seek(0)
    return buf

In [None]:
class ExternalInputIterator(object):
    def __init__(self, batch_size, csv_file, root_dir, indices):
        self.root_dir = root_dir
        self.batch_size = batch_size
        self.wav_df = pd.read_csv(csv_file)
        self.wav_df = self.wav_df.iloc[indices, :]

    def __iter__(self):
        self.i = 0
        self.n = len(self.wav_df)
        return self

    def __next__(self):
        b_wavs = []
        b_labels = []
        for _ in range(self.batch_size):
            wav = os.path.join(self.root_dir, self.wav_df.iloc[self.i, 0])
            y, sr = torchaudio.load(wav)
            y = y.squeeze(0).numpy()
            y = y[:(16000 * 15)]
            b_wavs.append(y)

            labels = self.wav_df.iloc[self.i, 1:].values.astype(np.float32)
            b_labels.append(labels)

            self.i = (self.i + 1) % self.n
        return (b_wavs, b_labels)

    @property
    def size(self,):
        return len(self.wav_df)

    next = __next__

In [None]:
class MelSpectrogramPipeline(Pipeline):
    def __init__(self, 
                 external_data,
                 device, 
                 batch_size, 
                 specgram_dict, 
                 num_threads=1, 
                 device_id=0):
        super(MelSpectrogramPipeline, self).__init__(batch_size, num_threads, device_id)

        self.device = device
        self.data_iterator = iter(external_data)
        self.specgram_dict = specgram_dict
        
        # input
        self.input_wav = ops.ExternalSource()
        self.input_label = ops.ExternalSource()

        # audio
        self.spectrogram = ops.Spectrogram(device=self.device,
                                           nfft=self.specgram_dict['n_fft'],
                                           window_length=self.specgram_dict['n_fft'],
                                           window_step=self.specgram_dict['hop_length'])
        self.mel_fbank = ops.MelFilterBank(device=self.device,
                                           sample_rate=self.specgram_dict['sr'],
                                           nfilter = self.specgram_dict['n_mels'],
                                           freq_high = self.specgram_dict['f_max'])
        self.dB = ops.ToDecibels(device=self.device,
                                 multiplier = 10.0,
                                 cutoff_db = -80)
        
        # image
        self.decode = ops.ImageDecoder(device=self.device)
        self.res = ops.Resize(device=self.device, resize_x=224, resize_y=224)
        self.norm = ops.CropMirrorNormalize(device = self.device,
                                            mean=[0.485, 0.456, 0.406], 
                                            std=[0.229, 0.224, 0.225])
        
    def define_graph(self):
        # audio transforms
        self.y = self.input_wav()
        self.labels = self.input_label()
        self.y = self.y.gpu() if self.device == 'gpu' else self.y
        specgram = self.spectrogram(self.y)
        mel_specgram = self.mel_fbank(specgram)
        mel_specgram_db = self.dB(mel_specgram)
        return (mel_specgram_db, self.labels)

    def iter_setup(self):
        y, labels = self.data_iterator.next()
        self.feed_input(self.y, y)
        self.feed_input(self.labels, labels)

In [None]:
# Creating data indices for training and validation splits:
wav_df = pd.read_csv(data_dir + 'WAV_OCEANI.csv')

validation_split = 0.2
shuffle_dataset = True
random_seed = 42

dataset_size = len(wav_df)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

In [None]:
train_eii = ExternalInputIterator(batch_size=32, 
                            csv_file=data_dir + 'WAV_OCEANI.csv', 
                            root_dir='all_wav',
                            indices=train_indices)

val_eii = ExternalInputIterator(batch_size=32, 
                            csv_file=data_dir + 'WAV_OCEANI.csv', 
                            root_dir='all_wav',
                            indices=val_indices)

In [None]:
from nvidia.dali.plugin.pytorch import DALIGenericIterator

specgram_dict = {'n_fft': 1024,
                 'hop_length': 256,
                 'n_mels': 40,
                 'sr': 16000,
                 'f_max': 8000,
                 'duration': 15}

train_pipe = MelSpectrogramPipeline(external_data=train_eii, 
                                    device='gpu',
                                    specgram_dict=specgram_dict,
                                    batch_size=32,
                                    device_id=0)
train_pipe.build()

val_pipe = MelSpectrogramPipeline(external_data=train_eii,
                                  device='gpu',
                                  specgram_dict=specgram_dict, 
                                  batch_size=32, 
                                  device_id=0)
val_pipe.build()

train_iterator = DALIGenericIterator(train_pipe, 
                                     ['mel_specgram_db', 'labels'], 
                                     size=train_eii.size)
val_iterator = DALIGenericIterator(val_pipe, 
                                   ['mel_specgram_db', 'labels'], 
                                   size=val_eii.size)



In [None]:
for item in train_iterator:
    print(item[0]['mel_specgram_db'].shape)
    break
train_iterator.reset()



torch.Size([32, 40, 957])


In [None]:
# set lengths of the iterators early on, 
# cause calculating the length every time makes a full run through the iterator
# which might consume the gpu memory

train_iterator.reset()
val_iterator.reset()

len_train_iterator = len(list(train_iterator))
len_val_iterator = len(list(val_iterator))

print(len_train_iterator, len_val_iterator)

# reset iterators before the start of every epoch
train_iterator.reset()
val_iterator.reset()



199 50


## PyTorch Dataloader

In [None]:
# load data
specgram_df = pd.read_csv(data_dir + 'Specgram_OCEANI.csv')
# with open(data_dir + 'mfcc_7934n_13200d.pkl', 'rb') as p:
#     mfcc = pickle.load(p)
print(len(specgram_df))

7934


In [None]:
class SpecgramDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.specgram_df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        # self.mfcc = mfcc

    def __len__(self):
        return len(self.specgram_df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir, self.specgram_df.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')
        labels = self.specgram_df.iloc[idx, 1:].values.astype(np.float32)
        # for key, value in labels.items():
        #     labels[key] = torch.Tensor([value])
        labels = torch.from_numpy(labels)
        # mfcc_vec = torch.Tensor(self.mfcc[self.specgram_df.iloc[idx, 0].replace('png', 'mp4')])

        if self.transform:
            image = self.transform(image)

        return (image, labels)

In [None]:
transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                ])

In [None]:
# Creating data indices for training and validation splits:
validation_split = 0.2
shuffle_dataset = True
random_seed = 42

dataset_size = len(specgram_df)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

In [None]:
specgram_data = SpecgramDataset(csv_file=os.path.join(data_dir, 'Specgram_OCEANI.csv'),
                                # mfcc=mfcc,
                                root_dir='specgrams1',
                                transform=transform)

train_dataloader = DataLoader(specgram_data, 
                              sampler=train_sampler, 
                              batch_size=8, 
                              num_workers=2,
                              pin_memory=True)

val_dataloader = DataLoader(specgram_data, 
                            sampler=val_sampler, 
                            batch_size=8, 
                            num_workers=2,
                            pin_memory=True)

In [None]:
len(train_dataloader), len(val_dataloader)

(794, 199)

In [None]:
def display_image(image):
    fig, ax = plt.subplots()
    image = image.numpy().transpose((1, 2, 0))

    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    image = std * image + mean
    image = np.clip(image, 0, 1)

    return ax.imshow(image)

In [None]:
b_images, b_labels = next(iter(train_dataloader))
print(b_images.shape)
# print(b_labels.shape)
print(b_labels)

# display_image(b_images[0])

torch.Size([8, 3, 224, 224])
tensor([[0.6111, 0.8932, 0.6636, 0.6374, 0.6875, 0.7196],
        [0.9000, 0.7087, 0.7757, 0.6813, 0.8333, 0.6916],
        [0.7556, 0.7282, 0.7009, 0.7802, 0.6771, 0.6822],
        [0.6667, 0.4951, 0.5327, 0.5714, 0.6354, 0.5327],
        [0.6556, 0.5340, 0.5888, 0.5165, 0.6875, 0.6355],
        [0.4333, 0.8252, 0.1776, 0.6154, 0.6667, 0.5888],
        [0.7111, 0.6602, 0.5514, 0.6593, 0.6354, 0.5234],
        [0.3111, 0.2621, 0.2617, 0.2637, 0.2708, 0.2617]])


In [None]:
b_images, b_labels = next(iter(val_dataloader))
print(b_images.shape)
# print(b_labels.shape)
print(b_labels)

torch.Size([8, 3, 224, 224])
tensor([[0.6667, 0.5728, 0.5421, 0.6703, 0.6354, 0.5047],
        [0.6778, 0.7961, 0.4953, 0.5934, 0.6562, 0.6262],
        [0.5778, 0.5340, 0.4673, 0.4396, 0.4271, 0.5327],
        [0.4778, 0.2233, 0.2710, 0.3846, 0.3333, 0.2243],
        [0.3222, 0.1845, 0.3551, 0.4835, 0.3854, 0.3084],
        [0.6667, 0.6019, 0.5047, 0.6484, 0.6771, 0.5514],
        [0.6111, 0.3981, 0.4766, 0.5495, 0.4375, 0.4486],
        [0.3111, 0.6602, 0.4579, 0.4945, 0.4688, 0.5047]])


# Pretrained Resnet101 + MFCCs - Late Fusion Model

In [None]:
device = torch.device('cuda')

In [None]:
resnet = models.resnext101_32x8d(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth" to /root/.cache/torch/checkpoints/resnext101_32x8d-8ba56ff5.pth


HBox(children=(FloatProgress(value=0.0, max=356082095.0), HTML(value='')))




In [None]:
resnet;

In [None]:
# trainable parameters
sum(p.numel() for p in resnet.parameters() if p.requires_grad)

88791336

In [None]:
class ResnetOut(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

In [None]:
# Freeze parameters so we don't backprop through them
for param in resnet.parameters():
    param.requires_grad = False

resnet.fc = ResnetOut()

In [None]:
resnet;

In [None]:
# define our own regressor
regressor = nn.Sequential(nn.Linear(15248, 2048),
                          nn.ReLU(),
                          nn.Dropout(p=0.25),
                          nn.Linear(2048, 512),
                          nn.ReLU(),
                          nn.Dropout(p=0.25),
                          nn.Linear(512, 128),
                          nn.ReLU(),
                          nn.Dropout(p=0.25),
                          nn.Linear(128, 1))

In [None]:
# concatenates resnet output & mfccs 
class Ensemble(nn.Module):
    def __init__(self, resnet, regressor):
        super(Ensemble, self).__init__()

        self.resnet = resnet
        self.regressor = regressor

    def forward(self, x1, mfcc_vec):
        x1 = self.resnet(x1)

        # concatenate resnet output & mfcc vector (2048 + 13200)
        x = torch.cat((x1, mfcc_vec), dim=1)

        x = self.regressor(x)

        return x


In [None]:
model = Ensemble(resnet, regressor)
model.to(device);

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.regressor.parameters(), lr=0.001)
epochs = 30
save_best_only = True
early_stopping_limit = 4
model_name = 'resnet-dnn_specgram-mfcc_ensemble_6000data'

In [None]:
training_losses, val_losses = [], []
prev_val_loss, min_val_loss = 0, 9999
early_stopping_cnt = 0

for epoch in range(epochs):
    # Training
    train_loss, train_steps = 0, 0
    model.train()

    for batch in tqdm(train_dataloader, desc='Epoch ' + str(epoch)):
        b_images, b_mfccs, b_labels = batch
        b_images, b_mfccs, b_labels = b_images.to(device), b_mfccs.to(device), b_labels.to(device)

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        logits = model.forward(b_images, b_mfccs)

        # calc loss
        loss = criterion(logits, b_labels)
        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()
        train_steps += 1
    
    training_losses.append(train_loss/len(train_dataloader))
    print('Train MSE loss:', train_loss/len(train_dataloader))

    # Validation
    val_loss, val_steps = 0, 0
    pred, true = [], []

    model.eval()
    for batch in val_dataloader:
        b_images, b_mfccs, b_labels = batch
        b_images, b_mfccs, b_labels = b_images.to(device), b_mfccs.to(device), b_labels.to(device)

        with torch.no_grad():
            logits = model.forward(b_images, b_mfccs)
            loss = criterion(logits, b_labels)

            val_loss += loss.item()

            logits = logits.cpu().numpy()
            labels = b_labels.cpu().numpy()

            pred.extend(logits)
            true.extend(labels)

        val_steps += 1

    avg_val_loss = val_loss/len(val_dataloader)
    if save_best_only and avg_val_loss < min_val_loss: 
        if not os.path.exists(models_dir + model_name):
            os.makedirs(models_dir + model_name)
        torch.save(model.state_dict(), os.path.join(models_dir, model_name, model_name + '.pt'))
        print(f'--- Model Saved. Val loss: {min_val_loss} -> {avg_val_loss}')
        min_val_loss = avg_val_loss
        early_stopping_cnt = 0

    val_losses.append(avg_val_loss)
    print('Validation MSE:', avg_val_loss)
    print('Validation MAE:', mean_absolute_error(true, pred))

    early_stopping_cnt += 1
    if early_stopping_cnt == early_stopping_limit:
        print('\n--- Stopped Early.')
        break

HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=298.0, style=ProgressStyle(description_widt…


Train MSE loss: 0.3591675653478643
--- Model Saved. Val loss: 9999 -> 0.06279391385614871
Validation MSE: 0.06279391385614871
Validation MAE: 0.21216476


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=298.0, style=ProgressStyle(description_widt…


Train MSE loss: 0.20056722461927257
--- Model Saved. Val loss: 0.06279391385614871 -> 0.05182067486147086
Validation MSE: 0.05182067486147086
Validation MAE: 0.18969798


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=298.0, style=ProgressStyle(description_widt…


Train MSE loss: 0.17981759871937483
--- Model Saved. Val loss: 0.05182067486147086 -> 0.03775529527415832
Validation MSE: 0.03775529527415832
Validation MAE: 0.15726227


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=298.0, style=ProgressStyle(description_widt…


Train MSE loss: 0.15278379240342035
Validation MSE: 0.03930701235930125
Validation MAE: 0.16244192


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=298.0, style=ProgressStyle(description_widt…


Train MSE loss: 0.18453978371475166
Validation MSE: 0.03899045373002688
Validation MAE: 0.16169605


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=298.0, style=ProgressStyle(description_widt…


Train MSE loss: 0.1075637669071255
Validation MSE: 0.04673169593016307
Validation MAE: 0.1799833

--- Stopped Early.


# Pretrained Resnet101 on Spectrograms Model

In [None]:
device = torch.device('cuda')
print(torch.cuda.get_device_name())

Tesla K80


In [None]:
resnet = models.resnext101_32x8d(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth" to /root/.cache/torch/checkpoints/resnext101_32x8d-8ba56ff5.pth


HBox(children=(FloatProgress(value=0.0, max=356082095.0), HTML(value='')))




In [None]:
resnet;

In [None]:
# trainable parameters
sum(p.numel() for p in resnet.parameters() if p.requires_grad)

88791336

In [None]:
# define our own regressor
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()

        self.fc1 = nn.Linear(2048, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 64)

        self.o_output = nn.Linear(64, 1)
        self.c_output = nn.Linear(64, 1)
        self.e_output = nn.Linear(64, 1)
        self.a_output = nn.Linear(64, 1)
        self.n_output = nn.Linear(64, 1)
        self.i_output = nn.Linear(64, 1)

        self.dropout = nn.Dropout(p=0.25)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)

        o_out = F.sigmoid(self.o_output(x))
        c_out = F.sigmoid(self.c_output(x))
        e_out = F.sigmoid(self.e_output(x))
        a_out = F.sigmoid(self.a_output(x))
        n_out = F.sigmoid(self.n_output(x))
        i_out = F.sigmoid(self.i_output(x))

        return o_out, c_out, e_out, a_out, n_out, i_out

In [None]:
# Freeze parameters so we don't backprop through them
FULL_FINETUNING = True

if FULL_FINETUNING == False:
    for param in resnet.parameters():
        param.requires_grad = False

resnet.fc = Regressor()

In [None]:
resnet;

In [None]:
model = resnet
model.to(device);

In [None]:
criterion = nn.MSELoss()
if FULL_FINETUNING:
    optimizer = optim.Adam(model.parameters(), lr=0.001)
else:
    optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
epochs = 20
save_best_only = True
early_stopping_limit = 5
model_name = 'resnet101_specgram_mse_multitask'

In [None]:
training_losses, val_losses = [], []
prev_val_loss, min_val_loss = 0, 9999
early_stopping_cnt = 0
train_iterator.reset()
val_iterator.reset()

for epoch in range(epochs):
    # Training
    train_loss = 0
    model.train()

    for step, batch in tqdm(enumerate(train_iterator), 
                            total=len_train_iterator,
                            desc='Epoch ' + str(epoch)):
        b_images, b_labels = batch[0]['images'], batch[0]['labels']
        b_images, b_labels = b_images.to(device), b_labels.to(device)

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        logits = model.forward(b_images)
        o_logits, c_logits, e_logits, a_logits, n_logits, i_logits = logits

        # calc loss
        o_loss = criterion(o_logits, b_labels[0])
        c_loss = criterion(c_logits, b_labels[1])
        e_loss = criterion(e_logits, b_labels[2])
        a_loss = criterion(a_logits, b_labels[3])
        n_loss = criterion(n_logits, b_labels[4])
        i_loss = criterion(i_logits, b_labels[5])

        loss = o_loss + c_loss + e_loss + a_loss + n_loss + i_loss

        train_loss += loss.item()
        # print(f'Iteration: {train_steps} | Batch loss: {train_loss}')
        # backward pass
        loss.backward()

        # update weights
        optimizer.step()
    
    avg_train_loss = train_loss/len_train_iterator
    training_losses.append(avg_train_loss)
    print('Avg Training MSE:', avg_train_loss)

    # Validation
    val_loss = 0
    pred, true = [], []
    val_o_loss, val_c_loss, val_e_loss, val_a_loss, val_n_loss, val_i_loss = 0, 0, 0, 0, 0, 0

    model.eval()
    for batch in val_iterator:
        b_images, b_labels = batch[0]['images'], batch[0]['labels']
        b_images, b_labels = b_images.to(device), b_labels.to(device)

        with torch.no_grad():
            logits = model.forward(b_images)
            o_logits, c_logits, e_logits, a_logits, n_logits, i_logits = logits

            # calc loss
            o_loss = criterion(o_logits, b_labels[0])
            c_loss = criterion(c_logits, b_labels[1])
            e_loss = criterion(e_logits, b_labels[2])
            a_loss = criterion(a_logits, b_labels[3])
            n_loss = criterion(n_logits, b_labels[4])
            i_loss = criterion(i_logits, b_labels[5])

            loss = o_loss + c_loss + e_loss + a_loss + n_loss + i_loss

            val_loss += loss.item()
            val_o_loss += o_loss.item()
            val_c_loss += c_loss.item()
            val_e_loss += e_loss.item()
            val_a_loss += a_loss.item()
            val_n_loss += n_loss.item()
            val_i_loss += i_loss.item()

            o_logits = o_logits.cpu().numpy()
            c_logits = c_logits.cpu().numpy()
            e_logits = e_logits.cpu().numpy()
            a_logits = a_logits.cpu().numpy()
            n_logits = n_logits.cpu().numpy()
            i_logits = i_logits.cpu().numpy()

            labels = b_labels.cpu().numpy()

            for logits in [o_logits, c_logits, e_logits, a_logits, n_logits, i_logits]:
                pred.extend(logits)
            for label in labels:
                true.extend(label)

    avg_val_loss = val_loss/len_val_iterator

    val_losses.append(avg_val_loss)
    print('O Validation MSE:', val_o_loss/len_val_iterator)
    print('C Validation MSE:', val_c_loss/len_val_iterator)
    print('E Validation MSE:', val_e_loss/len_val_iterator)
    print('A Validation MSE:', val_a_loss/len_val_iterator)
    print('N Validation MSE:', val_n_loss/len_val_iterator)
    print('I Validation MSE:', val_i_loss/len_val_iterator)

    print('\nAvg Validation MSE:', avg_val_loss)
    print('Avg Validation MAE:', mean_absolute_error(true, pred))

    # reset iterator after every epoch
    train_iterator.reset()
    val_iterator.reset()

    if save_best_only and avg_val_loss < min_val_loss: 
        if not os.path.exists(models_dir + model_name):
            os.makedirs(models_dir + model_name)
        torch.save(model.state_dict(), os.path.join(models_dir, model_name, model_name + '.pt'))
        print(f'--- Model Saved. Val loss: {min_val_loss} -> {avg_val_loss}')
        min_val_loss = avg_val_loss
        early_stopping_cnt = 0

    early_stopping_cnt += 1
    if early_stopping_cnt == early_stopping_limit:
        print('\n--- Stopped Early.')
        break



HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.14050304711838463
O Validation MSE: 0.01825920883566141
C Validation MSE: 0.021923313934821637
E Validation MSE: 0.028084843379911035
A Validation MSE: 0.02674275459256023
N Validation MSE: 0.01869646178325638
I Validation MSE: 0.019377628806978464

Avg Validation MSE: 0.13308420993387698
Avg Validation MAE: 0.12073044
--- Model Saved. Val loss: 9999 -> 0.13308420993387698


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.1354415073017379
O Validation MSE: 0.023107989826239645
C Validation MSE: 0.019913340769708156
E Validation MSE: 0.01870702937245369
A Validation MSE: 0.0271879747742787
N Validation MSE: 0.01999439231585711
I Validation MSE: 0.0228176092216745

Avg Validation MSE: 0.13172833688557148
Avg Validation MAE: 0.123450235
--- Model Saved. Val loss: 0.13308420993387698 -> 0.13172833688557148


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)


KeyboardInterrupt: ignored

# CNN

In [None]:
device = torch.device('cuda')
print(torch.cuda.get_device_name())

Tesla T4


In [None]:
class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, 
                               out_channels=32, 
                               kernel_size=3,
                               padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, 
                               out_channels=64, 
                               kernel_size=3,
                               padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, 
                               out_channels=128, 
                               kernel_size=3,
                               padding=1)
        
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(128*5*117, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 64)
        self.fc4 = nn.Linear(64, 16)
        self.o_output = nn.Linear(16, 1)
        self.c_output = nn.Linear(16, 1)
        self.e_output = nn.Linear(16, 1)
        self.a_output = nn.Linear(16, 1)
        self.n_output = nn.Linear(16, 1)
        self.i_output = nn.Linear(16, 1)

        self.dropout = nn.Dropout(p=0.25)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.maxpool(x)
        x = F.relu(self.conv2(x))
        x = self.maxpool(x)
        x = F.relu(self.conv3(x))
        x = self.maxpool(x)
        flattened = x.view(-1, 128*5*117)

        x = F.relu(self.fc1(flattened))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        o_out = F.sigmoid(self.o_output(x))
        c_out = F.sigmoid(self.c_output(x))
        e_out = F.sigmoid(self.e_output(x))
        a_out = F.sigmoid(self.a_output(x))
        n_out = F.sigmoid(self.n_output(x))
        i_out = F.sigmoid(self.i_output(x))

        return o_out, c_out, e_out, a_out, n_out, i_out

In [None]:
model = CNNModel()
model.to(device)

CNNModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=74880, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=16, bias=True)
  (o_output): Linear(in_features=16, out_features=1, bias=True)
  (c_output): Linear(in_features=16, out_features=1, bias=True)
  (e_output): Linear(in_features=16, out_features=1, bias=True)
  (a_output): Linear(in_features=16, out_features=1, bias=True)
  (n_output): Linear(in_features=16, out_features=1, bias=True)
  (i_output): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.25, inplace=F

In [None]:
# get pool out shape
# sample = next(iter(train_iterator))
# b_input = sample[0]['mel_specgram_db'].unsqueeze(1).to(device)

# pool_out = model.forward(b_input)
# pool_out.shape

torch.Size([32, 128, 5, 117])

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10
save_best_only = True
early_stopping_limit = epochs
model_name = 'cnn_melspecgram_mse'

In [None]:
training_losses, val_losses = [], []
prev_val_loss, min_val_loss = 0, 9999
early_stopping_cnt = 0
train_iterator.reset()
val_iterator.reset()

for epoch in range(epochs):
    # Training
    train_loss = 0
    model.train()

    for batch in tqdm(train_iterator, total=len_train_iterator, desc='Epoch ' + str(epoch)):
        b_input, b_labels = batch[0]['mel_specgram_db'], batch[0]['labels']
        b_input, b_labels = b_input.unsqueeze(1).to(device), b_labels.to(device)

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        logits = model.forward(b_input)
        o_logits, c_logits, e_logits, a_logits, n_logits, i_logits = logits

        # calc loss
        o_loss = criterion(o_logits, b_labels[0])
        c_loss = criterion(c_logits, b_labels[1])
        e_loss = criterion(e_logits, b_labels[2])
        a_loss = criterion(a_logits, b_labels[3])
        n_loss = criterion(n_logits, b_labels[4])
        i_loss = criterion(i_logits, b_labels[5])

        loss = o_loss + c_loss + e_loss + a_loss + n_loss + i_loss
        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()
    
    avg_train_loss = train_loss/len_train_iterator
    training_losses.append(avg_train_loss)
    print('Avg Training MSE:', avg_train_loss)

    # Validation
    val_loss = 0
    pred, true = [], []
    val_o_loss, val_c_loss, val_e_loss, val_a_loss, val_n_loss, val_i_loss = 0, 0, 0, 0, 0, 0

    model.eval()
    for batch in val_iterator:
        b_input, b_labels = batch[0]['mel_specgram_db'], batch[0]['labels']
        b_input, b_labels = b_input.unsqueeze(1).to(device), b_labels.to(device)

        with torch.no_grad():
            logits = model.forward(b_input)
            o_logits, c_logits, e_logits, a_logits, n_logits, i_logits = logits

            # calc loss
            o_loss = criterion(o_logits, b_labels[0])
            c_loss = criterion(c_logits, b_labels[1])
            e_loss = criterion(e_logits, b_labels[2])
            a_loss = criterion(a_logits, b_labels[3])
            n_loss = criterion(n_logits, b_labels[4])
            i_loss = criterion(i_logits, b_labels[5])

            loss = o_loss + c_loss + e_loss + a_loss + n_loss + i_loss

            val_loss += loss.item()
            val_o_loss += o_loss.item()
            val_c_loss += c_loss.item()
            val_e_loss += e_loss.item()
            val_a_loss += a_loss.item()
            val_n_loss += n_loss.item()
            val_i_loss += i_loss.item()

            o_logits = o_logits.cpu().numpy()
            c_logits = c_logits.cpu().numpy()
            e_logits = e_logits.cpu().numpy()
            a_logits = a_logits.cpu().numpy()
            n_logits = n_logits.cpu().numpy()
            i_logits = i_logits.cpu().numpy()

            labels = b_labels.cpu().numpy()

            for logits in [o_logits, c_logits, e_logits, a_logits, n_logits, i_logits]:
                pred.extend(logits)
            for label in labels:
                true.extend(label)

    avg_val_loss = val_loss/len_val_iterator

    val_losses.append(avg_val_loss)
    print('O Validation MSE:', val_o_loss/len_val_iterator)
    print('C Validation MSE:', val_c_loss/len_val_iterator)
    print('E Validation MSE:', val_e_loss/len_val_iterator)
    print('A Validation MSE:', val_a_loss/len_val_iterator)
    print('N Validation MSE:', val_n_loss/len_val_iterator)
    print('I Validation MSE:', val_i_loss/len_val_iterator)

    print('\nAvg Validation MSE:', avg_val_loss)
    mae = mean_absolute_error(true, pred)
    print('Avg Validation MAE:', mae)

    # reset iterator after every epoch
    train_iterator.reset()
    val_iterator.reset()

    if save_best_only and avg_val_loss < min_val_loss: 
        if not os.path.exists(models_dir + model_name):
            os.makedirs(models_dir + model_name)
        model_name_ = model_name + '_' + str(1 - mae) + 'mae'
        torch.save(model.state_dict(), os.path.join(models_dir, model_name, model_name_ + '.pt'))
        print(f'--- Model Saved. Val loss: {min_val_loss} -> {avg_val_loss}')
        min_val_loss = avg_val_loss
        early_stopping_cnt = 0

    early_stopping_cnt += 1
    if early_stopping_cnt == early_stopping_limit:
        print('\n--- Stopped Early.')
        break



HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.16917446326820096
O Validation MSE: 0.02043127309065312
C Validation MSE: 0.02183017533272505
E Validation MSE: 0.024780050460249184
A Validation MSE: 0.02680873322766274
N Validation MSE: 0.016584290300961584
I Validation MSE: 0.022075224560685457

Avg Validation MSE: 0.13250974781811237
Avg Validation MAE: 0.12443929
--- Model Saved. Val loss: 9999 -> 0.13250974781811237


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.14497677858301145
O Validation MSE: 0.026184963448904454
C Validation MSE: 0.022455517950002103
E Validation MSE: 0.019825061589945108
A Validation MSE: 0.028447064300999046
N Validation MSE: 0.02409782615955919
I Validation MSE: 0.020893579094554297

Avg Validation MSE: 0.141904012337327
Avg Validation MAE: 0.124322176


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.14148164007007777
O Validation MSE: 0.02145307247294113
C Validation MSE: 0.02242790282238275
E Validation MSE: 0.017638668685685845
A Validation MSE: 0.019206058266572654
N Validation MSE: 0.018349506163503976
I Validation MSE: 0.020652128625661136

Avg Validation MSE: 0.11972733713686466
Avg Validation MAE: 0.122832015
--- Model Saved. Val loss: 0.13250974781811237 -> 0.11972733713686466


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.14309983897074383
O Validation MSE: 0.028844454644713552
C Validation MSE: 0.025404257671907543
E Validation MSE: 0.018442184305749832
A Validation MSE: 0.020445456812158227
N Validation MSE: 0.02484004546655342
I Validation MSE: 0.025784310328308493

Avg Validation MSE: 0.14376070991158485
Avg Validation MAE: 0.12072331


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.13552934801833114
O Validation MSE: 0.02233307894319296
C Validation MSE: 0.026650794523302466
E Validation MSE: 0.021273604957386852
A Validation MSE: 0.02128867489285767
N Validation MSE: 0.02489318314008415
I Validation MSE: 0.026213627671822906

Avg Validation MSE: 0.14265296459197999
Avg Validation MAE: 0.122465976


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.14253540084365024
O Validation MSE: 0.02304797018878162
C Validation MSE: 0.021010350212454797
E Validation MSE: 0.016922251721844077
A Validation MSE: 0.023649077396839857
N Validation MSE: 0.025277632602956145
I Validation MSE: 0.023364141015335918

Avg Validation MSE: 0.13327142357826233
Avg Validation MAE: 0.121858574


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=199.0, style=ProgressStyle(description_widt…

  return F.mse_loss(input, target, reduction=self.reduction)



Avg Training MSE: 0.1361441142931955
O Validation MSE: 0.033018586095422506
C Validation MSE: 0.026096164397895336
E Validation MSE: 0.02640435567125678
A Validation MSE: 0.026546490387991072
N Validation MSE: 0.028985572032397613
I Validation MSE: 0.022186574572697283

Avg Validation MSE: 0.16323774255812168
Avg Validation MAE: 0.122046955

--- Stopped Early.
