In [1]:
import pandas as pd
audio_impro_processed = pd.read_csv('audio_df_improvised.csv')

audio_impro_processed.reset_index(inplace=True)
audio_impro_processed.head()

Unnamed: 0,index,start_time,end_time,wav_file,emotion,val,act,dom
0,0,6.2901,8.2357,Ses01F_impro01_F000,neu,2.5,2.5,2.5
1,1,10.01,11.3925,Ses01F_impro01_F001,neu,2.5,2.5,2.5
2,2,14.8872,18.0175,Ses01F_impro01_F002,neu,2.5,2.5,2.5
3,3,27.46,31.49,Ses01F_impro01_F005,neu,2.5,3.5,2.0
4,4,85.27,88.02,Ses01F_impro01_F012,ang,2.0,3.5,3.5


In [114]:
from torchvision import models, transforms
my_transforms=transforms.Compose([transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406],
                                                           [0.229, 0.224, 0.225])])

In [129]:
import librosa
from PIL import Image
def get_spectrogram_image(path, my_transforms, med_duration, sr=None, n_fft=2048, hop_length=256, n_mels=128, fmin=100, fmax=15000, top_db=80):
    med_duration = int(np.floor(med_duration))
    wav, sr = librosa.load(path)
    if wav.shape[0]<(med_duration*sr):
        wav=np.pad(wav,int(np.ceil(((med_duration*sr)-wav.shape[0])/2)),mode='reflect')
    else:
        wav=wav[:(med_duration*sr)]
    spec=librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels)
#     librosa.decompose.nn_filter(spec, aggregate=np.median, metric='cosine')
    spec_db=librosa.power_to_db(spec)
    img=np.stack((spec_db,)*3, axis=-1)
    PIL_image = Image.fromarray((img*255).astype(np.uint8))
    image=my_transforms(PIL_image)
    return PIL_image

In [140]:
import matplotlib.pyplot as plt

a = get_spectrogram_image('/data/home/advaitmb/datasets/sentences/' + audio_impro_processed['wav_file'][1] + '.wav', my_transforms, 3)

from IPython.display import Image as Im

display(Im(a))

TypeError: 'module' object is not callable

The Following code validates on the Male speaker of last Session 

In [116]:
# We split the dataframe into train and test segments

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

train, valid = train_test_split(audio_impro_processed, test_size=0.2, shuffle=False)
train = audio_impro_processed[:2617]
train = shuffle(train)
valid = audio_impro_processed[267:]

In [117]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

class IEMOCAP_3Channel(Dataset):
    def __init__(self, base, df, in_col, out_col):
        self.df = df
        self.data = []
        self.labels = []
        self.c2i={}
        self.i2c={}
        self.categories = sorted(df[out_col].unique())
        print(self.categories)
        for i, category in enumerate(self.categories):
            self.c2i[category]=i
            self.i2c[i]=category
        for ind in tqdm(range(len(df))):
            row = df.iloc[ind]
            
            #If all files are stored in a folder named sentences
            file_path = base + '/' + row[in_col] + '.wav'
            
            
            self.data.append(get_spectrogram_image(file_path, my_transforms, 4))
            self.labels.append(self.c2i[row['emotion']])
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]





In [118]:
train_data = IEMOCAP_3Channel('/data/home/advaitmb/datasets/sentences', train, 'wav_file', 'emotion')
valid_data = IEMOCAP_3Channel('/data/home/advaitmb/datasets/sentences', valid, 'wav_file', 'emotion')
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=16)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=True, num_workers=16)

  0%|          | 1/2617 [00:00<05:14,  8.31it/s]

['ang', 'hap', 'neu', 'sad']


100%|██████████| 2617/2617 [06:05<00:00,  7.16it/s]
  0%|          | 1/2676 [00:00<05:56,  7.50it/s]

['ang', 'hap', 'neu', 'sad']


 39%|███▉      | 1049/2676 [02:28<03:50,  7.04it/s]


KeyboardInterrupt: 

In [64]:
for i, data in enumerate(train_loader):
    print(data[0].shape)

torch.Size([5, 3, 224, 224])


In [93]:
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F

net = models.resnet18(pretrained=True)
net.fc = nn.Linear(512, 4)

In [94]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

net = net.to(device)

In [95]:
def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

def lr_decay(optimizer, epoch):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr)
        print("Changed learning rate to {}".format(new_lr))
    return optimizer

In [96]:
learning_rate = 2e-5
optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=0.01)
epochs = 20
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score


def train_model(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, change_lr=None):
    
    for epoch in tqdm(range(1,epochs+1)):
        
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch)
        for i, data in enumerate(train_loader):
            x, y = data
            
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)


        print("Epoch - {} Train-Loss : {}".format(epoch, np.mean(train_losses[-1])))
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())

        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
#         accuracy_unweighted = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        accuracy_unweighted = accuracy_score(trace_yhat.argmax(axis=1), trace_y)
        accuracy_weighted = balanced_accuracy_score(trace_yhat.argmax(axis=1), trace_y)
        print("Epoch - {} Valid-Loss : {} Valid-Accuracy Unweighted : {} Valid-Accuracy Weighted {}".format(epoch, np.mean(valid_losses[-1]), accuracy_unweighted, accuracy_weighted ))
        



In [97]:
train_model(net, loss_fn, train_loader, valid_loader, epochs, optimizer, resnet_train_losses, resnet_valid_losses, lr_decay)

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch - 1 Train-Loss : 1.1526133265437148


  5%|▌         | 1/20 [00:21<06:49, 21.54s/it]

Epoch - 1 Valid-Loss : 1.1260659803043713 Valid-Accuracy Unweighted : 0.49079754601226994 Valid-Accuracy Weighted 0.3808571364325547
Epoch - 2 Train-Loss : 0.8994067747418474


 10%|█         | 2/20 [00:43<06:27, 21.55s/it]

Epoch - 2 Valid-Loss : 1.0901475277814 Valid-Accuracy Unweighted : 0.5214723926380368 Valid-Accuracy Weighted 0.4270657771535581
Epoch - 3 Train-Loss : 0.7532863936773161


 15%|█▌        | 3/20 [01:04<06:07, 21.60s/it]

Epoch - 3 Valid-Loss : 1.093750693581321 Valid-Accuracy Unweighted : 0.50920245398773 Valid-Accuracy Weighted 0.39703330379559887
Epoch - 4 Train-Loss : 0.6093033301394161


 20%|██        | 4/20 [01:26<05:46, 21.66s/it]

Epoch - 4 Valid-Loss : 1.1191152334213257 Valid-Accuracy Unweighted : 0.5214723926380368 Valid-Accuracy Weighted 0.42485542724353853
Epoch - 5 Train-Loss : 0.46790071667694466


 25%|██▌       | 5/20 [01:48<05:25, 21.69s/it]

Epoch - 5 Valid-Loss : 1.247928874059157 Valid-Accuracy Unweighted : 0.44785276073619634 Valid-Accuracy Weighted 0.365725677830941
Epoch - 6 Train-Loss : 0.34444876360457116


 30%|███       | 6/20 [02:10<05:04, 21.75s/it]

Epoch - 6 Valid-Loss : 1.2078542546792463 Valid-Accuracy Unweighted : 0.5245398773006135 Valid-Accuracy Weighted 0.4239266286285205
Epoch - 7 Train-Loss : 0.23053764969837376


 35%|███▌      | 7/20 [02:32<04:43, 21.78s/it]

Epoch - 7 Valid-Loss : 1.1873490105975757 Valid-Accuracy Unweighted : 0.5184049079754601 Valid-Accuracy Weighted 0.41111636376370064
Epoch - 8 Train-Loss : 0.1565257284517695


 40%|████      | 8/20 [02:53<04:21, 21.80s/it]

Epoch - 8 Valid-Loss : 1.2037314962257037 Valid-Accuracy Unweighted : 0.50920245398773 Valid-Accuracy Weighted 0.3933656008035548
Epoch - 9 Train-Loss : 0.11399929906900336


 45%|████▌     | 9/20 [03:15<03:59, 21.82s/it]

Epoch - 9 Valid-Loss : 1.3745945475318215 Valid-Accuracy Unweighted : 0.5153374233128835 Valid-Accuracy Weighted 0.4151204955891974
Changed learning rate to 2.0000000000000003e-06
Epoch - 10 Train-Loss : 0.08669299000828731


 50%|█████     | 10/20 [03:37<03:38, 21.85s/it]

Epoch - 10 Valid-Loss : 1.3452861959284002 Valid-Accuracy Unweighted : 0.49079754601226994 Valid-Accuracy Weighted 0.3970192548137034
Epoch - 11 Train-Loss : 0.07367260377036362


 55%|█████▌    | 11/20 [03:59<03:16, 21.86s/it]

Epoch - 11 Valid-Loss : 1.416847738352689 Valid-Accuracy Unweighted : 0.49693251533742333 Valid-Accuracy Weighted 0.40094863731656183
Epoch - 12 Train-Loss : 0.07065284833675478


 60%|██████    | 12/20 [04:21<02:55, 21.90s/it]

Epoch - 12 Valid-Loss : 1.3680830543691462 Valid-Accuracy Unweighted : 0.49693251533742333 Valid-Accuracy Weighted 0.4005524116923765
Epoch - 13 Train-Loss : 0.07044486578826498


 65%|██████▌   | 13/20 [04:43<02:33, 21.92s/it]

Epoch - 13 Valid-Loss : 1.3687191876498135 Valid-Accuracy Unweighted : 0.4938650306748466 Valid-Accuracy Weighted 0.39893794638930935
Epoch - 14 Train-Loss : 0.0602983506169261


 70%|███████   | 14/20 [05:05<02:11, 21.92s/it]

Epoch - 14 Valid-Loss : 1.3817826075987383 Valid-Accuracy Unweighted : 0.49079754601226994 Valid-Accuracy Weighted 0.3952604515984798
Epoch - 15 Train-Loss : 0.06387883538334835


 75%|███████▌  | 15/20 [05:27<01:49, 21.92s/it]

Epoch - 15 Valid-Loss : 1.3463781042532488 Valid-Accuracy Unweighted : 0.4723926380368098 Valid-Accuracy Weighted 0.3852395935490901
Epoch - 16 Train-Loss : 0.05843615336570798


 80%|████████  | 16/20 [05:49<01:27, 21.92s/it]

Epoch - 16 Valid-Loss : 1.3538819768212058 Valid-Accuracy Unweighted : 0.50920245398773 Valid-Accuracy Weighted 0.40800583391251233
Epoch - 17 Train-Loss : 0.055867713898783776


 85%|████████▌ | 17/20 [06:11<01:05, 21.92s/it]

Epoch - 17 Valid-Loss : 1.4644842906431719 Valid-Accuracy Unweighted : 0.4754601226993865 Valid-Accuracy Weighted 0.3857612547807054
Epoch - 18 Train-Loss : 0.05238335202561646


 90%|█████████ | 18/20 [06:33<00:43, 21.93s/it]

Epoch - 18 Valid-Loss : 1.398373159495267 Valid-Accuracy Unweighted : 0.4815950920245399 Valid-Accuracy Weighted 0.3881888440860215
Epoch - 19 Train-Loss : 0.053146671276630426


 95%|█████████▌| 19/20 [06:55<00:21, 21.94s/it]

Epoch - 19 Valid-Loss : 1.4643620415167375 Valid-Accuracy Unweighted : 0.46319018404907975 Valid-Accuracy Weighted 0.38102583020173975
Changed learning rate to 2.0000000000000002e-07
Epoch - 20 Train-Loss : 0.050683296898879654


100%|██████████| 20/20 [07:17<00:00, 21.86s/it]

Epoch - 20 Valid-Loss : 1.3873401338403875 Valid-Accuracy Unweighted : 0.4785276073619632 Valid-Accuracy Weighted 0.3872326524643962



