In [16]:
import pickle 
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd
import numpy as np
import re
import librosa
import string
import matplotlib.pyplot as plt
import os
import noisereduce as nr
import IPython
from collections import Counter
from sklearn.metrics import mean_squared_error, accuracy_score

torch.manual_seed(10)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(10)

In [17]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

In [18]:
def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

def lr_decay(optimizer, epoch, learning_rate):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr)
        print("Changed learning rate to {}".format(new_lr))
    return optimizer

In [19]:
BASE = '/data/home/advaitmb/datasets/sentences/'
vocab_size = 2255

In [20]:
with open('spectrograms.pkl', 'rb') as handle:
    (data, labels) = pickle.load(handle)
    
data = np.array(data)
labels = np.array(labels)

In [21]:
with open('text_encodings.pkl', 'rb') as handle:
    (X, y) = pickle.load(handle)
    
X = np.array(X)
y = np.array(y)

In [22]:
class Audio(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [23]:
class Text(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [24]:
import pandas as pd
audio = pd.read_csv('audio_df_improvised.csv')

audio.reset_index(inplace=True)
audio.emotion = pd.Categorical(pd.factorize(audio.emotion)[0])
audio.head()

Unnamed: 0,index,start_time,end_time,wav_file,emotion,val,act,dom
0,0,6.2901,8.2357,Ses01F_impro01_F000,0,2.5,2.5,2.5
1,1,10.01,11.3925,Ses01F_impro01_F001,0,2.5,2.5,2.5
2,2,14.8872,18.0175,Ses01F_impro01_F002,0,2.5,2.5,2.5
3,3,27.46,31.49,Ses01F_impro01_F005,0,2.5,3.5,2.0
4,4,85.27,88.02,Ses01F_impro01_F012,1,2.0,3.5,3.5


In [25]:
old_text = pd.read_csv('text_df.csv')
old_text.emotion = audio.emotion
old_text.emotion = pd.Categorical(pd.factorize(old_text.emotion)[0])

text = pd.DataFrame()
text['data'] = old_text.transcription
text['label'] = old_text.emotion

#tokenization
import spacy
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

#count number of occurences of each word
counts = Counter()
for index, row in text.iterrows():
    counts.update(tokenize(row['data']))
    
    
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [26]:
def load_glove_vectors(glove_file=".vector_cache/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [27]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [28]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [49]:
class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = True 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [30]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__() # just run the init of parent class (nn.Module)
        self.conv1 = nn.Conv2d(1, 32, 5, 2, padding=1) # input is 1 image, 32 output channels, 5x5 kernel / window
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 5, 2, padding=1) # input is 32, bc the first layer output 32. Then we say the output will be 64 channels, 5x5 kernel / window
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 5, 2, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 128, 5, 2, padding=1)
        
        self.fc1 = nn.Linear((7*8*128), 512) #flattening.
        
        
        self.fc2 = nn.Linear(512, 4) # 512 in, 2 out bc we're doing 2 classes (dog vs cat).
        self.activation = nn.Softmax(dim=1)

    def convs(self, x):
        # max pooling over 2x2
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = F.relu(self.conv3(x))
        x = self.bn3(x)
        x = F.relu(self.conv4(x))

        return x

    def forward(self, x):
        x = self.convs(x)
        x = x.reshape(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x) # bc this is our output layer. No activation here.
        return self.activation(x)

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CRNN(nn.Module):
    def __init__(self, audioModel, textModel):
        
        # CNN for Audio
        super().__init__()

        self.audioModel = audioModel.eval()
        self.textModel = textModel.eval()
        
        self.linear = nn.Linear(8, 4)


    def forward(self, ax, tx, l):
        #Audio 
        ax = self.audioModel(ax)
        tx = self.textModel(tx, l)
        
        x = torch.cat((ax, tx), dim=1)
        x = self.linear(x)
        return F.softmax(x, dim=1)

In [66]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
def train_model(model, loss_fn, audio_train_loader, text_train_loader, audio_valid_loader, text_valid_loader, epochs, learning_rate, optimizer, train_losses, valid_losses, comment, change_lr=None): 
    for epoch in range(1,epochs+1):
        
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch, learning_rate)
        for a_data, t_data in tqdm(zip(audio_train_loader, text_train_loader)):
            ax, ay = a_data
            tx, ty, l = t_data
            ax = ax.to(device, dtype=torch.float32)
            ay = ay.to(device, dtype=torch.long)
            tx = tx.to(device, dtype=torch.long)
            y_hat = model(ax, tx, l)
            loss = loss_fn(y_hat, ay)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)

        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for a_data, t_data in zip(audio_train_loader, text_train_loader):
            ax, ay = a_data
            tx, ty, l = t_data
            ax = ax.to(device, dtype=torch.float32)
            ay = ay.to(device, dtype=torch.long)
            tx = tx.to(device, dtype=torch.long)
            y_hat = model(ax, tx, l)
            loss = loss_fn(y_hat, ay)
            trace_y.append(ay.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())

        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        unweighted_accuracy = accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        weighted_accuracy = balanced_accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        print("Epoch - {} Train-Loss : {} Valid-Loss : {}".format(epoch, 0, np.mean(valid_losses[-1])))
        print("unweighted_accuracy : {} weighted_accuracy : {}".format(unweighted_accuracy, weighted_accuracy ))
        
    return unweighted_accuracy, weighted_accuracy



In [67]:
def validate(audio_model, text_model, audio_train_loader, audio_valid_loader, text_train_loader, text_valid_loader, comment):
    net = CRNN(audio_model, text_model)
    
    for param in net.audioModel.parameters():
        param.requires_grad=False
    for param in net.textModel.parameters():
        param.requires_grad=False
        
    learning_rate = 0.01
    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
    epochs = 1
    loss_fn = nn.CrossEntropyLoss()
    
    train_losses=[]
    valid_losses=[]
    valid_accuracy = []
    
    uwa, wa = train_model(net, loss_fn, audio_train_loader, text_train_loader, audio_valid_loader, text_valid_loader, epochs, learning_rate, optimizer, train_losses, valid_losses, comment, change_lr=None)
    print("validation over")
    torch.save(net, comment+'.pth')
    del net
    
    return np.mean(valid_losses[-1]), uwa, wa  

In [68]:
ses_idx = np.array([ range(0,523), range(523, 1052), range(1052, 1680), range(1680, 2214), range(2214, len(audio)) ])

In [69]:
train_audio = []
valid_audio = []
train_text = []
valid_text = []

for i in range(ses_idx.shape[0]):
    train_audio.append(Audio(np.delete(data,ses_idx[i], axis=0), np.delete(labels,ses_idx[i], axis=0)))
    valid_audio.append(Audio(data[ses_idx[i]], labels[ses_idx[i]]))
    
    train_text.append(Text(np.delete(X,ses_idx[i], axis=0), np.delete(y,ses_idx[i], axis=0)))
    valid_text.append(Text(X[ses_idx[i]], y[ses_idx[i]]))
        
    

In [40]:
audio_model = torch.load('Audio validation : 0.pth')



In [71]:
losses = []
uwas = []
was = []

In [73]:
audio_train_loader = DataLoader(train_audio[0], batch_size=64, shuffle=False)
audio_valid_loader = DataLoader(valid_audio[0], batch_size=64, shuffle=False)
text_train_loader = DataLoader(train_text[0], batch_size=64, shuffle=False)
text_valid_loader = DataLoader(valid_text[0], batch_size=64, shuffle=False)
comment="Multimodal validation : {}".format(0)
audio_model = torch.load('Audio validation : 0.pth')
text_model = torch.load('Text validation : 0.pth')
loss, uwa, wa = validate(audio_model, text_model, audio_train_loader, audio_valid_loader, text_train_loader, text_valid_loader, comment)


losses.append(loss)
uwas.append(uwa)
was.append(wa)

38it [00:03, 11.17it/s]


Epoch - 1 Train-Loss : 0 Valid-Loss : 0.7958550123791945
unweighted_accuracy : 0.9475206611570248 weighted_accuracy : 0.9589548176744487
validation over


In [74]:
audio_train_loader = DataLoader(train_audio[1], batch_size=64, shuffle=False)
audio_valid_loader = DataLoader(valid_audio[1], batch_size=64, shuffle=False)
text_train_loader = DataLoader(train_text[1], batch_size=64, shuffle=False)
text_valid_loader = DataLoader(valid_text[1], batch_size=64, shuffle=False)
comment="Multimodal validation : {}".format(1)
audio_model = torch.load('Audio validation : 1.pth')
text_model = torch.load('Text validation : 1.pth')
loss, uwa, wa = validate(audio_model, text_model, audio_train_loader, audio_valid_loader, text_train_loader, text_valid_loader, comment)


losses.append(loss)
uwas.append(uwa)
was.append(wa)

38it [00:03, 11.87it/s]


Epoch - 1 Train-Loss : 0 Valid-Loss : 0.7992441873801382
unweighted_accuracy : 0.9362054681027341 weighted_accuracy : 0.9433392950932229
validation over


In [75]:
audio_train_loader = DataLoader(train_audio[2], batch_size=64, shuffle=False)
audio_valid_loader = DataLoader(valid_audio[2], batch_size=64, shuffle=False)
text_train_loader = DataLoader(train_text[2], batch_size=64, shuffle=False)
text_valid_loader = DataLoader(valid_text[2], batch_size=64, shuffle=False)
comment="Multimodal validation : {}".format(2)
audio_model = torch.load('Audio validation : 2.pth')
text_model = torch.load('Text validation : 2.pth')
loss, uwa, wa = validate(audio_model, text_model, audio_train_loader, audio_valid_loader, text_train_loader, text_valid_loader, comment)


losses.append(loss)
uwas.append(uwa)
was.append(wa)

37it [00:03, 11.75it/s]


Epoch - 1 Train-Loss : 0 Valid-Loss : 0.938661921668697
unweighted_accuracy : 0.8043196544276457 weighted_accuracy : 0.8551610882619295
validation over




In [76]:
audio_train_loader = DataLoader(train_audio[3], batch_size=64, shuffle=False)
audio_valid_loader = DataLoader(valid_audio[3], batch_size=64, shuffle=False)
text_train_loader = DataLoader(train_text[3], batch_size=64, shuffle=False)
text_valid_loader = DataLoader(valid_text[3], batch_size=64, shuffle=False)
comment="Multimodal validation : {}".format(3)
audio_model = torch.load('Audio validation : 3.pth')
text_model = torch.load('Text validation : 3.pth')
loss, uwa, wa = validate(audio_model, text_model, audio_train_loader, audio_valid_loader, text_train_loader, text_valid_loader, comment)


losses.append(loss)
uwas.append(uwa)
was.append(wa)

38it [00:03, 11.39it/s]


Epoch - 1 Train-Loss : 0 Valid-Loss : 0.7849988843265333
unweighted_accuracy : 0.9601494396014943 weighted_accuracy : 0.9680825636658217
validation over


In [77]:
audio_train_loader = DataLoader(train_audio[4], batch_size=64, shuffle=False)
audio_valid_loader = DataLoader(valid_audio[4], batch_size=64, shuffle=False)
text_train_loader = DataLoader(train_text[4], batch_size=64, shuffle=False)
text_valid_loader = DataLoader(valid_text[4], batch_size=64, shuffle=False)
comment="Multimodal validation : {}".format(4)
audio_model = torch.load('Audio validation : 4.pth')
text_model = torch.load('Text validation : 4.pth')
loss, uwa, wa = validate(audio_model, text_model, audio_train_loader, audio_valid_loader, text_train_loader, text_valid_loader, comment)


losses.append(loss)
uwas.append(uwa)
was.append(wa)

35it [00:03, 11.64it/s]


Epoch - 1 Train-Loss : 0 Valid-Loss : 0.8538094179970878
unweighted_accuracy : 0.8857271906052394 weighted_accuracy : 0.9171729262529535
validation over


In [82]:
from statistics import mean 
print("Average Loss : {}".format(mean(losses)))
print("Average Unweighted Accuracy : {}  Average Weighted Accuracy Score : {}" .format(mean(uwas), mean(was)))

Average Loss : 0.8290179138896039
Average Unweighted Accuracy : 0.912265305896957  Average Weighted Accuracy Score : 0.9334835761403891
