In [96]:
import pickle 
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import pandas as pd
import numpy as np
import re
import librosa
import string
import matplotlib.pyplot as plt
import os
import noisereduce as nr
import IPython
from collections import Counter
from sklearn.metrics import mean_squared_error, accuracy_score

torch.manual_seed(10)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(10)

In [97]:
from torch.utils.tensorboard import SummaryWriter

In [98]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

In [99]:
import pandas as pd
audio = pd.read_csv('audio_df_improvised.csv')

audio.reset_index(inplace=True)
audio.head()

Unnamed: 0,index,start_time,end_time,wav_file,emotion,val,act,dom
0,0,6.2901,8.2357,Ses01F_impro01_F000,neu,2.5,2.5,2.5
1,1,10.01,11.3925,Ses01F_impro01_F001,neu,2.5,2.5,2.5
2,2,14.8872,18.0175,Ses01F_impro01_F002,neu,2.5,2.5,2.5
3,3,27.46,31.49,Ses01F_impro01_F005,neu,2.5,3.5,2.0
4,4,85.27,88.02,Ses01F_impro01_F012,ang,2.0,3.5,3.5


In [100]:
audio.emotion = pd.Categorical(pd.factorize(audio.emotion)[0])

In [101]:
audio.head()

Unnamed: 0,index,start_time,end_time,wav_file,emotion,val,act,dom
0,0,6.2901,8.2357,Ses01F_impro01_F000,0,2.5,2.5,2.5
1,1,10.01,11.3925,Ses01F_impro01_F001,0,2.5,2.5,2.5
2,2,14.8872,18.0175,Ses01F_impro01_F002,0,2.5,2.5,2.5
3,3,27.46,31.49,Ses01F_impro01_F005,0,2.5,3.5,2.0
4,4,85.27,88.02,Ses01F_impro01_F012,1,2.0,3.5,3.5


In [102]:
old_text = pd.read_csv('text_df.csv')
old_text.emotion = audio.emotion
old_text.emotion = pd.Categorical(pd.factorize(old_text.emotion)[0])
old_text.head()

Unnamed: 0,wav_file,emotion,transcription
0,Ses01F_impro01_F000,0,excuse me .
1,Ses01F_impro01_F001,0,yeah .
2,Ses01F_impro01_F002,0,is there a problem ?
3,Ses01F_impro01_F005,0,well what s the problem ? let me change it .
4,Ses01F_impro01_F012,1,that s out of control .


In [103]:
text = pd.DataFrame()
text['data'] = old_text.transcription
text['label'] = old_text.emotion

text.head()

Unnamed: 0,data,label
0,excuse me .,0
1,yeah .,0
2,is there a problem ?,0
3,well what s the problem ? let me change it .,0
4,that s out of control .,1


In [104]:
#tokenization
import spacy
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [105]:
#count number of occurences of each word
counts = Counter()
for index, row in text.iterrows():
    counts.update(tokenize(row['data']))

In [106]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [107]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [108]:
text['encoded_data'] = text['data'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
text.head()

Unnamed: 0,data,label,encoded_data
0,excuse me .,0,"[[2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,yeah .,0,"[[5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,is there a problem ?,0,"[[6, 7, 8, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,well what s the problem ? let me change it .,0,"[[10, 11, 12, 13, 9, 14, 15, 3, 16, 17, 4, 0, ..."
4,that s out of control .,1,"[[18, 12, 19, 20, 21, 4, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
Counter(text['label'])

Counter({0: 1099, 1: 289, 2: 608, 3: 947})

In [109]:
X = list(text['encoded_data'])
y = list(text['label'])
from sklearn.model_selection import train_test_split
X_train = X[:2214]
X_valid = X[2214:]
y_train = y[:2214]
y_valid = y[2214:]
# X_train = X[243:]
# X_valid = X[:243]
# y_train = y[243:]
# y_valid = y[:243]
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

In [110]:
class Text(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [111]:
train_text = Text(X_train, y_train)
valid_text = Text(X_valid, y_valid)


In [112]:
batch_size = 1000
vocab_size = len(words)
text_train_loader = DataLoader(train_text, batch_size=batch_size, shuffle=False)
text_valid_loader = DataLoader(valid_text, batch_size=batch_size, shuffle=False)

In [113]:
def train_text_model(model, loss_fn, train_loader, valid_loader, epochs, learning_rate, optimizer, train_losses, valid_losses, comment, change_lr=None):
    print("Training Text Model")
#     tb = SummaryWriter(comment=comment)
    for epoch in range(1, epochs+1):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch, learning_rate)
        for x, y, l in text_train_loader:
            x = x.to(device, dtype=torch.long)
            y = y.to(device, dtype=torch.long)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()
            batch_losses.append(loss.item())
            y = y.cpu().detach()
        train_losses.append(batch_losses)
            
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        correct = 0
        for x, y, l in text_valid_loader:
            x = x.to(device, dtype=torch.long)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x, l)
            loss = loss_fn(y_hat, y)
            
            pred = torch.max(y_hat, 1)[1]
            correct += (pred == y).float().sum()
            
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
            

        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        unweighted_accuracy = accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        weighted_accuracy = balanced_accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        
#         tb.add_scalar(
#             'train_loss', np.mean(train_losses[-1]), epoch
#         )
#         tb.add_scalar(
#             'valid_loss', np.mean(valid_losses[-1]), epoch
#         )
#         tb.add_scalar(
#             'Unweighted Accuracy', unweighted_accuracy, epoch
#         )     
#         tb.add_scalar(
#             'Weighted Accuracy', weighted_accuracy, epoch
#         )   
        if epoch%5 == 0:
            print("Epoch - {} Train-Loss : {} Valid-Loss : {} Correct : {}".format(epoch, np.mean(train_losses[-1]), np.mean(valid_losses[-1]), correct))
            print("unweighted_accuracy : {} weighted_accuracy : {}".format(unweighted_accuracy, weighted_accuracy ))
#         print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

In [22]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in text_train_loader:
            x = x.to(device, dtype=torch.long)
            y = y.to(device, dtype=torch.long)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            y = y.cpu().detach()
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, text_valid_loader)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, text_valid_loader):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in text_valid_loader:
        x = x.to(device, dtype=torch.long)
        y = y.to(device, dtype=torch.long)
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        y_hat = y_hat.cpu().detach()
        y = y.cpu().detach()
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

Fixed Length module with random embeddings 

In [114]:
class LSTM_fixed_len(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(0.2)
        self.activation = nn.Softmax(dim=1)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.activation(self.linear(ht[-1]))

In [90]:
model_fixed =  LSTM_fixed_len(vocab_size, 56, 64)

In [91]:
if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

In [92]:
model_fixed.to(device)

LSTM_fixed_len(
  (embeddings): Embedding(2255, 56, padding_idx=0)
  (lstm): LSTM(56, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=4, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (activation): Softmax(dim=1)
)

In [None]:
train_model(model_fixed, epochs=500, lr=0.01)

Model with glove embeddings

In [115]:
def load_glove_vectors(glove_file=".vector_cache/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [116]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [117]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [118]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = True 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [119]:
lstm = LSTM_glove_vecs(vocab_size, 50, 64, pretrained_weights)


In [120]:
lstm.to(device)

LSTM_glove_vecs(
  (embeddings): Embedding(2255, 50, padding_idx=0)
  (lstm): LSTM(50, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=4, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [123]:
learning_rate = 0.01
optimizer = optim.Adam(lstm.parameters(), lr=learning_rate)
epochs = 300
loss_fn = nn.CrossEntropyLoss()
train_losses=[]
valid_losses=[]
comment = "None"

In [None]:
# train_model(lstm, epochs=300, lr=0.01)
train_text_model(lstm, loss_fn, text_train_loader, text_valid_loader, epochs, learning_rate, optimizer, train_losses, valid_losses, comment, change_lr=None)

Training Text Model




Epoch - 5 Train-Loss : 1.3003970781962078 Valid-Loss : 1.2070510387420654 Correct : 284.0
unweighted_accuracy : 0.3895747599451303 weighted_accuracy : 0.19505494505494506
Epoch - 10 Train-Loss : 1.2977413336435955 Valid-Loss : 1.2229628562927246 Correct : 285.0
unweighted_accuracy : 0.39094650205761317 weighted_accuracy : 0.3622589531680441
Epoch - 15 Train-Loss : 1.295873721440633 Valid-Loss : 1.221449613571167 Correct : 284.0
unweighted_accuracy : 0.3895747599451303 weighted_accuracy : 0.4308437067773167
Epoch - 20 Train-Loss : 1.295275052388509 Valid-Loss : 1.221219539642334 Correct : 282.0
unweighted_accuracy : 0.3868312757201646 weighted_accuracy : 0.13001383125864455
Epoch - 25 Train-Loss : 1.2936944166819255 Valid-Loss : 1.225310206413269 Correct : 283.0
unweighted_accuracy : 0.38820301783264743 weighted_accuracy : 0.21370781322237634
Epoch - 30 Train-Loss : 1.2941944201787312 Valid-Loss : 1.2197266817092896 Correct : 283.0
unweighted_accuracy : 0.38820301783264743 weighted_accu

Epoch - 245 Train-Loss : 0.2530027727286021 Valid-Loss : 1.6727993488311768 Correct : 405.0
unweighted_accuracy : 0.5555555555555556 weighted_accuracy : 0.45577122898669364
Epoch - 250 Train-Loss : 0.23220261434713999 Valid-Loss : 1.8455308675765991 Correct : 396.0
unweighted_accuracy : 0.5432098765432098 weighted_accuracy : 0.449298220650362
Epoch - 255 Train-Loss : 0.20733735958735147 Valid-Loss : 1.908883810043335 Correct : 404.0
unweighted_accuracy : 0.5541838134430727 weighted_accuracy : 0.4530366627800955
Epoch - 260 Train-Loss : 0.25146592656771344 Valid-Loss : 2.0423483848571777 Correct : 393.0
unweighted_accuracy : 0.5390946502057613 weighted_accuracy : 0.4461297777827926
Epoch - 265 Train-Loss : 0.2869443396727244 Valid-Loss : 1.8457614183425903 Correct : 404.0
unweighted_accuracy : 0.5541838134430727 weighted_accuracy : 0.45916997512122615
Epoch - 270 Train-Loss : 0.2292718142271042 Valid-Loss : 1.8411818742752075 Correct : 402.0
unweighted_accuracy : 0.551440329218107 weigh

In [198]:
torch.save(model_fixed, 'fixed_text_model.pth')

  "type " + obj.__name__ + ". It won't be checked "


In [31]:

def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128,top_db=80):
    wav,sr = librosa.load(file_path,sr=sr)
    
    if wav.shape[0]<5*sr:
        wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
    else:
        wav=wav[:5*sr]
    spec=librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels)
    spec_db=librosa.power_to_db(spec,top_db=top_db)
    return spec_db

def normalize(spec):
    return (spec - spec.mean())/(spec.std())

In [32]:
class Audio(Dataset):
    def __init__(self, base, df, in_col, out_col):
        self.df = df
        self.data = []
        self.labels = []
        for ind in tqdm(range(len(df))):
            row = df.iloc[ind]
            file_path = base + '/' + row[in_col] + '.wav'
            self.data.append(normalize(get_melspectrogram_db(file_path))[np.newaxis,...])
            self.labels.append(row[out_col])
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [68]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
train = audio[:2214]
valid = audio[2214:]
# train  = audio[243:]
# valid = audio[:243]
# train, valid = train_test_split(audio, test_size=0.2, shuffle=False)

In [69]:
train_audio = Audio('/data/home/advaitmb/datasets/sentences', train, 'wav_file', 'emotion')
valid_audio = Audio('/data/home/advaitmb/datasets/sentences', valid, 'wav_file', 'emotion')


100%|██████████| 2214/2214 [00:21<00:00, 102.64it/s]
100%|██████████| 729/729 [00:06<00:00, 105.10it/s]


In [70]:
audio_train_loader = DataLoader(train_audio, batch_size=64, shuffle=False)
audio_valid_loader = DataLoader(valid_audio, batch_size=64, shuffle=False)

In [71]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self):
        super().__init__() # just run the init of parent class (nn.Module)
        self.conv1 = nn.Conv2d(1, 32, 5, 2, padding=1) # input is 1 image, 32 output channels, 5x5 kernel / window
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 5, 2, padding=1) # input is 32, bc the first layer output 32. Then we say the output will be 64 channels, 5x5 kernel / window
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 5, 2, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 128, 5, 2, padding=1)
        
        self.fc1 = nn.Linear((7*8*128), 512) #flattening.
        
        
        self.fc2 = nn.Linear(512, 4) # 512 in, 2 out bc we're doing 2 classes (dog vs cat).
        self.activation = nn.Softmax(dim=1)

    def convs(self, x):
        # max pooling over 2x2
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = F.relu(self.conv3(x))
        x = self.bn3(x)
        x = F.relu(self.conv4(x))

        return x

    def forward(self, x):
        x = self.convs(x)
        x = x.reshape(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x) # bc this is our output layer. No activation here.
        return self.activation(x)


cnn = CNN()
print(cnn)

CNN(
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (fc1): Linear(in_features=7168, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=4, bias=True)
  (activation): Softmax(dim=1)
)


In [72]:
cnn.to(device)

CNN(
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
  (fc1): Linear(in_features=7168, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=4, bias=True)
  (activation): Softmax(dim=1)
)

In [122]:
def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

def lr_decay(optimizer, epoch, learning_rate):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr)
        print("Changed learning rate to {}".format(new_lr))
    return optimizer

In [74]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

def train_model(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, change_lr=None):
    
    for epoch in range(1,epochs+1):
        
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch, learning_rate)
        for i, data in tqdm(enumerate(train_loader)):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)


#         print("Epoch - {} Train-Loss : {}".format(epoch, np.mean(train_losses[-1])))
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())

        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        unweighted_accuracy = accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        weighted_accuracy = balanced_accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        print("Epoch - {} Train-Loss : {} Valid-Loss : {}".format(epoch, np.mean(train_losses[-1]), np.mean(valid_losses[-1])))
        print("unweighted_accuracy : {} weighted_accuracy : {}".format(unweighted_accuracy, weighted_accuracy ))
        

In [75]:
# def lr_decay_exp(optimizer, epoch, learning_rate):
#     learning_rate = (2e-1)/(1 + epoch*)
#     optimizer = setlr(optimizer, new_lr)
#     print("Changed learning rate to {}".format(new_lr))
#     return optimizer

In [76]:
learning_rate = 2e-5
optimizer = optim.Adam(cnn.parameters(), lr=learning_rate)
epochs = 15
loss_fn = nn.CrossEntropyLoss()
train_losses=[]
valid_losses=[]

In [77]:
train_model(cnn, loss_fn, audio_train_loader, audio_valid_loader, epochs, optimizer, train_losses, valid_losses, lr_decay)

35it [00:09,  3.88it/s]
0it [00:00, ?it/s]

Epoch - 1 Train-Loss : 1.342157656805856 Valid-Loss : 1.2985920310020447
unweighted_accuracy : 0.3991769547325103 weighted_accuracy : 0.5474965229485396


35it [00:09,  3.80it/s]
0it [00:00, ?it/s]

Epoch - 2 Train-Loss : 1.2856719936643328 Valid-Loss : 1.2494985858599346
unweighted_accuracy : 0.5034293552812071 weighted_accuracy : 0.5708199549359244


35it [00:09,  3.82it/s]
0it [00:00, ?it/s]

Epoch - 3 Train-Loss : 1.227051840509687 Valid-Loss : 1.226865440607071
unweighted_accuracy : 0.522633744855967 weighted_accuracy : 0.5244472740631875


35it [00:09,  3.82it/s]
0it [00:00, ?it/s]

Epoch - 4 Train-Loss : 1.188248782498496 Valid-Loss : 1.212343047062556
unweighted_accuracy : 0.50480109739369 weighted_accuracy : 0.5158692353048849


35it [00:09,  3.82it/s]
0it [00:00, ?it/s]

Epoch - 5 Train-Loss : 1.1604122349194117 Valid-Loss : 1.1991311808427174
unweighted_accuracy : 0.5089163237311386 weighted_accuracy : 0.5284088772330916


35it [00:09,  3.84it/s]
0it [00:00, ?it/s]

Epoch - 6 Train-Loss : 1.1377275909696307 Valid-Loss : 1.1903707285722096
unweighted_accuracy : 0.522633744855967 weighted_accuracy : 0.5542925290069974


35it [00:09,  3.79it/s]
0it [00:00, ?it/s]

Epoch - 7 Train-Loss : 1.118407917022705 Valid-Loss : 1.1838748653729756
unweighted_accuracy : 0.5390946502057613 weighted_accuracy : 0.5768348666899391


35it [00:09,  3.83it/s]
0it [00:00, ?it/s]

Epoch - 8 Train-Loss : 1.1003415516444615 Valid-Loss : 1.1779148280620575
unweighted_accuracy : 0.5500685871056241 weighted_accuracy : 0.5892454041632874


35it [00:09,  3.83it/s]
0it [00:00, ?it/s]

Epoch - 9 Train-Loss : 1.0821815882410322 Valid-Loss : 1.1718366593122482
unweighted_accuracy : 0.5500685871056241 weighted_accuracy : 0.5933934152458404
Changed learning rate to 2.0000000000000003e-06


35it [00:09,  3.84it/s]
0it [00:00, ?it/s]

Epoch - 10 Train-Loss : 1.0617310081209455 Valid-Loss : 1.1630895485480626
unweighted_accuracy : 0.5651577503429356 weighted_accuracy : 0.6184124134606593


35it [00:09,  3.82it/s]
0it [00:00, ?it/s]

Epoch - 11 Train-Loss : 1.0551531655447823 Valid-Loss : 1.1586727102597554
unweighted_accuracy : 0.5706447187928669 weighted_accuracy : 0.6244586453564781


35it [00:09,  3.86it/s]
0it [00:00, ?it/s]

Epoch - 12 Train-Loss : 1.0523176414625985 Valid-Loss : 1.1578205525875092
unweighted_accuracy : 0.5720164609053497 weighted_accuracy : 0.6279733779733779


35it [00:09,  3.58it/s]
0it [00:00, ?it/s]

Epoch - 13 Train-Loss : 1.0502966557230269 Valid-Loss : 1.1573929190635681
unweighted_accuracy : 0.5747599451303155 weighted_accuracy : 0.6329283238291464


35it [00:12,  2.86it/s]
0it [00:00, ?it/s]

Epoch - 14 Train-Loss : 1.0484359008925301 Valid-Loss : 1.1569918741782506
unweighted_accuracy : 0.5761316872427984 weighted_accuracy : 0.6339598342563262


35it [00:13,  2.60it/s]


Epoch - 15 Train-Loss : 1.0466408457074847 Valid-Loss : 1.1566048661867778
unweighted_accuracy : 0.5775034293552812 weighted_accuracy : 0.6349844815760505


In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CRNN(nn.Module):
    def __init__(self, audioModel, textModel):
        
        # CNN for Audio
        super().__init__()

        self.audioModel = audioModel.eval()
        self.textModel = textModel.eval()
        
        self.linear = nn.Linear(8, 4)


    def forward(self, ax, tx, l):
        #Audio 
        ax = self.audioModel(ax)
        tx = self.textModel(tx, l)
        
        x = torch.cat((ax, tx), dim=1)
        x = self.linear(x)
        return F.softmax(x, dim=1)

crnn = CRNN(cnn, lstm)
print(crnn)

CRNN(
  (audioModel): CNN(
    (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(64, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv4): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (fc1): Linear(in_features=7168, out_features=512, bias=True)
    (fc2): Linear(in_features=512, out_features=4, bias=True)
    (activation): Softmax(dim=1)
  )
  (textModel): LSTM_glove_vecs(
    (embeddings): Embedding(2255, 50, padding_idx=0)
    (lstm): LSTM(50, 64, batch_first=True)
    (linear): Linear(in_features=64, out_features=4, bias=True)
    (dropout): Dropout(p=

In [87]:
for param in crnn.audioModel.parameters():
    param.requires_grad=False
    
for param in crnn.textModel.parameters():
    param.requires_grad=False

In [61]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class CRNN(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim):
        
#         # CNN for Audio
#         super().__init__()
#         self.conv1 = nn.Conv2d(1, 32, 5, 2, padding=1) 
#         self.bn1 = nn.BatchNorm2d(32)
#         self.conv2 = nn.Conv2d(32, 64, 5, 2, padding=1) 
#         self.bn2 = nn.BatchNorm2d(64)
#         self.conv3 = nn.Conv2d(64, 128, 5, 2, padding=1)
#         self.bn3 = nn.BatchNorm2d(128)
#         self.conv4 = nn.Conv2d(128, 128, 5, 2, padding=1)
#         self.afc1 = nn.Linear((7*8*128), 512)
#         self.afc2 = nn.Linear(512, 128) 
        
#         # RNN for Text
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.tfc1 = nn.Linear(hidden_dim, 32)
#         self.dropout = nn.Dropout(0.2)

#         self.linear = nn.Linear(160, 4)
#     def convs(self, x):
#         x = F.relu(self.conv1(x))
#         x = self.bn1(x)
#         x = F.relu(self.conv2(x))
#         x = self.bn2(x)
#         x = F.relu(self.conv3(x))
#         x = self.bn3(x)
#         x = F.relu(self.conv4(x))

#         return x

#     def forward(self, ax, tx):
#         #Audio 
#         ax = self.convs(ax)
#         ax = ax.reshape(ax.size(0), -1)
#         ax = F.relu(self.afc1(ax))
#         ax = F.relu(self.afc2(ax)) # bc this is our output layer. No activation here.
        
#         #Text
#         tx = self.embeddings(tx)
# #         tx = self.dropout(tx)
#         lstm_out, (ht, ct) = self.lstm(tx)
#         tx = F.relu(self.tfc1(ht[-1]))
        
#         x = torch.cat((ax, tx), dim=1)
#         x = self.linear(x)
#         return F.softmax(x, dim=1)

# crnn = CRNN(vocab_size, 50, 64)
# print(crnn)

In [88]:
crnn.to(device)

CRNN(
  (audioModel): CNN(
    (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(64, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv4): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(1, 1))
    (fc1): Linear(in_features=7168, out_features=512, bias=True)
    (fc2): Linear(in_features=512, out_features=4, bias=True)
    (activation): Softmax(dim=1)
  )
  (textModel): LSTM_glove_vecs(
    (embeddings): Embedding(2255, 50, padding_idx=0)
    (lstm): LSTM(50, 64, batch_first=True)
    (linear): Linear(in_features=64, out_features=4, bias=True)
    (dropout): Dropout(p=

In [89]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
def train_model(model, loss_fn, audio_train_loader, text_train_loader, audio_valid_loader, text_valid_loader, epochs, optimizer, train_losses, valid_losses, change_lr=None): 
    for epoch in range(1,epochs+1):
        
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch, learning_rate)
        for a_data, t_data in tqdm(zip(audio_train_loader, text_train_loader)):
            ax, ay = a_data
            tx, ty, l = t_data
            ax = ax.to(device, dtype=torch.float32)
            ay = ay.to(device, dtype=torch.long)
            tx = tx.to(device, dtype=torch.long)
            y_hat = model(ax, tx, l)
            loss = loss_fn(y_hat, ay)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)

        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for a_data, t_data in zip(audio_train_loader, text_train_loader):
            ax, ay = a_data
            tx, ty, l = t_data
            ax = ax.to(device, dtype=torch.float32)
            ay = ay.to(device, dtype=torch.long)
            tx = tx.to(device, dtype=torch.long)
            y_hat = model(ax, tx, l)
            loss = loss_fn(y_hat, ay)
            trace_y.append(ay.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())

        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        unweighted_accuracy = accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        weighted_accuracy = balanced_accuracy_score( trace_yhat.argmax(axis=1), trace_y )
        print("Epoch - {} Train-Loss : {} Valid-Loss : {}".format(epoch, 0, np.mean(valid_losses[-1])))
        print("unweighted_accuracy : {} weighted_accuracy : {}".format(unweighted_accuracy, weighted_accuracy ))


In [90]:
batch_size = 64
vocab_size = len(words)
text_train_loader = DataLoader(train_text, batch_size=batch_size, shuffle=False)
text_valid_loader = DataLoader(valid_text, batch_size=batch_size, shuffle=False)

In [91]:
learning_rate = 0.01
optimizer = optim.SGD(crnn.parameters(), lr=learning_rate, momentum=0.9)
epochs = 1
loss_fn = nn.CrossEntropyLoss()
train_losses=[]
valid_losses=[]

In [92]:
print(audio_train_loader.dataset.labels[100:120])
print(text_train_loader.dataset.y[100:120])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [95]:
train_model(crnn, loss_fn, audio_train_loader, text_train_loader, audio_valid_loader, text_valid_loader, epochs, optimizer, train_losses, valid_losses, lr_decay)

35it [00:03, 11.14it/s]


Epoch - 1 Train-Loss : 0 Valid-Loss : 0.9993273683956692
unweighted_accuracy : 0.7461607949412827 weighted_accuracy : 0.8384779944910277


In [70]:
god = torch.load('god_model.pth')

In [241]:
for name, param in crnn.named_parameters():
    print(name)

audioModel.conv1.weight
audioModel.conv1.bias
audioModel.bn1.weight
audioModel.bn1.bias
audioModel.conv2.weight
audioModel.conv2.bias
audioModel.bn2.weight
audioModel.bn2.bias
audioModel.conv3.weight
audioModel.conv3.bias
audioModel.bn3.weight
audioModel.bn3.bias
audioModel.conv4.weight
audioModel.conv4.bias
audioModel.fc1.weight
audioModel.fc1.bias
audioModel.fc2.weight
audioModel.fc2.bias
textModel.embeddings.weight
textModel.lstm.weight_ih_l0
textModel.lstm.weight_hh_l0
textModel.lstm.bias_ih_l0
textModel.lstm.bias_hh_l0
textModel.linear.weight
textModel.linear.bias
linear.weight
linear.bias


In [230]:
for name, param in god.named_parameters():
    print(name)

audioModel.conv1.weight
audioModel.conv1.bias
audioModel.bn1.weight
audioModel.bn1.bias
audioModel.conv2.weight
audioModel.conv2.bias
audioModel.bn2.weight
audioModel.bn2.bias
audioModel.conv3.weight
audioModel.conv3.bias
audioModel.bn3.weight
audioModel.bn3.bias
audioModel.conv4.weight
audioModel.conv4.bias
audioModel.fc1.weight
audioModel.fc1.bias
audioModel.fc2.weight
audioModel.fc2.bias
textModel.embeddings.weight
textModel.lstm.weight_ih_l0
textModel.lstm.weight_hh_l0
textModel.lstm.bias_ih_l0
textModel.lstm.bias_hh_l0
textModel.linear.weight
textModel.linear.bias
linear.weight
linear.bias
