In [63]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.metrics import classification_report
import platform 
import sys
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch
from sklearn import preprocessing

# For data conversions
from torch.utils.data import DataLoader

# To get NN models in pytorch
import torch.nn as nn

# To get optimizers such as GD
import torch.optim as optim

# To get relu and tanh functions
import torch.nn.functional as F

import torch.utils.data as data_utils
from torch.utils.data import Dataset

In [64]:
has_gpu = torch.cuda.is_available()

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print("GPU is", "available" if has_gpu else "NOT AVAILABLE")
print(f"Target device is {device}")

import warnings
warnings.filterwarnings("ignore")

Python Platform: Windows-10-10.0.22621-SP0
PyTorch Version: 2.0.0+cu117
Python 3.10.10 | packaged by Anaconda, Inc. | (main, Mar 21 2023, 18:39:17) [MSC v.1916 64 bit (AMD64)]
Pandas 1.5.2
GPU is available
Target device is cuda


In [65]:
# Reading text features that are in line with alignment data

In [66]:
text_aligned_train_features = np.load('train_features_alignment.npy')
text_aligned_test_features = np.load('test_features_alignment.npy')

In [67]:
text_aligned_train_features

array([[ 5.04321873e-01,  4.63240236e-01, -3.76627706e-02, ...,
        -1.18496947e-01,  3.52687031e-01,  3.75959247e-01],
       [ 1.97530389e-01,  2.23903388e-01,  8.67783278e-03, ...,
        -1.96769819e-01,  2.87841946e-01,  2.99373597e-01],
       [ 2.55765110e-01,  1.04705095e-01, -3.99574637e-05, ...,
        -4.20659125e-01,  5.00405431e-01,  3.90712887e-01],
       ...,
       [-5.71486726e-02,  1.16940103e-01,  7.00608790e-02, ...,
        -1.42893016e-01,  9.89775807e-02,  3.86829376e-01],
       [ 4.03575838e-01,  3.52924109e-01,  1.08982280e-01, ...,
         5.49964160e-02,  3.62594336e-01,  2.75151968e-01],
       [ 4.99402825e-03,  2.02879816e-01,  2.22124338e-01, ...,
        -3.82540077e-02,  1.75538465e-01,  1.02757707e-01]], dtype=float32)

In [68]:
text_aligned_test_features

array([[ 0.04274414, -0.0171528 , -0.10105078, ...,  0.00315719,
         0.48995572,  0.5435381 ],
       [-0.13518153,  0.2433322 ,  0.03738693, ..., -0.10703974,
         0.15525305,  0.14268635],
       [-0.05714867,  0.1169401 ,  0.07006088, ..., -0.14289302,
         0.09897758,  0.38682938],
       ...,
       [-0.05788826,  0.03145726, -0.46042058, ...,  0.2709097 ,
         0.09669909,  0.41208312],
       [ 0.08848633,  0.06102172, -0.21425134, ...,  0.46084163,
         0.5064107 ,  0.46143913],
       [ 0.14460252,  0.18129203, -0.04733727, ..., -0.19956347,
         0.87466955,  0.3786211 ]], dtype=float32)

In [69]:
# Reading audio features that are in line with alignment data

In [70]:
audio_aligned_train_features = pd.read_csv('train_features_audio_aligned.csv')
audio_aligned_test_features = pd.read_csv('test_features_audio_aligned.csv')

In [71]:
audio_aligned_train_features.head(5)

Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label
0,24551,A,otherwise theyre pretty smelly,4019.txt,220.07175,221.542875,0.058766,0.731051,0.63811,0.89939,0.884808,0.852887,sv
1,23101,B,and i think a lot of their friends feel the sa...,2709.txt,432.140875,434.44325,0.098493,0.476925,0.364297,0.851667,0.897019,0.761266,sd
2,27063,B,well now so if you were going to have a dinner...,3506.txt,3.879375,7.575625,0.119491,0.390849,0.420487,0.963859,0.968053,0.937679,qw
3,20702,A,they uh,3228.txt,55.163,55.805875,-0.028226,0.336798,0.912177,0.927696,0.921769,0.901077,%
4,5976,B,i i dont feel like theyre a benefit to society...,3247.txt,40.875875,44.6155,0.110553,0.543148,1.124164,0.841859,0.877972,0.821876,sv


In [72]:
audio_aligned_test_features.head(5)

Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label
0,7246,A,and one is uh four,3457.txt,5.724875,7.149125,0.07945,0.745006,0.44698,0.812936,0.790438,0.58193,sd
1,1323,B,yeah,2349.txt,18.924125,19.216375,-0.029854,0.196361,0.130172,0.70411,0.687018,0.650901,b
2,33043,A,okay,2608.txt,5.7905,6.1505,-0.012475,0.378718,2.083808,0.81636,0.851317,0.702781,aa
3,26715,A,regarding uh taxes i you know,4725.txt,0.65,1.9,0.033159,0.370226,0.428816,0.890855,0.886848,0.928268,"fo_o_fw_""_by_bc"
4,23466,A,and that was actually after the war was over,2253.txt,291.411,292.961,0.051122,0.643769,0.5047,0.912026,0.881303,0.915515,sd


In [73]:
# Early fusion for Multimodal model

In [74]:
multimodal_data_train = pd.DataFrame(columns=['features', 'label'])
for index, row in audio_aligned_train_features.iterrows():
    multimodal_feature = np.concatenate((
        text_aligned_train_features[index],
        audio_aligned_train_features.iloc[index][6:-1].to_numpy()),
        axis=0)
    label = audio_aligned_train_features.iloc[index]['Label']
    temp = {'features': multimodal_feature, 'label': label}
    multimodal_data_train = pd.concat([multimodal_data_train, pd.DataFrame([temp])], ignore_index=True)

multimodal_data_train.head(5)

Unnamed: 0,features,label
0,"[0.5043218731880188, 0.46324023604393005, -0.0...",sv
1,"[0.19753038883209229, 0.22390338778495789, 0.0...",sd
2,"[0.2557651102542877, 0.1047050952911377, -3.99...",qw
3,"[0.16346515715122223, 0.31099289655685425, 0.2...",%
4,"[0.40566009283065796, 0.505189836025238, -0.06...",sv


In [75]:
le = preprocessing.LabelEncoder()
le.fit(multimodal_data_train['label'])
train_encoded_labels = le.transform(multimodal_data_train['label'])

multimodal_data_train['da'] = train_encoded_labels
multimodal_data_train.head(5)

Unnamed: 0,features,label,da
0,"[0.5043218731880188, 0.46324023604393005, -0.0...",sv,38
1,"[0.19753038883209229, 0.22390338778495789, 0.0...",sd,37
2,"[0.2557651102542877, 0.1047050952911377, -3.99...",qw,33
3,"[0.16346515715122223, 0.31099289655685425, 0.2...",%,0
4,"[0.40566009283065796, 0.505189836025238, -0.06...",sv,38


In [76]:
multimodal_data_test = pd.DataFrame(columns=['features', 'label'])
for index, row in audio_aligned_test_features.iterrows():
    multimodal_feature = np.concatenate((
        text_aligned_test_features[index],
        audio_aligned_test_features.iloc[0][6:-1].to_numpy()),
        axis=0)
    label = audio_aligned_test_features.iloc[index]['Label']
    temp = {'features': multimodal_feature, 'label': label}
    multimodal_data_test = pd.concat([multimodal_data_test, pd.DataFrame([temp])], ignore_index=True)

multimodal_data_test.head(5)

Unnamed: 0,features,label
0,"[0.042744144797325134, -0.017152801156044006, ...",sd
1,"[-0.13518153131008148, 0.24333220720291138, 0....",b
2,"[-0.05714867264032364, 0.11694010347127914, 0....",aa
3,"[0.008160506375133991, 0.3085247278213501, -0....","fo_o_fw_""_by_bc"
4,"[0.023708511143922806, 0.05153898894786835, -0...",sd


In [77]:
test_encoded_labels = le.transform(multimodal_data_test['label'])

multimodal_data_test['da'] = test_encoded_labels
multimodal_data_test.head(5)

Unnamed: 0,features,label,da
0,"[0.042744144797325134, -0.017152801156044006, ...",sd,37
1,"[-0.13518153131008148, 0.24333220720291138, 0....",b,10
2,"[-0.05714867264032364, 0.11694010347127914, 0....",aa,5
3,"[0.008160506375133991, 0.3085247278213501, -0....","fo_o_fw_""_by_bc",20
4,"[0.023708511143922806, 0.05153898894786835, -0...",sd,37


In [78]:
# Deep DA Classification

In [79]:
x_train_torch = torch.Tensor(list(multimodal_data_train['features']))
x_test_torch = torch.Tensor(list(multimodal_data_test['features']))

In [80]:
y_train = torch.LongTensor(list(multimodal_data_train['da']))
y_test = torch.LongTensor(list(multimodal_data_test['da']))

In [81]:
zipper = lambda x,y : list(zip(x,y))

In [82]:
x_train_torch = zipper(x_train_torch,y_train)
x_test_torch = zipper(x_test_torch,y_test)

In [83]:
# Hyperparameters
num_classes = 41
learning_rate = 0.05
batch_size = 16
num_epochs = 100
num_workers = 1
input_size = 774

In [84]:
# DataLoader
train_loader = torch.utils.data.DataLoader(x_train_torch, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(x_test_torch, batch_size=batch_size)

In [85]:
## Model 01: FNN

In [86]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.input_size = input_size
        self.feedforwardNN = nn.Sequential(
        nn.Linear(input_size, 500),
        nn.ReLU(),
        nn.Linear(500, 200),
        nn.ReLU(),
        nn.Linear(200, 100),
        nn.ReLU(),
        nn.Linear(100, num_classes),
        nn.Softmax()
    )

    def forward(self, X):
        return self.feedforwardNN(X)

    def predict(self, X):
        Y_pred = self.forward(X)
        return Y_pred

In [87]:
# Initialize model
feedforwardModel = FeedForwardNeuralNetwork(input_size = input_size,
                                            num_classes = num_classes).to(device)

In [88]:
# Specify criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(feedforwardModel.parameters(), lr=learning_rate)

In [89]:
# Function to get accuracy scores
def accuracy(y_pred, y_test):
    pred = torch.argmax(y_pred, dim=1)
    return (pred == y_test).float().mean()

In [90]:
# Function to get accuracy scores
def f1_score_NN(y_pred, y_test):
    pred = torch.argmax(y_pred, dim=1).cpu()
    y_test_cpu = y_test.cpu()
    return f1_score(y_test_cpu, pred, average='micro')

In [91]:
### Training Feedforward NN

In [92]:
# Start Training
def train_FF(model, num_epochs, train_loader, test_loader, modelName):
    test_loss_min = np.Inf
    for epoch in range(num_epochs):

        train_loss = 0.0
        test_loss = 0.0

        # Prepare the model for training
        model.train()

        for data, target in train_loader:
            data = data.to(device=device)
            target = target.type(torch.LongTensor).to(device=device)

            data = data.reshape(data.shape[0], -1)

            # forward pass: compute predicted outputs by passing
            # inputs to the model
            scores = model(data)

            # calculate the loss
            loss = criterion(scores, target)

            # clear the gradients of all optimized variables
            optimizer.zero_grad()

            # backward pass: compute gradient of the loss
            # with respect to model parameters
            loss.backward()

            # perform a single optimization step (parameter update)
            optimizer.step()

            train_loss += loss.item()*data.size(0)

        # Prepare model for evaluation
        model.eval()

        with torch.no_grad():
            for data, target in test_loader:
                data = data.to(device=device)
                target = target.type(torch.LongTensor).to(device=device)

                scores = model(data)
                loss = criterion(scores, target)

                test_loss += loss.item()*data.size(0)

        train_loss = train_loss/len(train_loader.dataset)
        test_loss = test_loss/len(test_loader.dataset)

        print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
            epoch+1,
            train_loss,
            test_loss
            ))

        # save model if validation loss has decreased
        if test_loss <= test_loss_min:
            print('Test loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            test_loss_min,
            test_loss))
            torch.save(model.state_dict(), modelName + '.pt')
            test_loss_min = test_loss

In [93]:
train_FF(feedforwardModel, num_epochs=num_epochs, train_loader=train_loader, test_loader=test_loader, modelName='feedforwardModel')

Epoch: 1 	Training Loss: 3.398824 	Test Loss: 3.333682
Test loss decreased (inf --> 3.333682).  Saving model ...
Epoch: 2 	Training Loss: 3.333351 	Test Loss: 3.333670
Test loss decreased (3.333682 --> 3.333670).  Saving model ...
Epoch: 3 	Training Loss: 3.333344 	Test Loss: 3.333667
Test loss decreased (3.333670 --> 3.333667).  Saving model ...
Epoch: 4 	Training Loss: 3.333341 	Test Loss: 3.333665
Test loss decreased (3.333667 --> 3.333665).  Saving model ...
Epoch: 5 	Training Loss: 3.333340 	Test Loss: 3.333665
Test loss decreased (3.333665 --> 3.333665).  Saving model ...
Epoch: 6 	Training Loss: 3.333339 	Test Loss: 3.333664
Test loss decreased (3.333665 --> 3.333664).  Saving model ...
Epoch: 7 	Training Loss: 3.333339 	Test Loss: 3.333664
Test loss decreased (3.333664 --> 3.333664).  Saving model ...
Epoch: 8 	Training Loss: 3.333338 	Test Loss: 3.333664
Test loss decreased (3.333664 --> 3.333664).  Saving model ...
Epoch: 9 	Training Loss: 3.333338 	Test Loss: 3.333663
Test l

In [94]:
# Load model with best test loss
feedforwardModel.load_state_dict(torch.load('feedforwardModel.pt'))

<All keys matched successfully>

In [181]:
# Reference:
# https://www.kaggle.com/code/tauseef6462/simple-feedforward-neural-network-using-pytorch
def prepare_for_accuracy(model,
                         test_dataset_tensor,
                         test_target_tensor):
    test_dataset_tensor = test_dataset_tensor.to(device=device)
    Y_pred_test = model.predict(test_dataset_tensor)

    Y_pred_test = Y_pred_test.to(device=device)
    test_target_tensor = test_target_tensor.to(device=device)
    accuracy_test = accuracy(Y_pred_test, test_target_tensor)
    f1_score_val = f1_score_NN(Y_pred_test, test_target_tensor)
    print("Test accuracy of Network",(accuracy_test))
    print("F1 score of Network",(f1_score_val))

In [96]:
x_train_torch = torch.Tensor(list(multimodal_data_train['features']))
x_test_torch = torch.Tensor(list(multimodal_data_test['features']))
y_train = torch.LongTensor(list(multimodal_data_train['da']))
y_test = torch.LongTensor(list(multimodal_data_test['da']))

In [97]:
print('Accuracy for FeedForward Network :: ')
prepare_for_accuracy(feedforwardModel,
                     test_dataset_tensor=x_test_torch,
                     test_target_tensor=y_test)

Accuracy for FeedForward Network :: 
Test accuracy of Network tensor(0.4210, device='cuda:0')
F1 score of Network 0.4209650582362729


In [98]:
### Using VGGish embeddings

In [99]:
cleaned_train_data__with_embeddings = pd.read_csv('train_features_audio_aligned__with_wav_files__embeddings.csv')
cleaned_test_data__with_embeddings = pd.read_csv('test_features_audio_aligned__with_wav_files__embeddings.csv')

In [100]:
# Early fusion for Multimodal model

In [101]:
multimodal_data_train = pd.DataFrame(columns=['features', 'label'])
for index, row in cleaned_train_data__with_embeddings.iterrows():
    audio_embeddings = list()
    with open('temp.npy', encoding='utf-8', mode='w') as npyf:
        temp = cleaned_train_data__with_embeddings.iloc[index]['audio_embeddings']
        temp = temp.replace('\n', '')[1: -1]
        npyf.write(temp)
    with open('temp.npy', encoding='utf-8', mode='r') as npyf:
        lines = npyf.readlines()[0].split(' ')
        for line in lines:
            if len(line) > 0:
                audio_embeddings.append(float(line))
    multimodal_feature = np.concatenate((
        text_aligned_train_features[index],
        audio_embeddings),
        axis=0)
    label = cleaned_train_data__with_embeddings.iloc[index]['Label']
    temp = {'features': multimodal_feature, 'label': label}
    multimodal_data_train = pd.concat([multimodal_data_train, pd.DataFrame([temp])], ignore_index=True)

multimodal_data_train.head(5)

Unnamed: 0,features,label
0,"[0.5043218731880188, 0.46324023604393005, -0.0...",sv
1,"[0.19753038883209229, 0.22390338778495789, 0.0...",sd
2,"[0.2557651102542877, 0.1047050952911377, -3.99...",^q
3,"[0.16346515715122223, 0.31099289655685425, 0.2...",sd
4,"[0.40566009283065796, 0.505189836025238, -0.06...",sd


In [102]:
multimodal_data_test = pd.DataFrame(columns=['features', 'label'])
for index, row in cleaned_test_data__with_embeddings.iterrows():
    audio_embeddings = list()
    with open('temp.npy', encoding='utf-8', mode='w') as npyf:
        temp = cleaned_test_data__with_embeddings.iloc[index]['audio_embeddings']
        temp = temp.replace('\n', '')[1: -1]
        npyf.write(temp)
    with open('temp.npy', encoding='utf-8', mode='r') as npyf:
        lines = npyf.readlines()[0].split(' ')
        for line in lines:
            if len(line) > 0:
                audio_embeddings.append(float(line))
    multimodal_feature = np.concatenate((
        text_aligned_test_features[index],
        audio_embeddings),
        axis=0)
    label = cleaned_test_data__with_embeddings.iloc[index]['Label']
    temp = {'features': multimodal_feature, 'label': label}
    multimodal_data_test = pd.concat([multimodal_data_test, pd.DataFrame([temp])], ignore_index=True)

multimodal_data_test.head(5)

Unnamed: 0,features,label
0,"[0.042744144797325134, -0.017152801156044006, ...",sd
1,"[-0.13518153131008148, 0.24333220720291138, 0....","fo_o_fw_""_by_bc"
2,"[-0.05714867264032364, 0.11694010347127914, 0....",sd
3,"[0.008160506375133991, 0.3085247278213501, -0....",sd
4,"[0.023708511143922806, 0.05153898894786835, -0...",sv


In [103]:
le = preprocessing.LabelEncoder()
le.fit(multimodal_data_train['label'])
train_encoded_labels = le.transform(multimodal_data_train['label'])

multimodal_data_train['da'] = train_encoded_labels
multimodal_data_train.head(5)

Unnamed: 0,features,label,da
0,"[0.5043218731880188, 0.46324023604393005, -0.0...",sv,37
1,"[0.19753038883209229, 0.22390338778495789, 0.0...",sd,36
2,"[0.2557651102542877, 0.1047050952911377, -3.99...",^q,3
3,"[0.16346515715122223, 0.31099289655685425, 0.2...",sd,36
4,"[0.40566009283065796, 0.505189836025238, -0.06...",sd,36


In [104]:
multimodal_data_test = multimodal_data_test[multimodal_data_test['label'] != '^g']
multimodal_data_test.head(5)

Unnamed: 0,features,label
0,"[0.042744144797325134, -0.017152801156044006, ...",sd
1,"[-0.13518153131008148, 0.24333220720291138, 0....","fo_o_fw_""_by_bc"
2,"[-0.05714867264032364, 0.11694010347127914, 0....",sd
3,"[0.008160506375133991, 0.3085247278213501, -0....",sd
4,"[0.023708511143922806, 0.05153898894786835, -0...",sv


In [105]:
test_encoded_labels = le.transform(multimodal_data_test['label'])

multimodal_data_test['da'] = test_encoded_labels
multimodal_data_test.head(5)

Unnamed: 0,features,label,da
0,"[0.042744144797325134, -0.017152801156044006, ...",sd,36
1,"[-0.13518153131008148, 0.24333220720291138, 0....","fo_o_fw_""_by_bc",19
2,"[-0.05714867264032364, 0.11694010347127914, 0....",sd,36
3,"[0.008160506375133991, 0.3085247278213501, -0....",sd,36
4,"[0.023708511143922806, 0.05153898894786835, -0...",sv,37


In [106]:
# Deep DA Classification

In [168]:
x_train_torch = torch.Tensor(list(multimodal_data_train['features']))
x_test_torch = torch.Tensor(list(multimodal_data_test['features']))

In [169]:
y_train = torch.LongTensor(list(multimodal_data_train['da']))
y_test = torch.LongTensor(list(multimodal_data_test['da']))

In [170]:
zipper = lambda x,y : list(zip(x,y))

In [171]:
x_train_torch = zipper(x_train_torch,y_train)
x_test_torch = zipper(x_test_torch,y_test)

In [172]:
# DataLoader
train_loader = torch.utils.data.DataLoader(x_train_torch, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(x_test_torch, batch_size=batch_size)

In [173]:
## Model 01: FNN

In [174]:
# Hyperparameters
# num_classes = 41
num_classes = 40
learning_rate = 0.05
batch_size = 16
num_epochs = 500
num_workers = 1
input_size = 896

In [175]:
# Initialize model
vggishFeedforwardModel = FeedForwardNeuralNetwork(input_size = input_size,
                                            num_classes = num_classes).to(device)

In [176]:
# Specify criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(feedforwardModel.parameters(), lr=learning_rate)

In [177]:
train_FF(vggishFeedforwardModel, num_epochs=num_epochs, train_loader=train_loader, test_loader=test_loader, modelName='vggishFeedforwardModel')

Epoch: 1 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (inf --> 3.689347).  Saving model ...
Epoch: 2 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 3 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 4 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 5 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 6 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 7 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 8 	Training Loss: 3.689340 	Test Loss: 3.689347
Test loss decreased (3.689347 --> 3.689347).  Saving model ...
Epoch: 9 	Training Loss: 3.689340 	Test Loss: 3.689347
Test l

In [183]:
# Load model with best test loss
vggishFeedforwardModel.load_state_dict(torch.load('vggishFeedforwardModel.pt'))

<All keys matched successfully>

In [184]:
x_train_torch = torch.Tensor(list(multimodal_data_train['features']))
x_test_torch = torch.Tensor(list(multimodal_data_test['features']))
y_train = torch.LongTensor(list(multimodal_data_train['da']))
y_test = torch.LongTensor(list(multimodal_data_test['da']))

In [185]:
print('Accuracy for FeedForward Network :: ')
prepare_for_accuracy(vggishFeedforwardModel,
                     test_dataset_tensor=x_test_torch,
                     test_target_tensor=y_test)

Accuracy for FeedForward Network :: 
Test accuracy of Network tensor(0.4835, device='cuda:0')
F1 score of Network 0.48348348348348347


In [120]:
### Model 02: A different FFN

In [121]:
# Hyperparameters
# num_classes = 41
num_classes = 40
learning_rate = 0.08
batch_size = 16
num_epochs = 100
num_workers = 1
input_size = 1000
audio_size = 128
text_size = 768

In [122]:
class FeedForwardNeuralNetwork_2(nn.Module):
    def __init__(self, audio_size, text_size, input_size, num_classes):
        super().__init__()
        self.input_size = input_size
        self.feedforwardNN_text = nn.Sequential(
            nn.Linear(text_size, 500)
        )
        self.feedforwardNN_audio = nn.Sequential(
            nn.Linear(audio_size, 500)
        )

        self.feedforwardNN = nn.Sequential(
        nn.Linear(input_size, 500),
        nn.ReLU(),
        nn.Linear(500, 200),
        nn.ReLU(),
        nn.Linear(200, 100),
        nn.ReLU(),
        nn.Linear(100, num_classes),
        nn.Softmax()
        )

    def forward(self, text_embds, audio_embds):
        text_weights = self.feedforwardNN_text(text_embds)
        audio_weights = self.feedforwardNN_audio(audio_embds)
        merged = torch.concat((text_weights, audio_weights), dim=1)
        return self.feedforwardNN(merged)

    def predict(self, text_embds, audio_embds):
        text_weights = self.feedforwardNN_text(text_embds)
        audio_weights = self.feedforwardNN_audio(audio_embds)
        merged = torch.concat((text_weights, audio_weights) ,dim=1)
        Y_pred = self.feedforwardNN(merged)
        return Y_pred

In [123]:
train_audio_embeddings = list()
train_text_embeddings = list()
for index, row in cleaned_train_data__with_embeddings.iterrows():
    temp2 = list()
    with open('temp.npy', encoding='utf-8', mode='w') as npyf:
        temp = cleaned_train_data__with_embeddings.iloc[index]['audio_embeddings']
        temp = temp.replace('\n', '')[1: -1]
        npyf.write(temp)
    with open('temp.npy', encoding='utf-8', mode='r') as npyf:
        lines = npyf.readlines()[0].split(' ')
        for line in lines:
            if len(line) > 0:
                temp2.append(float(line))
    train_text_embeddings.append(text_aligned_train_features[index])
    train_audio_embeddings.append(temp2)

In [124]:
test_audio_embeddings = list()
test_text_embeddings = list()
for index, row in cleaned_test_data__with_embeddings.iterrows():
    temp2 = list()
    with open('temp.npy', encoding='utf-8', mode='w') as npyf:
        temp = cleaned_test_data__with_embeddings.iloc[index]['audio_embeddings']
        temp = temp.replace('\n', '')[1: -1]
        npyf.write(temp)
    with open('temp.npy', encoding='utf-8', mode='r') as npyf:
        lines = npyf.readlines()[0].split(' ')
        for line in lines:
            if len(line) > 0:
                temp2.append(float(line))
    if row['Label'] == '^g':
        continue
    test_text_embeddings.append(text_aligned_test_features[index])
    test_audio_embeddings.append(temp2)

In [125]:
# x_train_torch_text = torch.Tensor(list(text_aligned_train_features))
# x_test_torch_text = torch.Tensor(list(text_aligned_test_features))

x_train_torch_text = torch.Tensor(train_text_embeddings)
x_test_torch_text = torch.Tensor(test_text_embeddings)

x_train_torch_audio = torch.Tensor(train_audio_embeddings)
x_test_torch_audio = torch.Tensor(test_audio_embeddings)

In [126]:
le = preprocessing.LabelEncoder()
le.fit(cleaned_train_data__with_embeddings['Label'])
train_encoded_labels = le.transform(cleaned_train_data__with_embeddings['Label'])

cleaned_train_data__with_embeddings['da'] = train_encoded_labels

cleaned_test_data__with_embeddings = cleaned_test_data__with_embeddings[cleaned_test_data__with_embeddings['Label'] != '^g']
test_encoded_labels = le.transform(cleaned_test_data__with_embeddings['Label'])

cleaned_test_data__with_embeddings['da'] = test_encoded_labels
cleaned_train_data__with_embeddings.head(5)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc,audio_embeddings,da
0,0,0,24551,A,otherwise theyre pretty smelly,4019.txt,220.07175,221.542875,0.058766,0.731051,0.63811,0.89939,0.884808,0.852887,sv,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04019.sph_0...,[-7.4466288e-01 -5.5393863e-01 -5.2604270e-01 ...,37
1,1,12,1375,B,she got the treatments,3057.txt,238.462375,239.7735,0.049801,0.327706,0.288991,0.833201,0.821875,0.72311,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03057.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03057.sph_1...,[-0.7556931 -0.18065795 -0.32506776 -0.808879...,36
2,2,14,28359,B,this is the reality,2418.txt,207.996125,209.286125,0.09164,0.485693,0.503992,0.841941,0.762634,0.675776,^q,Audio\swb1_LDC97S62\swb1_d1\data\sw02418.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02418.sph_1...,[-5.1421905e-01 -5.2362168e-01 -4.5810127e-01 ...,3
3,3,15,5278,A,so we kind of looked around,3252.txt,126.830125,128.693375,0.062573,0.717063,0.323278,0.864236,0.876196,0.871295,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03252.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03252.sph_0...,[-0.82808685 -0.5073712 -0.84148455 -0.899432...,36
4,4,20,310,A,hes in in florida jail now,3334.txt,156.163125,157.872375,0.136232,0.20211,0.324408,0.822553,0.860201,0.623045,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw03334.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw03334.sph_0...,[-1.27045846e+00 -5.35477519e-01 -2.04725593e-...,36


In [127]:
cleaned_test_data__with_embeddings.head(5)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Channel,Utterance,file,start,end,pitch0,pitch1,pitch2,fb00,fb01,fb02,Label,raw_audio_file_loc,split_audio_file_loc,audio_embeddings,da
0,0,0,7246,A,and one is uh four,3457.txt,5.724875,7.149125,0.07945,0.745006,0.44698,0.812936,0.790438,0.58193,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw03457.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw03457.sph_0...,[-0.8501383 -0.6682229 -0.7700393 -0.899509...,36
1,1,3,26715,A,regarding uh taxes i you know,4725.txt,0.65,1.9,0.033159,0.370226,0.428816,0.890855,0.886848,0.928268,"fo_o_fw_""_by_bc",Audio\swb1_LDC97S62\swb1_d3\data\sw04725.sph,Audio\swb1_LDC97S62\swb1_d3\data\sw04725.sph_0...,[-0.29266676 -0.39997765 -0.05619848 -0.805616...,19
2,2,4,23466,A,and that was actually after the war was over,2253.txt,291.411,292.961,0.051122,0.643769,0.5047,0.912026,0.881303,0.915515,sd,Audio\swb1_LDC97S62\swb1_d1\data\sw02253.sph,Audio\swb1_LDC97S62\swb1_d1\data\sw02253.sph_0...,[-0.57869494 -0.02115735 -0.22905743 -1.103605...,36
3,3,7,30504,A,oh well he made it at fifty,2623.txt,577.62475,579.52525,0.117541,0.304582,0.497606,0.903913,0.885161,0.875586,sd,Audio\swb1_LDC97S62\swb1_d2\data\sw02623.sph,Audio\swb1_LDC97S62\swb1_d2\data\sw02623.sph_0...,[-7.7571785e-01 -3.5373911e-01 -3.1975806e-01 ...,36
4,4,9,8670,B,and another thing is cost,4649.txt,135.940125,137.436,0.075767,0.416207,0.655162,0.90832,0.937884,0.858942,sv,Audio\swb1_LDC97S62\swb1_d4\data\sw04649.sph,Audio\swb1_LDC97S62\swb1_d4\data\sw04649.sph_1...,[-0.6750778 -0.38885894 -0.5354514 -0.978235...,37


In [128]:
y_train = torch.LongTensor(list(cleaned_train_data__with_embeddings['da']))
y_test = torch.LongTensor(list(cleaned_test_data__with_embeddings['da']))

In [129]:
zipper = lambda x,y,z : list(zip(x,y,z))

In [130]:
x_train_torch = zipper(x_train_torch_text, x_train_torch_audio, y_train)
x_test_torch = zipper(x_test_torch_text, x_test_torch_audio, y_test)

In [131]:
# DataLoader
train_loader = torch.utils.data.DataLoader(x_train_torch, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(x_test_torch, batch_size=batch_size)

In [132]:
# Initialize model
feedforwardModel_multi = FeedForwardNeuralNetwork_2(
    audio_size=audio_size,
    text_size=text_size,
    input_size = input_size,
    num_classes = num_classes).to(device)

In [133]:
# Specify criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(feedforwardModel_multi.parameters(), lr=learning_rate)

In [134]:
# Function to get accuracy scores
def accuracy(y_pred, y_test):
    pred = torch.argmax(y_pred, dim=1)
    return (pred == y_test).float().mean()

In [135]:
### Training Feedforward NN

In [136]:
# Start Training
def train_FF_Multi(model, num_epochs, train_loader, test_loader, modelName):
    test_loss_min = np.Inf
    for epoch in range(num_epochs):

        train_loss = 0.0
        test_loss = 0.0

        # Prepare the model for training
        model.train()

        for text_data, audio_data, target in train_loader:
            text_data = text_data.to(device=device)
            audio_data = audio_data.to(device=device)
            target = target.type(torch.LongTensor).to(device=device)

            text_data = text_data.reshape(text_data.shape[0], -1)
            audio_data = audio_data.reshape(audio_data.shape[0], -1)

            # forward pass: compute predicted outputs by passing
            # inputs to the model
            scores = model(text_data, audio_data)

            # calculate the loss
            loss = criterion(scores, target)

            # clear the gradients of all optimized variables
            optimizer.zero_grad()

            # backward pass: compute gradient of the loss
            # with respect to model parameters
            loss.backward()

            # perform a single optimization step (parameter update)
            optimizer.step()

            # train_loss += loss.item()*data.size(0)
            train_loss += loss.item() * 16 # Batch size

        # Prepare model for evaluation
        model.eval()

        with torch.no_grad():
            for text_data, audio_data, target in test_loader:
                text_data = text_data.to(device=device)
                audio_data = audio_data.to(device=device)
                target = target.type(torch.LongTensor).to(device=device)

                scores = model(text_data, audio_data)
                loss = criterion(scores, target)

                # test_loss += loss.item()*data.size(0)
                test_loss += loss.item() * 16 # Batch size

        train_loss = train_loss/len(train_loader.dataset)
        test_loss = test_loss/len(test_loader.dataset)

        print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
            epoch+1,
            train_loss,
            test_loss
            ))

        # save model if validation loss has decreased
        if test_loss <= test_loss_min:
            print('Test loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            test_loss_min,
            test_loss))
            torch.save(model.state_dict(), modelName + '.pt')
            test_loss_min = test_loss

In [137]:
train_FF_Multi(feedforwardModel_multi, num_epochs=num_epochs, train_loader=train_loader, test_loader=test_loader, modelName='feedforwardModel__Multi')

Epoch: 1 	Training Loss: 3.572009 	Test Loss: 3.276504
Test loss decreased (inf --> 3.276504).  Saving model ...
Epoch: 2 	Training Loss: 3.223663 	Test Loss: 3.276261
Test loss decreased (3.276504 --> 3.276261).  Saving model ...
Epoch: 3 	Training Loss: 3.223590 	Test Loss: 3.276239
Test loss decreased (3.276261 --> 3.276239).  Saving model ...
Epoch: 4 	Training Loss: 3.223577 	Test Loss: 3.276232
Test loss decreased (3.276239 --> 3.276232).  Saving model ...
Epoch: 5 	Training Loss: 3.223572 	Test Loss: 3.276228
Test loss decreased (3.276232 --> 3.276228).  Saving model ...
Epoch: 6 	Training Loss: 3.223568 	Test Loss: 3.276226
Test loss decreased (3.276228 --> 3.276226).  Saving model ...
Epoch: 7 	Training Loss: 3.223567 	Test Loss: 3.276224
Test loss decreased (3.276226 --> 3.276224).  Saving model ...
Epoch: 8 	Training Loss: 3.223565 	Test Loss: 3.276223
Test loss decreased (3.276224 --> 3.276223).  Saving model ...
Epoch: 9 	Training Loss: 3.223564 	Test Loss: 3.276222
Test l

In [138]:
# Load model with best test loss
feedforwardModel_multi.load_state_dict(torch.load('feedforwardModel__Multi.pt'))

<All keys matched successfully>

In [139]:
# Reference:
# https://www.kaggle.com/code/tauseef6462/simple-feedforward-neural-network-using-pytorch
def prepare_for_accuracy(model,
                         test_dataset_tensor_text,
                         test_dataset_tensor_audio,
                         test_target_tensor):
    test_dataset_tensor_text = test_dataset_tensor_text.to(device=device)
    test_dataset_tensor_audio = test_dataset_tensor_audio.to(device=device)
    Y_pred_test = model.predict(test_dataset_tensor_text, test_dataset_tensor_audio)

    Y_pred_test = Y_pred_test.to(device=device)
    test_target_tensor = test_target_tensor.to(device=device)
    accuracy_test = accuracy(Y_pred_test, test_target_tensor)
    f1_score_val = f1_score_NN(Y_pred_test, test_target_tensor)
    print("Test accuracy of Network",(accuracy_test))
    print("F1 score of Network",(f1_score_val))

In [140]:
# x_test_torch = torch.Tensor(list(multimodal_data_test['features']))
# y_test = torch.LongTensor(list(multimodal_data_test['da']))

In [141]:
print('Accuracy for FeedForward Network :: ')
prepare_for_accuracy(feedforwardModel_multi,
                     test_dataset_tensor_audio=x_test_torch_audio,
                     test_dataset_tensor_text=x_test_torch_text,
                     test_target_tensor=y_test)

Accuracy for FeedForward Network :: 
Test accuracy of Network tensor(0.4835, device='cuda:0')
F1 score of Network 0.48348348348348347


In [142]:
## With audio features such as pitch

In [143]:
# Hyperparameters
# num_classes = 41
num_classes = 40
learning_rate = 0.08
batch_size = 16
num_epochs = 100
num_workers = 1
input_size = 506
audio_size = 6
text_size = 768

In [144]:
class FeedForwardNeuralNetwork_2(nn.Module):
    def __init__(self, audio_size, text_size, input_size, num_classes):
        super().__init__()
        self.input_size = input_size
        self.feedforwardNN_text = nn.Sequential(
            nn.Linear(text_size, 500)
        )
        self.feedforwardNN_audio = nn.Sequential(
            nn.Linear(audio_size, 6)
        )

        self.feedforwardNN = nn.Sequential(
        nn.Linear(input_size, 500),
        nn.ReLU(),
        nn.Linear(500, 200),
        nn.ReLU(),
        nn.Linear(200, 100),
        nn.ReLU(),
        nn.Linear(100, num_classes),
        nn.Softmax()
        )

    def forward(self, text_embds, audio_embds):
        text_weights = self.feedforwardNN_text(text_embds)
        audio_weights = self.feedforwardNN_audio(audio_embds)
        merged = torch.concat((text_weights, audio_weights), dim=1)
        return self.feedforwardNN(merged)

    def predict(self, text_embds, audio_embds):
        text_weights = self.feedforwardNN_text(text_embds)
        audio_weights = self.feedforwardNN_audio(audio_embds)
        merged = torch.concat((text_weights, audio_weights) ,dim=1)
        Y_pred = self.feedforwardNN(merged)
        return Y_pred

In [145]:
train_audio_embeddings = list()
train_text_embeddings = list()
for index, row in cleaned_train_data__with_embeddings.iterrows():
    train_text_embeddings.append(text_aligned_train_features[index])
    train_audio_embeddings.append(cleaned_train_data__with_embeddings.iloc[index][8:14].to_numpy())

In [146]:
test_audio_embeddings = list()
test_text_embeddings = list()
for index, row in cleaned_test_data__with_embeddings.iterrows():
    if row['Label'] == '^g':
        continue
    test_text_embeddings.append(text_aligned_test_features[index])
    test_audio_embeddings.append(row[8: 14].to_numpy())

In [147]:
# x_train_torch_text = torch.Tensor(list(text_aligned_train_features))
# x_test_torch_text = torch.Tensor(list(text_aligned_test_features))

x_train_torch_text = torch.Tensor(train_text_embeddings)
x_test_torch_text = torch.Tensor(test_text_embeddings)

x_train_torch_audio = torch.Tensor(train_audio_embeddings)
x_test_torch_audio = torch.Tensor(test_audio_embeddings)

In [148]:
y_train = torch.LongTensor(list(cleaned_train_data__with_embeddings['da']))
y_test = torch.LongTensor(list(cleaned_test_data__with_embeddings['da']))

In [149]:
zipper = lambda x,y,z : list(zip(x,y,z))

In [150]:
x_train_torch = zipper(x_train_torch_text, x_train_torch_audio, y_train)
x_test_torch = zipper(x_test_torch_text, x_test_torch_audio, y_test)

In [151]:
# DataLoader
train_loader = torch.utils.data.DataLoader(x_train_torch, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(x_test_torch, batch_size=batch_size)

In [152]:
# Initialize model
feedforwardModel_multi = FeedForwardNeuralNetwork_2(
    audio_size=audio_size,
    text_size=text_size,
    input_size = input_size,
    num_classes = num_classes).to(device)

In [153]:
# Specify criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(feedforwardModel_multi.parameters(), lr=learning_rate)

In [154]:
# Function to get accuracy scores
def accuracy(y_pred, y_test):
    pred = torch.argmax(y_pred, dim=1)
    return (pred == y_test).float().mean()

In [155]:
train_FF_Multi(feedforwardModel_multi, num_epochs=num_epochs, train_loader=train_loader, test_loader=test_loader, modelName='feedforwardModel__Multi_pitch')

Epoch: 1 	Training Loss: 3.505320 	Test Loss: 3.276381
Test loss decreased (inf --> 3.276381).  Saving model ...
Epoch: 2 	Training Loss: 3.223637 	Test Loss: 3.276253
Test loss decreased (3.276381 --> 3.276253).  Saving model ...
Epoch: 3 	Training Loss: 3.223588 	Test Loss: 3.276236
Test loss decreased (3.276253 --> 3.276236).  Saving model ...
Epoch: 4 	Training Loss: 3.223576 	Test Loss: 3.276230
Test loss decreased (3.276236 --> 3.276230).  Saving model ...
Epoch: 5 	Training Loss: 3.223571 	Test Loss: 3.276226
Test loss decreased (3.276230 --> 3.276226).  Saving model ...
Epoch: 6 	Training Loss: 3.223568 	Test Loss: 3.276224
Test loss decreased (3.276226 --> 3.276224).  Saving model ...
Epoch: 7 	Training Loss: 3.223566 	Test Loss: 3.276223
Test loss decreased (3.276224 --> 3.276223).  Saving model ...
Epoch: 8 	Training Loss: 3.223565 	Test Loss: 3.276222
Test loss decreased (3.276223 --> 3.276222).  Saving model ...
Epoch: 9 	Training Loss: 3.223564 	Test Loss: 3.276221
Test l

In [156]:
# Load model with best test loss
feedforwardModel_multi.load_state_dict(torch.load('feedforwardModel__Multi_pitch.pt'))

<All keys matched successfully>

In [157]:
print('Accuracy for FeedForward Network :: ')
prepare_for_accuracy(feedforwardModel_multi,
                     test_dataset_tensor_audio=x_test_torch_audio,
                     test_dataset_tensor_text=x_test_torch_text,
                     test_target_tensor=y_test)

Accuracy for FeedForward Network :: 
Test accuracy of Network tensor(0.4835, device='cuda:0')
F1 score of Network 0.48348348348348347


In [158]:
### Late Fusion

In [159]:
text_probs = np.load('test_audio_probabilities_text.npy')
text_probs

array([[ 5.329866  ,  0.2828937 , -1.5951992 , ...,  4.385116  ,
        -1.2062347 , -0.7415899 ],
       [ 3.1725254 ,  0.7356104 , -1.9743685 , ...,  3.4874892 ,
        -1.576004  , -0.4736683 ],
       [-0.52005005,  1.0725409 , -1.5971717 , ...,  5.938665  ,
        -1.2389355 , -1.245855  ],
       ...,
       [-0.11853296,  1.2770503 , -1.637255  , ...,  4.3278975 ,
        -1.4869914 , -0.7658975 ],
       [ 4.426511  ,  2.9620278 , -1.4006051 , ...,  6.0137463 ,
        -1.6051922 , -0.77653944],
       [ 3.9398794 , -1.5151525 , -2.057766  , ...,  5.8977346 ,
        -1.2377753 , -0.78777707]], dtype=float32)

In [160]:
text_probs = np.delete(text_probs, 2, axis=1)
text_probs

array([[ 5.329866  ,  0.2828937 , -0.43800756, ...,  4.385116  ,
        -1.2062347 , -0.7415899 ],
       [ 3.1725254 ,  0.7356104 ,  0.37866458, ...,  3.4874892 ,
        -1.576004  , -0.4736683 ],
       [-0.52005005,  1.0725409 ,  0.813379  , ...,  5.938665  ,
        -1.2389355 , -1.245855  ],
       ...,
       [-0.11853296,  1.2770503 , -0.11502191, ...,  4.3278975 ,
        -1.4869914 , -0.7658975 ],
       [ 4.426511  ,  2.9620278 ,  0.2917903 , ...,  6.0137463 ,
        -1.6051922 , -0.77653944],
       [ 3.9398794 , -1.5151525 ,  1.1465577 , ...,  5.8977346 ,
        -1.2377753 , -0.78777707]], dtype=float32)

In [161]:
audio_probs = pd.read_csv('test_audio_probabilities_audio.csv', header=None).to_numpy()
audio_probs

array([[0.06666667, 0.        , 0.        , ..., 0.04444444, 0.        ,
        0.        ],
       [0.08888889, 0.02222222, 0.        , ..., 0.11111111, 0.        ,
        0.        ],
       [0.06666667, 0.02222222, 0.        , ..., 0.11111111, 0.        ,
        0.        ],
       ...,
       [0.08888889, 0.        , 0.        , ..., 0.11111111, 0.        ,
        0.        ],
       [0.02222222, 0.        , 0.        , ..., 0.15555556, 0.        ,
        0.        ],
       [0.02222222, 0.04444444, 0.        , ..., 0.08888889, 0.        ,
        0.        ]])

In [162]:
late_fusion = text_probs + audio_probs
late_fusion

array([[ 5.3965326 ,  0.28289369, -0.43800756, ...,  4.42956054,
        -1.20623469, -0.7415899 ],
       [ 3.26141429,  0.75783265,  0.37866458, ...,  3.59860033,
        -1.57600403, -0.47366831],
       [-0.45338338,  1.0947631 ,  0.81337899, ...,  6.04977602,
        -1.23893547, -1.24585497],
       ...,
       [-0.02964407,  1.27705026, -0.11502191, ...,  4.43900866,
        -1.48699141, -0.76589751],
       [ 4.44873303,  2.96202779,  0.29179031, ...,  6.16930182,
        -1.60519218, -0.77653944],
       [ 3.96210164, -1.47070801,  1.14655769, ...,  5.98662353,
        -1.23777533, -0.78777707]])

In [163]:
test_preds = np.argmax(late_fusion, axis=1)
test_preds

array([36, 36, 36, 36, 36, 11, 36, 36, 36, 36, 36, 22, 37, 36, 36, 36, 36,
       36, 37, 34, 36, 36,  4, 36, 36, 36,  0, 11,  0, 36, 36, 11, 36, 36,
       37, 37, 37, 36, 36, 36, 36,  0, 36, 36, 36, 36, 34, 36, 36, 36, 36,
       36, 36, 36, 36, 11, 36, 37, 36, 36, 36, 36, 32, 36, 36, 36, 36, 36,
       36,  0, 36, 32, 36, 36, 36,  0, 36, 30,  4, 36, 36, 36, 36, 36, 36,
       36, 11,  0,  0, 37, 36, 36, 36, 15, 36, 37, 36, 36, 36, 37, 30, 36,
       34, 15, 36, 36, 36, 36, 36, 36, 37, 36, 36, 34, 36, 34,  0,  0, 36,
       14, 34, 36, 36, 36,  9, 36, 36, 36, 36, 32, 36, 36, 34, 36, 32,  9,
       34, 36, 34, 36, 36, 37, 36,  4, 36, 34, 36, 32, 37,  0, 37,  0, 36,
       37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37,  0,  9, 36, 36, 34,
       36, 36, 36, 36, 36, 37, 32, 36, 36, 36, 36, 36,  0, 36, 36, 34, 36,
       36, 37, 36, 36,  0, 36, 37, 36, 36, 37, 36, 36, 37, 27, 36, 36, 36,
       36, 34, 36, 36, 36, 36, 37, 36, 36, 36, 36, 36, 36, 36, 36, 36, 18,
       18, 36, 36, 36,  0

In [164]:
real_test_labels = cleaned_test_data__with_embeddings['da']
real_test_labels

0      36
1      19
2      36
3      36
4      37
       ..
329    37
330    36
331    36
332    36
333     0
Name: da, Length: 333, dtype: int32

In [165]:
accuracy_score(real_test_labels, test_preds)

0.6636636636636637

In [166]:
f1_score(real_test_labels, test_preds, average='micro')

0.6636636636636637

In [167]:
report = classification_report(real_test_labels, test_preds, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.5,0.473684,0.486486,19.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,5.0
4,0.5,0.4,0.444444,5.0
6,0.0,0.0,0.0,4.0
9,0.666667,0.666667,0.666667,6.0
10,0.0,0.0,0.0,1.0
11,0.75,0.666667,0.705882,9.0
13,0.0,0.0,0.0,4.0
