In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, random_split, Subset
from torch import nn

import torchvision.transforms.functional as F
from torchaudio import load
from torchaudio.transforms import MelSpectrogram, Spectrogram

from CNNs import CNN, CNN_2channel, CNN_horizontal
#from data_dir.directivityDataset import DirectivityDataset
from Directivity.directivityDataset import DirectivityDataset

In [2]:
#DATA_DIR = os.getcwd()+'/data_dir/'
#DIRECTIVITY_ANNOTATION = os.path.join(DATA_DIR, "Directivity_Labels_only0_2dirBinary.csv")

DATA_DIR = os.getcwd()+'/Directivity/'
DIRECTIVITY_ANNOTATION = os.path.join(DATA_DIR, "MatrixLabelsDirectivityCategoriesBalanced.csv")

df = pd.read_csv(DIRECTIVITY_ANNOTATION)
df.head(5)

Unnamed: 0.1,Unnamed: 0,audio_filename,class,event x (m),event y (m),event z (m),event orientation (x),event orientation (y),event orientation (z),listener (x),...,distance (m),theta,l_0,l_1,l_2,l_3,l_4,l_5,l_6,l_7
0,0,directivityEval00001.wav,Speech,7.8,1.8,-5.0,-1,0,0,5,...,2.8,0,0,0,0,0,0,0,0,1
1,1,directivityEval00002.wav,Speech,5.0,1.8,-7.7,0,0,-1,5,...,2.7,-90,0,1,0,0,0,0,0,0
2,2,directivityEval00003.wav,Speech,7.3,1.8,-5.0,1,0,0,5,...,2.3,0,0,0,0,0,0,0,1,0
3,3,directivityEval00004.wav,Speech,8.1,1.8,-5.0,-1,0,0,5,...,3.1,0,0,0,0,0,0,0,0,1
4,4,directivityEval00005.wav,Speech,2.9,1.8,-5.0,1,0,0,5,...,2.1,180,0,0,1,0,0,0,0,0


In [7]:
TARGET_SAMPLE_RATE = 44100
NUM_SAMPLES = 200000
DEVICE = 'cuda'

mel_spec = MelSpectrogram(
        sample_rate = TARGET_SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    ).to(DEVICE)

In [8]:
dir = DirectivityDataset(DIRECTIVITY_ANNOTATION, DATA_DIR, mel_spec, TARGET_SAMPLE_RATE, NUM_SAMPLES, DEVICE)
train_len = int(0.8*len(dir))
val_len = int(0.2*len(dir))

val_dataset = Subset(dir, range(train_len, train_len+val_len))

loader = DataLoader(val_dataset, batch_size=1)

In [9]:
data, label = next(iter(loader))
input_dims = data[0][0].shape
input_dims

torch.Size([64, 391])

In [10]:
cnn_model = CNN_2channel(W = input_dims[0], H=input_dims[1]).to(DEVICE)
state_dict = torch.load('./Weights/cnn-melmodel.pth', weights_only=False)
cnn_model.load_state_dict(state_dict)

<All keys matched successfully>

In [11]:
cnn_model.eval()

CNN_2channel(
  (conv1): Sequential(
    (0): Conv2d(2, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (dense1): Sequential(
    (0): Linear(in_features=22528, out_features=64, bias=True)
    (1): ReLU()
  )
  (dense2): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
 

In [None]:
pred = cnn_model(data)
_, predicted = torch.max(pred, 1)
predicted, label

In [None]:
preds = []
labels = []
for input_data, target in loader:
    prediction = cnn_model(input_data)
    _, predicted = torch.max(prediction, 1)
    preds.append(predicted.detach().cpu().numpy())
    label = torch.argmax(target, dim=1)
    labels.append(label.detach().cpu().numpy())


In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        41
           1       0.95      1.00      0.97        35
           2       1.00      1.00      1.00        42
           3       1.00      1.00      1.00        42

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



In [18]:
confusion_matrix(labels, preds)

array([[39,  2,  0,  0],
       [ 0, 35,  0,  0],
       [ 0,  0, 42,  0],
       [ 0,  0,  0, 42]])