In [2]:

import pandas as pd
import numpy as np


def load_label(file_path):   
    df = pd.read_csv(file_path)
    return df

def load_data(file_path):   
    with np.load(file_path, allow_pickle=True) as data:
        list_audio_ids = data['audio_id']
        list_features = data['features']

    return list_audio_ids, list_features

In [None]:

from torch.utils.data import random_split

import torch.nn.functional as F
from torch.nn import init
import torch
import torch.nn as nn


labels_path = './data/label_59.csv'
data_path = './data/vggish_final.npz'

print('start preparing data ...')
# load data and labels
list_audio_ids, list_features = load_data(data_path)
print(list_audio_ids.shape)
dic_label = load_label(labels_path)
dic_label = dic_label.set_index('audio_Id')
labels = [dic_label.loc[int(audio_id)].values for audio_id in list_audio_ids]

samples = list(zip(list_features, labels))
print(f"There are {len(samples)} samples in the dataset.")
num_items = len(samples)


# Random split of 80:20 between training and validation
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(samples, [num_train, num_val])


train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [22]:
import torch.nn.functional as F
from torch.nn import init
import torch
import torch.nn as nn


class AudioClassifier (nn.Module):
  
    def __init__(self):
        super().__init__()
        conv_layers = []

        self.conv1 = nn.Conv2d(1, 16, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(128)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]


        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=128, out_features=59)
        
        self.conv = nn.Sequential(*conv_layers)
 
    
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)
        
        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)

        x = x.view(x.shape[0], -1)
        
        # Linear layer
        x = self.lin(x)
        
        # Final output
        return x

audioModel = AudioClassifier()
device = torch.device("cpu")
audioModel = audioModel.to(device)
next(audioModel.parameters()).device

device(type='cpu')

In [None]:
##### Training
def training(model, train_dl, num_epochs):

  # Loss Function, Optimizer and Scheduler
  criterion = nn.MultiLabelSoftMarginLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    for i, data in enumerate(train_dl):
        inputs, labels = data[0].to(device), data[1].to(device)
        
        # Normalize the inputs
        inputs = inputs.to(torch.float)
        inputs = inputs.reshape(inputs.shape[0], -1, 128, 1407)


        optimizer.zero_grad()

        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()

        
        #map outputs to range of 0-1
        outputs = torch.sigmoid(outputs).cpu()     

        prediction = outputs > 0.6

        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]
    
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.3f}, Accuracy: {acc:.3f}')

  print('Finished Training')
  
num_epochs=10   

training(audioModel, train_dl, num_epochs)

In [26]:
torch.save(audioModel.state_dict(), "./code/models/cnn/cnn_h/audioModel_aug_59.pth")

In [None]:
##### Inference

from sklearn.metrics import f1_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_auc_score


def inference (model, val_dl):
  correct_prediction = 0
  total_prediction = 0
  all_labels = []
  transformer_all_preds = []

  with torch.no_grad():
    for data in val_dl:
      inputs, labels = data[0].to(device), data[1].to(device)
      all_labels += labels.to(torch.float)
      
      # Normalize the inputs
      inputs = inputs.to(torch.float)
      inputs = inputs.reshape(inputs.shape[0], -1, 128, 1407)

      outputs = model(inputs)
      outputs = torch.sigmoid(outputs).cpu()      

      
      prediction = outputs > 0.5
      transformer_all_preds.extend(prediction.tolist())


      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]
  
  T_Y = np.array(transformer_all_preds)
  Y =  np.array([a.detach().cpu().numpy() for a in all_labels])

  tsrf_f1 = f1_score(Y,  T_Y, labels=None, pos_label=1, average='macro', sample_weight=None, zero_division='warn')
  print("cnn f1 macro: ", tsrf_f1)
  
  tsrf_f1_micro = f1_score(Y,  T_Y , labels=None, pos_label=1, average='micro', sample_weight=None, zero_division='warn')
  print("cnn f1 micro: ", tsrf_f1_micro)

  average_precision_score_micro = average_precision_score(Y,  T_Y , average = "micro")
  print("cnn average_precision_score_micro: ", average_precision_score_micro)

  roc_auc_score_micro = roc_auc_score(Y, T_Y , average = "micro")
  print("cnn roc_auc_score_micro: ", roc_auc_score_micro)

  average_precision_score_macro = average_precision_score(Y,  T_Y , average = "macro")
  print("cnn average_precision_score_macro: ", average_precision_score_macro)

  roc_auc_score_macro = roc_auc_score(Y,  T_Y , average = "macro")
  print("cnn roc_auc_score_macro: ", roc_auc_score_macro)

  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.3f}, Total items: {total_prediction}')


model = AudioClassifier()
model.load_state_dict(torch.load("./code/models/cnn/cnn_h/audioModel_aug_5.pth"))
model.eval()
inference(model, val_dl)