In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F

import torchvision.models



In [2]:
def load_weights(weight_file):
    if weight_file == None:
        return

    try:
        weights_dict = np.load(weight_file).item()
    except:
        weights_dict = np.load(weight_file, encoding='bytes').item()

    return weights_dict

In [3]:
class KitModel(nn.Module):

    
    def __init__(self, weight_file):
        super(KitModel, self).__init__()
        self.__weights_dict = load_weights(weight_file)

        self.batch_normalization_1 = self.__batch_normalization(2, 'batch_normalization_1', num_features=1, eps=0.001, momentum=0.99)
        self.conv2d_1 = self.__conv(2, name='conv2d_1', in_channels=1, out_channels=64, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_2 = self.__batch_normalization(2, 'batch_normalization_2', num_features=64, eps=0.001, momentum=0.99)
        self.conv2d_2 = self.__conv(2, name='conv2d_2', in_channels=64, out_channels=64, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_3 = self.__batch_normalization(2, 'batch_normalization_3', num_features=64, eps=0.001, momentum=0.99)
        self.conv2d_3 = self.__conv(2, name='conv2d_3', in_channels=64, out_channels=128, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_4 = self.__batch_normalization(2, 'batch_normalization_4', num_features=128, eps=0.001, momentum=0.99)
        self.conv2d_4 = self.__conv(2, name='conv2d_4', in_channels=128, out_channels=128, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_5 = self.__batch_normalization(2, 'batch_normalization_5', num_features=128, eps=0.001, momentum=0.99)
        self.conv2d_5 = self.__conv(2, name='conv2d_5', in_channels=128, out_channels=256, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_6 = self.__batch_normalization(2, 'batch_normalization_6', num_features=256, eps=0.001, momentum=0.99)
        self.conv2d_6 = self.__conv(2, name='conv2d_6', in_channels=256, out_channels=256, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_7 = self.__batch_normalization(2, 'batch_normalization_7', num_features=256, eps=0.001, momentum=0.99)
        self.conv2d_7 = self.__conv(2, name='conv2d_7', in_channels=256, out_channels=512, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)
        self.batch_normalization_8 = self.__batch_normalization(2, 'batch_normalization_8', num_features=512, eps=0.001, momentum=0.99)
        self.audio_embedding_layer = self.__conv(2, name='audio_embedding_layer', in_channels=512, out_channels=512, kernel_size=(3, 3), stride=(1, 1), groups=1, bias=True)

    def forward(self, x):
        conv2d_1_pad    = F.pad(x, (1, 1, 1, 1))
        conv2d_1        = self.conv2d_1(conv2d_1_pad)
        batch_normalization_2 = self.batch_normalization_2(conv2d_1)
        activation_1    = F.relu(batch_normalization_2)
        conv2d_2_pad    = F.pad(activation_1, (1, 1, 1, 1))
        conv2d_2        = self.conv2d_2(conv2d_2_pad)
        batch_normalization_3 = self.batch_normalization_3(conv2d_2)
        activation_2    = F.relu(batch_normalization_3)
        max_pooling2d_1 = F.max_pool2d(activation_2, kernel_size=(2, 2), stride=(2, 2), padding=0, ceil_mode=False)
        conv2d_3_pad    = F.pad(max_pooling2d_1, (1, 1, 1, 1))
        conv2d_3        = self.conv2d_3(conv2d_3_pad)
        batch_normalization_4 = self.batch_normalization_4(conv2d_3)
        activation_3    = F.relu(batch_normalization_4)
        conv2d_4_pad    = F.pad(activation_3, (1, 1, 1, 1))
        conv2d_4        = self.conv2d_4(conv2d_4_pad)
        batch_normalization_5 = self.batch_normalization_5(conv2d_4)
        activation_4    = F.relu(batch_normalization_5)
        max_pooling2d_2 = F.max_pool2d(activation_4, kernel_size=(2, 2), stride=(2, 2), padding=0, ceil_mode=False)
        conv2d_5_pad    = F.pad(max_pooling2d_2, (1, 1, 1, 1))
        conv2d_5        = self.conv2d_5(conv2d_5_pad)
        batch_normalization_6 = self.batch_normalization_6(conv2d_5)
        activation_5    = F.relu(batch_normalization_6)
        conv2d_6_pad    = F.pad(activation_5, (1, 1, 1, 1))
        conv2d_6        = self.conv2d_6(conv2d_6_pad)
        batch_normalization_7 = self.batch_normalization_7(conv2d_6)
        activation_6    = F.relu(batch_normalization_7)
        max_pooling2d_3 = F.max_pool2d(activation_6, kernel_size=(2, 2), stride=(2, 2), padding=0, ceil_mode=False)
        conv2d_7_pad    = F.pad(max_pooling2d_3, (1, 1, 1, 1))
        conv2d_7        = self.conv2d_7(conv2d_7_pad)
        batch_normalization_8 = self.batch_normalization_8(conv2d_7)
        activation_7    = F.relu(batch_normalization_8)
        audio_embedding_layer_pad = F.pad(activation_7, (1, 1, 1, 1))
        audio_embedding_layer = self.audio_embedding_layer(audio_embedding_layer_pad)
        max_pooling2d_4 = F.max_pool2d(audio_embedding_layer, kernel_size=(4, 8), stride=(4, 8), padding=0, ceil_mode=False)
        return max_pooling2d_4


    def __batch_normalization(self, dim, name, **kwargs):
        if   dim == 0 or dim == 1:  layer = nn.BatchNorm1d(**kwargs)
        elif dim == 2:  layer = nn.BatchNorm2d(**kwargs)
        elif dim == 3:  layer = nn.BatchNorm3d(**kwargs)
        else:           raise NotImplementedError()

        if 'scale' in self.__weights_dict[name]:
            layer.state_dict()['weight'].copy_(torch.from_numpy(self.__weights_dict[name]['scale']))
        else:
            layer.weight.data.fill_(1)

        if 'bias' in self.__weights_dict[name]:
            layer.state_dict()['bias'].copy_(torch.from_numpy(self.__weights_dict[name]['bias']))
        else:
            layer.bias.data.fill_(0)

        layer.state_dict()['running_mean'].copy_(torch.from_numpy(self.__weights_dict[name]['mean']))
        layer.state_dict()['running_var'].copy_(torch.from_numpy(self.__weights_dict[name]['var']))
        return layer

    def __conv(self, dim, name, **kwargs):
        if   dim == 1:  layer = nn.Conv1d(**kwargs)
        elif dim == 2:  layer = nn.Conv2d(**kwargs)
        elif dim == 3:  layer = nn.Conv3d(**kwargs)
        else:           raise NotImplementedError()

        layer.state_dict()['weight'].copy_(torch.from_numpy(self.__weights_dict[name]['weights']))
        if 'bias' in self.__weights_dict[name]:
            layer.state_dict()['bias'].copy_(torch.from_numpy(self.__weights_dict[name]['bias']))
        return layer

In [4]:
class Task5Model(nn.Module):

    def __init__(self, num_classes):

        super().__init__()
        
        self.bn = nn.BatchNorm2d(1)

        self.openl3 = KitModel('./openl3_no_mel_layer_pytorch_weights')

        self.final = nn.Sequential(
            nn.BatchNorm1d(512),
            nn.Linear(512, 256), nn.ReLU(), nn.BatchNorm1d(256),
            nn.Linear(256, num_classes))

    def forward(self, x):
        x = x.permute(0, 1, 3, 2) # keras model had (92, 128, 199, 1) like shape
        x = self.bn(x)
        x = self.openl3(x)
        x = x.max(dim=-1)[0].max(dim=-1)[0]
        x = self.final(x)
        return x

In [5]:
eval_files = glob('../../data/audio-eval/*.wav')
eval_files = [os.path.basename(x) for x in eval_files]

In [6]:
X = np.concatenate([
        np.expand_dims(np.load('../../data/logmelspec-eval/{}.npy'.format(x)).T[:635, :], axis=0)
        for x in eval_files])
X = X[:, None, :, :]

In [7]:
channel_means = np.load('../../data/channel_means.npy')
channel_stds = np.load('../../data/channel_stds.npy')
X = (X - channel_means) / channel_stds

In [8]:
class AudioDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx, ...]
        i = np.random.randint(sample.shape[1])
        sample = torch.cat([
                sample[:, i:, :],
                sample[:, :i, :]],
                dim=1)
        return sample

In [9]:
dataset = AudioDataset(torch.Tensor(X))
loader = DataLoader(dataset, 16, shuffle=False)

In [10]:
cuda = True
device = torch.device('cuda:0' if cuda else 'cpu')
print('Device: ', device)

Device:  cuda:0


In [11]:
model = Task5Model(31).to(device)
model.load_state_dict(torch.load('./model_system1'))

<All keys matched successfully>

In [12]:
all_preds = []
for _ in range(10):
    preds = []
    for inputs in loader:
            inputs = inputs.to(device)
            with torch.set_grad_enabled(False):
                model = model.eval()
                outputs = model(inputs)
                preds.append(outputs.detach().cpu().numpy())
    preds = np.concatenate(preds, axis=0)
    preds = (1 / (1 + np.exp(-preds)))
    all_preds.append(preds)
tmp = all_preds[0]
for x in all_preds[1:]:
    tmp += x
tmp = tmp / 10
preds = tmp

In [13]:
output_df = pd.DataFrame(
    preds, columns=[
        '1_engine', '2_machinery-impact', '3_non-machinery-impact',
        '4_powered-saw', '5_alert-signal', '6_music', '7_human-voice', '8_dog',
        '1-1_small-sounding-engine', '1-2_medium-sounding-engine',
        '1-3_large-sounding-engine', '2-1_rock-drill', '2-2_jackhammer',
        '2-3_hoe-ram', '2-4_pile-driver', '3-1_non-machinery-impact',
        '4-1_chainsaw', '4-2_small-medium-rotating-saw',
        '4-3_large-rotating-saw', '5-1_car-horn', '5-2_car-alarm', '5-3_siren',
        '5-4_reverse-beeper', '6-1_stationary-music', '6-2_mobile-music',
        '6-3_ice-cream-truck', '7-1_person-or-small-group-talking',
        '7-2_person-or-small-group-shouting', '7-3_large-crowd',
        '7-4_amplified-speech', '8-1_dog-barking-whining'])
output_df['audio_filename'] = pd.Series(eval_files, index=output_df.index)

In [14]:
cols_in_order = [
    "audio_filename", "1-1_small-sounding-engine",
    "1-2_medium-sounding-engine", "1-3_large-sounding-engine",
    "2-1_rock-drill",
    "2-2_jackhammer", "2-3_hoe-ram", "2-4_pile-driver",
    "3-1_non-machinery-impact",
    "4-1_chainsaw", "4-2_small-medium-rotating-saw",
    "4-3_large-rotating-saw",
    "5-1_car-horn", "5-2_car-alarm", "5-3_siren", "5-4_reverse-beeper",
    "6-1_stationary-music",
    "6-2_mobile-music", "6-3_ice-cream-truck",
    "7-1_person-or-small-group-talking",
    "7-2_person-or-small-group-shouting", "7-3_large-crowd",
    "7-4_amplified-speech",
    "8-1_dog-barking-whining", "1_engine", "2_machinery-impact",
    "3_non-machinery-impact", "4_powered-saw", "5_alert-signal",
    "6_music", "7_human-voice", "8_dog"]
output_df = output_df.loc[:, cols_in_order]

output_df.to_csv('submission-system-1.csv', index=False)