In [None]:
import sys
sys.setrecursionlimit(100000)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import glob
from torchvision import models, transforms
import librosa
from PIL import Image
import time
from sklearn.model_selection import train_test_split

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
num_epoch = 10
learning_rate = 1e-3

In [None]:
!unzip  ../input/train_curated.zip -d /kaggle/working/train_curated
!unzip ../input/test.zip -d /kaggle/working/test

In [None]:
curated_path = '/kaggle/working/train_curated'
noisy_path = '/kaggle/working/train_noisy'
test_path = '/kaggle/working/test/'
curated_csv = pd.read_csv('../input/train_curated.csv')
noisy_csv = pd.read_csv('../input/train_noisy.csv')
merged = pd.concat([curated_csv, noisy_csv], sort=True, ignore_index=True)

In [None]:
filelist = sorted(glob.glob('%s/*.wav' %curated_path))
labels = curated_csv.labels.str.get_dummies(sep=',')
labels = labels.values

print(filelist)

In [None]:
class Config(object):
    n_fft = 512
    hop_length = 256
    n_mels = 64
    f_min = 0
    f_max = 16000
    sample_rate = 16000
    audio_duration = 2

In [None]:
class AudioDataset(Dataset):
    
    def __init__(self, root, labels, config):
        self.labels = labels
        self.root_dir = root
        self.config = config
        self.audio_length = config.sample_rate * config.audio_duration
        self.filelist = root
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        data, sr = librosa.load(self.filelist[idx], sr=self.config.sample_rate)
        data = data[:self.audio_length]
        if len(data) < self.audio_length:
            offset = np.zeros((self.audio_length - len(data),))
            data = np.concatenate([data, offset], axis=0)
        data = librosa.feature.melspectrogram(data, sr=self.config.sample_rate,
                                              n_fft=self.config.n_fft, hop_length=self.config.hop_length)
        label = self.labels[idx]
        data = transforms.ToTensor()(Image.fromarray(data, mode='RGB'))
        return data, label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(filelist, labels, test_size=0.2)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

In [None]:
trainset = AudioDataset(X_train, y_train, Config())
trainloader = DataLoader(trainset, batch_size=16, shuffle=True)
testset = AudioDataset(X_test, y_test, Config())
testloader = DataLoader(testset, batch_size=16, shuffle=True)

In [None]:
class STFT(nn.Module):
    
    def __init__(self, config, pretrained_model):
        super(STFT, self).__init__()
        
        self.classifier = nn.Linear(512, 80)
        self.config = config
        self.pretrained = pretrained_model
        self.pretrained.fc = self.classifier
        self.image_model = self.pretrained
        
    def forward(self, x):
        x = self.image_model(x)
        return x

In [None]:
resnet = models.resnet18(pretrained=True)

In [None]:
model = STFT(Config(), resnet)
model.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epoch):
    model.train()
    for i, (data, label) in enumerate(trainloader):
        if i == 0:
            t = time.time()
        data = data.to(device)
        label = label.to(device).float()

        out = model.forward(data)

        loss = F.binary_cross_entropy_with_logits(out, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i % 5) == 0:
            print('Epoch %d  Step %d  Loss %0.4f  Time %0.2f s' %(epoch+1, i+1, loss, time.time() - t))
            t = time.time()
    
    model.eval()
    with torch.no_grad():
        for i, (data, label) in enumerate(testloader):
            data = data.to(device)
            label = label.to(device).float()
            out = model.forward(data)
            print(out)

In [None]:
prediction = []
labels = []
model.eval()
with torch.no_grad():
    for i, (data, label) in enumerate(trainloader):
        if i < 5:
            data = data.to(device)
            out = model.forward(data)
            out = torch.sigmoid(out)
            prediction.append(out.detach().cpu().numpy())
            labels.append(label)

prediction = np.concatenate(prediction, axis=0)
labels = np.concatenate(labels, axis=0)
print(prediction)
print(labels)

from sklearn.metrics import label_ranking_average_precision_score
score = label_ranking_average_precision_score(labels, prediction)
print(score)