In [1]:
import numpy as np
import pandas as pd

import os
import shutil
from pathlib import Path
import psutil
import pickle
from multiprocessing import Process, Manager

import librosa
import random

import datetime

import torch as torch
from torch.utils.data import WeightedRandomSampler

from fastai import *
from fastai.vision import *
from fastai.callbacks import *

import functools

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score

import inspect

In [2]:
INPUT_PATH = Path('/kaggle/input/')
WORKING_PATH = Path('/kaggle/working/')

NOISY_PATH   = INPUT_PATH/'freesound-audio-tagging-2019/train_noisy'
CURATED_PATH = INPUT_PATH/'freesound-audio-tagging-2019/train_curated'
TEST_PATH   = INPUT_PATH/'freesound-audio-tagging-2019/test'
PICKLE_PATH = WORKING_PATH/'pickles'
MODEL_PATH = WORKING_PATH/'models'

PICKLE_PATH.mkdir(exist_ok=True)
MODEL_PATH.mkdir(exist_ok=True)

In [3]:
def seed_everything(seed=1234):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
seed_everything()

In [5]:
g_slice_len = 4

### Model

In [6]:
class SEDenseLayer(nn.Module):
    def __init__(self, nf_in, nf_add):
        self.nf_in, self.nf_add = nf_in, nf_add
        super().__init__()
        self.dense_layers=nn.Sequential(
            nn.BatchNorm2d(nf_in),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=nf_in, out_channels=nf_in, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(nf_in),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=nf_in, out_channels=nf_add, kernel_size=3, stride=1, padding=1, bias=False),
        )
        
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        
        self.se_layers = nn.Sequential(
            nn.Linear(nf_add, nf_add//2, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(nf_add//2, nf_add, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        d = self.dense_layers(x)

        b, f, _, _ = d.size()
        se = self.avg_pool(d).view(b,f)
        se = self.se_layers(se).view(b,f,1,1)
        se = d * se.expand_as(d)
        
        return torch.cat([x, se], 1)

class SEDenseNet(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.first_conv = nn.Conv2d(in_channels=1, out_channels=15, kernel_size=3, stride=1, padding=1, bias=False)
        
        self.se_dense_layers = nn.Sequential(
            SEDenseLayer(16,16),
            nn.MaxPool2d(2),
            SEDenseLayer(32,32),
            nn.MaxPool2d(2),
            SEDenseLayer(64,64),
            nn.MaxPool2d(2),
            SEDenseLayer(128,128),
            nn.MaxPool2d(2),
            SEDenseLayer(256,256),
            nn.MaxPool2d(2),
            SEDenseLayer(512,512),
            nn.MaxPool2d(2),
            SEDenseLayer(512+512,512),
            nn.MaxPool2d(2),
            SEDenseLayer(512+512+512,512),
            nn.MaxPool2d(2)
            
        )
        
        self.linears = nn.ModuleList([
            nn.Linear(2048,80),
            nn.Linear(2048,80),
            nn.Linear(2048,80),
            nn.Linear(2048,80),
            nn.Linear(2048,80),
            nn.Linear(2048,80),
            nn.Linear(2048,80),
            nn.Linear(2048,80)
        ])
        
    
    def forward(self, x):
        
        y = torch.cat([x,self.first_conv(x)],1)
        
        d = self.se_dense_layers(y).squeeze(dim=3).squeeze(dim=2)
        
        linear_outs = []
        for l in self.linears:
            linear_outs.append(l(d))
            
        mean = torch.mean(torch.stack(linear_outs),dim=0)
        
        return mean

### Loss

In [7]:
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class


# Wrapper for fast.ai library - thanks @daisukelab
def lwlrap(scores, truth, **kwargs):
    score, weight = calculate_per_class_lwlrap(to_np(truth), to_np(scores))
    return torch.Tensor([(score * weight).sum()])

### Helper Functions

In [8]:
manager = Manager()

mem_dic = manager.dict()


def wav_to_spec(fn, slice_len = 2, slice_count=1, sr = 44100, n_mels = 256, fmin =20):

    #set melspectrogram parameters to achieve a output size of n_mels, n_mels per slice_len seconds
    hop_length   = int(sr/(n_mels/slice_len)) # ensures slice_len seconds per height
    fmax         = sr//2
    
    pickle_file = Path(PICKLE_PATH/(Path(fn).parent.name + '_' + Path(fn).name +'.pkl'))
    
    pcen, pcen_pickle = None, None
    
    assert_equality = False
    
    if str(fn) in mem_dic:
        pcen = mem_dic[str(fn)]

    mem = psutil.virtual_memory().percent
    
    disk = shutil.disk_usage("/kaggle/working").free/1e6

    
    if (not str(fn) in mem_dic) and pickle_file.exists():
        with open(pickle_file, 'rb') as pf:
            if assert_equality:
                pcen_pickle = pickle.load(pf)  
            else:
                pcen = pickle.load(pf)
                if mem < 90:
                    mem_dic[str(fn)] = pcen
    

    if (not str(fn) in mem_dic) and ((not pickle_file.exists()) or assert_equality):
        
        y       = librosa.effects.trim(librosa.load(fn , sr)[0])[0]

        mels    = librosa.feature.melspectrogram(y, 
                                                     sr=sr,
                                                     n_mels=n_mels,
                                                     hop_length=hop_length,
                                                     n_fft=n_mels*20,
                                                     fmin=fmin,
                                                     fmax=fmax).astype(np.float32)

        pcen = librosa.core.pcen(mels, 
                                        sr=sr,
                                        hop_length=hop_length,
                                        gain = 0.6,
                                        bias = 0.1,
                                        power = 0.2,
                                        time_constant = 0.4,
                                        eps=1e-9
                                       )
        
        if mem < 90:
            mem_dic[str(fn)] = pcen
        
        if assert_equality:
            print(f'checking pickle {fn}')
            if not np.allclose(pcen_pickle, pcen, rtol=1e-7,atol=1e-11,equal_nan=True):
                print(f'{fn} new mels does not match pickle')
        
        if not pickle_file.exists() and disk > 500 and mem > 90:
            with open(pickle_file, 'wb') as pf:
                pickle.dump(pcen, pf)
    
    if random.random() < 0.001 :
        print(f'disk free: {disk} mem used: {mem}')
    
    out_width = n_mels*slice_count
    
    pcen_len = pcen.shape[1]
    
    if pcen_len < out_width:
        offset = random.randint(0, out_width-pcen_len)
        cropped_padded = np.pad(pcen, ((0,0),(offset, (out_width-pcen_len)- offset)), 'constant')
    else:
        offset = random.randint(0, pcen_len-out_width)
        cropped_padded = pcen[:,offset:offset+out_width]
    
    return cropped_padded

def open_wav_image(fn, convert_mode, after_open)->Image:
    melspec = wav_to_spec(fn, slice_len = g_slice_len, slice_count=1, sr = 44100, n_mels = 256, fmin =0)
    return Image(torch.Tensor(melspec).unsqueeze(0))

vision.data.open_image = open_wav_image


In [9]:
def fold_from_fn(fn, k):
    random.seed(Path(fn).stem.encode())
    return random.choice(range(k))

k=3

def valid_fn(fn, k, validation_fold):
    return (validation_fold == fold_from_fn(fn, k))

### Dataframes

In [10]:
train_noisy_df   = pd.read_csv(INPUT_PATH/'freesound-audio-tagging-2019/train_noisy.csv')
train_curated_df = pd.read_csv(INPUT_PATH/'freesound-audio-tagging-2019/train_curated.csv')

train_noisy_df['noisy_curated']   = 'noisy'
train_curated_df['noisy_curated'] = 'curated'

train_noisy_df['fname_path']      = str(NOISY_PATH)  +"/"+train_noisy_df.fname
train_curated_df['fname_path']    = str(CURATED_PATH)+"/"+train_curated_df.fname

train_combined_df = pd.concat([train_curated_df, train_noisy_df])

### Updated TTA

In [11]:
from fastai.basic_train import _loss_func2activ

def no_flip_tta_only(learn:Learner, ds_type:DatasetType=DatasetType.Valid, scale:float=1.35) -> Iterator[List[Tensor]]:
    "Computes the outputs for several augmented inputs for TTA"
    dl = learn.dl(ds_type)
    ds = dl.dataset
    old = ds.tfms
    augm_tfm = [o for o in learn.data.train_ds.tfms if o.tfm not in
               (crop_pad, flip_lr, dihedral, zoom)]
    try:
        pbar = master_bar(range(8))
        for i in pbar:
            row = 1 if i&1 else 0
            col = 1 if i&2 else 0
            flip = False
            d = {'row_pct':row, 'col_pct':col, 'is_random':False}
            tfm = [*augm_tfm, zoom(scale=scale, **d), crop_pad(**d)]
            #if flip: tfm.append(flip_lr(p=1.))
            ds.tfms = tfm
            yield get_preds(learn.model, dl, pbar=pbar, activ=_loss_func2activ(learn.loss_func))[0]
    finally: ds.tfms = old

### Make Predictions

In [12]:
ls ../input/6foldnoisydirected

dir_noisy_fold_0_lwlrap_0.816.pkl  dir_noisy_fold_3_lwlrap_0.802.pkl
dir_noisy_fold_1_lwlrap_0.807.pkl  dir_noisy_fold_4_lwlrap_0.788.pkl
dir_noisy_fold_2_lwlrap_0.81.pkl   dir_noisy_fold_5_lwlrap_0.805.pkl


In [13]:
model_pkls = ['dir_noisy_fold_0_lwlrap_0.816.pkl',
              'dir_noisy_fold_3_lwlrap_0.802.pkl',
              'dir_noisy_fold_1_lwlrap_0.807.pkl',
              'dir_noisy_fold_4_lwlrap_0.788.pkl',
              'dir_noisy_fold_2_lwlrap_0.81.pkl',
              'dir_noisy_fold_5_lwlrap_0.805.pkl']
pred_list = []

In [14]:
for mdl in model_pkls:
    test = ImageList.from_folder(TEST_PATH, extensions='.wav')
    learn = load_learner(INPUT_PATH/'6foldnoisydirected', mdl, test=test)
    learn.model_dir = MODEL_PATH
    learn.save('weights')
    learn.model=SEDenseNet().cuda()
    learn.load('weights')
    os.remove(MODEL_PATH/'weights.pth')
    
    learn.data.batch_size = 50
    learn.tta_only = partial(no_flip_tta_only,learn=learn)

    preds, _ = learn.TTA(scale=1.1, ds_type=DatasetType.Test)
    pred_list.append(preds)
    

disk free: 4938.051584 mem used: 29.1
disk free: 4938.051584 mem used: 29.1
disk free: 4938.051584 mem used: 29.1
disk free: 4938.051584 mem used: 29.1
disk free: 4938.051584 mem used: 29.1
disk free: 4938.051584 mem used: 29.1
disk free: 4938.051584 mem used: 29.1


In [15]:
ls ../input/curatedthencombined6fold/

finetuned_model_fold_0.pkl  finetuned_model_fold_3.pkl
finetuned_model_fold_1.pkl  finetuned_model_fold_4.pkl
finetuned_model_fold_2.pkl  finetuned_model_fold_5.pkl


In [16]:
model_pkls = ['finetuned_model_fold_0.pkl',
              'finetuned_model_fold_3.pkl',
              'finetuned_model_fold_1.pkl',
              'finetuned_model_fold_4.pkl',
              'finetuned_model_fold_2.pkl',
              'finetuned_model_fold_5.pkl']

In [17]:
for mdl in model_pkls:
    test = ImageList.from_folder(TEST_PATH, extensions='.wav')
    learn = load_learner(INPUT_PATH/'curatedthencombined6fold', mdl, test=test)
    learn.model_dir = MODEL_PATH
    learn.save('weights')
    learn.model=SEDenseNet().cuda()
    learn.load('weights')
    os.remove(MODEL_PATH/'weights.pth')
    
    learn.data.batch_size = 50
    learn.tta_only = partial(no_flip_tta_only,learn=learn)

    preds, _ = learn.TTA(scale=1.1, ds_type=DatasetType.Test)
    pred_list.append(preds)
    

disk free: 4938.047488 mem used: 29.1
disk free: 4938.047488 mem used: 29.2
disk free: 4938.047488 mem used: 29.2
disk free: 4938.047488 mem used: 29.2
disk free: 4938.047488 mem used: 29.1
disk free: 4938.047488 mem used: 29.2
disk free: 4938.047488 mem used: 29.2
disk free: 4938.047488 mem used: 29.1


In [18]:
mean_preds = torch.stack(pred_list).mean(0)

In [19]:
test_names = [i.name for i in learn.data.test_ds.items]

scores_dic = {learn.data.classes[i]: mean_preds.numpy()[:,i] for i in range(mean_preds.shape[1])}

submission_df = pd.DataFrame({'fname':test_names, **scores_dic})
submission_df.to_csv('submission.csv',index=False)

In [20]:
!rm -r pickles