## **<div id="I">I. Importing Tools</div>**

### **<div id="I1">1. Dependencies</div>**

In [None]:
import time
start_time = time.time()

# Data analysis and wrangling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
import IPython.display
import librosa
import librosa.display
import random
from tqdm import tqdm_notebook
from fastai import *
from fastai.vision import *
from fastai.vision.data import *
from fastai.imports import *
from fastai.callback import *
from fastai.callbacks import *

# Machine learning
from sklearn import preprocessing
import sklearn.metrics
from sklearn.metrics import label_ranking_average_precision_score

# File handling
from pathlib import Path
import gc
import os
print(os.listdir("../input"))

### **<div id="I2">2. lwlrap</div>**

In [None]:
# from official code https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8#scrollTo=cRCaCIb9oguU
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class

# Wrapper for fast.ai library
def lwlrap(scores, truth, **kwargs):
    score, weight = calculate_per_class_lwlrap(to_np(truth), to_np(scores))
    return torch.Tensor([(score * weight).sum()])

## **<div id="II">II. Gather data</div>**

In [None]:
def load_pkl(filename):
    """Load pickle object from file."""
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
testing_df_X = pd.read_csv('../input/freesound-audio-tagging-2019/sample_submission.csv')
testing_df_LH = pd.read_csv('../input/freesound-audio-tagging-2019/sample_submission.csv')
testing_df = pd.read_csv('../input/freesound-audio-tagging-2019/sample_submission.csv')

## **<div id="III">III. Preprocessing sound</div>**

### **<div id="III1">1. EasyDict dependency</div>**

In [None]:
#EasyDict allows to access dict values as attributes (works recursively). A Javascript-like properties dot notation for python dicts.
#It is mandatory in order to use the library below
# Special thanks to https://github.com/makinacorpus/easydict/blob/master/easydict/__init__.py
class EasyDict(dict):

    def __init__(self, d=None, **kwargs):
        if d is None:
            d = {}
        if kwargs:
            d.update(**kwargs)
        for k, v in d.items():
            setattr(self, k, v)
        # Class attributes
        for k in self.__class__.__dict__.keys():
            if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
                setattr(self, k, getattr(self, k))

    def __setattr__(self, name, value):
        if isinstance(value, (list, tuple)):
            value = [self.__class__(x)
                     if isinstance(x, dict) else x for x in value]
        elif isinstance(value, dict) and not isinstance(value, self.__class__):
            value = self.__class__(value)
        super(EasyDict, self).__setattr__(name, value)
        super(EasyDict, self).__setitem__(name, value)

    __setitem__ = __setattr__

    def update(self, e=None, **f):
        d = e or dict()
        d.update(f)
        for k in d:
            setattr(self, k, d[k])

    def pop(self, k, d=None):
        delattr(self, k)
        return super(EasyDict, self).pop(k, d)

### **<div id="III2">2. Utilities for preprocessing (Librosa, custom preprocessing...)</div>**

In [None]:
#-------------------------------------------------------------------------------------------
#Librosa functions
#-------------------------------------------------------------------------------------------
#Thanks to https://github.com/daisukelab/ml-sound-classifier
def read_audio(conf, pathname, trim_long_data):
    y, sr = librosa.load(pathname, sr=conf.sampling_rate) #Loads an audio file as a floating point time series. This functions samples the sound
    # trim silence
    if 0 < len(y): # workaround: 0 length causes error
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf.samples: # long enough
        if trim_long_data:
            y = y[0:0+conf.samples]
    else: # pad blank
        padding = conf.samples - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')
    return y

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32) #Returns an 128 x L array corresponding to the spectrogram of the sound (L = 128*nÂ° of s)
    return spectrogram

def melspectrogram_to_delta(mels):
    return librosa.feature.delta(mels)

def show_melspectrogram(conf, mels, title='Log-frequency power spectrogram'):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf.sampling_rate, hop_length=conf.hop_length,
                            fmin=conf.fmin, fmax=conf.fmax)
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()

def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
    x = read_audio(conf, pathname, trim_long_data)
    mels = audio_to_melspectrogram(conf, x)
    if debug_display:
        delta = melspectrogram_to_delta(mels)
        delta_squared = melspectrogram_to_delta(delta)
        IPython.display.display(IPython.display.Audio(x, rate=conf.sampling_rate))
        show_melspectrogram(conf, mels)
        show_melspectrogram(conf, delta)
        show_melspectrogram(conf, delta_squared)
    return mels


#-------------------------------------------------------------------------------------------
#Spectrogram preprocessing
#-------------------------------------------------------------------------------------------
"""
The mels_preprocessing function takes as an input the spectrogram of our sound (list of array, see above). 
It stacks it three times, so that it has the same shape as a classic RGB image.
Then it standardize the array (take a matrix and change it so that its mean is equal to 0 and variance is 1). This improves performance.
Then it normalizes each value between 0 and 255 (gray scale). 
"""

#def mels_preprocessing(X1, X2, X3, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
def mels_preprocessing(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    #X = np.stack([X1, X2, X3], axis=-1)
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    #Standardization. Xstd has 0 mean and 1 variance
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

#-------------------------------------------------------------------------------------------
#High resolution conversion with no cut
#-------------------------------------------------------------------------------------------
def convert_wav_to_image(df, source, img_dest, conf):
    X = []
    for i, row in tqdm_notebook(df.iterrows()):
        x1 = read_as_melspectrogram(conf, source/str(row.fname), trim_long_data=False)
        x1 = mels_preprocessing(x1)
        #x2 = melspectrogram_to_delta(x1)
        #x3 = melspectrogram_to_delta(x2)
        #x_preprocessed = mels_preprocessing(x1, x2, x3)
        X.append(x1)
    return df, X

#-------------------------------------------------------------------------------------------
#Low resolution conversion with splits
#-------------------------------------------------------------------------------------------
def split_long_data(conf, X):
    # Splits long mel-spectrogram data with small overlap
    L = X.shape[1]
    one_length = conf['dims'][1]
    loop_length = int(one_length * 0.9)
    min_length = int(one_length * 0.2)
    #print(' sample length', L, 'to cut every', one_length)
    for idx in range(L // loop_length):
        cur = loop_length * idx
        rest = L - cur
        if one_length <= rest:
            yield X[:, cur:cur+one_length]
        elif min_length <= rest:
            cur = L - one_length
            yield X[:, cur:cur+one_length]
            
def convert_X(df, conf, datapath):
    # Convert all files listed on df.fname
    # Then generates X (contains mel-spectrograms)
    # and index mapping to original sample order
    df_x = pd.DataFrame({'fname': [], 'labels': []})
    X = []
    for i, row in tqdm_notebook(df.iterrows()):
        j = 1
        x1 = read_as_melspectrogram(conf, datapath / row.fname, trim_long_data=False)
        x1 = mels_preprocessing(x1)
        #x2 = melspectrogram_to_delta(x1)
        #x3 = melspectrogram_to_delta(x2)
        #x_preprocessed = mels_preprocessing(x1, x2, x3)
        for chunk in split_long_data(conf, x1):
            X.append(np.expand_dims(chunk, axis=-1))
            df_x = df_x.append({'fname': str(j) + '-' + df.fname[i], 'labels':df.labels[i]}, ignore_index=True)
            f = open('/kaggle/working/trn_curated_X/' + str(j) + '-' + df.fname[i],"w+")
            j += 1
    return df_x, X

def convert_X_test(df, conf, datapath):
    df_x = pd.DataFrame({'fname': []})
    X = []
    for i, row in tqdm_notebook(df.iterrows()):
        j = 1
        x1 = read_as_melspectrogram(conf, datapath / row.fname, trim_long_data=False)
        x1 = mels_preprocessing(x1)
        #x2 = melspectrogram_to_delta(x1)
        #x3 = melspectrogram_to_delta(x2)
        #x_preprocessed = mels_preprocessing(x1, x2, x3)
        for chunk in split_long_data(conf, x1):
            X.append(np.expand_dims(chunk, axis=-1))
            df_x = df_x.append({'fname': str(j) + '-' + df.fname[i]}, ignore_index=True)
            f = open('/kaggle/working/test_X/' + str(j) + '-' + df.fname[i],"w+")
            j += 1
    return df_x, X

### **<div id="III3">3. Configuration values</div>**

In [None]:
# Approach LH parameters: highest resolutions
conf_LH = EasyDict()
conf_LH.sampling_rate = 44100
conf_LH.duration = 2
conf_LH.hop_length = 347 * conf_LH.duration # to make time steps 128
conf_LH.fmin = 20
conf_LH.fmax = conf_LH.sampling_rate // 2
conf_LH.n_mels = 128
conf_LH.n_fft = conf_LH.n_mels * 20
conf_LH.samples = conf_LH.sampling_rate * conf_LH.duration
conf_LH.dims = (conf_LH.n_mels, 1 + int(np.floor(conf_LH.samples/conf_LH.hop_length)), 1)

# Approach X uses longer sound, then it uses suppressed
conf_X = EasyDict()
conf_X.sampling_rate = 44100
conf_X.duration = 8
conf_X.hop_length = 347 * conf_X.duration # to make time steps 128 #700 * conf_X.duration # to make time steps 64
conf_X.fmin = 20
conf_X.fmax = conf_X.sampling_rate // 2
conf_X.n_mels = 128
conf_X.n_fft = conf_X.n_mels * 20
conf_X.samples = conf_X.sampling_rate * conf_X.duration
conf_X.dims = (conf_X.n_mels, 1 + int(np.floor(conf_X.samples/conf_X.hop_length)), 1)

### **<div id="III5">4. Converting audio to image</div>**

In [None]:
testing_df_X, X_test_X = convert_wav_to_image(testing_df, source=Path('../input/freesound-audio-tagging-2019/test'), img_dest=Path('test'), conf = conf_X)
testing_df_LH, X_test_LH = convert_wav_to_image(testing_df, source=Path('../input/freesound-audio-tagging-2019/test'), img_dest=Path('test'), conf = conf_LH)

print(f"Finished data conversion at {(time.time()-start_time)/3600} hours")
print (len(X_test_X))
print (len(testing_df_X))
print (len(X_test_LH))
print (len(testing_df_LH))

## **<div id="III">III. Modeling</div>**

### **<div id="III">1. Utilities for modeling</div>**

In [None]:
CUR_X_FILES, CUR_X = list(testing_df_X.fname.values), X_test_X

#Custom open_image for fast.ai library to load data from memory. Random cropping 1 sec, this is working like augmentation.
def open_fat2019_image(fn, convert_mode, after_open)->Image:
    # open
    idx = CUR_X_FILES.index(fn.split('/')[-1])
    x = PIL.Image.fromarray(CUR_X[idx])
    # crop
    time_dim, base_dim = x.size
    crop_x = random.randint(0, time_dim - base_dim)
    x = x.crop([crop_x, 0, crop_x+base_dim, base_dim])    
    # standardize
    return Image(pil2tensor(x, np.float32).div_(255))

vision.data.open_image = open_fat2019_image

In [None]:
#Adding TTA
def _tta_only(learn:Learner, ds_type:DatasetType=DatasetType.Valid, num_pred:int=5) -> Iterator[List[Tensor]]:
    "Computes the outputs for several augmented inputs for TTA"
    dl = learn.dl(ds_type)
    ds = dl.dataset
    old = ds.tfms
    aug_tfms = [o for o in learn.data.train_ds.tfms]
    try:
        pbar = master_bar(range(num_pred))
        for i in pbar:
            ds.tfms = aug_tfms
            yield get_preds(learn.model, dl, pbar=pbar)[0]
    finally: ds.tfms = old

Learner.tta_only = _tta_only

def _TTA(learn:Learner, beta:float=0, ds_type:DatasetType=DatasetType.Valid, num_pred:int=5, with_loss:bool=False) -> Tensors:
    "Applies TTA to predict on `ds_type` dataset."
    preds,y = learn.get_preds(ds_type)
    all_preds = list(learn.tta_only(ds_type=ds_type, num_pred=num_pred))
    avg_preds = torch.stack(all_preds).mean(0)
    if beta is None: return preds,avg_preds,y
    else:            
        final_preds = preds*beta + avg_preds*(1-beta)
        if with_loss: 
            with NoneReduceOnCPU(learn.loss_func) as lf: loss = lf(final_preds, y)
            return final_preds, y, loss
        return final_preds, y

Learner.TTA = _TTA

### **<div id="V2">2. Modeling long chunks</div>**

In [None]:
CUR_X_FILES, CUR_X = list(testing_df_X.fname.values), X_test_X
test_X = ImageList.from_csv(Path('/kaggle/working'), Path('../input/freesound-audio-tagging-2019/sample_submission.csv'), folder=Path('../input/freesound-audio-tagging-2019/test'))

predictions_X = torch.from_numpy(np.zeros((3361,80))).float()
print(predictions_X.shape)
num_folds = 5

for i in range(num_folds):
    learn_X = load_learner(path = '.', file=Path('/kaggle/input/fat-curated-ensemble-mixup-tta/FAT2019_X_1D_mixup_TTA_Kfold_'+str(i)+'.pkl'), test=test_X)
    preds_X, _ = learn_X.TTA(ds_type=DatasetType.Test)
    print(preds_X.shape)
    predictions_X = predictions_X + preds_X
    
predictions_X = predictions_X/num_folds
testing_df_X[learn_X.data.classes] = predictions_X
testing_df_X.head()

In [None]:
# CUR_X_FILES, CUR_X = list(testing_df_X.fname.values), X_test_X

# test_X = ImageList.from_csv(Path('/kaggle/working'), Path('../input/freesound-audio-tagging-2019/sample_submission.csv'), folder=Path('../input/freesound-audio-tagging-2019/test'))
# learn_X = load_learner(path = '.', file=Path('/kaggle/input/fat-curated-ensemble-mixup-tta/FAT2019_X_1D_mixup_TTA.pkl'), test=test_X)
# # preds_X, _ = learn_X.get_preds(ds_type=DatasetType.Test)
# preds_X, _ = learn_X.TTA(ds_type=DatasetType.Test)

# testing_df_X[learn_X.data.classes] = preds_X
# testing_df_X.head()

# #Removing fake files
# #shutil.rmtree('trn_curated_X')

# print(preds_X.type)
# print(preds_X.shape)
# testing_df_X.head()

### **<div id="V3">3. Modeling beginning of sound</div>**

In [None]:
CUR_X_FILES, CUR_X = list(testing_df_LH.fname.values), X_test_LH
test_LH = ImageList.from_csv(Path('/kaggle/working'), Path('../input/freesound-audio-tagging-2019/sample_submission.csv'), folder=Path('../input/freesound-audio-tagging-2019/test'))

predictions_LH = torch.from_numpy(np.zeros((3361,80))).float()
print(predictions_LH.shape)
num_folds = 5

for i in range(num_folds):
    learn_LH = load_learner(path = '.', file=Path('/kaggle/input/fat-curated-ensemble-mixup-tta/FAT2019_LH_1D_mixup_TTA_Kfold_'+str(i)+'.pkl'), test=test_LH)
    preds_LH, _ = learn_LH.TTA(ds_type=DatasetType.Test)
    print(preds_LH.shape)
    predictions_LH = predictions_LH + preds_LH
    
predictions_LH = predictions_LH/num_folds
testing_df_LH[learn_LH.data.classes] = predictions_LH
testing_df_LH.head()

In [None]:
# CUR_X_FILES, CUR_X = list(testing_df_LH.fname.values), X_test_LH

# test_LH = ImageList.from_csv(Path('/kaggle/working'), Path('../input/freesound-audio-tagging-2019/sample_submission.csv'), folder=Path('../input/freesound-audio-tagging-2019/test'))
# learn_LH = load_learner(path = '.', file=Path('/kaggle/input/fat-curated-ensemble-mixup-tta/FAT2019_LH_1D_mixup_TTA.pkl'), test=test_LH)
# # preds_LH, _ = learn_LH.get_preds(ds_type=DatasetType.Test)
# preds_LH, _ = learn_LH.TTA(ds_type=DatasetType.Test)

# testing_df_LH[learn_LH.data.classes] = preds_LH

# print(preds_LH.type)
# print(preds_LH.shape)
# testing_df_LH.head()

## **<div id="V">V. Ensembling</div>**

In [None]:
# #Taking the geometric average of preds_X and preds_LH
# preds = (preds_X * preds_LH)**(.5)
# #Taking the arithmetic average (because we are using TTA) of preds_X and preds_LH
# preds = (preds_X + preds_LH) / 2
preds = (predictions_X + predictions_LH) / 2

testing_df[learn_X.data.classes] = preds
testing_df.to_csv('submission.csv', index=False)
testing_df.head(20)