## Fit FastAI models


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
print(os.listdir("."))

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai import *
from fastai.vision import *
from fastai.vision.data import *
import librosa
import librosa.display
import torch

In [None]:
torch.cuda.is_available()

In [None]:
bs = 150

In [None]:
path = Path("../input/freesound-audio-tagging-2019")
model_path = Path(".")
test_path = path/'test'
train_path = path/'train_curated'
train2_path = path/'train_noisy'
#sample_submission_csv =path/'sample_submission.csv'
test_df = pd.read_csv(path/'sample_submission.csv')
train_df = pd.read_csv(path/'train_curated.csv')
train2_df = pd.read_csv(path/'train_noisy.csv')

In [None]:
train_df = train_df[~train_df.fname.isin(['f76181c4.wav', '77b925c2.wav', '6a1f682a.wav', 'c7db12aa.wav', '7752cc8a.wav'])].reindex()

# lwlrap metric definition

In [None]:
# from official code https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8#scrollTo=cRCaCIb9oguU
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class


# Wrapper for fast.ai library
def lwlrap(scores, truth, **kwargs):
    score, weight = calculate_per_class_lwlrap(to_np(truth), to_np(scores))
    return torch.Tensor([(score * weight).sum()])


## Functions to read audio and convert to image

In [None]:
def read_audio(conf, pathname, trim_long_data):
    y, sr = librosa.load(pathname, sr=conf.sampling_rate)
    # trim silence
    if 0 < len(y): # workaround: 0 length causes error
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf.samples: # long enough
        if trim_long_data:
            y = y[0:0+conf.samples]
    else: # pad blank
        padding = conf.samples - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')
    return y


In [None]:
def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

def show_melspectrogram(conf, mels, title='Log-frequency power spectrogram'):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf.sampling_rate, hop_length=conf.hop_length,
                            fmin=conf.fmin, fmax=conf.fmax)
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()

def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
    x = read_audio(conf, pathname, trim_long_data)
    mels = audio_to_melspectrogram(conf, x)
    if debug_display:
        IPython.display.display(IPython.display.Audio(x, rate=conf.sampling_rate))
        show_melspectrogram(conf, mels)
    return mels


class conf:
    # Preprocessing settings for using 224 dim images
    sampling_rate = 44100
    duration = 4 
    #hop_length = 347*duration # to make time steps 128   
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 224
    n_fft = n_mels * 20
    samples = int(sampling_rate * duration)
    hop_length = n_fft//6

In [None]:

def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def convert_wav_to_image(df, source):
    X = []
    #for i, row in tqdm_notebook(df.iterrows()):
    for i in progress_bar(df.index):
        x = read_as_melspectrogram(conf, source/str(df.loc[i].fname), trim_long_data=False)
        x_color = mono_to_color(x)
        X.append(x_color)
    return X


In [None]:

X_train = convert_wav_to_image(train_df, source=train_path)

In [None]:
#data_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
data_df=train_df

In [None]:
# hack to make fastai open the images from memory rather than a file.
#CUR_X_FILES, CUR_X = list(data_df.fname.values), (X_train + X_test)
CUR_X_FILES, CUR_X = list(data_df.fname.values), (X_train)
def open_fat2019_image(fn, convert_mode, after_open)->Image:
    # open
    idx = CUR_X_FILES.index(fn.split('/')[-1])
    x = PIL.Image.fromarray(CUR_X[idx])
    #if x.size[0]<= x.size[1]:
        #print("fn=",fn," time dim=",x.size[0], " base dim=",x.size[1])
    # crop
    time_dim, base_dim = x.size
    crop_x = random.randint(0, time_dim - base_dim)
    x = x.crop([crop_x, 0, crop_x+base_dim, base_dim])    
    # standardize
    return Image(pil2tensor(x, np.float32).div_(255))

vision.data.open_image = open_fat2019_image

In [None]:
class MixUpCallback(LearnerCallback):
    "Callback that creates the mixed-up input and target."
    def __init__(self, learn:Learner, alpha:float=0.4, stack_x:bool=False, stack_y:bool=True):
        super().__init__(learn)
        self.alpha,self.stack_x,self.stack_y = alpha,stack_x,stack_y
    
    def on_train_begin(self, **kwargs):
        if self.stack_y: self.learn.loss_func = MixUpLoss(self.learn.loss_func)
        
    def on_batch_begin(self, last_input, last_target, train, **kwargs):
        "Applies mixup to `last_input` and `last_target` if `train`."
        if not train: return
        lambd = np.random.beta(self.alpha, self.alpha, last_target.size(0))
        lambd = np.concatenate([lambd[:,None], 1-lambd[:,None]], 1).max(1)
        lambd = last_input.new(lambd)
        shuffle = torch.randperm(last_target.size(0)).to(last_input.device)
        x1, y1 = last_input[shuffle], last_target[shuffle]
        if self.stack_x:
            new_input = [last_input, last_input[shuffle], lambd]
        else: 
            new_input = (last_input * lambd.view(lambd.size(0),1,1,1) + x1 * (1-lambd).view(lambd.size(0),1,1,1))
        if self.stack_y:
            new_target = torch.cat([last_target[:,None].float(), y1[:,None].float(), lambd[:,None].float()], 1)
        else:
            if len(last_target.shape) == 2:
                lambd = lambd.unsqueeze(1).float()
            new_target = last_target.float() * lambd + y1.float() * (1-lambd)
        return {'last_input': new_input, 'last_target': new_target}  
    
    def on_train_end(self, **kwargs):
        if self.stack_y: self.learn.loss_func = self.learn.loss_func.get_old()
        

class MixUpLoss(nn.Module):
    "Adapt the loss function `crit` to go with mixup."
    
    def __init__(self, crit, reduction='mean'):
        super().__init__()
        if hasattr(crit, 'reduction'): 
            self.crit = crit
            self.old_red = crit.reduction
            setattr(self.crit, 'reduction', 'none')
        else: 
            self.crit = partial(crit, reduction='none')
            self.old_crit = crit
        self.reduction = reduction
        
    def forward(self, output, target):
        if len(target.size()) == 2:
            loss1, loss2 = self.crit(output,target[:,0].long()), self.crit(output,target[:,1].long())
            d = (loss1 * target[:,2] + loss2 * (1-target[:,2])).mean()
        else:  d = self.crit(output, target)
        if self.reduction == 'mean': return d.mean()
        elif self.reduction == 'sum':            return d.sum()
        return d
    
    def get_old(self):
        if hasattr(self, 'old_crit'):  return self.old_crit
        elif hasattr(self, 'old_red'): 
            setattr(self.crit, 'reduction', self.old_red)
            return self.crit

def mixup(learn:Learner, alpha:float=0.4, stack_x:bool=False, stack_y:bool=True) -> Learner:
    "Add mixup https://arxiv.org/abs/1710.09412 to `learn`."
    learn.callback_fns.append(partial(MixUpCallback, alpha=alpha, stack_x=stack_x, stack_y=stack_y))
    return learn
Learner.mixup = mixup

In [None]:


tfms = get_transforms(do_flip=False, max_rotate=0, max_lighting=0.2, max_zoom=0, max_warp=0)


#train = ImageList.from_csv(".", path/"train_curated.csv", folder='train_curated')
train = ImageList.from_df(path=".", df=train_df, folder='train_curated')
src = train.split_by_rand_pct(0.2).label_from_df(label_delim=',')

#data = (src.transform(tfms, size=224).add_test(test).databunch(bs=bs)
data = (src.transform(tfms, size=224).databunch(bs=bs)
)

In [None]:
# new code - note that this is fp_16 (for local models) or fp_32(for kaggle models)
learn=None
gc.collect()
torch.cuda.empty_cache()
np.random.seed(42)
#learn = cnn_learner(data=data, base_arch=models.resnet18, pretrained=False, metrics=[lwlrap]).to_fp16()
learn = cnn_learner(data=data, base_arch=models.resnet34, pretrained=False,metrics=[lwlrap]).mixup(stack_y=False)

# Fitting code (when running on Kaggle Kernel)

In [None]:
learn.freeze()
learn.fit_one_cycle(10,max_lr=slice(1e-6,1e-2))

In [None]:
learn.save('res34-kaggle-1')


In [None]:
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(25,max_lr=slice(5e-4,1e-2))

In [None]:
learn.save('res34-kaggle-2')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.fit_one_cycle(30,max_lr=slice(1e-5,1e-2))

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.save('res34-kaggle-3')

# Train some more with noisy data based on misclassified classes of first model

## Identify misclassified classes

In [None]:
learn.load('res34-kaggle-3')
#learn.to_fp32()
interp = ClassificationInterpretation.from_learner(learn)
losses,idxs = interp.top_losses()
samples=20
figsize=(8,8)
    
#losses, idxs = self.top_losses(self.data.c)
l_dim = len(losses.size())
if l_dim == 1: losses, idxs = interp.top_losses()
infolist, ordlosses_idxs, mismatches_idxs, mismatches, losses_mismatches, mismatchescontainer = [],[],[],[],[],[]
truthlabels = np.asarray(interp.y_true, dtype=int)
classes_ids = [k for k in enumerate(interp.data.classes)]
predclass = np.asarray(interp.pred_class)
for i,pred in enumerate(predclass):
    where_truth = np.nonzero((truthlabels[i]>0))[0]
    mismatch = np.all(pred!=where_truth)
    if mismatch:
        mismatches_idxs.append(i)
        if l_dim > 1 : losses_mismatches.append((losses[i][pred], i))
        else: losses_mismatches.append((losses[i], i))
    if l_dim > 1: infotup = (i, pred, where_truth, losses[i][pred], np.round(interp.probs[i], decimals=3)[pred], mismatch)
    else: infotup = (i, pred, where_truth, losses[i], np.round(interp.probs[i], decimals=3)[pred], mismatch)
    infolist.append(infotup)
ds = interp.data.dl(interp.ds_type).dataset
mismatches = ds[mismatches_idxs]
ordlosses = sorted(losses_mismatches, key = lambda x: x[0], reverse=True)
for w in ordlosses: ordlosses_idxs.append(w[1])
mismatches_ordered_byloss = ds[ordlosses_idxs]
print(f'{str(len(mismatches))} misclassified samples over {str(len(interp.data.valid_ds))} samples in the validation set.')
samples = min(samples, len(mismatches))
for ima in range(len(mismatches_ordered_byloss)):
    mismatchescontainer.append(mismatches_ordered_byloss[ima][0])
    
poor_classes=[]
p_i=0
for sampleN in range(samples):
    actualclasses = ''
    for clas in infolist[ordlosses_idxs[sampleN]][2]:
        actualclasses = f'{actualclasses} -- {str(classes_ids[clas][1])}'
        poor_classes= poor_classes + [str(classes_ids[clas][1])]
        
    
    poor_classes= poor_classes + [str(classes_ids[infolist[ordlosses_idxs[sampleN]][1]][1])]
    
    
    #imag = mismatches_ordered_byloss[sampleN][0]
    #imag = show_image(imag, figsize=figsize)
    #imag.set_title(f"""Predicted: {classes_ids[infolist[ordlosses_idxs[sampleN]][1]][1]} \nActual: {actualclasses}\nLoss: {infolist[ordlosses_idxs[sampleN]][3]}\nProbability: {infolist[ordlosses_idxs[sampleN]][4]}""",
    #                loc='left')
    plt.show()

    print(f"""Predicted: {classes_ids[infolist[ordlosses_idxs[sampleN]][1]][1]} Actual: {actualclasses}Loss: {infolist[ordlosses_idxs[sampleN]][3]} Probability: {infolist[ordlosses_idxs[sampleN]][4]}""")

poor_classes=list(set(poor_classes))
    
    
print(poor_classes)

In [None]:
df = train2_df.copy()
df['singled'] = ~df.labels.str.contains(',')
singles_df = df[df.singled]

In [None]:
train_addition_df= pd.DataFrame(columns=singles_df.columns)
for p in poor_classes:
    #print(p)
    #print(singles_df[singles_df.labels == p])
    sel=(singles_df.labels == p)
    train_addition_df = pd.concat([train_addition_df, singles_df[sel][:150]])

In [None]:
X_train2 = convert_wav_to_image(train_addition_df, source=train2_path)


In [None]:
data_df = pd.concat([train_df, train_addition_df], ignore_index=True, sort=False)


In [None]:
# hack to make fastai open the images from memory rather than a file.
CUR_X_FILES, CUR_X = list(data_df.fname.values), (X_train + X_train2)

def open_fat2019_image(fn, convert_mode, after_open)->Image:
    # open
    idx = CUR_X_FILES.index(fn.split('/')[-1])
    x = PIL.Image.fromarray(CUR_X[idx])
    #if x.size[0]<= x.size[1]:
        #print("fn=",fn," time dim=",x.size[0], " base dim=",x.size[1])
    # crop
    time_dim, base_dim = x.size
    crop_x = random.randint(0, time_dim - base_dim)
    x = x.crop([crop_x, 0, crop_x+base_dim, base_dim])    
    # standardize
    return Image(pil2tensor(x, np.float32).div_(255))

vision.data.open_image = open_fat2019_image

## Fit the additional data

In [None]:
tfms = get_transforms(do_flip=False, max_rotate=0, max_lighting=0.2, max_zoom=0, max_warp=0)


#train = ImageList.from_csv(".", path/"train_curated.csv", folder='train_curated')
train = ImageList.from_df(path=".", df=data_df, folder='train_curated')
src = train.split_by_rand_pct(0.2).label_from_df(label_delim=',')

#data = (src.transform(tfms, size=224).add_test(test).databunch(bs=bs)
data = (src.transform(tfms, size=224).databunch(bs=bs)
)

In [None]:
learn=None
gc.collect()
torch.cuda.empty_cache()
np.random.seed(42)
#learn = cnn_learner(data=data, base_arch=models.resnet18, pretrained=False, metrics=[lwlrap]).to_fp16()
learn = cnn_learner(data=data, base_arch=models.resnet34, pretrained=False,metrics=[lwlrap]).mixup(stack_y=False)

In [None]:
learn.load('res34-kaggle-3')

In [None]:
learn.fit_one_cycle(25,max_lr=slice(1e-4,1e-2))

In [None]:
learn.save('res34-kaggle-4')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.fit_one_cycle(25,max_lr=slice(5e-4,1e-2))

In [None]:
learn.save('res34-kaggle-5')

In [None]:
learn.recorder.plot_losses()

In [None]:
print(str(learn.metrics))
learn.validate()

# Prediction code

In [None]:
del X_train, X_train2

X_test = convert_wav_to_image(test_df, source=test_path)

test = ImageList.from_csv(path, "sample_submission.csv", folder='test')
data.add_test(test)

In [None]:
X_test = convert_wav_to_image(test_df, source=test, img_dest=img_test)

In [None]:
data_df = test_df

In [None]:
# hack to make fastai open the images from memory rather than a file.
CUR_X_FILES, CUR_X = list(data_df.fname.values), (X_test)

def open_fat2019_image(fn, convert_mode, after_open)->Image:
    # open
    idx = CUR_X_FILES.index(fn.split('/')[-1])
    x = PIL.Image.fromarray(CUR_X[idx])
    #if x.size[0]<= x.size[1]:
        #print("fn=",fn," time dim=",x.size[0], " base dim=",x.size[1])
    # crop
    time_dim, base_dim = x.size
    crop_x = random.randint(0, time_dim - base_dim)
    x = x.crop([crop_x, 0, crop_x+base_dim, base_dim])    
    # standardize
    return Image(pil2tensor(x, np.float32).div_(255))

vision.data.open_image = open_fat2019_image

In [None]:
def _tta_only(learn:Learner, ds_type:DatasetType=DatasetType.Valid, num_pred:int=5) -> Iterator[List[Tensor]]:
    "Computes the outputs for several augmented inputs for TTA"
    dl = learn.dl(ds_type)
    ds = dl.dataset
    old = ds.tfms
    aug_tfms = [o for o in learn.data.train_ds.tfms]
    try:
        pbar = master_bar(range(num_pred))
        for i in pbar:
            ds.tfms = aug_tfms
            yield get_preds(learn.model, dl, pbar=pbar)[0]
    finally: ds.tfms = old

Learner.tta_only = _tta_only

def _TTA(learn:Learner, beta:float=0, ds_type:DatasetType=DatasetType.Valid, num_pred:int=5, with_loss:bool=False) -> Tensors:
    "Applies TTA to predict on `ds_type` dataset."
    preds,y = learn.get_preds(ds_type)
    all_preds = list(learn.tta_only(ds_type=ds_type, num_pred=num_pred))
    avg_preds = torch.stack(all_preds).mean(0)
    if beta is None: return preds,avg_preds,y
    else:            
        final_preds = preds*beta + avg_preds*(1-beta)
        if with_loss: 
            with NoneReduceOnCPU(learn.loss_func) as lf: loss = lf(final_preds, y)
            return final_preds, y, loss
        return final_preds, y

Learner.TTA = _TTA

In [None]:

from shutil import copyfile

#os.mkdir("models")

#copyfile("../input/res18stage4/res18-stage-4.pth", "models/res18-stage4.pth")
#learn.load('res18-stage4')

In [None]:
#learn.load('res18-kaggle-2')
#learn.to_fp32()
learn.TTA()

In [None]:
preds, _ = learn.get_preds(ds_type=DatasetType.Test)
test_df[learn.data.classes] = preds
test_df.to_csv('submission.csv', index=False)
test_df.head()