- Single model
- No external data

## Model
ResNet50 + Attention Block

## Data Augmentation
- mixup
- spec augmentation
- pitch & tempo augmentation
- melspectrogram cropping 

## Secondary labels
I only used secondary labels from data's duration < 40

## Balancing data
For large data size classes, downsampling to 80 samples each class.(removing long duration samples)

For small data size classes, upsamling to 60 samples each class.(splitting long duration samples)

## Inferece
segmentwise_output.

Predicting 5 second periods with 2.5 seconds intervals. (Half overlapped each period)

In [None]:
# !pip install ../input/tensorflow-io0150/tensorflow_io-0.15.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
!pip install -q tensorflow_io
# tfp stable version won't work on TPU. https://github.com/tensorflow/tensorflow/issues/40584
# !pip install -q tfp-nightly

In [None]:
# # There seems to be a problems in Kaggle TPU environment...
# # hack to avoid errors
# !pip install cloud-tpu-client

# import tensorflow as tf
# from cloud_tpu_client import Client
# print(tf.__version__)

# Client().configure_tpu_version(tf.__version__, restart_type='ifNeeded')

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow_addons as tfa
import tensorflow_io as tfio
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
# from keras.utils import to_categorical
from tensorflow.keras.utils import to_categorical
from kaggle_datasets import KaggleDatasets
import os
import numpy as np
import pandas as pd
import yaml
from IPython.display import display, Audio
from sklearn.model_selection import StratifiedKFold

print(tf.__version__)

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    # print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

# print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE

BATCH_SIZE = 32 * strategy.num_replicas_in_sync

In [None]:
SEED = 777
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

USE_FOLD=2
N_SPLIT=7
SAMPLE_RATE=32000
NUM_CLASSES = 264
PERIOD = 5

## Load data

In [None]:
train = pd.read_csv('../input/birdsong-resampled-train-audio-00/train_mod.csv')

## Process secondary labels

In [None]:
BIRD_CODE = {
    'aldfly': 0, 'ameavo': 1, 'amebit': 2, 'amecro': 3, 'amegfi': 4,
    'amekes': 5, 'amepip': 6, 'amered': 7, 'amerob': 8, 'amewig': 9,
    'amewoo': 10, 'amtspa': 11, 'annhum': 12, 'astfly': 13, 'baisan': 14,
    'baleag': 15, 'balori': 16, 'banswa': 17, 'barswa': 18, 'bawwar': 19,
    'belkin1': 20, 'belspa2': 21, 'bewwre': 22, 'bkbcuc': 23, 'bkbmag1': 24,
    'bkbwar': 25, 'bkcchi': 26, 'bkchum': 27, 'bkhgro': 28, 'bkpwar': 29,
    'bktspa': 30, 'blkpho': 31, 'blugrb1': 32, 'blujay': 33, 'bnhcow': 34,
    'boboli': 35, 'bongul': 36, 'brdowl': 37, 'brebla': 38, 'brespa': 39,
    'brncre': 40, 'brnthr': 41, 'brthum': 42, 'brwhaw': 43, 'btbwar': 44,
    'btnwar': 45, 'btywar': 46, 'buffle': 47, 'buggna': 48, 'buhvir': 49,
    'bulori': 50, 'bushti': 51, 'buwtea': 52, 'buwwar': 53, 'cacwre': 54,
    'calgul': 55, 'calqua': 56, 'camwar': 57, 'cangoo': 58, 'canwar': 59,
    'canwre': 60, 'carwre': 61, 'casfin': 62, 'caster1': 63, 'casvir': 64,
    'cedwax': 65, 'chispa': 66, 'chiswi': 67, 'chswar': 68, 'chukar': 69,
    'clanut': 70, 'cliswa': 71, 'comgol': 72, 'comgra': 73, 'comloo': 74,
    'commer': 75, 'comnig': 76, 'comrav': 77, 'comred': 78, 'comter': 79,
    'comyel': 80, 'coohaw': 81, 'coshum': 82, 'cowscj1': 83, 'daejun': 84,
    'doccor': 85, 'dowwoo': 86, 'dusfly': 87, 'eargre': 88, 'easblu': 89,
    'easkin': 90, 'easmea': 91, 'easpho': 92, 'eastow': 93, 'eawpew': 94,
    'eucdov': 95, 'eursta': 96, 'evegro': 97, 'fiespa': 98, 'fiscro': 99,
    'foxspa': 100, 'gadwal': 101, 'gcrfin': 102, 'gnttow': 103, 'gnwtea': 104,
    'gockin': 105, 'gocspa': 106, 'goleag': 107, 'grbher3': 108, 'grcfly': 109,
    'greegr': 110, 'greroa': 111, 'greyel': 112, 'grhowl': 113, 'grnher': 114,
    'grtgra': 115, 'grycat': 116, 'gryfly': 117, 'haiwoo': 118, 'hamfly': 119,
    'hergul': 120, 'herthr': 121, 'hoomer': 122, 'hoowar': 123, 'horgre': 124,
    'horlar': 125, 'houfin': 126, 'houspa': 127, 'houwre': 128, 'indbun': 129,
    'juntit1': 130, 'killde': 131, 'labwoo': 132, 'larspa': 133, 'lazbun': 134,
    'leabit': 135, 'leafly': 136, 'leasan': 137, 'lecthr': 138, 'lesgol': 139,
    'lesnig': 140, 'lesyel': 141, 'lewwoo': 142, 'linspa': 143, 'lobcur': 144,
    'lobdow': 145, 'logshr': 146, 'lotduc': 147, 'louwat': 148, 'macwar': 149,
    'magwar': 150, 'mallar3': 151, 'marwre': 152, 'merlin': 153, 'moublu': 154,
    'mouchi': 155, 'moudov': 156, 'norcar': 157, 'norfli': 158, 'norhar2': 159,
    'normoc': 160, 'norpar': 161, 'norpin': 162, 'norsho': 163, 'norwat': 164,
    'nrwswa': 165, 'nutwoo': 166, 'olsfly': 167, 'orcwar': 168, 'osprey': 169,
    'ovenbi1': 170, 'palwar': 171, 'pasfly': 172, 'pecsan': 173, 'perfal': 174,
    'phaino': 175, 'pibgre': 176, 'pilwoo': 177, 'pingro': 178, 'pinjay': 179,
    'pinsis': 180, 'pinwar': 181, 'plsvir': 182, 'prawar': 183, 'purfin': 184,
    'pygnut': 185, 'rebmer': 186, 'rebnut': 187, 'rebsap': 188, 'rebwoo': 189,
    'redcro': 190, 'redhea': 191, 'reevir1': 192, 'renpha': 193, 'reshaw': 194,
    'rethaw': 195, 'rewbla': 196, 'ribgul': 197, 'rinduc': 198, 'robgro': 199,
    'rocpig': 200, 'rocwre': 201, 'rthhum': 202, 'ruckin': 203, 'rudduc': 204,
    'rufgro': 205, 'rufhum': 206, 'rusbla': 207, 'sagspa1': 208, 'sagthr': 209,
    'savspa': 210, 'saypho': 211, 'scatan': 212, 'scoori': 213, 'semplo': 214,
    'semsan': 215, 'sheowl': 216, 'shshaw': 217, 'snobun': 218, 'snogoo': 219,
    'solsan': 220, 'sonspa': 221, 'sora': 222, 'sposan': 223, 'spotow': 224,
    'stejay': 225, 'swahaw': 226, 'swaspa': 227, 'swathr': 228, 'treswa': 229,
    'truswa': 230, 'tuftit': 231, 'tunswa': 232, 'veery': 233, 'vesspa': 234,
    'vigswa': 235, 'warvir': 236, 'wesblu': 237, 'wesgre': 238, 'weskin': 239,
    'wesmea': 240, 'wessan': 241, 'westan': 242, 'wewpew': 243, 'whbnut': 244,
    'whcspa': 245, 'whfibi': 246, 'whtspa': 247, 'whtswi': 248, 'wilfly': 249,
    'wilsni1': 250, 'wiltur': 251, 'winwre3': 252, 'wlswar': 253, 'wooduc': 254,
    'wooscj2': 255, 'woothr': 256, 'y00475': 257, 'yebfly': 258, 'yebsap': 259,
    'yehbla': 260, 'yelwar': 261, 'yerwar': 262, 'yetvir': 263
}

INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}

In [None]:
# https://www.kaggle.com/hawkey/birdsong-multi-label-dataset
import re
SPS_CODE = dict(zip(train.sci_name, train.ebird_code))
# background
def get_sps_from_bg(s):
    if type(s) != str: s = str(s)
    return re.findall('\((.*?)\)', s)

# secondary
def get_sps_from_sec(l):
    return [re.split('_', s[1:-1])[0] for s in re.split(',\s*', l[1:-1])]

def map_sps_to_code(l):
    res = []
    for s in l:
        if s == '':
            continue
        elif s in SPS_CODE:
            res.append(SPS_CODE[s])
        else:
            None
            # there are species not included in the classification targets (thus no ebird_code)
#             print(f"{s} doesn't exist.")
    return res

train['labels_bg'] = train['background'].apply(get_sps_from_bg).apply(map_sps_to_code)
train['labels_sec'] = train['secondary_labels'].apply(get_sps_from_sec).apply(map_sps_to_code)
train['secondary_codes'] = [list(set(a+b)) for a,b in zip(train['labels_sec'].values, train['labels_bg'].values)]

In [None]:
# flitering secondary codes because secondary codes isn't reliable in long duration data. 
filter_limit = 30
train['secondary_codes'] = train['secondary_codes'].where(train['duration'] < filter_limit, '[]')

# BIRD_CODE to number
train.secondary_codes = train.secondary_codes.apply(lambda x: list(x for x in map(BIRD_CODE.get, x) if x is not None))

## Dealing with Imbalanced Data

In [None]:
# downsampling to 80 samples per sepecies for large data. (removing longer samples)
# 
def downsampling(df):
    main_data = pd.DataFrame()
    for ebird_code in list(df.groupby('ebird_code').groups.keys()):
        main_data = pd.concat([main_data, df[df['ebird_code'] == ebird_code].sort_values('duration')[:80]])
    return main_data

# upsampling data to 60 samples per sepecies for small data. (splitting "duration > 40" samples)
def upsampling(df):
    def duplicate_long_data(df, ebird_code, dif):            
        long_data = df[df['ebird_code'] == ebird_code].query('duration > 40').sort_values('duration', ascending=False)[:dif]
        df = pd.concat([df, long_data])
        return df
    for i in range(5):
        ebird_code_count = df.groupby('ebird_code').count().filename
        for ebird_code in ebird_code_count.keys():
                dif = 60 - ebird_code_count[ebird_code]
                if dif > 0:
                    df = duplicate_long_data(df, ebird_code, dif)
                else:
                    continue
    return df

In [None]:
import re
# birdsong-resampled-train-audio
GCS_PATH0 = KaggleDatasets().get_gcs_path('birdsong-resampled-train-audio-00')
GCS_PATH1 = KaggleDatasets().get_gcs_path('birdsong-resampled-train-audio-01')
GCS_PATH2 = KaggleDatasets().get_gcs_path('birdsong-resampled-train-audio-02')
GCS_PATH3 = KaggleDatasets().get_gcs_path('birdsong-resampled-train-audio-03')
GCS_PATH4 = KaggleDatasets().get_gcs_path('birdsong-resampled-train-audio-04')


train_all = train.copy()
train_all['file_path'] = ''

def f(x):
    fl = x[0]   
    def get_gcp_path(fl): 
        if re.compile('[a-b]').fullmatch(fl):
            return GCS_PATH0
        elif re.compile('[c-f]').fullmatch(fl):
            return GCS_PATH1
        elif re.compile('[g-m]').fullmatch(fl):
            return GCS_PATH2
        elif re.compile('[n-r]').fullmatch(fl):
            return GCS_PATH3
        elif re.compile('[s-y]').fullmatch(fl):
            return GCS_PATH4
        else:
            raise Exception('no matching code')
    return get_gcp_path(fl) + '/' + x + '/'
train_all['file_path'] = train_all['ebird_code'].map(f) +  train_all['resampled_filename']

train_all = downsampling(train_all)

skf = StratifiedKFold(n_splits=N_SPLIT, shuffle=True, random_state=1)

train_all["fold"] = -1
for fold_id, (train_index, val_index) in enumerate(skf.split(train_all, train_all["ebird_code"])):
    train_all.iloc[val_index, -1] = fold_id

print(len(train_all))
train_all = upsampling(train_all)
print(len(train_all))

use_fold = USE_FOLD
train_file_list = train_all.query("fold != @use_fold")[["file_path", "ebird_code", "secondary_codes"]]
val_file_list = train_all.query("fold == @use_fold")[["file_path", "ebird_code", "secondary_codes"]]

print("[fold {}] train: {}, val: {}".format(use_fold, len(train_file_list), len(val_file_list)))

In [None]:
train_file_list.ebird_code = train_file_list.ebird_code.map(BIRD_CODE)
val_file_list.ebird_code = val_file_list.ebird_code.map(BIRD_CODE)

train_file_list.secondary_codes = [list(set(s + [p])) for s , p in zip(train_file_list.secondary_codes, train_file_list.ebird_code)]
val_file_list.secondary_codes = [list(set(s + [p])) for s , p in zip(val_file_list.secondary_codes, val_file_list.ebird_code)]

In [None]:
# hoge  = train_file_list.secondary_codes.apply(lambda x: sum(list(map(lambda x: to_categorical(x, NUM_CLASSES), x)))).tolist()

In [None]:
import functools
import tensorflow_probability as tfp

def mixup(batch_size, alpha, images, labels):
    """Applies Mixup regularization to a batch of images and labels.
    [1] Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz
      Mixup: Beyond Empirical Risk Minimization.
      ICLR'18, https://arxiv.org/abs/1710.09412
    Arguments:
      batch_size: The input batch size for images and labels.
      alpha: Float that controls the strength of Mixup regularization.
      images: A batch of images of shape [batch_size, ...]
      labels: A batch of labels of shape [batch_size, num_classes]
    Returns:
      A tuple of (images, labels) with the same dimensions as the input with
      Mixup regularization applied.
    """
    if  tf.random.uniform([]) < 0.5:
        return images, (tf.zeros([batch_size, 1, NUM_CLASSES]),labels) 
    mix_weight = tfp.distributions.Beta(alpha, alpha).sample([batch_size, 1])
    mix_weight = tf.maximum(mix_weight, 1. - mix_weight)
    images_mix_weight = tf.reshape(mix_weight, [batch_size, 1, 1, 1])
    # Mixup on a single batch is implemented by taking a weighted sum with the
    # same batch in reverse.
    images_mix = (
        images * images_mix_weight + images[::-1] * (1. - images_mix_weight))
    labels_mix = labels * mix_weight + labels[::-1] * (1. - mix_weight)
    # return  images_mix, labels_mix #images, labels 
    return  images_mix, (tf.zeros([batch_size, 1, NUM_CLASSES]),labels_mix) #images, labels 

def augment_freq_time_mask(spectrogram,
                           frequency_masking_para=15,
                           time_masking_para=15,
                           frequency_mask_num=1,
                           time_mask_num=1):
    time_max = tf.shape(spectrogram)[1]
    freq_max = tf.shape(spectrogram)[2]
    # Frequency masking
    for _ in range(frequency_mask_num):
        f = tf.random.uniform(shape=(), minval=0, maxval=frequency_masking_para, dtype=tf.dtypes.int32)
        f0 = tf.random.uniform(shape=(), minval=0, maxval=freq_max - f, dtype=tf.dtypes.int32)
        value_ones_freq_prev = tf.ones(shape=[1, time_max, f0])
        value_zeros_freq = tf.zeros(shape=[1, time_max, f])
        value_ones_freq_next = tf.ones(shape=[1, time_max, freq_max-(f0+f)])
        freq_mask = tf.concat([value_ones_freq_prev, value_zeros_freq, value_ones_freq_next], axis=2)
        # mel_spectrogram[:, f0:f0 + f, :] = 0 #can't assign to tensor
        # mel_spectrogram[:, f0:f0 + f, :] = value_zeros_freq #can't assign to tensor
        spectrogram = spectrogram*freq_mask

    # Time masking
    for _ in range(time_mask_num):
        t = tf.random.uniform(shape=(), minval=0, maxval=time_masking_para, dtype=tf.dtypes.int32)
        t0 = tf.random.uniform(shape=(), minval=0, maxval=time_max - t, dtype=tf.dtypes.int32)
        value_zeros_time_prev = tf.ones(shape=[1, t0, freq_max])
        value_zeros_time = tf.zeros(shape=[1, t, freq_max])
        value_zeros_time_next = tf.ones(shape=[1, time_max-(t0+t), freq_max])
        time_mask = tf.concat([value_zeros_time_prev, value_zeros_time, value_zeros_time_next], axis=1)
        # mel_spectrogram[:, :, t0:t0 + t] = 0 #can't assign to tensor
        # mel_spectrogram[:, :, t0:t0 + t] = value_zeros_time #can't assign to tensor
        spectrogram = spectrogram*time_mask

    return spectrogram

def augment_pitch_and_tempo(spectrogram,
                            max_tempo=1.2,
                            max_pitch=1.1,
                            min_pitch=0.95):
    original_shape = tf.shape(spectrogram)
    choosen_pitch = tf.random.uniform(shape=(), minval=min_pitch, maxval=max_pitch)
    choosen_tempo = tf.random.uniform(shape=(), minval=1, maxval=max_tempo)
    new_freq_size = tf.cast(tf.cast(original_shape[2], tf.float32)*choosen_pitch, tf.int32)
    new_time_size = tf.cast(tf.cast(original_shape[1], tf.float32)/(choosen_tempo), tf.int32)
    spectrogram_aug = tf.image.resize(tf.expand_dims(spectrogram, -1), [new_time_size, new_freq_size])
    spectrogram_aug = tf.image.crop_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0, target_height=tf.shape(spectrogram_aug)[1], target_width=tf.minimum(original_shape[2], new_freq_size))
    spectrogram_aug = tf.cond(choosen_pitch < 1,
                              lambda: tf.image.pad_to_bounding_box(spectrogram_aug, offset_height=0, offset_width=0,
                                                                   target_height=tf.shape(spectrogram_aug)[1], target_width=original_shape[2]),
                              lambda: spectrogram_aug)
    return spectrogram_aug[:, :, :, 0]

def augment_speed_up(spectrogram,
                     speed_std=0.1):
    original_shape = tf.shape(spectrogram)
    choosen_speed = tf.math.abs(tf.random.normal(shape=(), stddev=speed_std)) # abs makes sure the augmention will only speed up
    choosen_speed = 1 + choosen_speed
    new_freq_size = tf.cast(tf.cast(original_shape[2], tf.float32), tf.int32)
    new_time_size = tf.cast(tf.cast(original_shape[1], tf.float32)/(choosen_speed), tf.int32)
    spectrogram_aug = tf.image.resize(tf.expand_dims(spectrogram, -1), [new_time_size, new_freq_size])
    return spectrogram_aug[:, :, :, 0]

def augment_dropout(spectrogram,
                    keep_prob=0.9):
    return tf.nn.dropout(spectrogram, rate=1-keep_prob)

def transform_spectrogram(spectrogram,y):
    spectrogram = tf.expand_dims(spectrogram, axis=0)
    if  tf.random.uniform([]) > 0.7:
        spectrogram = augment_freq_time_mask(spectrogram)
    rand = tf.random.uniform([])
    if  rand < 0.3:
        spectrogram = augment_dropout(spectrogram, keep_prob=1-rand)
    if  tf.random.uniform([]) < 0.2:
        spectrogram = augment_pitch_and_tempo(spectrogram)
    spectrogram = tf.squeeze(spectrogram, axis=0)
    return spectrogram,y

In [None]:
def paths_and_labels_to_dataset(audio_paths, labels, sec_labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x), num_parallel_calls=AUTO)
    label_ds = tf.data.Dataset.from_tensor_slices(to_categorical(labels))

#     sec_labels_ds = tf.data.Dataset.from_tensor_slices(sec_labels.apply(lambda x: sum(list(map(lambda x: to_categorical(x, NUM_CLASSES), x)))).tolist())
    # sec_labels = 1
    label_ds = tf.data.Dataset.from_tensor_slices(sec_labels.apply(lambda x: sum(list(map(lambda x: to_categorical(x, NUM_CLASSES), x)))).tolist())
    # sec_labels = 0.5
    # label_ds = tf.data.Dataset.zip((label_ds, sec_labels_ds)).map(lambda p,s: tf.clip_by_value(p + s * 0.5, 0.0, 1.0) , num_parallel_calls=AUTO)
    return tf.data.Dataset.zip((audio_ds, label_ds))

def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, sample_rate = tf.audio.decode_wav(audio, 1)    
    return audio

def split_audio(audio, y):
    len_audio = len(audio)
    effective_length = SAMPLE_RATE * PERIOD 
    if len_audio <= effective_length:
        audio = tf.concat([audio, tf.expand_dims(tf.zeros([effective_length - len_audio]), 1)], axis=0)
    else:
        start = tf.random.uniform([1], maxval = len_audio - effective_length, dtype = tf.int32)[0]
        audio = audio[start:start + effective_length]
    return audio, y

# def split_audio_val(audio, y):
#     len_audio = len(audio)
#     effective_length = SAMPLE_RATE * PERIOD 
#     if len_audio <= effective_length:
#         audio = tf.concat([audio, tf.expand_dims(tf.zeros([effective_length - len_audio]), 1)], axis=0)
#     else:
#         # split down the middle
#         # start = tf.math.floordiv(len_audio - effective_length, 2)
#         start = 0
#         audio = audio[start: start + effective_length]
#     return audio, y

def audio_to_fft(audio, y):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    # print(audio.shape)
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[: (audio.shape[0] // 2), :]), y

def audio_to_mel_spectrogram(audio, y):
    audio = tf.squeeze(audio, axis=-1)
    spectrogram = tfio.audio.spectrogram(
        audio, nfft=2048, window=2048, stride=320)

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=SAMPLE_RATE, mels=500, fmin=50, fmax=14000)
    
    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80)

    return dbscale_mel_spectrogram, y

def mono_to_color(audio, y):
    eps=1e-6
    X = audio
    # Stack X as [X,X,X]
    X = tf.stack([X, X, X], axis=-1)

    # Standardize
    mean = tf.math.reduce_mean(X)
    X = X - mean
    std = tf.math.reduce_std(X)
    Xstd = X / (std + eps)
    _min, _max = tf.math.reduce_min(Xstd), tf.math.reduce_max(Xstd)
    norm_max = _max
    norm_min = _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V = 255 * (V - norm_min) / (norm_max - norm_min)
    else:
        # Just zero
        V = tf.zeros_like(Xstd)
    img_size = 224
    image = tf.image.resize(V, (img_size,img_size))

    return preprocess_input(image), y


In [None]:
# for hoge in paths_and_labels_to_dataset(train_file_list['file_path'].values.tolist(), train_file_list['ebird_code'].values.tolist(), train_file_list['secondary_codes']).take(1):
#     print(hoge)

In [None]:
train_ds = (
    paths_and_labels_to_dataset(train_file_list['file_path'].values.tolist(), train_file_list['ebird_code'].values.tolist(), train_file_list['secondary_codes'])
    .cache()
    .map(split_audio, num_parallel_calls=AUTO)
    .map(audio_to_mel_spectrogram, num_parallel_calls=AUTO)
    .map(transform_spectrogram, num_parallel_calls=AUTO)
    .map(mono_to_color, num_parallel_calls=AUTO)
    .shuffle(1024)
    .repeat()
    .batch(BATCH_SIZE, drop_remainder=True)
#     .map(functools.partial(mixup, BATCH_SIZE, 0.2), num_parallel_calls=AUTO)
    .prefetch(AUTO)
)


valid_ds = (
    paths_and_labels_to_dataset(val_file_list['file_path'].values.tolist(), val_file_list['ebird_code'].values.tolist(), val_file_list['secondary_codes'])
    .map(split_audio, num_parallel_calls=AUTO)
    .map(audio_to_mel_spectrogram, num_parallel_calls=AUTO)
    .map(mono_to_color, num_parallel_calls=AUTO)
    .cache()
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(AUTO)
)

In [None]:
# https://www.kaggle.com/itsuki9180/birdcall-using-tpu-train
from keras.callbacks import Callback

class F1Callback(Callback):
    def __init__(self):
        self.f1s = []

    def on_epoch_end(self, epoch, logs):
        eps = np.finfo(np.float32).eps
        recall = logs["val_clipwise_output_true_positives"] / (logs["val_clipwise_output_possible_positives"] + eps)
        precision = logs["val_clipwise_output_true_positives"] / (logs["val_clipwise_output_predicted_positives"] + eps)
        f1 = 2*precision*recall / (precision+recall+eps)
        print("f1_val (from log) =", f1)
        self.f1s.append(f1)

def true_positives(y_true, y_pred):
    return K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

def possible_positives(y_true, y_pred):
    return K.sum(K.round(K.clip(y_true, 0, 1)))

def predicted_positives(y_true, y_pred):
    return K.sum(K.round(K.clip(y_pred, 0, 1)))

def F1(y_true, y_pred):
    TPFN = possible_positives(y_true, y_pred)
    TPFP = predicted_positives(y_true, y_pred)
    TP = true_positives(y_true, y_pred)
    return  (TP * 2) / (TPFN + TPFP + K.epsilon())

f1cb = F1Callback()


In [None]:
with strategy.scope():
    base_model = ResNet50(include_top=False, weights='imagenet')
    # for inference
#     base_model = ResNet50(include_top=False, weights=None)
    x = base_model.output

    # https://gist.github.com/hengck23/6ebe1c75f8b3bcc953c0599ac76bad45
    x = tf.reduce_mean(x, axis=2)
    x1 = L.MaxPooling1D(pool_size=3, strides=1, padding='same')(x)
    x2 = L.AveragePooling1D(pool_size=3, strides=1, padding='same')(x)
    x = x1 + x2 
    x = L.Dropout(0.5)(x)
    x = L.Dense(1024, activation='relu')(x)
    x = L.Dropout(0.5)(x)

    norm_att = L.Conv1D(filters=NUM_CLASSES, kernel_size=1, padding='same')(x)
    norm_att = tf.keras.activations.tanh(norm_att/10)*10
    norm_att = tf.keras.activations.softmax(norm_att, axis=-2)
    segmentwise_output = L.Conv1D(filters=NUM_CLASSES, kernel_size=1, padding='same', activation='sigmoid', name='segmentwise_output')(x)
    clipwise_output = tf.math.reduce_sum(norm_att * segmentwise_output, axis=1)
    clipwise_output = L.Lambda(lambda x: x, name="clipwise_output")(clipwise_output)
    output = [segmentwise_output, clipwise_output]

    model = tf.keras.models.Model(inputs=base_model.input, outputs=output)
    optimizer= tfa.optimizers.RectifiedAdam(
        lr=1e-3,
        total_steps=10000,
        warmup_proportion=0.1,
        min_lr=1e-8,
    )

    model.compile(optimizer, loss=[None, "binary_crossentropy"],loss_weights=[0,1], metrics=[[],["accuracy", F1,true_positives,possible_positives,predicted_positives]])
    model.summary()

In [None]:
%%time
es = tf.keras.callbacks.EarlyStopping(monitor='loss', verbose=1, patience=10)
sv = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', verbose=1, factor=0.5, patience=2, min_delta=0.0001, cooldown=1, min_lr=1e-7)

STEPS_PER_EPOCH = len(train_file_list) // BATCH_SIZE
VALIDATION_STEP = len(val_file_list) // BATCH_SIZE

history = model.fit(
    train_ds,
    epochs=100,
    steps_per_epoch=STEPS_PER_EPOCH,
    callbacks=[reduce_lr, es, sv, f1cb],
    validation_data=valid_ds,
    validation_steps = VALIDATION_STEP
)

# Inferece

In [None]:
# import tensorflow as tf
# import tensorflow.keras.backend as K
# import tensorflow.keras.layers as L
# import tensorflow_io as tfio
# from keras.applications.resnet50 import ResNet50
# from keras.applications.resnet50 import preprocess_input, decode_predictions
# import os
# import numpy as np
# import pandas as pd
# import yaml
# import librosa
# from pathlib import Path

# TEST_PATH = Path('../input/birdsong-recognition') if os.path.exists('../input/birdsong-recognition/test_audio') else Path('../input/birdcall-check')

# TEST_AUDIO_PATH = TEST_PATH/'test_audio'
# test = pd.read_csv(TEST_PATH/'test.csv')

# model.load_weights('../input/birdcallweight/model.h5')

In [None]:
# def audio_to_mel_spectrogram(audio):
#     spectrogram = tfio.experimental.audio.spectrogram(
#         audio, nfft=2048, window=2048, stride=320)
#     mel_spectrogram = tfio.experimental.audio.melscale(
#         spectrogram, rate=SAMPLE_RATE, mels=500, fmin=50, fmax=14000)
    
#     dbscale_mel_spectrogram = tfio.experimental.audio.dbscale(
#         mel_spectrogram, top_db=80)
#     return dbscale_mel_spectrogram

# def mono_to_color(audio):
#     eps=1e-6
#     X = audio
#     X = tf.stack([X, X, X], axis=-1)

#     mean = tf.math.reduce_mean(X)
#     X = X - mean
#     std = tf.math.reduce_std(X)
#     Xstd = X / (std + eps)
#     _min, _max = tf.math.reduce_min(Xstd), tf.math.reduce_max(Xstd)
#     norm_max = _max
#     norm_min = _min
#     if (_max - _min) > eps:
#         V = Xstd
#         V = 255 * (V - norm_min) / (norm_max - norm_min)
#     else:
#         V = tf.zeros_like(Xstd)
#     img_size = 224
#     image = tf.image.resize(V, (img_size,img_size))
#     return preprocess_input(image)

In [None]:
# def upsample(x):
#     ratio = 72
#     (time_steps, classes_num) = x.shape
#     upsampled = np.repeat(x, ratio, axis=0)
#     upsampled = upsampled[2:-2]
#     return upsampled

# def prediction_for_clip(test_df: pd.DataFrame,
#                         clip: np.ndarray, 
#                         model: tf.keras.models.Model,
#                         threshold=0.5):
#     PERIOD = 5
#     INTERVAL_RATE = 0.5
#     OFFSET_LEGNTH = 0.01
#     audios = []
#     LENGTH_THRESHOLD = 0.1
#     y = clip.astype(np.float32)
#     len_y = len(y)
#     start = 0
#     end = PERIOD * SAMPLE_RATE
#     while True:
#         y_batch = y[start:end].astype(np.float32)
#         if len(y_batch) != PERIOD * SAMPLE_RATE:
#             y_pad = np.zeros(PERIOD * SAMPLE_RATE, dtype=np.float32)
#             y_pad[:len(y_batch)] = y_batch
#             audios.append(y_pad)
#             break
#         start = end - int(PERIOD * (1.0-INTERVAL_RATE) * SAMPLE_RATE)
#         end = start + PERIOD * SAMPLE_RATE
#         audios.append(y_batch)
        
#     array = np.asarray(audios)

#     estimated_event_list = []
#     global_time = 0.0
#     site = test_df["site"].values[0]
#     audio_id = test_df["audio_id"].values[0]
#     for audio in array:
#         melspec = audio_to_mel_spectrogram(audio)
#         image = mono_to_color(melspec)
#         image = tf.expand_dims(image, axis=0)

#         framewise_outputs, _ = model.predict(image)
#         framewise_outputs = upsample(framewise_outputs[0])
        
#         thresholded = framewise_outputs >= threshold
#         for target_idx in range(thresholded.shape[1]):
#             if thresholded[:, target_idx].mean() == 0:
#                 pass
#             else:
#                 detected = np.argwhere(thresholded[:, target_idx]).reshape(-1)
#                 head_idx = 0
#                 tail_idx = 0
#                 while True:
#                     if (tail_idx + 1 == len(detected)) or (
#                             detected[tail_idx + 1] - 
#                             detected[tail_idx] != 1):
                                
#                         onset = OFFSET_LEGNTH * detected[
#                             head_idx] + global_time
#                         offset = OFFSET_LEGNTH * detected[
#                             tail_idx] + global_time
#                         onset_idx = detected[head_idx]
#                         offset_idx = detected[tail_idx]
#                         max_confidence = framewise_outputs[
#                             onset_idx:offset_idx, target_idx].max()
#                         mean_confidence = framewise_outputs[
#                             onset_idx:offset_idx, target_idx].mean()
                                                    
#                         estimated_event = {
#                             "site": site,
#                             "audio_id": audio_id,
#                             "ebird_code": INV_BIRD_CODE[target_idx],
#                             "onset": onset,
#                             "offset": offset,
#                             "max_confidence": max_confidence,
#                             "mean_confidence": mean_confidence
#                         }
#                         if offset-onset > LENGTH_THRESHOLD or max_confidence > threshold * 1.5:
#                             estimated_event_list.append(estimated_event)
#                         else:
#                             None
# #                             print(estimated_event)
#                         head_idx = tail_idx + 1
#                         tail_idx = tail_idx + 1
#                         if head_idx >= len(detected):
#                             break
#                     else:
#                         tail_idx += 1
#         global_time += PERIOD*INTERVAL_RATE
        
#     prediction_df = pd.DataFrame(estimated_event_list)
#     return prediction_df

In [None]:
# import warnings

# warnings.filterwarnings("ignore", category=UserWarning)
# prediction_dfs = []


# unique_audio_id = test.audio_id.unique()
# for audio_id in unique_audio_id:
#     print(audio_id)
#     audio_path = TEST_AUDIO_PATH/f'{audio_id}.mp3'
#     clip, _ = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True, res_type="kaiser_fast")
#     test_df_for_audio_id = test.query(f"audio_id == '{audio_id}'").reset_index(drop=True)
#     prediction_df = prediction_for_clip(test_df_for_audio_id,
#                                           clip=clip,
#                                           model=model,  
#                                           threshold=0.85)
#     prediction_dfs.append(prediction_df)
                                        
# prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
# prediction_df

In [None]:
# labels = {}

# for audio_id, sub_df in prediction_df.groupby("audio_id"):
#     events = sub_df[["ebird_code", "onset", "offset", "max_confidence", "site"]].values
#     n_events = len(events)
#     removed_event = []
    
#     site = events[0][4]
#     for i in range(n_events):
#         if i in removed_event:
#             continue
#         event = events[i][0] 
#         onset = events[i][1] + 0.35
#         offset = events[i][2] - 0.35
#         if site in {"site_1", "site_2"}:
#             start_section = int((onset // 5) * 5) + 5
#             end_section = int((offset // 5) * 5) + 5
#             cur_section = start_section

#             row_id = f"{site}_{audio_id}_{start_section}"
#             if labels.get(row_id) is not None:
#                 labels[row_id].add(event)
#             else:
#                 labels[row_id] = set()
#                 labels[row_id].add(event)

#             while cur_section != end_section:
#                 cur_section += 5
#                 row_id = f"{site}_{audio_id}_{cur_section}"
#                 if labels.get(row_id) is not None:
#                     labels[row_id].add(event)
#                 else:
#                     labels[row_id] = set()
#                     labels[row_id].add(event)
#         else:
#             row_id = f"{site}_{audio_id}"
#             if labels.get(row_id) is not None:
#                 labels[row_id].add(event)
#             else:
#                 labels[row_id] = set()
#                 labels[row_id].add(event)

# for key in labels:
#     labels[key] = " ".join(sorted(list(labels[key])))
    
    
# row_ids = list(labels.keys())
# birds = list(labels.values())
# post_processed = pd.DataFrame({
#     "row_id": row_ids,
#     "birds": birds
# })
# post_processed.head()

In [None]:
# all_row_id = test[["row_id"]]
# submission = all_row_id.merge(post_processed, on="row_id", how="left")
# submission = submission.fillna("nocall")
# submission.to_csv("submission.csv", index=False)
# submission.values