In [None]:
import os

import numpy as np
import pandas as pd
import librosa
import tensorflow as tf


import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import IPython.display as ipd

In [None]:
BASE_INPUT_DIR = '/kaggle/input/rfcx-species-audio-detection/'
TRAIN_INPUT_DIR = os.path.join(BASE_INPUT_DIR, 'train')
TEST_INPUT_DIR = os.path.join(BASE_INPUT_DIR, 'test')
TRAIN_TF_RECORDS = os.path.join(BASE_INPUT_DIR, 'tfrecords/train')

In [None]:
train_tp = pd.read_csv(os.path.join(BASE_INPUT_DIR, 'train_tp.csv'))
train_fp = pd.read_csv(os.path.join(BASE_INPUT_DIR, 'train_tp.csv'))
submission = pd.read_csv(os.path.join(BASE_INPUT_DIR, 'sample_submission.csv'))

In [None]:
train_files = os.listdir(TRAIN_INPUT_DIR)
train_files = [os.path.join(TRAIN_INPUT_DIR, f) for f in train_files]

train_tf_records = os.listdir(TRAIN_TF_RECORDS)
train_tf_records = [os.path.join(TRAIN_TF_RECORDS, f) for f in train_tf_records]

test_files = os.listdir(TEST_INPUT_DIR)
test_files = [os.path.join(TEST_INPUT_DIR, f) for f in test_files]

# Model 1

In [None]:
# Config

FMIN = 40.0
FMAX = 24000.0

SR = 48000
N_MELS = 224

IMG_SIZE = (224, 512)
IMG_HEIGHT = IMG_SIZE[0]
IMG_WIDTH = IMG_SIZE[1]

CLIP_DURATION = 60
SEGMENT_DURATION = 10

In [None]:
# Constants

NUM_SPECIES = 24
SPECIES_ID = list(range(NUM_SPECIES))

# Model 1 Prep

In [None]:
!mkdir train

In [None]:
def load_audio(recording_id, train=True):
    filepath = os.path.join(TRAIN_INPUT_DIR if train else TEST_INPUT_DIR, recording_id + '.flac')
    data, _ = librosa.load(filepath, sr=SR)
    return data


def cut_audio(audio_data, tmin, tmax, sr=SR, segment_duration=SEGMENT_DURATION):
    clip_duration = len(audio_data)/sr
    extra_time = max(0, segment_duration - (tmax - tmin)) / 2
    left_extension = right_extension = extra_time
    if tmin - left_extension < 0:
        right_extension += left_extension - tmin
    if tmax + right_extension > clip_duration:
        left_extension += tmax + right_extension - clip_duration
    tmin = max(0, tmin - left_extension)
    tmax = min(clip_duration, tmax + right_extension)
    
    min_sample, max_sample = librosa.time_to_samples([tmin, tmax], sr=sr)
    return audio_data[min_sample:(max_sample + 1)]
    

def get_mel_spec_img(audio_data):
    mel_spec = librosa.feature.melspectrogram(audio_data, sr=SR, n_mels=N_MELS)
    log_mel_spec = librosa.power_to_db(mel_spec)
    img = tf.expand_dims(log_mel_spec, -1)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.image.per_image_standardization(img)
    return img, log_mel_spec


def cut_and_get_mel_spec_img(sample):
    audio_data = load_audio(sample['recording_id'])
    audio_data = cut_audio(audio_data, sample['t_min'], sample['t_max'])
    return get_mel_spec_img(audio_data)
    

def cut_and_save_image(sample):
    img, _ = cut_and_get_mel_spec_img(sample)
    idx = sample.name
    output_file = os.path.join(f'./train/', sample['recording_id'] + f'_{idx}.npy')
    np.save(output_file, img.numpy())
   
    
def get_rgb_spec_img(img):
    img_min = tf.reduce_min(img)
    img_max = tf.reduce_max(img)
    img = (img - img_min)/(img_max - img_min)*255
    return tf.image.grayscale_to_rgb(img)


def time_to_img_pos(sample):
    tmin = sample['t_min']
    tmax = sample['t_max']
    extra_time = max(0, SEGMENT_DURATION - (tmax - tmin)) / 2
    seg_min = max(0, tmin - extra_time)
    seg_max = min(CLIP_DURATION, tmax + extra_time)
    
    tmin_frame, tmax_frame = librosa.time_to_frames([tmin, tmax], sr=SR)
    segmin_frame, segmax_frame = librosa.time_to_frames([seg_min, seg_max], sr=SR)
    img_scale_factor = IMG_WIDTH/(segmax_frame - segmin_frame)
    
    tmin_img_pos = (tmin_frame - segmin_frame)*img_scale_factor
    tmax_img_pos = (tmax_frame - segmin_frame)*img_scale_factor
    return tmin_img_pos, tmax_img_pos


def random_time_crop(img, sample):
    img_size = tf.shape(img)[:2]
    tmin_img_pos, tmax_img_pos = time_to_img_pos(sample)
    
    min_left = 0
    max_left = tmin_img_pos
    min_right = tmax_img_pos
    max_right = img_size[1]
    
    left = tf.random.uniform([], minval=min_left, maxval=max_left, dtype=tf.int32)
    right = tf.random.uniform([], minval=min_right, maxval=max_right, dtype=tf.int32)
    return tf.image.resize(img[:, left:right, :], img_size)

In [None]:
# sample = train_tp.loc[9, :]
# cut_and_save_image(sample)
# spec_img = np.load(f'./train/{sample.recording_id}_{sample.name}.npy')

# fig, ax = plt.subplots(figsize=(25, 5))
# plt.imshow(get_rgb_spec_img(spec_img).numpy()/255)

# print(sample['t_min'], sample['t_max'])
# tmin_img_pos, tmax_img_pos = time_to_img_pos(sample)
# ax.axvline(tmin_img_pos, color='g')
# ax.axvline(tmax_img_pos, color='r')

In [None]:
# img = random_time_crop(spec_img, sample)

# fig, ax = plt.subplots(figsize=(25, 5))
# plt.imshow(get_rgb_spec_img(img).numpy()/255)

## Generate Numpy files

In [None]:
# import dask.dataframe as dd

# train_tp_dd = dd.from_pandas(train_tp, npartitions=8)
# train_tp_dd.map_partitions(lambda df: df.apply(cut_and_save_image, axis=1), meta=(None, object)).compute()

In [None]:
# !zip -r melspec_img_raw.zip ./train

## Generate TFRecords

In [None]:
# def generator():
#     for i in range(train_tp.shape[0]):
#         sample = train_tp.loc[i, :]
#         spec_img = np.load(f'./train/{sample.recording_id}_{sample.name}.npy')
#         spec_img = tf.constant(spec_img, dtype=tf.float32)
#         yield spec_img

In [None]:
# melspec_ds = tf.data.Dataset.from_generator(generator, tf.float32).prefetch(1).map(tf.io.serialize_tensor)

In [None]:
# filename = 'train.melspec-img.tfrecord'
# writer = tf.data.experimental.TFRecordWriter(filename)
# writer.write(melspec_ds)

In [None]:
# raw_dataset = tf.data.TFRecordDataset(filename)

# for raw_record in raw_dataset.take(1):
#     spec_img = tf.io.parse_tensor(raw_record, tf.float32)
#     plt.imshow(get_rgb_spec_img(spec_img).numpy()/255)

# Model 2

In [None]:
def model_2_generator():
    for rec_id, samples in train_tp.groupby('recording_id'):
        audio_wav = load_audio(rec_id)
        sr = SR
        clip_duration = len(audio_wav)/sr
        assert clip_duration == CLIP_DURATION
        segment_duration = SEGMENT_DURATION
        indices = []
        cut_tmin_lst = []
        overlap_windows = []
        overlap_species = []
        
        for _, sample in samples.iterrows():
            tmin = sample['t_min']
            tmax = sample['t_max']

            extra_time = max(0.0, segment_duration - (tmax - tmin)) / 2
            cut_tmin = max(0.0, tmin - extra_time)
            if tmax + extra_time > clip_duration:
                cut_tmin = max(0.0, cut_tmin - (tmax + extra_time - clip_duration))
            cut_tmin_lst.append(cut_tmin)
            indices.append(sample.name)
            
        for cut_tmin in cut_tmin_lst:
            cut_tmax = cut_tmin + segment_duration
            cut_overlap_windows = []
            cut_overlap_species = []
            for _, sample in samples.iterrows():
                tmin = sample['t_min']
                tmax = sample['t_max']
                if cut_tmin <= tmin <= cut_tmax:
                    cut_overlap_windows.append([tmin - cut_tmin, min(tmax, cut_tmax) - cut_tmin])
                    cut_overlap_species.append(sample['species_id'])
            overlap_windows.append(cut_overlap_windows)
            overlap_species.append(cut_overlap_species)
            
        for idx, cut_tmin, windows, species in zip(indices, cut_tmin_lst, overlap_windows, overlap_species):
            cut_tmin_sample = int(np.floor(cut_tmin * sr))
            cut_tmax_sample = cut_tmin_sample + segment_duration * sr
            _sample = {
                'idx': tf.constant(idx, dtype=tf.int32),
                'audio_wav': tf.reshape(audio_wav[cut_tmin_sample:cut_tmax_sample], [SR*SEGMENT_DURATION]),
                'recording_id': tf.constant(rec_id, dtype=tf.string),
                'cut_tmin': tf.constant(cut_tmin, dtype=tf.float32),
                'windows': tf.constant(windows, dtype=tf.float32),
                'species': tf.constant(species, dtype=tf.int32)
            }
            yield _sample

In [None]:
features_dtypes = {
    'idx': tf.int32,
    'audio_wav': tf.float32,
    'recording_id': tf.string,
    'cut_tmin': tf.float32,
    'windows': tf.float32,
    'species': tf.int32
}

parsed_dataset = tf.data.Dataset.from_generator(model_2_generator, features_dtypes).prefetch(10)

In [None]:
def _tensor_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(value).numpy()]))


def serialize_example(idx, audio_wav, recording_id, cut_tmin, windows, species):
    feature = {
        'idx': _tensor_feature(idx),
        'audio_wav': _tensor_feature(audio_wav),
        'recording_id': _tensor_feature(recording_id),
        'cut_tmin': _tensor_feature(cut_tmin),
        'windows': _tensor_feature(windows),
        'species': _tensor_feature(species),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


def tf_serialize_example(data):
    pos_data = (
        data['idx'],
        data['audio_wav'],
        data['recording_id'],
        data['cut_tmin'],
        data['windows'],
        data['species']
    )
    tf_string = tf.py_function(serialize_example, pos_data, tf.string)
    return tf.reshape(tf_string, ())


serialized_dataset = parsed_dataset.map(tf_serialize_example).prefetch(50)

In [None]:
filename = 'train.cut_audio.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(serialized_dataset)