In [None]:
import os

import numpy as np
import pandas as pd
import librosa
import tensorflow as tf


import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import IPython.display as ipd

# Check files format

In [None]:
BASE_INPUT_DIR = '/kaggle/input/rfcx-species-audio-detection/'
TRAIN_INPUT_DIR = os.path.join(BASE_INPUT_DIR, 'train')
TEST_INPUT_DIR = os.path.join(BASE_INPUT_DIR, 'test')

In [None]:
train_tp = pd.read_csv(os.path.join(BASE_INPUT_DIR, 'train_tp.csv'))
train_fp = pd.read_csv(os.path.join(BASE_INPUT_DIR, 'train_tp.csv'))
submission = pd.read_csv(os.path.join(BASE_INPUT_DIR, 'sample_submission.csv'))

In [None]:
train_files = os.listdir(TRAIN_INPUT_DIR)
train_files = [os.path.join(TRAIN_INPUT_DIR, f) for f in train_files]

In [None]:
FMIN = 40.0
FMAX = 24000.0

SR = 48000
N_MELS = 224

IMG_SIZE = (224, 512)
IMG_HEIGHT = IMG_SIZE[0]
IMG_WIDTH = IMG_SIZE[1]
FRAME_MAX = 5626

SEGMENT_DURATION = 10

In [None]:
train_tp.head()

In [None]:
train_fp.head()

In [None]:
submission.head()

In [None]:
SPECIES_ID = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

# Summary

In [None]:
train_tp.shape

In [None]:
train_tp.describe()

In [None]:
train_tp.info()

In [None]:
train_fp.info()

In [None]:
train_fp.describe()

In [None]:
submission.info()

In [None]:
test_files = os.listdir('/kaggle/input/rfcx-species-audio-detection/test/')

In [None]:
len(test_files)

# Time Window

In [None]:
data, _ = librosa.load(train_files[0], sr=SR)

In [None]:
len(data)

In [None]:
ipd.Audio(data, rate=sr)

In [None]:
train_file_info = []
for train_file in train_files:
    data, sr = librosa.load(train_file, sr=None)
    train_file_info.append((len(data), sr))

In [None]:
len(train_file_info)

In [None]:
train_tp['recording_id'].nunique()

In [None]:
train_info = np.array(train_file_info)

In [None]:
print(np.unique(train_info[:, 0]))
print(np.unique(train_info[:, 1]))

In [None]:
len(data)/sr

In [None]:
train_tp_tdiff = train_tp ['t_max'] - train_tp['t_min']
print(train_tp_tdiff.describe())
train_tp_tdiff.hist()

In [None]:
train_fp_tdiff = train_fp ['t_max'] - train_fp['t_min']
print(train_fp_tdiff.describe())
train_fp_tdiff.hist()

## Overlapping time windows

In [None]:
train_tp_rec_grps = train_tp.sort_values(['recording_id', 't_min', 't_max']).groupby('recording_id')

for name, grp in train_tp_rec_grps:
    prev_window = None
    
    for _, row in grp.iterrows():
        if prev_window is None:
            prev_window = (row['t_min'], row['t_max'])
        else:
            if row['t_min'] < prev_window[0]:
                cur_window = (row['t_min'], row['t_max'])
                print(f'Overlap in {name}: {prev_window} and {cur_window}')

**Note: No overlaps in TP**

In [None]:
train_fp_rec_grps = train_fp.sort_values(['recording_id', 't_min', 't_max']).groupby('recording_id')

for name, grp in train_fp_rec_grps:
    prev_window = None
    
    for _, row in grp.iterrows():
        if prev_window is None:
            prev_window = (row['t_min'], row['t_max'])
        else:
            if row['t_min'] < prev_window[0]:
                cur_window = (row['t_min'], row['t_max'])
                print(f'Overlap in {name}: {prev_window} and {cur_window}')

**Note: No overlaps in FP**

# Univariate

In [None]:
np.sort(train_tp.species_id.unique())

In [None]:
np.sort(train_fp.species_id.unique())

In [None]:
train_tp.groupby('species_id').size().plot(kind='bar')

In [None]:
tp_obs_cnts = train_tp.groupby(['species_id', 'songtype_id']).size().reset_index()
tp_obs_cnts = tp_obs_cnts.rename(columns={0: 'obs'})

plt.figure(figsize=(10, 7))
sns.barplot(x='species_id', y='obs', hue='songtype_id', data=tp_obs_cnts)

In [None]:
train_fp.groupby('species_id').size().plot(kind='bar')

In [None]:
fp_obs_cnts = train_fp.groupby(['species_id', 'songtype_id']).size().reset_index()
fp_obs_cnts = fp_obs_cnts.rename(columns={0: 'obs'})

plt.figure(figsize=(10, 7))
sns.barplot(x='species_id', y='obs', hue='songtype_id', data=fp_obs_cnts)

# Spectrogram

In [None]:
data, sr = librosa.load(train_files[0], sr=None)

In [None]:
mel_spec = librosa.power_to_db(librosa.feature.melspectrogram(data, sr=sr, n_mels=256, fmin=F_MIN, fmax=F_MAX))

In [None]:
mel_spec.shape

In [None]:
librosa.display.specshow(mel_spec, x_axis='time', y_axis='mel', sr=sr, fmin=F_MIN, fmax=F_MAX)

In [None]:
librosa.display.waveplot(data, sr=sr)

In [None]:
stft = librosa.stft(data, hop_length=512)
stft = librosa.power_to_db(np.abs(stft))

In [None]:
librosa.display.specshow(stft, x_axis='time', y_axis='log', sr=sr, fmin=F_MIN, fmax=F_MAX)

# Spectrogram with targets

In [None]:
def load_audio(recording_id, train=True):
    filepath = os.path.join(TRAIN_INPUT_DIR if train else TEST_INPUT_DIR, recording_id + '.flac')
    data, _ = librosa.load(filepath, sr=SR)
    return data


def cut_audio(audio_data, tmin, tmax, sr=SR, segment_duration=SEGMENT_DURATION):
    clip_duration = len(audio_data)/sr
    extra_time = max(0, segment_duration - (tmax - tmin)) / 2
    tmin = max(0, tmin - extra_time)
    tmax = min(clip_duration, tmax + extra_time)
    
    min_sample, max_sample = librosa.time_to_samples([tmin, tmax], sr=sr)
    return audio_data[min_sample:(max_sample + 1)]
    

def get_mel_spec_img(audio_data):
    mel_spec = librosa.feature.melspectrogram(audio_data, sr=SR, n_mels=N_MELS)
    log_mel_spec = librosa.power_to_db(mel_spec)
    img = tf.expand_dims(log_mel_spec, -1)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.image.per_image_standardization(img)
    return img, log_mel_spec


def get_displayable_img(spec_img):
    img_min = np.min(spec_img)
    img_max = np.max(spec_img)
    img = (spec_img - img_min)/(img_max - img_min)
    return np.stack([np.squeeze(img.numpy())]*3, axis=-1)


def freq_to_mel_bin(freqs, n_mels=N_MELS, fmin=FMIN, fmax=FMAX):
    min_mel = librosa.hz_to_mel(fmin)
    max_mel = librosa.hz_to_mel(fmax)
    mel_step = (max_mel - min_mel)/n_mels
    mel_freqs = librosa.hz_to_mel(freqs)
    return [int(np.floor(f/mel_step)) for f in mel_freqs]


def time_to_img_bin(times):
    times = librosa.time_to_frames(times, sr=SR)*IMG_WIDTH/FRAME_MAX
    return [int(np.floor(t)) for t in times]


def show_spectrogram(sample, ax, is_tp=True, showlabel=False):
    audio_data = load_audio(sample['recording_id'])
    _, mel_spec = get_mel_spec_img(audio_data)
    librosa.display.specshow(mel_spec, x_axis='time', y_axis='mel', sr=SR, fmin=FMIN, fmax=FMAX)
    ax.set(title=f'Mel-frequency spectrogram of {sample["recording_id"]}')

    sid, fmin, fmax, tmin, tmax = (sample["species_id"], sample["f_min"], sample["f_max"], sample["t_min"], sample["t_max"])
    ec = '#00ff00' if is_tp == 1 else '#0000ff'
    ax.add_patch(
        patches.Rectangle(xy=(tmin, fmin), width=tmax-tmin, height=fmax-fmin, ec=ec, fill=False)
    )

    if showlabel:
        ax.text(tmin, fmax, 
        f"{sid} {'tp' if is_tp else 'fp'}",
        horizontalalignment='left', verticalalignment='bottom', color=ec, fontsize=16)

    
def show_cut_spec_img(sample, ax, cut_duration=10):
    audio_data = load_audio(sample['recording_id'])
    sid, fmin, fmax, tmin, tmax = (sample["species_id"], sample["f_min"], sample["f_max"], sample["t_min"], sample["t_max"])
    print(sample)
    
    audio_data = cut_audio(audio_data, tmin, tmax)
    img, _ = get_mel_spec_img(audio_data)
    ax.imshow(get_displayable_img(img))

    fmin, fmax = freq_to_mel_bin([fmin, fmax])
    ax.axhline(fmin, color='r', lw=0.5)
    ax.axhline(fmax, color='g', lw=0.5)

In [None]:
print(train_tp.loc[2, :])
fig, ax = plt.subplots(figsize=(15, 3))
show_spectrogram(train_tp.loc[2, :], ax, is_tp=True, showlabel=True)

In [None]:
fig, ax = plt.subplots()
show_cut_spec_img(train_tp.loc[2, :], ax)