I have noticed some training notebooks that take a **random 5 sec** from the clips as training data. 

I think that may be a deceptive strategy because **the birds are not singing for the whole clip**. 
Furthemore we should be able to classify when there is no bird singing at all in a sequence so we need training data for that as well. 

That means that being able to identify if there is or not a bird singing in a clip section is primordial to this competition

This notebook explores strategies of preprocessing that allow to split clips between silence (background noise) and real bird songs parts 



There is too much data to process it on Kaggle so I took up a Google Cloud Engine to do it [https://github.com/LuisBlanche/kaggle-birdsong-split-silence](Code on github)
I am now uploading the datasets to kaggle, you can find them here :

https://www.kaggle.com/luisblanche/birdcall-singing-0

https://www.kaggle.com/luisblanche/birdcall-singing-1

https://www.kaggle.com/luisblanche/birdcall-singing-2

https://www.kaggle.com/luisblanche/birdcall-singing-3

https://www.kaggle.com/luisblanche/birdcall-singing-4

https://www.kaggle.com/luisblanche/birdcall-background This one is 5sec samples of background sound for each clip of the training data

## Import modules and data

In [None]:
import os
import numpy as np

import librosa
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
from IPython.display import Audio, IFrame, display
import librosa.display
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/birdsong-recognition/train.csv", parse_dates=['date'])

## Remove silences 

Strategy : 
* Select long clips that have more chances to have silence in it while the recorder wait for another birdcall
* Use librosa to remove silent parts , using  **mean db - std db** as the threshold to consider silence

In [None]:
long_clips = train[train['duration'] > 60].index 
len(long_clips)

In [None]:
path="../input/birdsong-resampled-train-audio-00"


def get_audio_path(row_number):
    """Returns species and path for a row number of train"""
    data = train.iloc[row_number]
    species = data['species']
    audio_path=os.path.join(path, data['ebird_code'], data['filename'].split('.')[0] + '.wav')
    return species, audio_path
    
def get_audio(row_number):
    """displays audio for a row number of train
    """
    species, audio_path = get_audio_path(row_number)
    print(species)
    return display(Audio(audio_path))

sound_variables = ['pitch', 'speed', 'number_of_notes', 'type', 'volume', 'length']
def describe_audio(row_number):
    """print audio with bird sound metadata 
    """
    audio = get_audio(row_number)
    metadata = train.iloc[row_number][sound_variables]
    return audio, metadata

def plot_wave(row_number):
    """
    """
    species, audio_path = get_audio_path(row_number)
    x , sr = librosa.load(audio_path)
    librosa.display.waveplot(x, sr=sr)
    plt.gca().set_title(species)


Select an example clip

In [None]:
clip = long_clips[1]

In [None]:
get_audio(0)

In [None]:
describe_audio(clip)

In [None]:
plot_wave(clip)

Listening to this clip and looking at the waveplot we understand that a big part of is just "white noise" with not bird singing. Let's try and use [librosa.effects.split](https://librosa.org/librosa/0.7.1/generated/librosa.effects.split.html#librosa-effects-split) to remove te silent parts and reduce the amount of information to process

In [None]:
 def split_sound(row_number):
    """Returns the sound array, sample rate and
    x_split = intervals where sound is louder than top db
    """
    species, audio_path = get_audio_path(row_number)
    x , sr = librosa.load(audio_path)
    db = librosa.core.amplitude_to_db(x)
    mean_db = np.abs(db).mean()
    std_db = db.std()
    x_split = librosa.effects.split(y=x, top_db = mean_db - std_db)
    return x, sr, x_split

In [None]:
def remove_silence(clip):
    """Removes silence from clip
    """
    sound, sr, intervals = split_sound(clip)
    silence_removed = []
    for inter in intervals:
        silence_removed.extend(sound[inter[0]:inter[1]])
    silence_removed = np.array(silence_removed)
    return silence_removed, sr

In [None]:
silence_removed, sr = remove_silence(clip)
display(Audio(silence_removed, rate=sr))

In [None]:
librosa.display.waveplot(silence_removed, sr=sr)

We can see that the clip has been reduced from 2min 7s to 51s , and that we still have all the bird sounds from the initial clip, we merely remove parts of the clip where we could hear waves in the background. 

## Create Silent/Noise clips 

In [None]:
x, sr, split = split_sound(clip)

In [None]:
def gather_silence(clip):
    sound, sr, intervals = split_sound(clip)
    silence = sound[0:intervals[0][0]]
    for i in range(len(intervals)-1):
        silence = np.append(silence, sound[intervals[i][1]:intervals[i+1][0]])
    silence = np.append(silence, sound[intervals[-1][1]:])
    return silence, sr

In [None]:
silence, sr = gather_silence(clip)
display(Audio(silence, rate=sr))

In [None]:
librosa.display.waveplot(silence, sr=sr)

Just like that we get the background noise, which could be use to train another model to recognize birds in the background (that could be very useful for prediction in clip 3 for instance) 

## Create Dataset

In [None]:
from pathlib import Path
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdsong-recognition"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"
TRAIN_RESAMPLED_AUDIO_DIRS = [
  INPUT_ROOT / "birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
]


In [None]:
TRAIN_RESAMPLED_AUDIO_DIRS

In [None]:
TRAIN_SINGING_DIR = Path("processed_data/train_audio_singing")
TRAIN_BACKGROUND_DIR = Path("processed_data/train_audio_background")
TRAIN_SINGING_DIR.mkdir(parents=True, exist_ok=True)
TRAIN_BACKGROUND_DIR.mkdir(parents=True, exist_ok=True)
for ebird_code in train.ebird_code.unique():
    ebird_dir = TRAIN_SINGING_DIR / ebird_code
    background_dir = TRAIN_BACKGROUND_DIR / ebird_code
    ebird_dir.mkdir(exist_ok=True)
    background_dir.mkdir(exist_ok=True)

In [None]:
import random
def split_sound(clip):
    """Returns the sound array, sample rate and
    x_split = intervals where sound is louder than top db
    """
    db = librosa.core.amplitude_to_db(clip)
    mean_db = np.abs(db).mean()
    std_db = db.std()
    x_split = librosa.effects.split(y=clip, top_db = mean_db - std_db)
    return x_split

def take_random_sample(clip, sample_len=5, sample_rate=32000):
    if len(clip) > sample_len*sample_len:
        idx = random.randint(0, len(clip) -  sample_len*sample_len)
        sample = clip[idx:idx + sample_rate * sample_len]
        return sample
    else:
        return clip
    

def split_singing_background(clip):
    """Removes silence from clip
    """
    intervals = split_sound(clip)
    singing = []
    background = clip[0:intervals[0][0]]
    for i in range(len(intervals)-1):
        background = np.append(background, clip[intervals[i][1]:intervals[i+1][0]])
    background = np.append(background, clip[intervals[-1][1]:])
    background = take_random_sample(background)
    for inter in intervals:
        singing.extend(clip[inter[0]:inter[1]])
    singing = np.array(singing)
    singing = take_random_sample(singing)
    return singing , silence


def remove_silence_from_file(ebird_code: str, filename: str, source_dir: str, target_sr: int = 32000):
    ebird_dir = TRAIN_SINGING_DIR / ebird_code
    background_dir = TRAIN_BACKGROUND_DIR / ebird_code
    filename = filename.replace('.mp3', '.wav')
    try:
        y, _ = librosa.load(
            source_dir / ebird_code / filename,
            sr=target_sr, mono=True, res_type="kaiser_fast")
        sound, background = split_singing_background(y)
        sf.write(str(ebird_dir / filename), sound, target_sr)
        sf.write(str(background_dir / filename), background, target_sr)
    except Exception as e:
        print(e)
        with open("skipped.txt", "a") as f:
            file_path = str(source_dir / ebird_code / filename)
            f.write(file_path + ' ' + str(e) + "\n")

In [None]:
train_list = [train[train['ebird_code'].str.startswith(('a'))],  # Remove 'b' to save space
              train[train['ebird_code'].str.startswith(('c', 'd', 'e', 'f'))],
              train[train['ebird_code'].str.startswith(('g' 'h', 'i', 'j', 'k', 'l', 'm'))],
              train[train['ebird_code'].str.startswith(('n', 'o', 'p', 'q', 'r'))],
              train[train['ebird_code'].str.startswith(('s', 't', 'u', 'v', 'w', 'x', 'y', 'z'))]
             ]

In [None]:
from joblib import delayed, Parallel
import soundfile as sf
warnings.simplefilter("ignore")
for i in range(1): ## Change to 5 for complete dataset (does not work on kaggle because HDD is too small)
    train_audio_infos = train_list[i][["ebird_code", "filename"]].values.tolist()
    source_dir = TRAIN_RESAMPLED_AUDIO_DIRS[i]
    #Parallel(n_jobs=-1, verbose=5)(
    #     delayed(remove_silence_from_file)(ebird_code, file_name, source_dir) for ebird_code, file_name in train_audio_infos)
