# Birds Book - EDA

A collection of my observations regarding the data we have for this competition. The EDA may grow in the future

In [None]:
from __future__ import annotations
from pathlib import Path
import json

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
from IPython.display import HTML
import torchaudio
import joblib
import torch
from joblib import Parallel, delayed
from torchaudio.transforms import Resample

from tqdm.notebook import tqdm
import multiprocessing


def inline(text, tag):
    ipd.display(HTML(f"<{tag}>{text}</{tag}>"))

In [None]:
import sys
sys.path.append('../input/birdclef2022-code')

from demo import foobar

print(foobar(0))
print(foobar(1))

In [None]:
from demo import answer
print(answer())

In [None]:
info_df = None

In [None]:
root_dir = Path('/kaggle/input/birdclef-2022/')
training_dir = root_dir / 'train_audio'
test_dir = root_dir / 'test_soundscapes'

training_path = root_dir / 'train_metadata.csv'
test_path = root_dir / 'test.csv'
submission_path = root_dir / 'submission.csv'
scored_birds_path = root_dir / 'scored_birds.json'

# Training data EDA

In [None]:
too_long_columns = ['license', 'secondary_labels', 'author', 'type', 'url']

@delayed
def load_info(name):
    info = vars(torchaudio.info(training_dir / name))
    info['seconds'] = info['num_frames'] / info['sample_rate']
    info['minutes'] = info['seconds'] / 60
    info['filename'] = name
    return info


with open(root_dir / 'scored_birds.json') as fh:
    scored_birds = json.load(fh)


training_df = pd.read_csv(training_path)
inline("Training dataset", "h3")
ipd.display(training_df.head(3))
inline("Training information", "h3")
print(f"Training shape: {training_df.shape}")
print(f"Unique primary labels count: {training_df.primary_label.nunique()}")


if info_df is None:
    file_infos = Parallel(n_jobs=-1)(load_info(path) for path in tqdm(training_df.filename))
    info_df = pd.DataFrame.from_records(info for info in file_infos)
    
inline("Audio files informations", "h3")
ipd.display(info_df.head(3))
training_df = pd.merge(training_df, info_df, on='filename')

inline("Full training data", "h3")
ipd.display(training_df.head(3).drop(['license', 'url', 'author'], axis=1))

scored_df = training_df[training_df.primary_label.isin(scored_birds)].copy()

training_df['dataset'] = 'training'
scored_df['dataset'] = 'scored'
combined_df = pd.concat([training_df, scored_df]).reset_index()

### Distribution of targets

There are some (huge) inbalances in the classes. It maybe a problem to predict if a given bird is on the recording as some birds are just not representative enough. The question is, how much will we lose if we don't learn them correctly? Open question.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
order = scored_df['primary_label'].value_counts().index
sns.countplot(data=scored_df, y='primary_label', ax=ax, order=order)

least_represented_df = scored_df[scored_df.primary_label.isin(order[-5:])]
inline("All rows of the least representative birds", "h4")
ipd.display(least_represented_df.drop(too_long_columns, axis=1))

## Distribution of rating

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].set_title("KDE plot")
sns.kdeplot(data=combined_df, x='rating', hue='dataset', common_norm=False, ax=ax[0])

ax[1].set_title("Boxen plot")
sns.boxenplot(data=combined_df, y='rating', x='dataset', ax=ax[1])

There are many recordings with score of 0.0. I would say that it is reasonable to remove these recording from the training set as:

> Garbage in, garabe out

Our models shouldn't learn on this noise, as it may hinder the performance. We can even go further, and I would say that it is safe to try to teach a model without the recordings with labels under rating of 1. That is because all of the most rare birds all have rating above 1.0 (smallest is 1.5, but I would keep it as this bird has only 2 sampels).

**Cleaning Rule**

> Remove all samples with rating below 1.5 (smallest rating for a rare bird we care about)

## The distributions of the sound file informations

In [None]:
inline("Sound file information statistics", "h4")
ipd.display_html(
    info_df.describe(percentiles=[.25, .5, .75, .95, .99, .999])
        .style.set_table_attributes("style='display:inline'")
        .set_caption('For full data')._repr_html_() 
    + 
    scored_df[info_df.columns].describe(percentiles=[.25, .5, .75, .95, .99, .999])
        .style.set_table_attributes("style='display:inline'")
        .set_caption('For scored only')._repr_html_(), 
    raw=True)

info_columns = ['num_frames', 'seconds']
fig, axs = plt.subplots(1, len(info_columns) * 2, figsize=(16, 4))
sub_5min_df = combined_df.query(f"seconds < {60*5}")

for ax, column in zip(axs[:len(info_columns)], info_columns):
    ax.set_title(f"All data")
    sns.violinplot(data=combined_df, x='dataset', y=column, ax=ax)
    ax.set_ylabel(f"log({column})")
    ax.set_yscale('log')
        
for ax, column in zip(axs[len(info_columns):], info_columns):
    ax.set_title(f"Recordings under 5 min")
    sns.violinplot(data=sub_5min_df, x='dataset', y=column, ax=ax)
        
fig.tight_layout()

In [None]:
inline("Top 5 longest recordings", "h4")
ipd.display(training_df.query("minutes > 16").sort_values('minutes', ascending=False).drop(too_long_columns, axis=1))

I would say that for the training we can drop the really long files. Why? Because:

* bigger file takes longer to load, and we have plenty of other data so we don't really need these files
* above ~16.5 minutes there are no recordings of birds that we care about
* the problem with longer files is that we don't know at what point the bird is heard, as we must predict the presence of a bird on 5 second windows and we only have information that the bird is heard in this recording (not in what time frame). This acutally can be another topic, beacuse if we somehow obtain a "silence" label or "no bird" label then we can have better model (need more thinking).

**Cleaning Rule**

> Remove all sampels longer then 17 minutes

# Training set cleanup

In [None]:
def drop_poor_quality_data(df):
    return df[df.rating >= 1.5].copy()

def drop_long_recordings(df):
    return df[df.minutes < 17].copy()

def clean(df):
    rules = [drop_poor_quality_data, drop_long_recordings]
    initial_rows = df.shape[0]
    for rule in rules:
        df = rule(df)
     
    final_rows = df.shape[0]
    print(f"Rows before: {initial_rows}, rows after: {final_rows}, difference: {initial_rows - final_rows}")
    return df

In [None]:
clean_training_df = clean(training_df)
clean_scored_df = clean(scored_df)

inline("Clean training statistics", "h4")
ipd.display(clean_training_df.describe())

inline("Clean scored statistics", "h4")
ipd.display(clean_scored_df.describe())

# Example test set

The full test file will be provided durring subbmision. One recording can require multiple predictions to be made using 5 second windows. As far as I know, there *should* be a single bird per file (or at least few birds per file). If that is true we can average the scores (or for example take max of the bird). That is hard to say, because on provided recording the bird is heard only at the end, so basically we may want to apply model to each section (as sound scape can have bird in split 5-10, but no bird noises in split 10-15 etc).

But as we are required to predict on 5 second windows, then we can only create netowrk that does exactly that. Note that we need to have padding in the case the sound is shorter then 5 seconds, like in description:

>  These are each within a few milliseconds of 1 minute long and in the ogg audio format.

Then we need to pad as few miliseconds is not equal to 5 seconds. Also we don't know if final window will contain exactly 5 seconds (perhabs not).

In [None]:
test_df = pd.read_csv(test_path)
# test_df.loc[1, 'bird'] = 'other'
ipd.display(test_df.head())

test_ogg_path = test_dir / 'soundscape_453028782.ogg'
print(torchaudio.info(test_ogg_path))
ipd.Audio(test_ogg_path)

# Test set after sumbission

In [None]:
# t2 = test_df.copy()
def unique_birds(x):
    return x.unique()

def unique_birds_count(x):
    return x.nunique()

file_df = test_df.groupby('file_id')['bird'].agg([unique_birds, unique_birds_count])
file_infos = [] 
for file in file_df.index:
    path =  test_dir / f'{file}.ogg'
    if path.exists():
        info = torchaudio.info(path)
        info = vars(info)
        info['index'] = file
        info['duration_s'] = info['num_frames'] / info['sample_rate']
        file_infos.append(info)

if file_infos:
    file_info_df = pd.DataFrame.from_records(file_infos).set_index('index')
    file_df = file_df.join(file_info_df)

anomaly_df = file_df.query('unique_birds_count > 1')

### Test dataframe

In [None]:
ipd.display(file_df)
print("Statistics")
ipd.display(file_df.describe())

### Test dataframe where there are more then one bird in recording

In [None]:
ipd.display(anomaly_df)
print("Statistics")
ipd.display(anomaly_df.describe())

# Dummy submission

With test for data loading

In [None]:
scored_df = training_df[training_df.primary_label.isin(scored_birds)].copy()

lookup = scored_df.groupby('primary_label')['type'].count().sort_values()
always_negative = set(lookup[:len(lookup)//2].index)
always_positive = set(lookup[len(lookup)//2:].index)

assert set(lookup.index) == (always_negative | always_positive)

In [None]:
def rechannel(recording, channels):
    current_channels = recording.shape[0]
    if current_channels == channels:
        return recording
    
    if channels == 1:
        return recording[:1, :]
    
    if channels == 2 and current_channels == 1:
        return torch.cat([recording, recording])
    
    if channels == 2 and current_channels == 3:
        return recording[:2, :]
    
    if channels == 3 and current_channels == 1:
        return torch.cat([recording, recording, recording])
    
    if channels == 3 and current_channels == 2:
        return torch.cat([
            recording[0:1, :],
            recording[1:2, :],
            recording.mean(axis=0)[None, :]
        ])
    
    raise ValueError(f"Unupported target channels: {channels} and audio channels {current_channels}")
    

# for in_c, out_c in product(range(1, 4), repeat=2):
#     out = rechannel(torch.randn(in_c, 50), out_c)
#     assert out.shape[0] == out_c

In [None]:
test_df['target'] = [b in always_positive for b in test_df['bird']]
submission_df = test_df[['row_id', 'target']]
ipd.display(submission_df)
submission_df.to_csv('submission.csv', index=False)

# Testing time required to load test set:


# for file_id in tqdm(test_df.file_id):
# files = [f+'.ogg' for f in test_df.file_id]

# files = list(test_dir.glob('*.ogg'))
# for file_id in tqdm(files):
#     audio, rate = torchaudio.load(test_dir / file_id)
#     resampler = Resample(rate, 1000)

#     audo = rechannel(audio, 2)
#     audio = resampler(audio)
    
#     del audio

# submission_df = pd.merge(test_df, file_df.reset_index(), on='file_id', how='outer')
# ipd.display(submission_df)
# submission_df.to_csv('submission.csv', index=False)