# Training data (Short audio)

The training data for this competition consists of a collection of so-called “focal recordings”. These recordings were made using semi-professional equipment (often using highly directional microphones) and primarily focus on one single species. All recordings were contributed by Xeno-canto (https://www.xeno-canto.org), one of the largest digital archives for bird sounds. Each recording comes with metadata specifying things like recording date, recording location, and (of course) the bird species that was recorded.

To get a better understanding of the metadata, let’s look at a few entries.

In [None]:
import pandas as pd
import os
import warnings
warnings.filterwarnings(action='ignore')
import IPython.display as ipd
import numpy as np
import folium
import matplotlib.pyplot as plt
import seaborn as sns
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
import geopandas as gpd
import geopy.distance
from shapely.geometry import Point

from datetime import datetime, timedelta

from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, LinearAxis, Range1d
from bokeh.models.tools import HoverTool
from bokeh.palettes import BuGn4
from bokeh.plotting import figure, output_notebook, show
from bokeh.transform import cumsum

output_notebook()

In [None]:
import time
import librosa
import sklearn
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
from tqdm import tqdm
tqdm.pandas()

In [None]:
## configuring setup, constants and parameters
PATH_TRAIN = "../input/birdclef-2021/train_metadata.csv"
PATH_TRAIN_SOUNDSCAPE = "../input/birdclef-2021/train_soundscape_labels.csv"
PATH_TEST = "../input/birdclef-2021/test.csv"
PATH_TEST_RECORDING_DATE_LOC = "../input/birdclef-2021/test_soundscapes/test_set_recording_dates.csv"
#PATH_EBIRD_CLEMENTS = "../input/ebirdclements-checklist/eBird-Clements-v2019-integrated-checklist-August-2019.csv"
PATH_AUDIO = "../input/birdclef-2021/train_short_audio"

In [None]:
train = pd.read_csv(PATH_TRAIN,)
train.head()

In [None]:
print(" The data has ",train.shape[0]," rows")
print("\n The data has ",train.shape[1]," columns. \n The columns are: ",train.columns.values)

In [None]:
train.info()

In [None]:
# train["date"] = train["date"].astype('datetime64[ns]')

In [None]:
train.dtypes

## TODO Correct/Impute Date if aything is wrong.

In [None]:
train.groupby("date")["common_name"].count().reset_index()

In [None]:
train.dtypes

In [None]:
# Create some time features

train['year'] = train['date'].apply(lambda x: x.split('-')[0]).astype(int)
train['month'] = train['date'].apply(lambda x: x.split('-')[1]).astype(int)
train['day_of_month'] = train['date'].apply(lambda x: x.split('-')[2]).astype(int)
train["hour"] = pd.to_numeric(train.time.str.split(":", expand = True)[0], errors = "coerce")

In [None]:
train["year"].value_counts()

In [None]:
train["year"][train["year"] == 0] = 2015
train["year"][train["year"] == 199] = 1990
train["year"][train["year"] == 201] = 2010
train["year"][train["year"] == 202] = 2020
train["year"][train["year"] == 2104] = 2014

In [None]:
train["month"].value_counts()

In [None]:
train["month"][train["month"] == 0] = 6

In [None]:
train["day_of_month"].value_counts()

In [None]:
train["day_of_month"][train["day_of_month"] == 0] = 16

In [None]:
train

In [None]:
#Recreate train["date"] with imputed values
train['date'] = pd.to_datetime(pd.DataFrame({'year':train['year'],
                             'month':train['month'],
                             'day':train['day_of_month']}))

In [None]:
train['latitude'].value_counts()

In [None]:
train['longitude'].value_counts()

In [None]:
train.head()

In [None]:
train.dtypes



# Test data

If you’re already familiar with the training soundscapes, the hidden test set should not be a surprise. It contains 20 soundscape recordings of 10-minute duration for each of the four recording sites. Again, you need to predict audible species for 5-second chunks of the audio data. The submission file needs to contain the ID of the processed audio chunk  (fileID_site_time) and all audible species as a list of space-delimited eBird codes.

Let’s look at one example:

When analyzing file “*1234_SSW_20170101.ogg*” (that’s a mock filename), the audio chunk ending at second *00:00:35* of the entire file would have the unique ID “*1234_SSW_35*”. If your classifier thinks that species “bluwa1” and “redwa2” (again, mock codes) vocalize during this time, the final submission entry should look like this:

*1234_SSW_35 bluwa1 redwa2*

A submission for this file should include **ALL** segments, starting at 5 seconds. Like this:
 
*1234_SSW_5 nocall*  
*1234_SSW_10 bluwa1*  
*1234_SSW_15 nocall*  
*1234_SSW_20 bluwa1 redwa2*  
*1234_SSW_25 nocall*  
*1234_SSW_30 nocall* 

And so on...


Make yourself familiar with the training and test data, also make sure to check out our other notebooks, let us know if you have any comments and - of course - don’t hesitate to start a forum thread if you have any questions.

In [None]:
test_csv = pd.read_csv(PATH_TEST)
sample_sub= pd.read_csv("../input/birdclef-2021/sample_submission.csv")

In [None]:
test_csv.columns

In [None]:
test_csv.head()

In [None]:
sample_sub.head()

In [None]:
# df_ebird_clements = pd.read_csv(PATH_EBIRD_CLEMENTS, encoding = "ISO-8859-1")
# df_ebird_clements

In [None]:
#df_ebird_clements.columns

In [None]:
#Use this information later if required https://www.kaggle.com/jmreuter/a-birder-s-eye-view-of-the-metadata-with-taxonomy/

## TRAINING & TEST RECORDING LOCATIONS  

We can extract location data from txt located in the test_soundscapes folder.

In [None]:
# latitude, longitude
COL = ['COL', 5.57,-75.85,200,'Jardín, Departamento de Antioquia','Colombia']
COR = ['COR',10.12,-84.51,200,'Alajuela, San Ramón','Costa Rica']
SNE = ['SNE', 38.49,-119.95,200,'Sierra Nevada, California','USA'] 
SSW = ['SSW', 42.47,-76.45,200,'Ithaca, New York','USA']
alias = ['COL','COR','SNE','SSW']
columns = ['alias','latitude','longitude','size','location','country']
data = [COL, COR, SNE, SSW]

df_recording_loc = pd.DataFrame(data,columns=columns)
df_recording_loc

 ## Test recording location with date  
 Description of test site parameters, such as location and date.  
 Explore which birds were at the right place and in the right time to have a chance of making into a test set recording.
 All birds were likely to be observed at the test sites, but were they there at the right time?  
 
 Answer : Each recording site is assigned a circular region of 200 km in radius. If any of the birds were previously 
recorded in the same month as the test recordings on that site, the bird could potentially be present in test recordings.


Let's retrieve all months, when the recording took place, in the **test set**. Together with location, lets check all species that were observed in the right place and in the right time.

In [None]:
test_recording_date_loc = pd.read_csv(PATH_TEST_RECORDING_DATE_LOC)
test_recording_date_loc["date"] = pd.to_datetime(test_recording_date_loc["date"].astype(str), format="%Y%m%d")
test_recording_date_loc["month"] = test_recording_date_loc["date"].apply(lambda x: x.month).astype(int)

test_recording_date_loc.head()

In [None]:
df_recording_loc.values

In [None]:
site_params = dict([(site, []) for site in test_recording_date_loc["site"].unique()])
for row in test_recording_date_loc.iterrows():
    site_params[row[1]["site"]].append(row[1]["month"])

for site in site_params:
    site_params[site] = {"months" : list(set(site_params[site]))}

for spatial in df_recording_loc.values:
    site_params[spatial[0]]["latlon"] = (spatial[1], spatial[2])
    site_params[spatial[0]]["R"] = 200.0
    
site_params

In [None]:
site_params.items()

In [None]:
def right_place_time(lat, lon, month):
    """
    Calculate if an observation was made within test site parameters (coordinates and time)
    """
    check = False
    for site, params in site_params.items():
        # Check within site
        check_site = (geopy.distance.distance(params["latlon"], (lat, lon)).km < params["R"]) and (month in params["months"])
        check = check or (check_site > 0)

    return check

In [None]:
right_place_time(42.3005, -72.5877, 9)

In [None]:
right_place_time(42.47, -76.5877, 1)

In [None]:
right_place_time(42.47, -76.5877, 2)

In [None]:
train.columns

In [None]:
train.dtypes

In [None]:
train.head()

In [None]:
train["latitude"].isna().any()

In [None]:
train["hour"].isna().any()

In [None]:
train["right_place_time"] = train.progress_apply(lambda r: right_place_time(r['latitude'], r['longitude'], r["month"]), axis=1)

In [None]:
train.head()

In [None]:
train["right_place_time"].value_counts()

In [None]:
print("Percentage of records withing test sites at matching times of year: {:.2f}%".format(100*len(train[train["right_place_time"]])/len(train)))

In [None]:
print("Of {} species {} were observed within sites at the same time of year".format(train["primary_label"].nunique(), train[train["right_place_time"]]["primary_label"].nunique()))

## TRAINING SOUNDSCAPE : COUNTS INFORMATION  

Let's view the value counts difference between nocall and call labels in the training soundscape data.
Splitting the whole soundscape recording into 5 second segments, let's also see how many segments exist in each recording, in the training soundscape data.

In [None]:
# Training Given Environment Recordings
train_soundscape = pd.read_csv(PATH_TRAIN_SOUNDSCAPE)
train_soundscape.head()

In [None]:
train_soundscape.shape

In [None]:
#tdf0 = pd.read_csv('../input/birdclef-2021/train_metadata.csv')
val = train_soundscape.birds.value_counts() 
y = val.to_list() 
x = val.index.to_list()

print('CALLS vs NOCALLS INFO in All Recordings')
print('*****************************************')
print(f"Training Soundscape Identifiers: {train_soundscape[train_soundscape.birds!='nocall'].shape[0]}")
print(f"Training Soundscapes Nocalls: {train_soundscape[train_soundscape.birds=='nocall'].shape[0]}")

print('\nTRAINING SOUNDSCAPE RECORDINGS:')
print('***********************************')
train_soundscape.site.value_counts() # 2/4 TEST LOCATIONS

## 1. Primary species

Most importantly, the metadata specifies the audible species for each recording. The primary species annotation consists of three data fields: *primary_label, scientific_name, and common_name*[](http://). All labels have to be considered as “weak labels” since we do know which species is audible in the recording, but we do not know the exact timestamps of the vocalizations. Training with weakly labeled data is one of the core challenges of this competition.

Let’s look at the number of different species.

In [None]:
train['primary_label'].value_counts()

In [None]:
len(train['primary_label'].value_counts())

Our dataset contains recordings for **397** different *primary* species, all of them defined by their **eBird code** (the codes that we use as primary label). Just as Xeno-canto is a digital platform that collects audio recordings, eBird (https://ebird.org) is a citizen science project that collects observations of birds. eBird uses unique species codes to reference birds. You can access additional information on each bird species by combining the base URL “https://ebird.org/species/” with a species code from the *primary_label* columns of the metadata.

Here are a few examples:

Golden-crowned Kinglet: https://ebird.org/species/gockin  
Red-winged Blackbird: https://ebird.org/species/rewbla  
American Goldfinch: https://ebird.org/species/amegfi

Let’s take a look at the number of recordings for each species in the training data:

## Bird (or Species)  
primary_label is the target variable that needs to be predicted. Let's look at its distribution. There are two features with the same information: common_name is just a prettier (and complete) version of primary_name. Note that the values from primary_name are used for predictions.

In [None]:
df_bird = train.groupby("common_name")["filename"].count().reset_index().rename(columns = {"filename": "recordings"}).sort_values("recordings")

source = ColumnDataSource(df_bird)
tooltips = [
    ("Bird Species", "@common_name"),
    ("Recordings", "@recordings")
]

v = figure(plot_width = 1000, plot_height = 6000, y_range = df_bird.common_name.values, tooltips = tooltips, title = "Count of Bird Species")
v.hbar("common_name", right = "recordings", source = source, height = 0.75, color = "steelblue", alpha = 0.6)

v.xaxis.axis_label = "Count"
v.yaxis.axis_label = "Species"

show(v)
# df_bird.head()



## 2. Background species

The metadata for each recording lists the number of audible background species. The data field “*seconday_labels*” contains lists of eBird codes (i.e., primary labels) that recordists annotated. It is important to note that these lists might be incomplete, and you might be able to hear background species, although none are specified in the metadata. Therefore, lists of secondary labels are not very reliable, but they might still be useful for multi-label training (e.g., through loss masking for background species).

Let's look at some values:

In [None]:
train['secondary_labels'].value_counts()

We can see that the majority of recordings does not have an annotation of background species. Yet, it is highly likely that most of them actually contain one or more additional species. The data also shows us that the Red-winged Blackbird (rewbla), American Robin (amerob), House Sparrow (houspa), and Northern Cardinal (norcar) appear to be some of the most common background species.

**Please note, secondary lables only contain labels of species that are actually represented in the data set.**

## 3. Location, location, location

Each recording comes with a recording location specified in the metadata. Data fields “*latitude*” and “*longitude*” contain GPS coordinates as provided by the recordist. In combination with the recording data (data field “*date*”), this information can be very useful to map distribution and migration patterns. Why is it important? Not all birds occur at all locations at all times! 

Let's look at a few examples:


In [None]:
# Code adapted from: https://www.kaggle.com/andradaolteanu/birdcall-recognition-eda-and-audio-fe
# Make sure to ckeck out the entire nootebook. It's brilliant.



# SHP file
world_map = gpd.read_file("../input/world-shapefile/world_shapefile.shp")

# Coordinate reference system
crs = {"init" : "epsg:4326"}

# Lat and Long need to be of type float, not object
species_list = ['norcar', 'houspa', 'wesblu', 'banana']
data = train[train['primary_label'].isin(species_list)]
data["latitude"] = data["latitude"].astype(float)
data["longitude"] = data["longitude"].astype(float)

# Create geometry
geometry = [Point(xy) for xy in zip(data["longitude"], data["latitude"])]

# Geo Dataframe
geo_df = gpd.GeoDataFrame(data, crs=crs, geometry=geometry)

# Create ID for species
species_id = geo_df["primary_label"].value_counts().reset_index()
species_id.insert(0, 'ID', range(0, 0 + len(species_id)))

species_id.columns = ["ID", "primary_label", "count"]

# Add ID to geo_df
geo_df = pd.merge(geo_df, species_id, how="left", on="primary_label")

# === PLOT ===
fig, ax = plt.subplots(figsize = (16, 10))
world_map.plot(ax=ax, alpha=0.4, color="grey")

palette = iter(sns.hls_palette(len(species_id)))
for i in range(len(species_list)):
    geo_df[geo_df["ID"] == i].plot(ax=ax, 
                                   markersize=20, 
                                   color=next(palette), 
                                   marker="o", 
                                   label = species_id['primary_label'].values[i]);
    
ax.legend()

As we can see, different species occur over different spatial scales. According to the recording locations, the House Sparrow (houspa) has occurrences around the globe, the Northern Cardinal (norcar) appears to be a typical East coast species of the U.S., the Western Bluebird (wesblu) a West coast species. The Bananaquit (banana) seems to only occur in Central and South America. 

Location data can help us to create subsets of the training data for each of the four test data recording locations (which we will explore later). But be aware: The range of certain species may not be fully reflected by recording location data, and the actual range may differ from what we can see in the data. Yet, recording locations are a good starting point.

Please note that the training data only contains species that are likely to occur at the recording locations of the test data, even though sometimes the majority of the recordings were made in Europe. If you want to know more about the range of a certain species, please take a look at the associated eBird entry.



## 4.Time of the Recording

In [None]:
plt.figure(figsize=(16, 6))
train = train.sort_values(['year']).reset_index(drop=True)
ax = sns.countplot(train['year'], palette="hls")


plt.title("Audio Files Registration per Year Made", fontsize=16)
plt.xticks(rotation=90, fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

Majority of the data was registered between 2013 and 2020, and from March to July

0000, 0199, 0201, 0202, 2104 are likley wrong years

In [None]:
plt.figure(figsize=(16, 6))

train = train.sort_values(['month']).reset_index(drop=True)
ax = sns.countplot(train['month'], palette="hls")

plt.title("Audio Files Registration per Month Made", fontsize=16)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

Majority of the data was registered between March and July

00 is months which are not known

## Datetime  
Let's look at when these recordings were taken.

In [None]:
df_date = train.groupby("date")["common_name"].count().reset_index().rename(columns = {"common_name": "recordings"})
df_date.head(10)

In [None]:
df_date.date = pd.to_datetime(df_date.date, errors = "coerce")
df_date.dropna(inplace = True)
df_date["weekday"] = df_date.date.dt.day_name()
source_1 = ColumnDataSource(df_date)

## 5. Rating

Xeno-canto has a rating system for the quality of each recording. Ratings are assigned by users, and we adapted this rating scheme for the training data. In our case, ratings range from 0.5 to 5.0 (the latter being the best possible rating) and reflect the overall quality assigned by users and the number of background species. A value of “0” means that this particular recording does not have a rating, and it is by that the fallback value.

Let's see how rating values are distributed across the training data:

## Recording quality ratings by species

Here's the description for ratings from xeno-canto:

Use the following general guidelines when rating recordings on xeno-canto. Ratings are obviously subjective, and will inevitably vary slightly between different individuals, but these guidelines should improve consistency.  
  
A: Loud and Clear  
B: Clear, but bird a bit distant, or some interference with other sound sources  
C: Moderately clear, or quite some interference  
D: Faint recording, or much interference  
E: Barely audible  
Note that the A-E character classifications described on the xeno-canto website were sensibly converted to numeric classifications for the metadata, with 1 being the worst and 5 being the Best.

In [None]:
# Code adapted from https://www.kaggle.com/shahules/bird-watch-complete-eda-fe
# Again, make sure to check out the entire notebook.
import plotly.graph_objects as go

hist_data = train['rating'].values.tolist()
fig = go.Figure(data=[go.Histogram(x=hist_data)], 
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))
fig.update_layout(title='Number of recordings per rating')

fig.show()

Overall, the training data contains high-quality recordings and the majority of samples is rated with 3.5 or higher. Whenever we had to limit the amount of recordings per species for the training data, we used the 500 top-rated samples. Sub-sampling training data based on user rating might help to extract high-quality training samples.

Other data fields of the metadata might be of value at some point during development, here is a brief description for each of them:

* **type**: Represents the type of the vocalization, with “song” and “call” as the most common. Excluding or including recordings of certain call types might help to diversify training data. Learn more about how and why birds vocalize here: https://academy.allaboutbirds.org/birdsong/

* **author**: Acknowledgement to the recordists who contributed the recording. Some recordists focus on a specific subsets of species, so there might be some value in these data.

* **filename**: A reference to the sound file in the training data.

* **license**: All recordings have an open source license which is noted in this field. Make sure to respect the license when sharing the data.

* **time**: Time of recording as stated by the recordist. Might be of value to distinguish between birds that vocalize during the day and those which only vocalize during the night. Can be used to diversify the training data.

* **url**: A link to the original recording on Xeno-canto.



**Recordings**  
The main data is the audio files of the bird recordings. Let's hear the first sample from few of the species.

In [None]:
# df_bird_map = train[["primary_label", "common_name"]].drop_duplicates()

# for primary_label in os.listdir(PATH_AUDIO)[:20]:
#     species = df_bird_map[df_bird_map.primary_label == primary_label].common_name.values[0]
#     audio_file = os.listdir(f"{PATH_AUDIO}/{primary_label}")[0]
#     audio_path = f"{PATH_AUDIO}/{primary_label}/{audio_file}"
#     ipd.display(ipd.HTML(f"<h2>{primary_label} ({species})</h2>"))
#     ipd.display(ipd.Audio(audio_path))

# Training data (Soundscapes)

One of the major obstacles in this competition is the significant gap between training and test recordings. There is a distinct shift in acoustic domains between the two and it can be very challenging to train classifiers that generalize well enough to bridge the gap. Yet, training with target samples (i.e., soundscapes) is often not possible - somebody has to annotate the data for each new deployment, for each new recording location. However, we decided to include some examples of soundscape recordings (i.e., test recordings) that can be used for validation, or even for training. These 20 recordings represent 2 of the 4 test recording locations. Yet, they might not be 100% representative, some species might be missing and only audible in the hidden test set, recording equipment might differ. But they should nonetheless provide a good overview of what to expect in the hidden test data.

Let’s take a look at the label data for this set of recordings.

In [None]:
train_soundscapes = pd.read_csv('../input/birdclef-2021/train_soundscape_labels.csv',)
train_soundscapes

In [None]:
train_soundscapes["site"].value_counts()

We can see a few data fields and here’s a brief description for each of them:

* **row_id**: Unique identifier of a 5-second segment of each soundscape file. Use this create the submission file entry.

* **site**: Recording site of the soundscape data. In this competition, we included recordings from 4 different sites (COL = Colombia, COR = Costa Rica, SNE = Sierra Nevada, SSW = Sapsucker Woods). **Make sure to take a look at the “test_soundscape_metadata” which contains more information on each location**. Training soundscapes only represent two of the four locations (COR and SSW).

* **audio_id**: Identifier used to reference audio recordings. Filenames contain the file ID, recording site and recording date (yyyymmdd).

* **seconds**: End time of the 5-second segment for which this entry states the label. A value of 85 would mean that this particular segment starts at 00:01:20 and lasts until 00:01:25 of the audio file.

* **birds**: primary label (i.e., eBird code) of the audible species of this segment. “nocall” references a segment without any bird vocalization. Segments can have more than one bird, in that case, eBird codes are separated by space. “nocall” can never appear together with other codes.

Let’s look at the most common entries for “birds”:

In [None]:
train_soundscapes.shape, train_soundscapes.describe()

In [None]:
print(train_soundscapes['birds'].value_counts())

“Nocall” seems to be the most common, which is no surprise: Birds only vocalize occasionally during a recording. Yet, some recordings contain very dense acoustic scenes with multiple birds vocalizing at the same time. Why is “nocall” important? There’s a simple reason: Your classifier should be able to suppress false positives for these segments, which is important for ornithologists when confronted with the detections. One of the core challenges of this competition is to reduce the number of false positives (precision) without losing too many true positives (recall).

It is up to you if you use training soundscapes for validation (since they represent the hidden test set) or if you use annotated segments for training (to cope with the shift in acoustic domains). But be aware: Training with soundscape data for a few species might introduce unwanted biases when overfitting to one recording site.

In [None]:
train

In [None]:
print("Of {} species {} were observed within sites at the same time of year".format(train["primary_label"].nunique(), train[train["right_place_time"]]["primary_label"].nunique()))

In [None]:
list_of_test_site_birds = train[train["right_place_time"]]["primary_label"].unique()

In [None]:
list_of_test_site_birds, len(list_of_test_site_birds)

In [None]:
train.loc[train['primary_label'].isin(list_of_test_site_birds)]

In [None]:
train_filtered = train.loc[train['primary_label'].isin(list_of_test_site_birds)]

In [None]:
train_filtered.head()

In [None]:
train_filtered["primary_label"].nunique()

In [None]:
train_filtered[train_filtered.rating.isin([0, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])]["primary_label"].nunique()

In [None]:
train_filtered = train_filtered[train_filtered.rating.isin([0, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])]

In [None]:
hist_data = train_filtered['rating'].values.tolist()
fig = go.Figure(data=[go.Histogram(x=hist_data)], 
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))
fig.update_layout(title='Number of recordings per rating')

fig.show()

In [None]:
df_bird = train_filtered.groupby("common_name")["filename"].count().reset_index().rename(columns = {"filename": "recordings"}).sort_values("recordings")

source = ColumnDataSource(df_bird)
tooltips = [
    ("Bird Species", "@common_name"),
    ("Recordings", "@recordings")
]

v = figure(plot_width = 1000, plot_height = 4000, y_range = df_bird.common_name.values, tooltips = tooltips, title = "Count of Bird Species")
v.hbar("common_name", right = "recordings", source = source, height = 0.75, color = "steelblue", alpha = 0.6)

v.xaxis.axis_label = "Count"
v.yaxis.axis_label = "Species"

show(v)

In [None]:
train_filtered["primary_label"].nunique()

In [None]:
!nvidia-smi

In [None]:
!pip install -q pysndfx SoundFile audiomentations pretrainedmodels efficientnet_pytorch resnest

In [None]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

import torch
from torch import nn, optim
from  torch.utils.data import Dataset, DataLoader

from resnest.torch import resnest50

from matplotlib import pyplot as plt

import os, random, gc
import re, time, json
from  ast import literal_eval


from IPython.display import Audio
from sklearn.metrics import label_ranking_average_precision_score

from tqdm.notebook import tqdm
import joblib

In [None]:
from efficientnet_pytorch import EfficientNet
import pretrainedmodels
import resnest.torch as resnest_torch

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
NUM_CLASSES = 273
SR = 32_000
DURATION = 7

MAX_READ_SAMPLES = 7 # Each record will have 10 melspecs at most, you can increase this on Colab with High Memory Enabled
DATA_ROOT = Path("../input/birdclef-2021")
MEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/rich_train_metadata.csv"))
TRAIN_LABEL_PATHS = sorted(Path("../input").glob("kkiller-birdclef-mels-computer-d7-part?/LABEL_IDS.json"))

MODEL_ROOT = Path(".")

In [None]:
MEL_PATHS, TRAIN_LABEL_PATHS

In [None]:
TRAIN_BATCH_SIZE = 50
TRAIN_NUM_WORKERS = 2

VAL_BATCH_SIZE = 64
VAL_NUM_WORKERS = 2

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device:", DEVICE)

In [None]:
temp = pd.read_csv(str('../input/kkiller-birdclef-mels-computer-d7-part1/rich_train_metadata.csv'), index_col=0)
temp.head()

In [None]:
def get_df(mel_paths=MEL_PATHS, train_label_paths=TRAIN_LABEL_PATHS):
  df = None
  LABEL_IDS = {}
    
  for file_path in mel_paths:
    temp = pd.read_csv(str(file_path), index_col=0)
    temp["impath"] = temp.apply(lambda row: file_path.parent/"audio_images/{}/{}.npy".format(row.primary_label, row.filename), axis=1) 
    df = temp if df is None else df.append(temp)
    
  df["secondary_labels"] = df["secondary_labels"].apply(literal_eval)

  for file_path in train_label_paths:
    with open(str(file_path)) as f:
      LABEL_IDS.update(json.load(f))

  return LABEL_IDS, df

In [None]:
LABEL_IDS, df = get_df()

print(df.shape)
df.head()

In [None]:
df

In [None]:
train_filtered

In [None]:
# Create some time features

df['year'] = df['date'].apply(lambda x: x.split('-')[0]).astype(int)
df['month'] = df['date'].apply(lambda x: x.split('-')[1]).astype(int)
df['day_of_month'] = df['date'].apply(lambda x: x.split('-')[2]).astype(int)
df["hour"] = pd.to_numeric(df.time.str.split(":", expand = True)[0], errors = "coerce")

df["year"][df["year"] == 0] = 2015
df["year"][df["year"] == 199] = 1990
df["year"][df["year"] == 201] = 2010
df["year"][df["year"] == 202] = 2020
df["year"][df["year"] == 2104] = 2014

df["month"][df["month"] == 0] = 6
df["day_of_month"][df["day_of_month"] == 0] = 16
#Recreate train["date"] with imputed values
df['date'] = pd.to_datetime(pd.DataFrame({'year':df['year'],
                             'month':df['month'],
                             'day':df['day_of_month']}))


In [None]:
df["right_place_time"] = df.progress_apply(lambda r: right_place_time(r['latitude'], r['longitude'], r["month"]), axis=1)

In [None]:
df["right_place_time"].value_counts()

In [None]:
df_filtered = df.loc[df['primary_label'].isin(list_of_test_site_birds)]

In [None]:
df_filtered["primary_label"].nunique()

In [None]:
df_filtered = df_filtered[df_filtered.rating.isin([0, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])]

In [None]:
hist_data = df_filtered['rating'].values.tolist()
fig = go.Figure(data=[go.Histogram(x=hist_data)], 
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))
fig.update_layout(title='Number of recordings per rating')

fig.show()


In [None]:
df_bird = df_filtered.groupby("common_name")["filename"].count().reset_index().rename(columns = {"filename": "recordings"}).sort_values("recordings")

source = ColumnDataSource(df_bird)
tooltips = [
    ("Bird Species", "@common_name"),
    ("Recordings", "@recordings")
]

v = figure(plot_width = 1000, plot_height = 4000, y_range = df_bird.common_name.values, tooltips = tooltips, title = "Count of Bird Species")
v.hbar("common_name", right = "recordings", source = source, height = 0.75, color = "steelblue", alpha = 0.6)

v.xaxis.axis_label = "Count"
v.yaxis.axis_label = "Species"

show(v)

In [None]:
LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df_filtered["primary_label"].unique()))}
INV_LABEL_CODE = {val: key for key,val in LABEL_IDS.items()}

In [None]:
LABEL_IDS

In [None]:
LABEL_IDS['acafly']

In [None]:
import gc
del [[df, train, train_filtered, df_bird]]
gc.collect()

In [None]:
df_filtered["label_id"] = df_filtered['primary_label'].map(LABEL_IDS)

In [None]:
df_filtered["label_id"].min(), df_filtered["label_id"].max()

In [None]:
df_filtered[["label_id","primary_label"]]

In [None]:
df_filtered.reset_index(drop=True, inplace=True)

In [None]:
df_filtered.head()

In [None]:
df_filtered["fold"].value_counts()

In [None]:
def get_model(name, num_classes=NUM_CLASSES):
    """
    Loads a pretrained model. 
    Supports ResNest, ResNext-wsl, EfficientNet, ResNext and ResNet.

    Arguments:
        name {str} -- Name of the model to load

    Keyword Arguments:
        num_classes {int} -- Number of classes to use (default: {1})

    Returns:
        torch model -- Pretrained model
    """
    if "resnest" in name:
        #model = getattr(resnest_torch, name)(pretrained=True)
        pretrained_weights = torch.load('../input/timm-resnest-weights/resnest50-528c19ca.pth')
        model = getattr(resnest_torch, name)(pretrained=False)
        model.load_state_dict(pretrained_weights)
    elif "wsl" in name:
        model = torch.hub.load("facebookresearch/WSL-Images", name)
    elif name.startswith("resnext") or  name.startswith("resnet"):
        model = torch.hub.load("pytorch/vision:v0.6.0", name, pretrained=True)
    elif name.startswith("tf_efficientnet_b"):
        model = getattr(timm.models.efficientnet, name)(pretrained=True)
    elif "efficientnet-b" in name:
        model = EfficientNet.from_pretrained(name)
    else:
        model = pretrainedmodels.__dict__[name](pretrained='imagenet')

    if hasattr(model, "fc"):
        nb_ft = model.fc.in_features
        model.fc = nn.Linear(nb_ft, num_classes)
    elif hasattr(model, "_fc"):
        nb_ft = model._fc.in_features
        model._fc = nn.Linear(nb_ft, num_classes)
    elif hasattr(model, "classifier"):
        nb_ft = model.classifier.in_features
        model.classifier = nn.Linear(nb_ft, num_classes)
    elif hasattr(model, "last_linear"):
        nb_ft = model.last_linear.in_features
        model.last_linear = nn.Linear(nb_ft, num_classes)

    return model

In [None]:
np.load(str("../input/kkiller-birdclef-mels-computer-d7-part1/audio_images/acafly/XC109605.ogg.npy"))

In [None]:
len(np.load(str("../input/kkiller-birdclef-mels-computer-d7-part1/audio_images/acafly/XC109605.ogg.npy")))

In [None]:
def load_data(df):
    def load_row(row):
        # impath = TRAIN_IMAGES_ROOT/f"{row.primary_label}/{row.filename}.npy"
        return row.filename, np.load(str(row.impath))[:MAX_READ_SAMPLES]
    pool = joblib.Parallel(4)
    mapper = joblib.delayed(load_row)
    tasks = [mapper(row) for row in df.itertuples(False)]
    res = pool(tqdm(tasks))
    res = dict(res)
    return res

In [None]:
#Save a copy of df_filtered and use it for Inference
df_filtered.to_csv("train_metadata_filtered_rich.csv", index=False)

In [None]:
# We cache the train set to reduce training time

audio_image_store = load_data(df_filtered)
len(audio_image_store)

In [None]:
print("shape:", next(iter(audio_image_store.values())).shape)
lbd.specshow(next(iter(audio_image_store.values()))[0])

In [None]:
print("shape:", next(iter(audio_image_store.values())).shape)
lbd.specshow(next(iter(audio_image_store.values()))[1])

In [None]:
class BirdClefDataset(Dataset):

    def __init__(self, audio_image_store, meta, sr=SR, is_train=True, num_classes=NUM_CLASSES, duration=DURATION):
        
        self.audio_image_store = audio_image_store
        self.meta = meta.copy().reset_index(drop=True)
        self.sr = sr
        self.is_train = is_train
        self.num_classes = num_classes
        self.duration = duration
        self.audio_length = self.duration*self.sr
    
    @staticmethod
    def normalize(image):
        image = image.astype("float32", copy=False) / 255.0
        image = np.stack([image, image, image])
        return image

    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self, idx):
        row = self.meta.iloc[idx]
        image = self.audio_image_store[row.filename]

        image = image[np.random.choice(len(image))]
        image = self.normalize(image)
        
        
        t = np.zeros(self.num_classes, dtype=np.float32) + 0.0025 # Label smoothing
        t[row.label_id] = 0.995
        
        return image, t

In [None]:
ds = BirdClefDataset(audio_image_store, meta=df_filtered, sr=SR, duration=DURATION, is_train=True)
len(ds)

In [None]:
ds

In [None]:
x, y = ds[np.random.choice(len(ds))]
# x, y = ds[0]
x.shape, y.shape, np.where(y >= 0.5)

In [None]:
y[:5]

In [None]:
lbd.specshow(x[0])

In [None]:
def one_step( xb, yb, net, criterion, optimizer, scheduler=None):
  xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        
  optimizer.zero_grad()
  o = net(xb)
  loss = criterion(o, yb)
  loss.backward()
  optimizer.step()
  
  with torch.no_grad():
      l = loss.item()

      o = o.sigmoid()
      yb = (yb > 0.5 )*1.0
      lrap = label_ranking_average_precision_score(yb.cpu().numpy(), o.cpu().numpy())

      o = (o > 0.5)*1.0

      prec = (o*yb).sum()/(1e-6 + o.sum())
      rec = (o*yb).sum()/(1e-6 + yb.sum())
      f1 = 2*prec*rec/(1e-6+prec+rec)

  if  scheduler is not None:
    scheduler.step()

  return l, lrap, f1.item(), rec.item(), prec.item()

In [None]:
@torch.no_grad()
def evaluate(net, criterion, val_laoder):
    net.eval()

    os, y = [], []
    val_laoder = tqdm(val_laoder, leave = False, total=len(val_laoder))

    for icount, (xb, yb) in  enumerate(val_laoder):

        y.append(yb.to(DEVICE))

        xb = xb.to(DEVICE)
        o = net(xb)

        os.append(o)

    y = torch.cat(y)
    o = torch.cat(os)

    l = criterion(o, y).item()
    
    o = o.sigmoid()
    y = (y > 0.5)*1.0

    lrap = label_ranking_average_precision_score(y.cpu().numpy(), o.cpu().numpy())

    o = (o > 0.5)*1.0

    prec = ((o*y).sum()/(1e-6 + o.sum())).item()
    rec = ((o*y).sum()/(1e-6 + y.sum())).item()
    f1 = 2*prec*rec/(1e-6+prec+rec)

    return l, lrap, f1, rec, prec, 


In [None]:
def one_epoch(net, criterion, optimizer, scheduler, train_laoder, val_laoder):
  net.train()
  l, lrap, prec, rec, f1, icount = 0.,0.,0.,0., 0., 0
  train_laoder = tqdm(train_laoder, leave = False)
  epoch_bar = train_laoder
  
  for (xb, yb) in  epoch_bar:
      # epoch_bar.set_description("----|----|----|----|---->")
      _l, _lrap, _f1, _rec, _prec = one_step(xb, yb, net, criterion, optimizer)
      l += _l
      lrap += _lrap
      f1 += _f1
      rec += _rec
      prec += _prec

      icount += 1
        
      if hasattr(epoch_bar, "set_postfix") and not icount%10:
          epoch_bar.set_postfix(
            loss="{:.6f}".format(l/icount),
            lrap="{:.3f}".format(lrap/icount),
            prec="{:.3f}".format(prec/icount),
            rec="{:.3f}".format(rec/icount),
            f1="{:.3f}".format(f1/icount),
          )
  
  scheduler.step()

  l /= icount
  lrap /= icount
  f1 /= icount
  rec /= icount
  prec /= icount
  
  l_val, lrap_val, f1_val, rec_val, prec_val = evaluate(net, criterion, val_laoder)
  
  return (l, l_val), (lrap, lrap_val), (f1, f1_val), (rec, rec_val), (prec, prec_val)

In [None]:
class AutoSave:
  def __init__(self, top_k=2, metric="f1", mode="min", root=None, name="ckpt"):
    self.top_k = top_k
    self.logs = []
    self.metric = metric
    self.mode = mode
    self.root = Path(root or MODEL_ROOT)
    assert self.root.exists()
    self.name = name

    self.top_models = []
    self.top_metrics = []

  def log(self, model, metrics):
    metric = metrics[self.metric]
    rank = self.rank(metric)

    self.top_metrics.insert(rank+1, metric)
    if len(self.top_metrics) > self.top_k:
      self.top_metrics.pop(0)

    self.logs.append(metrics)
    self.save(model, metric, rank, metrics["epoch"])


  def save(self, model, metric, rank, epoch):
    t = time.strftime("%Y%m%d%H%M%S")
    name = "{}_epoch_{:02d}_{}_{:.04f}_{}".format(self.name, epoch, self.metric, metric, t)
    name = re.sub(r"[^\w_-]", "", name) + ".pth"
    path = self.root.joinpath(name)

    old_model = None
    self.top_models.insert(rank+1, name)
    if len(self.top_models) > self.top_k:
      old_model = self.root.joinpath(self.top_models[0])
      self.top_models.pop(0)      

    torch.save(model.state_dict(), path.as_posix())

    if old_model is not None:
      old_model.unlink()

    self.to_json()


  def rank(self, val):
    r = -1
    for top_val in self.top_metrics:
      if val <= top_val:
        return r
      r += 1

    return r
  
  def to_json(self):
    # t = time.strftime("%Y%m%d%H%M%S")
    name = "{}_logs".format(self.name)
    name = re.sub(r"[^\w_-]", "", name) + ".json"
    path = self.root.joinpath(name)

    with path.open("w") as f:
      json.dump(self.logs, f, indent=2)

In [None]:
def one_fold(model_name, fold, train_set, val_set, epochs=20, save=True, save_root=None):

  save_root = Path(save_root) or MODEL_ROOT

  saver = AutoSave(root=save_root, name=f"birdclef_{model_name}_fold{fold}", metric="f1_val")

  net = get_model(model_name).to(DEVICE)

  criterion = nn.BCEWithLogitsLoss()

  optimizer = optim.Adam(net.parameters(), lr=8e-4)
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=epochs)

  train_data = BirdClefDataset(audio_image_store, meta=df_filtered.iloc[train_set].reset_index(drop=True),
                           sr=SR, duration=DURATION, is_train=True)
  train_laoder = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, num_workers=TRAIN_NUM_WORKERS, shuffle=True, pin_memory=True)

  val_data = BirdClefDataset(audio_image_store, meta=df_filtered.iloc[val_set].reset_index(drop=True),  sr=SR, duration=DURATION, is_train=False)
  val_laoder = DataLoader(val_data, batch_size=VAL_BATCH_SIZE, num_workers=VAL_NUM_WORKERS, shuffle=False)

  epochs_bar = tqdm(list(range(epochs)), leave=False)
  for epoch  in epochs_bar:
    epochs_bar.set_description(f"--> [EPOCH {epoch:02d}]")
    net.train()

    (l, l_val), (lrap, lrap_val), (f1, f1_val), (rec, rec_val), (prec, prec_val) = one_epoch(
        net=net,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        train_laoder=train_laoder,
        val_laoder=val_laoder,
      )

    epochs_bar.set_postfix(
    loss="({:.6f}, {:.6f})".format(l, l_val),
    prec="({:.3f}, {:.3f})".format(prec, prec_val),
    rec="({:.3f}, {:.3f})".format(rec, rec_val),
    f1="({:.3f}, {:.3f})".format(f1, f1_val),
    lrap="({:.3f}, {:.3f})".format(lrap, lrap_val),
    )

    print(
        "[{epoch:02d}] loss: {loss} lrap: {lrap} f1: {f1} rec: {rec} prec: {prec}".format(
            epoch=epoch,
            loss="({:.6f}, {:.6f})".format(l, l_val),
            prec="({:.3f}, {:.3f})".format(prec, prec_val),
            rec="({:.3f}, {:.3f})".format(rec, rec_val),
            f1="({:.3f}, {:.3f})".format(f1, f1_val),
            lrap="({:.3f}, {:.3f})".format(lrap, lrap_val),
        )
    )

    if save:
      metrics = {
          "loss": l, "lrap": lrap, "f1": f1, "rec": rec, "prec": prec,
          "loss_val": l_val, "lrap_val": lrap_val, "f1_val": f1_val, "rec_val": rec_val, "prec_val": prec_val,
          "epoch": epoch,
      }

      saver.log(net, metrics)

In [None]:
def train(model_name, epochs=20, save=True, n_splits=5, seed=177, save_root=None, suffix="", folds=None):
  gc.collect()
  torch.cuda.empty_cache()

  save_root = save_root or MODEL_ROOT/f"{model_name}{suffix}"
  save_root.mkdir(exist_ok=True, parents=True)

  #pdb.set_trace()

  fold_bar = tqdm(df_filtered.reset_index().groupby("fold").index.apply(list).items(), total=df_filtered.fold.max()+1)
  
  for fold, val_set in fold_bar:
      if folds and not fold in folds:
        continue
      
      print(f"\n############################### [FOLD {fold}]")
      fold_bar.set_description(f"[FOLD {fold}]")
      train_set = np.setdiff1d(df_filtered.index, val_set)
        
      one_fold(model_name, fold=fold, train_set=train_set , val_set=val_set , epochs=epochs, save=save, save_root=save_root)
    
      gc.collect()
      torch.cuda.empty_cache()

In [None]:
MODEL_NAMES = [
      "efficientnet-b5",
]

In [None]:
for model_name in MODEL_NAMES:
  print("\n\n###########################################", model_name.upper())
  try:
    train(model_name, epochs=20, suffix=f"_sr{SR}_d{DURATION}_v1_v1", folds=[0])
  except Exception as e:
    # print(f"Error {model_name} : \n{e}")
    raise ValueError() from  e