In [None]:
!pip install chart_studio

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
%matplotlib inline

# Preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import datetime as dt
from datetime import datetime   

# Visualisation libraries
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.figure_factory as ff
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# Settings for pretty nice plots
plt.style.use('fivethirtyeight')
plt.show()


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')

# Improving the accuracy of soundscape analyses.

## Objective

It is often easier to hear birds than see them. With proper sound detection and classification, researchers could automatically intuit factors about an area’s quality of life based on a changing bird population.The objective of the competition as stated on the competitions page is to identify a wide variety of bird vocalizations in soundscape recordings. However, the recordings are complex and may contain anthropogenic sounds (e.g., airplane overflights) or other bird and non-bird (e.g., chipmunk) calls in the background, with a particular labeled bird species in the foreground.

## Understanding the Evaluation Metric

The metric in this competition is the row-wise micro averaged F1 score.The F1 score or F measure, is a measure of a test’s accuracy.

The F score is defined as the weighted harmonic mean of the test’s precision and recall.
![](https://imgur.com/nC4QwrO.png)


## Data
Following files have been provided to the participants:

* `train_audio` : Trainign data consisting of short recordings
* `train_csv` : metadata for training data
*  `test_audio` : The hidden test set audio consists of approximately 150 recordings in mp3 format, each                       roughly 10 minutes long. 
* `test_audio.csv` : metadata for test set.It is important to note that only the first three rows are available for download; the full test.csv is in the hidden test set.

## Exploring the Training metadata

To begin with let's explore the training metadata file to gather some information

In [None]:
train = pd.read_csv('../input/birdsong-recognition/train.csv',)
train.head()

In [None]:
train.info()

The file contains a lot of columns but we shall focus on some of the ones which are directly related to our problem

## e-bird code

a code for the bird species. You can review detailed information about the bird codes by appending the code to https://ebird.org/species/, such as https://ebird.org/species/amecro for the American Crow.

In [None]:
len(train['ebird_code'].value_counts())

In [None]:
x = train['ebird_code'].value_counts().index.to_list()
e_code_path = 'https://ebird.org/species/'
species = [e_code_path+p for p in x]


Let's check out a few

In [None]:
from IPython.display import IFrame
IFrame(species[0], width=800, height=450)


In [None]:
IFrame(species[100], width=800, height=450)


In [None]:
IFrame(species[200], width=800, height=450)

## Recordist

Recordist is the user who provided the recordings.

In [None]:
# Total number of people who provided the recordings
train['recordist'].nunique()

In [None]:
# Top 10 recordists in terms of the number of recordings done
train['recordist'].value_counts()[:10].sort_values().iplot(kind='barh',color='#3780BF')

## Playback_used

Whether playback has been used or not.

In [None]:
train['playback_used'].fillna('Not Defined',inplace=True);
train['playback_used'].value_counts()

In [None]:
train['playback_used'].value_counts()

labels = train['playback_used'].value_counts().index
values = train['playback_used'].value_counts().values
colors=['#3795bf','#bfbfbf']

fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial',marker=dict(colors=colors))])
fig.show()

## Ratings

In [None]:
train['rating'].value_counts().iplot(kind='bar',color='#3780BF')

## Date of recordings

In [None]:
# Convert string to datetime64
train['date'] = train['date'].apply(pd.to_datetime,format='%Y-%m-%d', errors='coerce')
#train.set_index('date',inplace=True)
train['date'].value_counts().plot(figsize=(12,8))

## Countries where recordings have been made

In [None]:
%%HTML
<div class='tableauPlaceholder' id='viz1592397692077' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Bi&#47;Birds_15923974075490&#47;Dashboard1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Birds_15923974075490&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Bi&#47;Birds_15923974075490&#47;Dashboard1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1592397692077');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.minWidth='420px';vizElement.style.maxWidth='650px';vizElement.style.width='100%';vizElement.style.minHeight='587px';vizElement.style.maxHeight='887px';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.minWidth='420px';vizElement.style.maxWidth='650px';vizElement.style.width='100%';vizElement.style.minHeight='587px';vizElement.style.maxHeight='887px';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';} else { vizElement.style.width='100%';vizElement.style.height='727px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

## Species

In [None]:
# Total no of unique species in the dataset
print(len(train['species'].value_counts().index))


In [None]:
train['species'].value_counts()

In [None]:
train['species'].value_counts().iplot()

## Distribution of Recorded Species

In [None]:
%%HTML
<div class='tableauPlaceholder' id='viz1592442148007' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;ZN&#47;ZNDRZCHNN&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='path' value='shared&#47;ZNDRZCHNN' /> <param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;ZN&#47;ZNDRZCHNN&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1592442148007');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.minWidth='420px';vizElement.style.maxWidth='650px';vizElement.style.width='100%';vizElement.style.minHeight='587px';vizElement.style.maxHeight='887px';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.minWidth='420px';vizElement.style.maxWidth='650px';vizElement.style.width='100%';vizElement.style.minHeight='587px';vizElement.style.maxHeight='887px';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';} else { vizElement.style.width='100%';vizElement.style.height='727px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

## Additional Datasets

It is interesting to note here the amount of training recordings per species has been limited to 100 to keep the dataset at a reasonable. However, external dataset is available and [Rohan Rao](https://www.kaggle.com/rohanrao) has been kind enough to extract them and share them in usable form with the community.

The remaining recordings (with some exclusions) has been published as datasets (split in two by first alphabet due to Kaggle's 20GB size limitation):
http://www.kaggle.com/rohanrao/xeno-canto-bird-recordings-extended-a-m
http://www.kaggle.com/rohanrao/xeno-canto-bird-recordings-extended-n-z

Please read [this discussion post](https://www.kaggle.com/c/birdsong-recognition/discussion/159970) for more information.

### Analysing the external datasets

In [None]:
TRAIN_EXT_PATH = "../input/xeno-canto-bird-recordings-extended-a-m/train_extended.csv"
train_ext = pd.read_csv(TRAIN_EXT_PATH)
train_ext.head()

In [None]:
len(train_ext['ebird_code'].value_counts())

In [None]:
len(train_ext)

The original train data has 21,375 recordings and the extended train data has an additional 22,015 recordings for 253 out of 264 species which more than doubles the training data size.

In [None]:
df_original = train.groupby("species")["filename"].count().reset_index().rename(columns = {"filename": "original_recordings"})
df_extended = train_ext.groupby("species")["filename"].count().reset_index().rename(columns = {"filename": "extended_recordings"})

df = df_original.merge(df_extended, on = "species", how = "left").fillna(0)
df["total_recordings"] = df.original_recordings + df.extended_recordings
df = df.sort_values("total_recordings").reset_index().sort_values('total_recordings',ascending=False)
df.head()

In [None]:
# Plot the total recordings
f, ax = plt.subplots(figsize=(10, 50))

sns.set_color_codes("pastel")
sns.barplot(x="total_recordings", y="species", data=df,
            label="total_recordings", color="r")

# Plot the original recordings
sns.set_color_codes("muted")
sns.barplot(x="original_recordings", y="species", data=df,
            label="original_recordings", color="g")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 2000), ylabel="",
       xlabel="Count")
sns.despine(left=True, bottom=True)

Clearly, the additional recordings(in red) has far more observations than existing data. Since external data is allowed in the competition, it can prove to be highly beneficial during the training part.

# Properties of an audio signal


An audio signal is represented in the form of an audio signal having parameters such as frequency, bandwidth, decibel etc. A typical audio signal can be expressed as a function of Amplitude and Time.
![](https://miro.medium.com/max/1400/1*akRbhl8739UEDuKHkOUR1Q.png)

source: https://docs.google.com/presentation/d/1zzgNu_HbKL2iPkHS8-qhtDV20QfWt9lC3ZwPVZo8Rw0/pub?start=false&loop=false&delayms=3000&slide=id.g5a7a9806e_0_84


These sounds are available in many formats which makes it possible for the computer to read and analyse them. Some examples are:
* mp3 format
* WMA (Windows Media Audio) format
* wav (Waveform Audio File) format

## Loading an audio file

I'll be using a Python Library called Librosa for analysing the audio file.

In [None]:

audio_path = '../input/birdsong-recognition/train_audio/nutwoo/XC462016.mp3'
x , sr = librosa.load(audio_path)


In [None]:
print(type(x), type(sr))

In [None]:
print(x.shape, sr)

This returns an audio time series as a numpy array with a default sampling rate(sr) of 22KHZ mono. sr stands for **Sample Rate** which is the number of samples of audio carried per second, measured in Hz or kHz.

We can also resample the audio sample to **44.1KHz** by:

In [None]:
librosa.load(audio_path, sr=44100)

Or we can simply disable sampling by:

In [None]:
librosa.load(audio_path, sr=None)

## Playing Audio

`IPython.display.Audio` lets you play audio directly in a jupyter notebook.

In [None]:
import IPython.display as ipd
ipd.Audio(audio_path)

## Visualizing Audio

We can plot the audio array using librosa.display.waveplot. Let's plot the  amplitude envelope of a waveform.


In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=sr)

Here, we have the plot of the amplitude envelope of a waveform.

### 1. Spectrogram
A spectrogram is a visual representation of the spectrum of frequencies of sound or other signals as they vary with time. Spectrograms are sometimes called sonographs, voiceprints, or voicegrams. When the data is represented in a 3D plot, they may be called waterfalls. In 2-dimensional arrays, the first axis is frequency while the second axis is time.
We can display a spectrogram using. `librosa.display.specshow`.

In [None]:
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()

The vertical axis shows frequencies (from 0 to 10kHz), and the horizontal axis shows the time of the clip. Since we see that all action is taking place at the bottom of the spectrum, we can convert the frequency axis to a logarithmic one.

In [None]:
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
plt.colorbar()

### 2.Feature extraction

Every audio signal consists of many features. However, we must extract the characteristics that are relevant to the problem we are trying to solve. The process of extracting features to use them for analysis is called feature extraction. Let us study about few of the features in detail.

###  Zero Crossing Rate
The zero crossing rate is the rate of sign-changes along a signal, i.e., the rate at which the signal changes from positive to negative or back. 
Let us calculate the zero crossing rate for our example audio clip.

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=sr)

In [None]:
# Zooming in
n0 = 9000
n1 = 9100
plt.figure(figsize=(14, 5))
plt.plot(x[n0:n1])
plt.grid()

In [None]:
zero_crossings = librosa.zero_crossings(x[n0:n1], pad=False)
print(sum(zero_crossings))

### Spectral Centroid
It indicates where the ”centre of mass” for a sound is located and is calculated as the weighted mean of the frequencies present in the sound. Consider two songs, one from a blues genre and the other belonging to metal. 


In [None]:
spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0]
spectral_centroids.shape


In [None]:
# Computing the time variable for visualization
frames = range(len(spectral_centroids))
t = librosa.frames_to_time(frames)
# Normalising the spectral centroid for visualisation
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)
#Plotting the Spectral Centroid along the waveform
librosa.display.waveplot(x, sr=sr, alpha=0.4)
plt.plot(t, normalize(spectral_centroids), color='r')

### Spectral Rolloff
It is a measure of the shape of the signal. It represents the frequency below which a specified percentage of the total spectral energy, e.g. 85%, lies.

In [None]:
spectral_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sr)[0]
librosa.display.waveplot(x, sr=sr, alpha=0.4)
plt.plot(t, normalize(spectral_rolloff), color='r')

### Mel-Frequency Cepstral Coefficients
The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10–20) which concisely describe the overall shape of a spectral envelope. It models the characteristics of the human voice.

In [None]:
x, fs = librosa.load('../input/birdsong-recognition/train_audio/nutwoo/XC161356.mp3')
librosa.display.waveplot(x, sr=sr)


In [None]:
mfccs = librosa.feature.mfcc(x, sr=fs)
print(mfccs.shape)


In [None]:
#Displaying  the MFCCs:
librosa.display.specshow(mfccs, sr=sr, x_axis='time')


Here mfcc computed 20 MFCC s over 1062 frames.
We can also perform feature scaling such that each coefficient dimension has zero mean and unit variance:

In [None]:
mfccs = sklearn.preprocessing.scale(mfccs, axis=1)
print(mfccs.mean(axis=1))
print(mfccs.var(axis=1))
librosa.display.specshow(mfccs, sr=sr, x_axis='time')