In [None]:
import os 
import math
import numpy as np

#mapping:
import geopandas as gpd 
import pandas as pd 
import folium
from folium import Marker
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster

#plotting
import matplotlib.pyplot as plt 
import seaborn as sns 

#Audio
import librosa
import librosa.display as ld
from IPython.display import Audio

In [None]:
train_metadata=pd.read_csv('../input/birdclef-2021/train_metadata.csv')

train_metadata.head()

In [None]:
print(f'Number of species in the data {train_metadata.primary_label.nunique()}')

#plot of ratings of audio data :

plt.figure(figsize=(16,8))
sns.countplot(x=train_metadata.rating,data=train_metadata)
plt.title('Recordings Ratings')

**highest and least recorded 20 birds in the recordings**

In [None]:
highest_recorded=train_metadata['common_name'].value_counts().sort_values(ascending=False)[:20]
least_recorded=train_metadata['common_name'].value_counts().sort_values()[:20]

plt.subplots(2,1,figsize=(16,16))

plt.subplot(2,1,1)
plt.bar(x=highest_recorded.index,height=highest_recorded.values)
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Most Recorded birds')

plt.subplot(2,1,2)
plt.bar(x=least_recorded.index,height=least_recorded.values)
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Least Recorded birds')

plt.tight_layout()
plt.show()


**Heatmap of the recording locations**

In [None]:

from folium.plugins import HeatMap

start_loc=(train_metadata['latitude'].mean(),train_metadata['longitude'].mean())

#map
m_1=folium.Map(location=start_loc,
              tiles='Open Street Map',
              zoom_start=2,
              min_zoom=1.5)

#heatmap:
HeatMap(data=train_metadata[['latitude','longitude']],
        radius=7,opacity=.1).add_to(m_1)
print('Recordings Heatmap')
m_1

**This Heatmap shows that most of the recordings come from the Americas(North and South)
and Western Europe and Scandinavia.There are also recordings around South Africa, Russia, Japan, Western India, China and New Zealand.**

**Most Common birds**

This is is plot of the most represented birds in the data. The marker colors for all except 2 birds will be unique.

In [None]:
colors = [
    'red',
    'blue',
    'gray',
    'darkred',
    'lightred',
    'orange',
    'beige',
    'green',
    'darkgreen',
    'lightgreen',
    'darkblue',
    'lightblue',
    'purple',
    'darkpurple',
    'pink',
    'cadetblue',
    'lightgray',
    'black',
    'red',
    'blue']

In [None]:
most_common=train_metadata[train_metadata['common_name'].isin(highest_recorded.index)]

#adding diffrent color marker to each species:
color_dict=dict(zip(highest_recorded.index,colors))

m_2=folium.Map(location=start_loc,
              tiles='Open Street Map',
              zoom_start=2,
              min_zoom=1.5)

mc=MarkerCluster()

for idx,row in most_common.iterrows():
    
    location=(row['latitude'],row['longitude'])
    bird_name=row['common_name']
    fill_color=color_dict[bird_name]
    
    if not math.isnan(location[0]) and not math.isnan(location[1]):
        mc.add_child(Marker(location,tooltip=f'<b>{bird_name}</b>',
                    icon=folium.Icon(color=fill_color)))
    
m_2.add_child(mc)    

**Most common birds seem to be the birds that have a long range.**

**Least Common birds**

The location of Least recorded birds in the data set. Marker colors are unique.

In [None]:
least_common=train_metadata[train_metadata['common_name'].isin(least_recorded.index)]

#adding diffrent color marker to each species:
color_dict=dict(zip(least_recorded.index,colors))

m_2=folium.Map(location=start_loc,
              tiles='Open Street Map',
              zoom_start=2,
              min_zoom=1.5)

mc=MarkerCluster()

for idx,row in least_common.iterrows():
    
    location=(row['latitude'],row['longitude'])
    bird_name=row['common_name']
    fill_color=color_dict[bird_name]
    
    if not math.isnan(location[0]) and not math.isnan(location[1]):
        mc.add_child(Marker(location,tooltip=f'<b>{bird_name}</b>',
                           icon=folium.Icon(color=fill_color)))
    
m_2.add_child(mc)    

**The least recorded birds seem to be endemic to thier specific locations.**

# Audio EDA

In [None]:
#lets see some audio samples:
# 1)Amepip
path_1='../input/birdclef-2021/train_short_audio/amepip/XC111040.ogg'
Audio(path_1)

In [None]:
# 2) bcnfly
path_2='../input/birdclef-2021/train_short_audio/bncfly/XC113984.ogg'
Audio(path_2)

**Plot waveplot and power spectrum**

In [None]:
def plot_waveplot(path,sr=None):
    '''plot waveplot and power spectrograms'''
    
    #loading audio
    signal,sr=librosa.load(path,sr=sr)
    
    plt.figure(figsize=(16,10))

    #waveplot:
    plt.subplot(2,1,1)
    ld.waveplot(signal,sr)
    plt.ylabel('Magnitude')
    plt.title('Waveplot')

    #fast fourier transform:

    fft=np.fft.fft(signal)
    mag=np.abs(fft)
    freq=np.linspace(0,sr,len(mag))
    plt.subplot(2,1,2)
    plt.plot(freq,mag)
    plt.xlabel('Frequency')
    plt.ylabel('Magnitude')
    plt.title('Power Spectrum')
    
    
    plt.tight_layout()
    plt.show()

plot_waveplot(path='../input/birdclef-2021/train_short_audio/blbthr1/XC119226.ogg')

**Short time fourier transform(STFT) and Mel spectrograms**

In [None]:
def plot_stft(path,sr):
    '''Plot STFT '''
    
    #loading audio
    signal,sr=librosa.load(path,sr=sr)
    
    #short time fourier transform:
    stft=librosa.core.stft(signal,hop_length=hop_len,n_fft=n_fft)
    spectrogram=librosa.amplitude_to_db(np.abs(stft),ref=np.max)

    #display_spectrogram:
    plt.figure(figsize=(16,4))

    img=ld.specshow(spectrogram,sr=sr,hop_length=hop_len,x_axis='time',y_axis='log')
    plt.xlabel('Time')
    plt.ylabel('Freq')
    plt.colorbar(img)
    plt.title('STFT')
    plt.show()


def plot_spectrogram(path,sr=None):
    
    #loading audio
    signal,sr=librosa.load(path,sr=sr)

    fig,ax=plt.subplots(figsize=(16,4))
    M = librosa.feature.melspectrogram(y=signal, sr=sr)
    M_db = librosa.power_to_db(M, ref=np.max)
    img = ld.specshow(M_db, y_axis='mel', x_axis='time', ax=ax)
    plt.colorbar(img)
    ax.set(title='Mel spectrogram display')
    plt.show()

hop_len=512
n_fft=2048
plot_stft(path='../input/birdclef-2021/train_short_audio/blbthr1/XC119226.ogg',sr=None)
plot_spectrogram(path='../input/birdclef-2021/train_short_audio/blbthr1/XC119226.ogg',sr=None)

**Lets compare spectrograms of same species**

In [None]:
def compare_specs(path):
    for paths in os.listdir(path)[:2]:
        print('bird :{}'.format(path.split('/')[-1]))
        plot_spectrogram(path=os.path.join(path,paths))
        
compare_specs(path='../input/birdclef-2021/train_short_audio/amecro')

In [None]:
compare_specs('../input/birdclef-2021/train_short_audio/casfin')

In [None]:
compare_specs('../input/birdclef-2021/train_short_audio/houspa')

In [None]:
compare_specs('../input/birdclef-2021/train_short_audio/macwar')

**There are similar patterns in spectrograms of same species. There could be diffrences as the quality of recordings is not uniform and may consist of lots of noise**

**Thats it for this notebook. I will do the modelling and prediction in a notebook listed below.** 
[ https://www.kaggle.com/virajkadam/birdclef-bird-sound-classification ]