# BirdCLEF EDA + Audio Visualization

In [None]:
!pip install reverse_geocode

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import os
import matplotlib.pyplot as plt
import plotly.express as px
import reverse_geocode
import librosa
import librosa.display
%matplotlib inline

train_md = pd.read_csv('../input/birdclef-2022/train_metadata.csv')

In [None]:
train_md = pd.read_csv('../input/birdclef-2022/train_metadata.csv')
train_md.isna().sum()

#### Thank Goodness! No missing data. 

In [None]:
plt.figure(figsize = (20,10))
colors = sns.color_palette('pastel')
count_val = train_md.primary_label.value_counts(normalize = True).values
labels = train_md.primary_label.value_counts().keys()
sns.barplot(x = labels,y = count_val, palette  = 'pastel')
plt.xticks(rotation = 90)
plt.title('Primary Label Composition in Training Data')
plt.show()

In [None]:
plt.figure(figsize = (10,10))
colors = sns.color_palette('pastel')
count_val = train_md.primary_label.value_counts(normalize = True).values[:11].tolist()
labels = train_md.primary_label.value_counts().keys().tolist()[:11]
count_val.append(sum(train_md.primary_label.value_counts(normalize = True).values[11:]))
labels.append('others')
plt.title('Top 10 Primary Labels by count in Training Data')
plt.pie(count_val, labels = labels, colors = colors, autopct = '%0.1f%%')
plt.show()

In [None]:
plt.figure(figsize = (10,10))
colors = sns.color_palette('pastel')
count_val = train_md.common_name.value_counts(normalize = True).values[:11].tolist()
labels = train_md.common_name.value_counts().keys().tolist()[:11]
count_val.append(sum(train_md.common_name.value_counts(normalize = True).values[11:]))
labels.append('others')
plt.title('Top 10 Birds (Common Name) by count in Training Data')
plt.pie(count_val, labels = labels, colors = colors, autopct = '%0.1f%%')
plt.show()

#### Seems like Barn Owl is the most common bird

In [None]:
sns.catplot(y = 'rating', data = train_md, kind="box", palette = 'pastel')
plt.ylabel('Rating')
plt.show()

#### Our median rating is 4. This is good. Atleast 50% of data is of high quality.

In [None]:
fig = px.scatter_geo(
    train_md,
    lat="latitude",
    lon="longitude",
    color="common_name",
    width=1_000,
    height=500,
    title="BirdCLEF 2022 Recording Geographical Locations",
)
fig.show()

In [None]:
def city_state_country(row):
    coord = (row['latitude'], row['longitude']), (0,0)
    location = reverse_geocode.search(coord)[0]['country']
    row['country'] = location
    return row

train_md = train_md.apply(city_state_country, axis=1)


In [None]:
plt.figure(figsize = (20,10))
sns.barplot(x = train_md.country.value_counts().keys(),y = train_md.country.value_counts().values, data = train_md, palette  = 'pastel')
plt.xticks(rotation = 90)
plt.show()

#### Most of our data comes from United States. It is possible that if there is a location specific feature in our recordings, then it might cause poor generalization when we build and train our models.  

In [None]:
plt.figure(figsize = (10,10))
colors = sns.color_palette('pastel')
count_val = train_md.country.value_counts(normalize = True).values[:11].tolist()
labels = train_md.country.value_counts().keys().tolist()[:11]
count_val.append(sum(train_md.country.value_counts(normalize = True).values[11:]))
labels.append('others')
plt.title('Top 10 Countries by count in Training Data')
plt.pie(count_val, labels = labels, colors = colors, autopct = '%0.1f%%')
plt.show()

In [None]:
fig = px.scatter_geo(
    train_md,
    lat="latitude",
    lon="longitude",
    color="common_name",
    width=1_000,
    height=500,
    title="BirdCLEF 2022 Recording USA",
    scope = 'usa',
    hover_name = 'country'
)
fig.show()

## Audio Files

In [None]:
AUDIO_DIR = '../input/birdclef-2022/train_audio'
idx = np.random.randint(0, len(train_md), 10)

In [None]:
fig, ax = plt.subplots(nrows=10, figsize=(10, 10), sharex = True)
for i in range(10):
    audio_file = AUDIO_DIR + '/' + train_md.iloc[idx[i]].filename
    signal, sr = librosa.load(audio_file)
    librosa.display.waveshow(signal, sr=sr, alpha = 0.5, color = 'blue', ax=ax[i])

#### We have to be cautious while prepraing the data. Our inputs would have varying lengths. Either we trim the signals to a minimum length or pad it to a maximum length.  

In [None]:
fig, ax = plt.subplots(nrows=10, figsize=(10, 20))
for i in range(10):
    audio_file = AUDIO_DIR + '/' + train_md.iloc[idx[i]].filename
    signal, sr = librosa.load(audio_file)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(signal)), ref=np.max)
    img = librosa.display.specshow(D, y_axis='linear', x_axis='time',sr=sr, ax=ax[i], cmap = 'cool')
fig.colorbar(img, ax=ax, format="%+2.f dB")

#### We can already see some patterns in the spectrograms. If these are distinct enough for different birds, then we can build a good model. 