# Let's study the chirping of birds
<img src='https://media.giphy.com/media/NfzOqaLHp7j44/giphy.gif' height=300 width=500/>

In [None]:
!pip install pydub

In [None]:
#Libraries Required
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly_express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import wave
import os
import requests
import re
from pydub import AudioSegment
import IPython.display as ipd
import struct
from scipy.io import wavfile as wav
from colorama import Fore, Back, Style
import requests
import json
from bs4 import BeautifulSoup
import ipywidgets as widgets
from itertools import product

# Read the dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/birdsong-recognition/train.csv')
df_test = pd.read_csv('/kaggle/input/birdsong-recognition/test.csv')
df = pd.read_csv('/kaggle/input/birdsongrecognitiondetails/bird_details.csv')
media_path = '/kaggle/input/birdsong-recognition/train_audio/'

In [None]:
def get_elevation(val):
    """Derive the elevation value from the string. Also, I have 
    kept negative elevation values as below sea level is also a possibility."""
    l = re.findall('[~\?]?(-?\d+[\.,]?\d*)-?(\d*)',val)
    val1=0
    val2=0
    if l:
        if l[0][0]:
            val1=float(l[0][0].replace(',',''))
        if l[0][1]:
            val2=float(l[0][1].replace(',',''))
        if val1!=0 and val2!=0:
            return (val1+val2)/2
        return val1
    else:
        return float('nan')
df_train.elevation=df_train.elevation.apply(lambda x: get_elevation(x))

# Let's listen the birds chirp!
Since there are around 200 species and for each species multiple recordings are there, I'll be showing the sound plot and details of a random bird chirp recording from some species. The same function can be used to plot for each and every record.

In [None]:
def wav_plotter(full_path,data):   
    rate, wav_sample = wav.read(full_path)
    wave_file = open(full_path,"rb")
    riff_fmt = wave_file.read(36)
    bit_depth_string = riff_fmt[-2:]
    bit_depth = struct.unpack("H",bit_depth_string)[0]
    print(Fore.CYAN+data['title'].upper())
    print('_'*len(data['title']))
    print('')
    print('Scientific Name:',data['sci_name'])
    print('Country recorded: ',data['country'])
    print('Recordist: ',data['author'])
    print('Rating: ',data['rating'])
    print('Sampling rate: ',rate,'Hz')
    print('Bit depth: ',bit_depth)
    print('Number of channels: ',wav_sample.shape[1] if len(wav_sample.shape)>1 else 1)
    print('Duration: ',wav_sample.shape[0]/rate,' second')
    print('Number of samples: ',len(wav_sample))
    plt.figure(figsize=(12, 4))
    plt.plot(wav_sample)
    return ipd.Audio(full_path)

def plot_wav(sp):
    data = df_train[df_train['species']==sp]
    idx = np.random.choice(data.index,1)[0]
    sound_data = data.loc[idx,:]
    src = os.path.join('/kaggle/input/birdsong-recognition/train_audio/',sound_data['ebird_code'],sound_data['filename'])
    sound_mp3 = AudioSegment.from_mp3(src)
    filename=sound_data['filename'].split('.')[0]+'.wav'
    sound_mp3.export(filename,format='wav')
    return wav_plotter(filename,sound_data)

## Alder Flycatcher
<img src='https://test.cdn.download.ams.birds.cornell.edu/api/v1/asset/59858041/1800' width=500 height=300/>
Entirely black. Large, long-legged, thick-necked bird with heavy, straight bill. In flight, wings are fairly broad and rounded with wingtip feathers spread like fingers. Short tail, squared off at the end. Very social, sometimes forming noisy flocks in the thousands. Aggressive, often chase away hawks and owls. Common in fields, woods, and cities. Thrives around people.

In [None]:
plot_wav('Alder Flycatcher')

## House Wren
<img src='https://test.cdn.download.ams.birds.cornell.edu/api/v1/asset/59860711/1800' width=500 height=300/>
At home in suburbs, parks, rural farmland, and other open areas with thick tangles. In the breeding season, it is often singing its effervescent song from the top of a bush or fence post. Plumage doesn't show much contrast: plain brown overall, slightly paler below, with some dark barring on the wings and tail. Lacks white eyebrow. In winter they are more secretive, preferring brushy tangles, thickets, and hedgerows.

In [None]:
plot_wav('House Wren')

## American Robin
<img src='https://test.cdn.download.ams.birds.cornell.edu/api/v1/asset/60412911/1800' width=500 height=300/>
Fairly large songbird with round body, long legs, and longish tail. Gray above with warm orange underparts and blackish head. Hops across lawns and stands erect with its bill often tilted upward. In fall and winter, forms large flocks and gathers in trees to roost or eat berries. Common across North America in gardens, parks, yards, golf courses, fields, pastures, and many other wooded habitats.

In [None]:
plot_wav('American Robin')

## Ovenbird
<img src='https://test.cdn.download.ams.birds.cornell.edu/api/v1/asset/64439011/1800' width=500 height=300/>
Secretive warbler that lacks vibrant colors, but compensates with its enormous voice. Olive-brown above with black streaks on white breast and bold white eyering. Orange crown bordered by black on either side. Forages for insects by walking along branches and on the ground. Breeds primarily in areas with extensive forest. Listen for its loud song that builds in volume: "tea-cher, TEA-cher, TEA-CHER!" Possible to confuse with thrushes, but smaller, and walks instead of hops. Winters in Central America.

In [None]:
plot_wav('Ovenbird')

## Northern Flicker
<img src='https://test.cdn.download.ams.birds.cornell.edu/api/v1/asset/60403261/1800' width=500 height=300/>
Large, brown woodpecker with black barring on the back and black spots on the belly. Easily recognized in flight by its bright white rump. Also note large black crescent-shaped mark on breast. Underwings are yellow or red, depending on the subspecies. Generally "Yellow-shafted" is found in eastern and northern North America, and "Red-shafted" in the West south through Mexico. Often seen feeding on the ground in lawns, where they eat lots of ants and worms. Nests in cavities.

In [None]:
plot_wav('Northern Flicker')

# Bird Chirp Rating system
So the site https://www.xeno-canto.org/ rates the chirp of the bird. On further analysis of the dataset and the site, I found that the birds are rated on a scale of 0-5 with 0 being the least and 5 being the highest rating. In this section, I'll be analysing the ratings and what factors are actually affecting the rating of the bird chirp.

## How are the birds rated?
Let's just see what number of unique birds are assigned to which rating. This can be analysed using a bar plot.

In [None]:
#Common backdrop to be used for all the plots
PAPER_BGCOLOR='rgb(255,255,255)'
PLOT_BGCOLOR='rgb(255,255,255)'

In [None]:
ratings = df_train.groupby('rating',as_index=False)['title'].count().sort_values('rating')
fig = go.Figure()
fig.add_trace(go.Bar(x=ratings['rating'],y=ratings['title'],marker_line_color='black',marker_line_width=1.5,text=ratings['title'],textposition='auto'))
fig.update_layout(template='seaborn',height=300,title='Ratings count',paper_bgcolor=PAPER_BGCOLOR,plot_bgcolor=PLOT_BGCOLOR,
                 xaxis=dict(title='Ratings',nticks=20,mirror=True,linewidth=2,linecolor='black'),
                 yaxis=dict(title='Counts',mirror=True,linewidth=2,linecolor='black',gridcolor='darkgrey'))
fig.show()

**Seems like the people really find all the chirps pretty pleasing :P as most of the birds are rated above 3.5.** 

In [None]:
df_train['year'] = df_train['date'].apply(lambda x: int(x.split('-')[0]))
group = df_train.groupby(['year','species'],as_index=False).agg({'rating':'mean','ebird_code':'count'})\
    .sort_values(['year','ebird_code'],axis=0)
group = group[group['year']>=1979]

data = np.array(list(product(group.year.unique().tolist(),group.species.unique().tolist())))
df_data = pd.DataFrame(np.vstack(data), columns=['year','species'])
df_data['year'] = df_data['year'].astype(int)
df_data = pd.merge(df_data,group,on=['year','species'],how='left')
df_data.fillna(0,inplace=True)
df_data.rename(columns={'ebird_code':'Recordings'},inplace=True)
df_data.Recordings = df_data.Recordings.astype(int)
fig = px.bar(df_data,y='species',x='Recordings',animation_frame='year',orientation='h')
fig.update_layout(template='seaborn',height=800,width=700,title='Recordings registered per year',
                  paper_bgcolor=PAPER_BGCOLOR,plot_bgcolor=PLOT_BGCOLOR,
                 xaxis=dict(range=[0,48],title='Number of Recordings',mirror=True,linewidth=2,linecolor='black',gridcolor='darkgrey'),
                 yaxis=dict(title='Bird Species',mirror=True,linewidth=2,linecolor='black'))
fig.show()

## What really affects these ratings?
I have done an extensive analysis of the rating using the following features:-
- Number of Recordists
- Number of Countries where the bird is found
- Mean Duration of the chirp in seconds
- Mean Elevation of the bird in meters

>In the dashboard below the X-axis of all the scatter plots shows the ratings scale.

In [None]:
ratings_species = df_train.groupby('species',as_index=False)\
    .agg({'rating':'mean','country':'nunique','author':'nunique','duration':'mean','elevation':'mean'})
ratings_species['rating']=np.round(ratings_species['rating'],2)
ratings_species['duration']=np.round(ratings_species['duration'],2)
ratings_species['elevation']=np.round(ratings_species['elevation'],2)
ratings_species.sort_values('rating',ascending=False,inplace=True)

In [None]:
fig = make_subplots(rows=3,cols=2,specs=[[{'type':'table','colspan':2},None],[{},{}],[{},{}]],
                   vertical_spacing=0.03,horizontal_spacing=0.03)
fig.add_trace(go.Table(
        columnorder=[1,2,3,4,5,6],
        columnwidth=[170,60,130,90,130,120],
        header=dict(
            values=["<b>Species</b>", "<b>Mean Rating</b>", "<b>No. of Countries</b><br>where bird is found",
                    "<b>No. of Recordists</b>",'<b>Mean Duration</b><br>of chirp in seconds',
                   "<b>Mean Elevation</b><br>in meters"],
            line_color='darkslategray',
            fill_color='royalblue',
            font=dict(color='white', size=10),
            align=['center']
        ),
        cells=dict(
            values=[ratings_species[k].tolist() for k in ratings_species.columns],
            align = "center",
            line_color='darkslategray',
            fill=dict(color=['paleturquoise', 'white']))
    ),1,1)
fig.add_trace(go.Scatter(name='Author',x=ratings_species.rating,y=ratings_species.author,mode='markers',
                        marker_size=10,marker_line_width=1,
                        text=ratings_species['species'],
                        textposition='bottom center'),2,1)
fig.add_trace(go.Scatter(name='Duration',x=ratings_species.rating,y=ratings_species.duration,mode='markers',
                        marker_size=10,marker_line_width=1,
                        text=ratings_species['species'],
                        textposition='bottom center'),2,2)
fig.add_trace(go.Scatter(name='Country',x=ratings_species.rating,y=ratings_species.country,mode='markers',
                        marker_size=10,marker_line_width=1,
                        text=ratings_species['species'],
                        textposition='bottom center'),3,1)
fig.add_trace(go.Scatter(name='Elevation',x=ratings_species.rating,y=ratings_species.elevation,mode='markers',
                        marker_size=10,marker_line_width=1,
                        text=ratings_species['species'],
                        textposition='bottom center'),3,2)

fig.update_xaxes(linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=2,col=1)
fig.update_xaxes(linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=2,col=2)
fig.update_xaxes(linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=3,col=1)
fig.update_xaxes(linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=3,col=2)

fig.update_yaxes(title_text='No. of Recordists',linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=2,col=1)
fig.update_yaxes(title_text='Mean Duration',linecolor='black',linewidth=2,showline=True,side='right',
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=2,col=2)
fig.update_yaxes(title_text='No. of Countries',linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=3,col=1)
fig.update_yaxes(title_text='Mean Elevation',linecolor='black',linewidth=2,showline=True,side='right',
                 showgrid=False,mirror=True,ticks='inside',tickfont=dict(size=10),row=3,col=2)

fig.update_layout(template='seaborn',width=700,height=800,title='Bird Chirp Rating Analysis',
                  showlegend=False,plot_bgcolor=PLOT_BGCOLOR,paper_bgcolor=PAPER_BGCOLOR)
fig.show()                                            

In [None]:
for graph in fig.data:
    graph_type = type(graph).__name__
    if graph_type == 'Bar' or graph_type == 'Scatter':
        print(graph.marker)
    

**Looks like a positive correlation between all the features and rating.**

# Bird Chirp Spatial Analysis
In this section, I'll be doing a spatial analysis of the bird chirp recordings. I'll be answering the questions such as:
- `Which countries had the most number of recordings?`
- `Which countries produced the best recordings?`
- `Was the bird seen or not?`
- `Where the recordings were made?`

## Country wise Analysis
Here, I'll be answering the first three questions using a bar plot. Below is some info about the plot:-
- The size of the bar represents the number of recordings for a particular country.
- The colour of the bar represents if the bird was seen or not while recording. Green stands for 'yes' while Red stands for 'no'.
- The text represents the average rating for the country for both the cases: bird seen or not. It's on a scale of 0-5 with 0 being the lowest and 5 being the highest.
- Just hover along the y-axis to know all the information about a particular country.
- I will be represting the Top 50 countries in terms of number of recordings. In total there are 95 distinct countries where the recordings have been done.

In [None]:
countries = df_train.groupby(['country','bird_seen'],as_index=False).agg({'title':'count','rating':'mean'})\
    .sort_values('title',ascending=False).reset_index()
countries = countries.loc[:50,:]
seen_color = {'yes':'rgb(93, 217, 93)','no':'rgb(239, 58, 56)'}
fig = go.Figure()
for seen in ['yes','no']:
    fig.add_trace(go.Bar(name=seen,y=countries[countries['bird_seen']==seen]['country'],
                         x=countries[countries['bird_seen']==seen]['title'],orientation='h',
                         marker_line_color='black',marker_line_width=1.5,
                         text=np.round(countries[countries['bird_seen']==seen]['rating'],2),textposition='outside',
                         marker_color=seen_color[seen]))
fig.update_layout(height=800,template='seaborn',paper_bgcolor=PAPER_BGCOLOR,plot_bgcolor=PLOT_BGCOLOR,barmode='stack',
                  hovermode='y unified',width=700,
                 xaxis=dict(title='Number of Recordings',type='log',mirror='allticks',linewidth=2,linecolor='black',
                            showgrid=True,gridcolor='darkgray'),
                 yaxis=dict(mirror=True,linewidth=2,linecolor='black',tickfont=dict(size=8)),
                 legend=dict(title='<b>Was the bird seen?</b>',x=0.71,y=0.95,bgcolor='rgba(255, 255, 255, 0)',
                             bordercolor='rgba(255, 255, 255, 0)'),
                 title='<b>Number of Recordings per Country [Top 50]</b><br>(Along with average ratings)')
fig.show()

**Analysis from the plot above:-**
- Most number of recordings have come from USA with Canada being a far second having less than 1/5th recordings than that of USA's.
- In majority of cases the bird is seen while recording. 
- Iceland has the worst average rating of 1.2 from 103 recordings. Bird was seen in all the recordings.
- Slovakia has the best average rating of 4.97 from 15 records. Bird was seen in all the recordings.

## Where the recordings were made?
Since, I have latitude and longitude of the location where recordings were made, I'll be plotting the exact locations on the map where the recordings were made.

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("gmaps")
secret_value_1 = user_secrets.get_secret("mapboxtoken")
df_train.latitude = df_train.latitude.str.replace('Not specified','nan').astype(np.float16)
df_train.longitude = df_train.longitude.str.replace('Not specified','nan').astype(np.float16)

In [None]:
px.set_mapbox_access_token(secret_value_1)
fig = px.scatter_mapbox(df_train,
                lat='latitude',
                lon='longitude',
                size='duration',
                color='rating',
                hover_name='species',
                hover_data=['duration','country','elevation'],
                color_continuous_scale=px.colors.sequential.Viridis,
                mapbox_style='open-street-map',
                zoom=0.5)
fig.update_geos(fitbounds="locations", visible=True)
fig.update_geos(projection_type="mercator")
fig.update_layout(height=500,width=700,margin={"r":0,"t":50,"l":0,"b":0})
fig.update_layout(title='<b>Recording Locations</b>',template='seaborn',
                  hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"))
fig.show()

**As seen above, most of the recordings were made in North America.**

# Recordist Analysis
Now lets analyse the recordists. A recordist is the one who records the bird chirp. In this dataset we have two columns: `author` & `recordist`, but they are actually the same. Therefore, the person recording and posting the chird online is the same.

In the below plot I'll be using a table to show details about all the recordists and a bar plot showing the top 30 recordists. I'm not fetching the top 30 recordists by just sorting on the basis of their ratings because a recordist just having a single recording with rating 5 can look better than a recordist with 100 recording having 99 5 ratings and 1 recording with 4 rating. So, to avoid this bias I'll be taking number of countries and species covered by the recordist into consideration along with the average rating. Below is the image of the formula used by me:-

**score = 0.5x(Number of Species Covered) + 0.3x(Number of Countries covered) + 0.2x(Average rating)**

> Please note: All these values are normalized.

So, the top 30 recordists will be evaluated on the basis of this score. I believe a recordist who have covered a large number of species is better than those who haven't covered many. So, the weightage given species is 50%. Countries and rating have 30% and 20% weightage respectively.

In [None]:
def normalize(x):
    xmin = x.min()
    xmax = x.max()
    return (x-xmin)/(xmax-xmin)

recordists = df_train.groupby('author',as_index=False).agg({'rating':'mean','species':'nunique','country':'nunique'})\
.sort_values(['species','rating'],ascending=False)
recordists['rating'] = np.round(recordists['rating'],2)
recordists['rating_norm'] = normalize(recordists['rating'])
recordists['species_norm'] = normalize(recordists['species'])
recordists['country_norm'] = normalize(recordists['country'])
recordists['total'] = recordists['species_norm']*0.5 + recordists['country_norm']*0.3 + recordists['rating_norm']*0.2
recordists.drop(['rating_norm','species_norm','country_norm'],axis=1,inplace=True)

fig = make_subplots(rows=2,cols=1,specs=[[{'type':'table','rowspan':1}],[{'rowspan':1}]],
                   vertical_spacing=0.03,horizontal_spacing=0.03,shared_xaxes=True)
fig.add_trace(go.Table(
        columnorder=[1,2,3,4],
        columnwidth=[250,150,150,150],
        header=dict(
            values=["<b>Recordist</b>", "<b>Mean Rating</b>", "<b>Species Covered</b>",
                    "<b>Countries Covered</b>"],
            line_color='darkslategray',
            fill_color='royalblue',
            font=dict(color='white', size=12),
            align=['center']
        ),
        cells=dict(
            values=[recordists[k].tolist() for k in recordists.columns[:-1]],
            align = "center",
            line_color='darkslategray',
            fill=dict(color=['paleturquoise', 'white']))
    ),1,1)

rec = recordists.nlargest(30,'total')
fig.add_trace(go.Bar(name='Species',x=rec.author,y=rec.species,
                    marker_line_width=1.5,
                    marker_line_color='black',
                    marker_color='#F1EA49'),2,1)
fig.add_trace(go.Bar(name='Countries',x=rec.author,y=rec.country,
                    marker_line_width=1.5,
                    marker_line_color='black',
                    marker_color='#3893D2',
                    text=rec.rating,
                    textposition='outside'),2,1)
fig.update_xaxes(linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='outside',tickfont=dict(size=10),row=2,col=1)
fig.update_yaxes(title='Species + Countries covered',linecolor='black',linewidth=2,showline=True,
                 showgrid=False,mirror=True,ticks='outside',tickfont=dict(size=10),row=2,col=1)
fig.update_layout(template='seaborn',width=700,height=800,title='Recordist Analysis',
                  legend=dict(title='<b>   Top 30<br>Recordists</b>',x=0.1,y=0.49,bgcolor='rgba(255, 255, 255, 0)',
                             bordercolor='rgba(255, 255, 255, 0)',orientation='h'),
                  plot_bgcolor=PLOT_BGCOLOR,paper_bgcolor=PAPER_BGCOLOR,barmode='stack',
                 hovermode='x unified')
fig.show()                                            

**So, as per my rating scale Mike Nelson is the top Recordist as he has covered the right mix of species and countries. Although, his recordings have an average rating of 3.95 as can be seen from the plot.**

In [None]:
"""Covert mp3 to wav format"""
# path = '/kaggle/input/birdsong-recognition/train_audio/'
# out_dir = '/kaggle/working/train_audio_wav'
# if not os.path.exists(out_dir):
#     os.makedirs(out_dir)
# for i in range(df_train.shape[0]):
#     src = os.path.join(path,df_train.loc[i,'ebird_code'],df_train.loc[i,'filename'])
#     dst = os.path.join(out_dir,df_train.loc[i,'ebird_code'])
#     if not os.path.exists(dst):
#         os.makedirs(dst)
#     sound = AudioSegment.from_mp3(src)
#     os.chdir(dst)
#     sound.export(df_train.loc[i,'filename'].split('.')[0]+'.wav', format='wav')    

### Just an initial commit. Lot more of EDA to come. Do leave an upvote if you liked my work. It encourages me to produce more quality content.