# Import libraries

NOTE: This code was run on my local machine, it will not run on Google Colab without adjusting the file paths first. Also the duration extraction will take a long time to run on Colab.

In [1]:
# Standard libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 300)

# for grouping the location of birds
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# for audio
import librosa


# Load train data csv file

In [2]:
df = pd.read_csv('train_metadata.csv')

df.head()

Unnamed: 0,primary_label,common_name,scientific_name,species_rank,filename,dataset,duration_secs,duration_mins,duration_hrs,species_total_files,...,rating,latitude,longitude,author,url,license,class_weight_file,class_weight_duration,class_weight_combined,orig_index
0,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC110335.ogg,train,119.275102,1.987918,0.033132,499,...,4.5,58.5264,13.8637,Patrik Åberg,https://www.xeno-canto.org/110335,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13845
1,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC110336.ogg,train,122.044127,2.034069,0.033901,499,...,5.0,59.1763,15.4038,Patrik Åberg,https://www.xeno-canto.org/110336,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13846
2,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC118260.ogg,train,44.382041,0.739701,0.012328,499,...,5.0,61.565,29.565,Steve Klasan,https://www.xeno-canto.org/118260,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13848
3,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC120947.ogg,train,189.231066,3.153851,0.052564,499,...,4.5,52.443,21.094,Lars Lachmann,https://www.xeno-canto.org/120947,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13849
4,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC125024.ogg,train,45.312018,0.7552,0.012587,499,...,2.5,54.577,11.9226,Louis A. Hansen,https://www.xeno-canto.org/125024,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13850


In [3]:
len(df)

3278

Keep only columns of interest

In [4]:
df.columns

Index(['primary_label', 'common_name', 'scientific_name', 'species_rank',
       'filename', 'dataset', 'duration_secs', 'duration_mins', 'duration_hrs',
       'species_total_files', 'species_train_files', 'species_test_files',
       'species_total_duration_hrs', 'species_train_duration_hrs',
       'species_test_duration_hrs', 'total_files', 'train_files', 'test_files',
       'total_duration_hrs', 'train_duration_hrs', 'test_duration_hrs',
       'filename_npy', 'type', 'secondary_labels', 'rating', 'latitude',
       'longitude', 'author', 'url', 'license', 'class_weight_file',
       'class_weight_duration', 'class_weight_combined', 'orig_index'],
      dtype='object')

In [5]:
columns = ['primary_label', 'filename', 'type', 'filename_npy', 'rating', 'latitude','longitude']
df = df[columns]
df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,latitude,longitude
0,thrnig1,thrnig1/XC110335.ogg,['song'],thrnig1/XC110335.npy,4.5,58.5264,13.8637
1,thrnig1,thrnig1/XC110336.ogg,['song'],thrnig1/XC110336.npy,5.0,59.1763,15.4038
2,thrnig1,thrnig1/XC118260.ogg,"['male', 'song']",thrnig1/XC118260.npy,5.0,61.565,29.565
3,thrnig1,thrnig1/XC120947.ogg,['song'],thrnig1/XC120947.npy,4.5,52.443,21.094
4,thrnig1,thrnig1/XC125024.ogg,"['call', 'song']",thrnig1/XC125024.npy,2.5,54.577,11.9226


Keep only species of interest (based on the duration slide, these 3 have similar total durations)

In [6]:
species = ['eaywag1', 'comsan', 'barswa']

In [7]:
df = df[df['primary_label'].isin(species)]

In [8]:
len(df)

1044

# Find duration of all samples

In [9]:
filenames = df['filename']
filenames

699     eaywag1/XC118267.ogg
700     eaywag1/XC133264.ogg
701     eaywag1/XC133266.ogg
702     eaywag1/XC134075.ogg
703     eaywag1/XC138503.ogg
                ...         
1738     barswa/XC746605.ogg
1739     barswa/XC746612.ogg
1740     barswa/XC746615.ogg
1741     barswa/XC747404.ogg
1742     barswa/XC749307.ogg
Name: filename, Length: 1044, dtype: object

In [10]:
durations = []

for filename in filenames:
  audio, _ = librosa.load(filename, sr=32000)
  duration = librosa.get_duration(y=audio, sr=32000)
  durations.append(duration)

In [11]:
len(durations)

1044

In [12]:
df['duration_secs_32000'] = durations

# change the type to either blank, 'song', 'call', or 'both'

In [13]:
df['type'].value_counts().head(10)

['flight call']                                                       151
['call']                                                              138
['']                                                                  112
['song']                                                              102
['nocturnal flight call']                                              90
['call', 'flight call']                                                46
['male', 'song']                                                       30
['life stage uncertain', 'nocturnal flight call', 'sex uncertain']     22
['flight call', 'nocturnal flight call']                               22
['alarm call']                                                         18
Name: type, dtype: int64

In [14]:
df.loc[df['type'].str.contains('call') & df['type'].str.contains('song'), 'type'] = 'both'
df.loc[df['type'].str.contains('call'), 'type'] = 'call'
df.loc[df['type'].str.contains('song|Song'), 'type'] = 'song'
df.loc[df['type'] == "['']", 'type'] = 'blank'

In [15]:
df['type'].value_counts().head(10)

call                         688
song                         162
blank                        112
both                          76
['uncertain']                  3
['Single Note Calls']          1
['Adults feeding babies']      1
['juvenile']                   1
Name: type, dtype: int64

In [16]:
df.loc[(df['type'] != 'call') & (df['type'] != 'song') & (df['type'] != 'blank') & (df['type'] != 'both'), 'type'] = 'blank'

In [17]:
df['type'].value_counts().head(10)

call     688
song     162
blank    118
both      76
Name: type, dtype: int64

# change rating to binary

In [18]:
df['rating'].value_counts()

4.0    278
5.0    230
3.0    185
3.5     94
4.5     93
2.0     60
2.5     52
0.0     23
1.0     17
1.5     11
0.5      1
Name: rating, dtype: int64

In [19]:
df['rating'] = np.where(df['rating'] >= 3.0, 'good', 'poor')

In [20]:
df['rating'].value_counts()

good    880
poor    164
Name: rating, dtype: int64

# Save the updated csv file

In [21]:
df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,latitude,longitude,duration_secs_32000
699,eaywag1,eaywag1/XC118267.ogg,call,eaywag1/XC118267.npy,good,14.754,-17.411,11.885719
700,eaywag1,eaywag1/XC133264.ogg,call,eaywag1/XC133264.npy,good,56.1285,47.3607,5.459594
701,eaywag1,eaywag1/XC133266.ogg,song,eaywag1/XC133266.npy,good,56.1286,47.3598,42.971438
702,eaywag1,eaywag1/XC134075.ogg,call,eaywag1/XC134075.npy,good,47.1147,20.0626,6.34775
703,eaywag1,eaywag1/XC138503.ogg,call,eaywag1/XC138503.npy,good,54.5689,11.9426,69.746938


In [22]:
df.to_csv('train_3_species_metadata_32000sr.csv', index=False)