# Import libraries

In [35]:
# Standard libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 300)

# for grouping the location of birds
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
!pip install pycountry_convert
import pycountry_convert as pc

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# for audio
import librosa

# For drive access
from google.colab import drive
import os
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load train data csv file

In [36]:
df = pd.read_csv('/content/drive/MyDrive/207/207-Project/notebooks/RG/3_species/train_3_species_metadata_32000sr.csv')

df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,latitude,longitude,duration_secs_32000
0,eaywag1,eaywag1/XC118267.ogg,call,eaywag1/XC118267.npy,good,14.754,-17.411,11.885719
1,eaywag1,eaywag1/XC133264.ogg,call,eaywag1/XC133264.npy,good,56.1285,47.3607,5.459594
2,eaywag1,eaywag1/XC133266.ogg,song,eaywag1/XC133266.npy,good,56.1286,47.3598,42.971438
3,eaywag1,eaywag1/XC134075.ogg,call,eaywag1/XC134075.npy,good,47.1147,20.0626,6.34775
4,eaywag1,eaywag1/XC138503.ogg,call,eaywag1/XC138503.npy,good,54.5689,11.9426,69.746938


In [37]:
len(df)

1044

# add a new column in df for the country location of each sample

In [38]:
df['latitude'].isnull().sum()

16

In [39]:
df['longitude'].isnull().sum()

16

In [40]:
locations = df[['latitude', 'longitude']].values

In [41]:
countries = []

for location in locations:
  geolocator = Nominatim(user_agent="<APP_NAME>", timeout=10)
  geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1)
  if np.isnan(location[0]) or np.isnan(location[1]):
    country = 'Unknown'
  else:
    locate = geocode(location, language='en')
    if locate is None:
      country = 'Antarctica'
    elif 'address' in locate.raw and 'country_code' in locate.raw['address']:
      country = locate.raw['address']['country_code']
    elif 'address' in locate.raw:
      country = locate.raw['address']
    else:
      country = 'Unknown'
  country = country.upper()
  countries.append(country)


In [42]:
len(df) == len(countries)

True

In [43]:
df['country'] = countries

In [44]:
# take a look at the different countries
df['country'].value_counts()

FR         134
ES         133
RU         101
GB          94
PL          57
DE          48
PT          46
SE          46
NL          40
IE          28
FI          28
US          26
BE          26
UA          22
UNKNOWN     16
MX          15
AT          14
KZ          13
DK          13
IT          12
EE          12
CN          11
NO          10
HR          10
UZ           9
CA           7
ZA           7
IN           5
GR           4
TR           4
TH           4
TW           3
BR           3
GE           3
HU           3
MN           3
KR           2
SA           2
SN           2
AR           2
IR           2
CH           2
IL           2
ZM           2
MA           2
SG           1
ID           1
LA           1
BO           1
BA           1
CY           1
CV           1
CD           1
KW           1
MY           1
OM           1
JP           1
BG           1
AE           1
LV           1
RO           1
Name: country, dtype: int64

In [45]:
# take a look at the rows with 'Unknown' country
df[['primary_label', 'latitude', 'longitude','country']][df['country'] == 'UNKNOWN']


Unnamed: 0,primary_label,latitude,longitude,country
554,comsan,,,UNKNOWN
710,barswa,,,UNKNOWN
712,barswa,,,UNKNOWN
713,barswa,,,UNKNOWN
723,barswa,,,UNKNOWN
724,barswa,,,UNKNOWN
726,barswa,,,UNKNOWN
727,barswa,,,UNKNOWN
728,barswa,,,UNKNOWN
742,barswa,,,UNKNOWN


In [46]:
# confirmed all labels with unknown country are due to missing latitude and longitude info
len(df[df['country']=='UNKNOWN']) == df['longitude'].isnull().sum()

True

# add a new column in df for the continent location of each sample

In [48]:
continents = []

for country in countries:
  if country == 'UNKNOWN':
    continent = 'UNKNOWN'
  else:
    continent = pc.country_alpha2_to_continent_code(country)
  continents.append(continent)

In [49]:
len(continents) == len(countries)

True

In [50]:
len(continents) == len(df)

True

In [59]:
df['continent'] = continents

In [60]:
# confirmed all labels with unknown country are in unknown continent
df[['primary_label', 'country','continent']][df['country'] == 'UNKNOWN']

Unnamed: 0,primary_label,country,continent
554,comsan,UNKNOWN,UNKNOWN
710,barswa,UNKNOWN,UNKNOWN
712,barswa,UNKNOWN,UNKNOWN
713,barswa,UNKNOWN,UNKNOWN
723,barswa,UNKNOWN,UNKNOWN
724,barswa,UNKNOWN,UNKNOWN
726,barswa,UNKNOWN,UNKNOWN
727,barswa,UNKNOWN,UNKNOWN
728,barswa,UNKNOWN,UNKNOWN
742,barswa,UNKNOWN,UNKNOWN


In [61]:
# take a look at the different continents
df['continent'].value_counts()

EU         887
AS          72
NA          48
UNKNOWN     16
AF          15
SA           6
Name: continent, dtype: int64

In [62]:
# since there are only a handful in NA and SA, let's just group them with NA as Americas
df.loc[df['continent']== 'NA', 'continent'] = 'AMERICAS'
df.loc[df['continent'] == 'SA', 'continent'] = 'AMERICAS'
df.loc[df['continent'] == 'EU', 'continent'] = 'EUROPE'
df.loc[df['continent'] == 'AS', 'continent'] = 'ASIA'
df.loc[df['continent'] == 'AF', 'continent'] = 'AFRICA'

In [63]:
df['continent'].value_counts()

EUROPE      887
ASIA         72
AMERICAS     54
UNKNOWN      16
AFRICA       15
Name: continent, dtype: int64

# Confirm the df is as expected and drop the latitude/longitude columns

In [64]:
df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,latitude,longitude,duration_secs_32000,country,continent
0,eaywag1,eaywag1/XC118267.ogg,call,eaywag1/XC118267.npy,good,14.754,-17.411,11.885719,SN,AFRICA
1,eaywag1,eaywag1/XC133264.ogg,call,eaywag1/XC133264.npy,good,56.1285,47.3607,5.459594,RU,EUROPE
2,eaywag1,eaywag1/XC133266.ogg,song,eaywag1/XC133266.npy,good,56.1286,47.3598,42.971438,RU,EUROPE
3,eaywag1,eaywag1/XC134075.ogg,call,eaywag1/XC134075.npy,good,47.1147,20.0626,6.34775,HU,EUROPE
4,eaywag1,eaywag1/XC138503.ogg,call,eaywag1/XC138503.npy,good,54.5689,11.9426,69.746938,DK,EUROPE


In [65]:
df.drop(['latitude','longitude'], axis=1, inplace=True)

In [66]:
df.head()

Unnamed: 0,primary_label,filename,type,filename_npy,rating,duration_secs_32000,country,continent
0,eaywag1,eaywag1/XC118267.ogg,call,eaywag1/XC118267.npy,good,11.885719,SN,AFRICA
1,eaywag1,eaywag1/XC133264.ogg,call,eaywag1/XC133264.npy,good,5.459594,RU,EUROPE
2,eaywag1,eaywag1/XC133266.ogg,song,eaywag1/XC133266.npy,good,42.971438,RU,EUROPE
3,eaywag1,eaywag1/XC134075.ogg,call,eaywag1/XC134075.npy,good,6.34775,HU,EUROPE
4,eaywag1,eaywag1/XC138503.ogg,call,eaywag1/XC138503.npy,good,69.746938,DK,EUROPE


# Save the updated df to csv

In [67]:
df.to_csv('/content/drive/MyDrive/207/207-Project/notebooks/RG/3_species/train_3_species_metadata_32000sr_w_location.csv', index=False)