# Google Drive Interface Setup

In [1]:
from google.colab import drive, auth
import sys

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Pull down github repo, store in colab runtime memory
!git clone https://github.com/andrew-loeber/proj_utils_207.git

Cloning into 'proj_utils_207'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 34 (delta 10), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (34/34), 8.70 KiB | 1.09 MiB/s, done.


In [4]:
# Tell Python to also look in this repo when running an import
sys.path.insert(1, "/content/proj_utils_207")

In [5]:
# Authenticate google account and give back the session access token
auth.authenticate_user()
gcloud_token = !gcloud auth print-access-token
gcloud_token

['ya29.a0AbVbY6Pw2tLLBPqEmu65IJZlKmr8FLI99rHcO20E-f-pCrikaZ-GndbTJVDCWCTo38m3yGkEEv4Quulwu-I_q8XrrP1QOQCZ7nHqSHuhsnD6ZmsljsY6R3vwkOzM1bXzxjcWoW7om7rZRJgA7BHIbPoyMHyGaCgYKAUISARASFQFWKvPliU_M-GdgO6KheEc55FOuRg0163']

In [6]:
from gdriveinterface import GDriveInterface
import proj_ref

In [7]:
# Show email address, username, and the path of the shared project folder on
# your Google Drive instance
gdi = GDriveInterface(gcloud_token)
shared_folder_path = gdi.shared_folder_path
gdi.__dict__

{'email': 'aloeber@berkeley.edu',
 'account': 'aloeber',
 'shared_folder_path': '/content/drive/MyDrive/207-Project'}

In [8]:
# Show name keys and paths for tracked files
proj_ref.files

{'taxonomy': 'BirdCLEF/eBird_Taxonomy_v2021.csv',
 'orig_metadata': 'BirdCLEF/train_metadata_with_duration.csv',
 'sample_metadata': 'data/sample_metadata.csv',
 'species_metadata': 'data/species_metadata.csv',
 'train_metadata': 'data/train_metadata.csv',
 'test_metadata': 'data/test_metadata.csv'}

In [9]:
# Show name keys and paths for tracked directories
proj_ref.dirs

{'orig_audio': {'path': 'BirdCLEF/train_audio'},
 'train_audio': {'path': 'data/train/audio_files'},
 'test_audio': {'path': 'data/test/audio_files'},
 'train_npy_full': {'path': 'data/train/librosa_loaded'},
 'test_npy_full': {'path': 'data/test/librosa_loaded'},
 'train_npy_loud5s': {'path': 'data/train/librosa_loaded_loudest_5sec'},
 'test_npy_loud5s': {'path': 'data/test/librosa_loaded_loudest_5sec'}}

### **Usage examples**

##### Query specific file based on named key
```
command: gdi.get_file_path('taxonomy')
returns: '/content/drive/MyDrive/207-Project/BirdCLEF/eBird_Taxonomy_v2021.csv'
```

##### Query specific directory based on named key
```
command: gdi.get_dir_path('orig_audio')
returns: '/content/drive/MyDrive/207-Project/BirdCLEF/train_audio'
```

##### Query specific sample record based on named keys for directory and file
```
command: gdi.get_sample_path('train_audio', 'barswa/XC132406.ogg')
returns: '/content/drive/MyDrive/207-Project/data/train/audio_files/barswa/XC132406.ogg'
```

##### Provide full path given a custom partial path
```
command: gdi.join_to_shared('images/sandpiper.jpeg')
returns: '/content/drive/MyDrive/207-Project/images/sandpiper.jpeg'
```

# Setup

In [10]:
import os
import shutil
import numpy as np
import pandas as pd

In [75]:
pd.set_option('display.max_columns', None)

In [11]:
# Derive new file paths
orig_data_path = f'{shared_folder_path}/BirdCLEF'
orig_trainaudio_path = gdi.get_dir_path('orig_audio')
new_data_path = f'{shared_folder_path}/data'

# Load existing train and test metadata files

In [12]:
train_metadata = pd.read_csv(gdi.get_file_path('train_metadata'))
train_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,librosa_loaded
0,thrnig1,[],['song'],58.5264,13.8637,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/110335,thrnig1/XC110335.ogg,119.275102,500,61606.560907,17.112934,0,train,thrnig1/XC110335.npy
1,thrnig1,[],['song'],59.1763,15.4038,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/110336,thrnig1/XC110336.ogg,122.044127,500,61606.560907,17.112934,0,train,thrnig1/XC110336.npy
2,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Steve Klasan,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/118260,thrnig1/XC118260.ogg,44.382041,500,61606.560907,17.112934,0,train,thrnig1/XC118260.npy
3,thrnig1,[],['song'],52.443,21.094,Luscinia luscinia,Thrush Nightingale,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/120947,thrnig1/XC120947.ogg,189.231066,500,61606.560907,17.112934,0,train,thrnig1/XC120947.npy
4,thrnig1,[],"['call', 'song']",54.577,11.9226,Luscinia luscinia,Thrush Nightingale,Louis A. Hansen,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/125024,thrnig1/XC125024.ogg,45.312018,500,61606.560907,17.112934,0,train,thrnig1/XC125024.npy


In [13]:
train_filenames = list(train_metadata['filename'])
train_filenames[:5]

['thrnig1/XC110335.ogg',
 'thrnig1/XC110336.ogg',
 'thrnig1/XC118260.ogg',
 'thrnig1/XC120947.ogg',
 'thrnig1/XC125024.ogg']

In [14]:
test_metadata = pd.read_csv(gdi.get_file_path('test_metadata'))
test_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,librosa_loaded
0,thrnig1,[],['song'],59.851,17.623,Luscinia luscinia,Thrush Nightingale,Sander Bot,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/112492,thrnig1/XC112492.ogg,58.540408,500,61606.560907,17.112934,0,test,thrnig1/XC112492.npy
1,thrnig1,[],"['male', 'song']",56.0889,47.2543,Luscinia luscinia,Thrush Nightingale,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132295,thrnig1/XC132295.ogg,73.769796,500,61606.560907,17.112934,0,test,thrnig1/XC132295.npy
2,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Stuart Fisher,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/133558,thrnig1/XC133558.ogg,78.759229,500,61606.560907,17.112934,0,test,thrnig1/XC133558.npy
3,thrnig1,[],['song'],51.3506,23.0467,Luscinia luscinia,Thrush Nightingale,Jarek Matusiak,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/134265,thrnig1/XC134265.ogg,394.187755,500,61606.560907,17.112934,0,test,thrnig1/XC134265.npy
4,thrnig1,[],"['male', 'song']",56.7542,46.845,Luscinia luscinia,Thrush Nightingale,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/135792,thrnig1/XC135792.ogg,42.344535,500,61606.560907,17.112934,0,test,thrnig1/XC135792.npy


In [15]:
test_filenames = list(test_metadata['filename'])
test_filenames[:5]

['thrnig1/XC112492.ogg',
 'thrnig1/XC132295.ogg',
 'thrnig1/XC133558.ogg',
 'thrnig1/XC134265.ogg',
 'thrnig1/XC135792.ogg']

In [16]:
len(train_filenames + test_filenames)

4680

# Pre-filter data and find top 10 species

In [171]:
meta_df = pd.read_csv(gdi.get_file_path('orig_metadata'), delimiter=',')
meta_df

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,45.609796
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,18.677596
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg,38.765760
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg,29.257188
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg,42.344535
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/703472,yewgre1/XC703472.ogg,20.924127
16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/703485,yewgre1/XC703485.ogg,21.707755
16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/704433,yewgre1/XC704433.ogg,80.875102
16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/752974,yewgre1/XC752974.ogg,51.800816


In [172]:
meta_df.rename({'duration': 'duration_secs'}, axis=1, inplace=True)
meta_df['over_5_sec'] = meta_df['duration_secs'] >= 5.0
meta_df['duration_mins'] = meta_df['duration_secs'] / (60)
meta_df['duration_hrs'] = meta_df['duration_secs'] / (60 * 60)
meta_df

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,over_5_sec,duration_mins,duration_hrs
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,45.609796,True,0.760163,0.012669
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,18.677596,True,0.311293,0.005188
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg,38.765760,True,0.646096,0.010768
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg,29.257188,True,0.487620,0.008127
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg,42.344535,True,0.705742,0.011762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/703472,yewgre1/XC703472.ogg,20.924127,True,0.348735,0.005812
16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/703485,yewgre1/XC703485.ogg,21.707755,True,0.361796,0.006030
16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/704433,yewgre1/XC704433.ogg,80.875102,True,1.347918,0.022465
16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/752974,yewgre1/XC752974.ogg,51.800816,True,0.863347,0.014389


In [173]:
filenames_to_filter = [
  'thrnig1/XC728016.ogg',
  'barswa/XC575749.ogg',
  'barswa/XC664977.ogg',
  'barswa/XC671721.ogg',
  'barswa/XC747232.ogg',
  'eaywag1/XC645113.ogg',
  'eaywag1/XC669616.ogg',
  'comsan/XC586330.ogg',
  'comsan/XC586332.ogg',
  'comsan/XC613128.ogg',
  'combuz1/XC144258.ogg',
  'combuz1/XC647787.ogg',
  'woosan/XC647845.ogg',
  'eubeat1/XC392184.ogg',
  'eubeat1/XC392191.ogg',
  'eubeat1/XC392193.ogg',
  'hoopoe/XC417570.ogg',
  'cohmar1/XC558316.ogg',
  'cohmar1/XC748727.ogg'
]

meta_df['passes_manual_filter'] = True

for i in range(len(meta_df)):
  filename = meta_df.iloc[i]['filename']
  if filename in filenames_to_filter:
    meta_df.at[i, 'passes_manual_filter'] = False

meta_df[~meta_df['passes_manual_filter']]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,over_5_sec,duration_mins,duration_hrs,passes_manual_filter
1204,barswa,[],"['call', 'song', 'various calls']",58.7542,23.8439,Hirundo rustica,Barn Swallow,Stanislas Wroza,Creative Commons Attribution-NonCommercial-Sha...,0.0,https://www.xeno-canto.org/575749,barswa/XC575749.ogg,36.12,True,0.602,0.010033,False
1290,barswa,[],"['adult', 'song']",52.8858,23.8293,Hirundo rustica,Barn Swallow,Ireneusz Oleksik,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/664977,barswa/XC664977.ogg,64.704036,True,1.078401,0.017973,False
1318,barswa,[],['call'],51.3672,5.8406,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,3.0,http://xeno-canto.org/671721,barswa/XC671721.ogg,10.057007,True,0.167617,0.002794,False
1390,barswa,[],[''],56.6346,9.7837,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/747232,barswa/XC747232.ogg,10.109025,True,0.168484,0.002808,False
3865,cohmar1,[],['flight call'],50.4639,3.6821,Delichon urbicum,Common House-Martin,Alain Malengreau,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/558316,cohmar1/XC558316.ogg,17.423039,True,0.290384,0.00484,False
4072,cohmar1,[],[''],48.2741,10.8188,Delichon urbicum,Common House-Martin,johannes buhl,Creative Commons Attribution-NonCommercial-Sha...,1.0,https://xeno-canto.org/748727,cohmar1/XC748727.ogg,6.817007,True,0.113617,0.001894,False
4570,combuz1,[],['flight call'],55.2375,47.6393,Buteo buteo,Common Buzzard,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,2.0,https://www.xeno-canto.org/144258,combuz1/XC144258.ogg,38.739637,True,0.645661,0.010761,False
4938,combuz1,[],"['call', 'courtship/copulation calls (?)']",52.2321,-8.6701,Buteo buteo,Common Buzzard,Irish Wildlife Sounds,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/647787,combuz1/XC647787.ogg,77.714014,True,1.295234,0.021587,False
5288,comsan,[],['nocturnal flight call'],51.4309,-2.8518,Actitis hypoleucos,Common Sandpiper,Paul Williams,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/586330,comsan/XC586330.ogg,7.026032,True,0.117101,0.001952,False
5289,comsan,[],['nocturnal flight call'],51.4309,-2.8518,Actitis hypoleucos,Common Sandpiper,Paul Williams,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/586332,comsan/XC586332.ogg,7.026032,True,0.117101,0.001952,False


In [174]:
meta_df_filt = meta_df[meta_df['over_5_sec'] & meta_df['passes_manual_filter']]
meta_df_filt.reset_index(drop=False, names='orig_index', inplace=True)
meta_df_filt.drop(['over_5_sec', 'passes_manual_filter'], axis=1, inplace=True)
meta_df_filt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df_filt.drop(['over_5_sec', 'passes_manual_filter'], axis=1, inplace=True)


Unnamed: 0,orig_index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,duration_mins,duration_hrs
0,0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,45.609796,0.760163,0.012669
1,1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,18.677596,0.311293,0.005188
2,2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg,38.765760,0.646096,0.010768
3,3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg,29.257188,0.487620,0.008127
4,4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg,42.344535,0.705742,0.011762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16040,16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/703472,yewgre1/XC703472.ogg,20.924127,0.348735,0.005812
16041,16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/703485,yewgre1/XC703485.ogg,21.707755,0.361796,0.006030
16042,16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/704433,yewgre1/XC704433.ogg,80.875102,1.347918,0.022465
16043,16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/752974,yewgre1/XC752974.ogg,51.800816,0.863347,0.014389


In [175]:
meta_df_filt.drop_duplicates(
    subset=['duration_secs', 'type', 'primary_label', 'author']
    , keep='first'
    , inplace=True
)
meta_df_filt.reset_index(drop=True, inplace=True)
meta_df_filt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df_filt.drop_duplicates(


Unnamed: 0,orig_index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,duration_mins,duration_hrs
0,0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,45.609796,0.760163,0.012669
1,1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,18.677596,0.311293,0.005188
2,2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg,38.765760,0.646096,0.010768
3,3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg,29.257188,0.487620,0.008127
4,4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg,42.344535,0.705742,0.011762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/703472,yewgre1/XC703472.ogg,20.924127,0.348735,0.005812
15995,16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/703485,yewgre1/XC703485.ogg,21.707755,0.361796,0.006030
15996,16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/704433,yewgre1/XC704433.ogg,80.875102,1.347918,0.022465
15997,16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/752974,yewgre1/XC752974.ogg,51.800816,0.863347,0.014389


In [176]:
# Aggregate file count and audio duration by species
meta_species_agg = meta_df_filt.groupby('primary_label')['duration_hrs'].agg([len, sum])
meta_species_agg.rename(
    {'len': 'species_total_files', 'sum': 'species_total_duration_hrs'}
    , axis=1
    , inplace=True
)
meta_species_agg.sort_values(['species_total_files', 'species_total_duration_hrs'], ascending=[False, False], inplace=True)
meta_species_agg['species_rank'] = range(len(meta_species_agg))
meta_species_agg

Unnamed: 0_level_0,species_total_files,species_total_duration_hrs,species_rank
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
thrnig1,499,17.078735,0
wlwwar,499,9.310583,1
eaywag1,498,4.649309,2
comsan,497,4.297684,3
barswa,496,5.308204,4
...,...,...,...
brtcha1,1,0.007670,259
whhsaw1,1,0.002888,260
whctur2,1,0.002641,261
golher1,1,0.002300,262


In [177]:
meta_agg_top10 = meta_species_agg.iloc[:10]
meta_agg_top10

Unnamed: 0_level_0,species_total_files,species_total_duration_hrs,species_rank
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
thrnig1,499,17.078735,0
wlwwar,499,9.310583,1
eaywag1,498,4.649309,2
comsan,497,4.297684,3
barswa,496,5.308204,4
woosan,474,3.730303,5
combuz1,473,6.874921,6
hoopoe,419,6.955822,7
eubeat1,417,7.826683,8
cohmar1,408,5.656965,9


In [178]:
species_info = meta_df_filt[['primary_label', 'scientific_name', 'common_name']].drop_duplicates()
species_info.set_index('primary_label', drop=True, inplace=True)
species_info

Unnamed: 0_level_0,scientific_name,common_name
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1
abethr1,Turdus tephronotus,African Bare-eyed Thrush
abhori1,Oriolus larvatus,African Black-headed Oriole
abythr1,Turdus abyssinicus,Abyssinian Thrush
afbfly1,Elminia longicauda,African Blue Flycatcher
afdfly1,Muscicapa adusta,African Dusky Flycatcher
...,...,...
yertin1,Pogoniulus bilineatus,Yellow-rumped Tinkerbird
yesbar1,Buccanodon duchaillui,Yellow-spotted Barbet
yespet1,Gymnoris pyrgita,Yellow-spotted Bush Sparrow
yetgre1,Atimastillas flavicollis,Yellow-throated Greenbul


In [179]:
species_info_top10 = meta_agg_top10.join(species_info, on='primary_label')
species_info_top10.reset_index(inplace=True)
species_info_top10.set_index('species_rank', inplace=True)
species_info_top10

Unnamed: 0_level_0,primary_label,species_total_files,species_total_duration_hrs,scientific_name,common_name
species_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,thrnig1,499,17.078735,Luscinia luscinia,Thrush Nightingale
1,wlwwar,499,9.310583,Phylloscopus trochilus,Willow Warbler
2,eaywag1,498,4.649309,Motacilla flava,Western Yellow Wagtail
3,comsan,497,4.297684,Actitis hypoleucos,Common Sandpiper
4,barswa,496,5.308204,Hirundo rustica,Barn Swallow
5,woosan,474,3.730303,Tringa glareola,Wood Sandpiper
6,combuz1,473,6.874921,Buteo buteo,Common Buzzard
7,hoopoe,419,6.955822,Upupa epops,Eurasian Hoopoe
8,eubeat1,417,7.826683,Merops apiaster,European Bee-eater
9,cohmar1,408,5.656965,Delichon urbicum,Common House-Martin


In [180]:
top_10_species_labels = list(species_info_top10['primary_label'])
top_10_species_labels

['thrnig1',
 'wlwwar',
 'eaywag1',
 'comsan',
 'barswa',
 'woosan',
 'combuz1',
 'hoopoe',
 'eubeat1',
 'cohmar1']

In [181]:
label_in_top10 = [True if label in top_10_species_labels else False for label in meta_df_filt['primary_label']]
meta_df_top10 = meta_df_filt[label_in_top10].copy()
meta_df_top10.reset_index(drop=True, inplace=True)
meta_df_top10

Unnamed: 0,orig_index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,duration_mins,duration_hrs
0,895,barswa,[],"['call', 'flight call']",46.4605,6.3914,Hirundo rustica,Barn Swallow,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/113914,barswa/XC113914.ogg,57.077596,0.951293,0.015855
1,896,barswa,[],['song'],35.0307,-120.6205,Hirundo rustica,Barn Swallow,Thomas G. Graves,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/129647,barswa/XC129647.ogg,106.728027,1.778800,0.029647
2,897,barswa,[],['song'],45.3675,-73.8566,Hirundo rustica,Barn Swallow,Patrick Turgeon,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132406,barswa/XC132406.ogg,10.788571,0.179810,0.002997
3,898,barswa,[],"['call', 'female', 'male', 'song']",56.1559,47.4939,Hirundo rustica,Barn Swallow,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133096,barswa/XC133096.ogg,11.572290,0.192872,0.003215
4,899,barswa,[],['song'],55.9937,-3.5605,Hirundo rustica,Barn Swallow,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133802,barswa/XC133802.ogg,15.490658,0.258178,0.004303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,16261,woosan,[],[''],60.2183,25.1208,Tringa glareola,Wood Sandpiper,Hannu Varkki,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://xeno-canto.org/754173,woosan/XC754173.ogg,50.016009,0.833600,0.013893
4676,16262,woosan,[],[''],64.1910,21.1130,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754702,woosan/XC754702.ogg,58.546440,0.975774,0.016263
4677,16263,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754763,woosan/XC754763.ogg,78.560499,1.309342,0.021822
4678,16264,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754765,woosan/XC754765.ogg,71.276780,1.187946,0.019799


In [182]:
# meta_df_filt = meta_df_filt.join(meta_species_agg, on='primary_label')
# meta_df_filt

In [183]:
# # Get only files in top 10 species
# meta_df_top10 = meta_df_filt[(meta_df_filt['species_rank'] <= 9)]
# meta_df_top10.sort_values(['species_rank', 'filename'], inplace=True)
# meta_df_top10.reset_index(drop=True, inplace=True)
# meta_df_top10

# Assign files to train or test dataset

In [193]:
# def assign_datasets(meta_df, test_ratio=0.3, random_seed=207):
#   rng = np.random.default_rng(seed=random_seed)
#   df_list = []

#   # Loop through each species' corresponding DataFrame
#   for species_group in meta_df.groupby('primary_label'):
#     species_df = species_group[1].copy()
#     species_df['dataset'] = 'train'
#     # Determine total file number & target test file number
#     n_files = len(species_df)
#     n_test_files = int(n_files * test_ratio)
#     # Make random choices for the test set
#     test_indices = rng.choice(n_files, n_test_files, replace=False)
#     test_indices.sort()
#     species_df['dataset'].iloc[test_indices] = 'test'
#     df_list.append(species_df)

#   return pd.concat(df_list)


In [194]:
meta_df_top10['dataset'] = ''

for i in range(len(meta_df_top10)):
  if meta_df_top10.at[i, 'filename'] in test_filenames:
    meta_df_top10.at[i, 'dataset'] = 'test'
  else:
    meta_df_top10.at[i, 'dataset'] = 'train'

meta_df_top10

Unnamed: 0,orig_index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,duration_mins,duration_hrs,dataset
0,895,barswa,[],"['call', 'flight call']",46.4605,6.3914,Hirundo rustica,Barn Swallow,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/113914,barswa/XC113914.ogg,57.077596,0.951293,0.015855,test
1,896,barswa,[],['song'],35.0307,-120.6205,Hirundo rustica,Barn Swallow,Thomas G. Graves,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/129647,barswa/XC129647.ogg,106.728027,1.778800,0.029647,test
2,897,barswa,[],['song'],45.3675,-73.8566,Hirundo rustica,Barn Swallow,Patrick Turgeon,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132406,barswa/XC132406.ogg,10.788571,0.179810,0.002997,train
3,898,barswa,[],"['call', 'female', 'male', 'song']",56.1559,47.4939,Hirundo rustica,Barn Swallow,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133096,barswa/XC133096.ogg,11.572290,0.192872,0.003215,test
4,899,barswa,[],['song'],55.9937,-3.5605,Hirundo rustica,Barn Swallow,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133802,barswa/XC133802.ogg,15.490658,0.258178,0.004303,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,16261,woosan,[],[''],60.2183,25.1208,Tringa glareola,Wood Sandpiper,Hannu Varkki,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://xeno-canto.org/754173,woosan/XC754173.ogg,50.016009,0.833600,0.013893,test
4676,16262,woosan,[],[''],64.1910,21.1130,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754702,woosan/XC754702.ogg,58.546440,0.975774,0.016263,train
4677,16263,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754763,woosan/XC754763.ogg,78.560499,1.309342,0.021822,test
4678,16264,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754765,woosan/XC754765.ogg,71.276780,1.187946,0.019799,train


In [195]:
meta_df_top10['dataset'].value_counts()

train    3278
test     1402
Name: dataset, dtype: int64

In [196]:
# meta_df_top20_datasets = assign_datasets(meta_df_top20)
# meta_df_top20_datasets.sort_values(['species_rank', 'filename'], inplace=True)
# meta_df_top20_datasets.drop('under_5_sec', axis=1, inplace=True)
# meta_df_top20_datasets

In [197]:
# meta_df_top10.sort_values(['species_rank', 'filename'], inplace=True)
# meta_df_top10.reset_index(drop=True, inplace=True)
# meta_df_top10

In [198]:
train_test_durs_top10 = meta_df_top10.groupby(
    ['primary_label', 'dataset']
)['duration_hrs'].sum().unstack()
train_test_durs_top10.rename({'test': 'species_test_duration_hrs', 'train': 'species_train_duration_hrs'}, axis=1, inplace=True)
train_test_durs_top10

dataset,species_test_duration_hrs,species_train_duration_hrs
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1
barswa,1.675853,3.632351
cohmar1,1.553775,4.103189
combuz1,2.240165,4.634755
comsan,1.433354,2.864329
eaywag1,1.420284,3.229026
eubeat1,2.178028,5.648654
hoopoe,1.781583,5.174239
thrnig1,6.339658,10.739077
wlwwar,2.577132,6.733451
woosan,1.026231,2.704072


In [199]:
train_test_counts_top10 = meta_df_top10.groupby('primary_label')['dataset'].value_counts().unstack()
train_test_counts_top10.rename({'test': 'species_test_files', 'train': 'species_train_files'}, axis=1, inplace=True)
train_test_counts_top10

dataset,species_test_files,species_train_files
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1
barswa,149,347
cohmar1,122,286
combuz1,141,332
comsan,148,349
eaywag1,150,348
eubeat1,125,292
hoopoe,126,293
thrnig1,150,349
wlwwar,149,350
woosan,142,332


In [207]:
species_info_top10_full = species_info_top10.join(train_test_durs_top10, on='primary_label')
species_info_top10_full = species_info_top10_full.join(train_test_counts_top10, on='primary_label')
species_info_top10_full

Unnamed: 0_level_0,primary_label,species_total_files,species_total_duration_hrs,scientific_name,common_name,species_test_duration_hrs,species_train_duration_hrs,species_test_files,species_train_files
species_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,thrnig1,499,17.078735,Luscinia luscinia,Thrush Nightingale,6.339658,10.739077,150,349
1,wlwwar,499,9.310583,Phylloscopus trochilus,Willow Warbler,2.577132,6.733451,149,350
2,eaywag1,498,4.649309,Motacilla flava,Western Yellow Wagtail,1.420284,3.229026,150,348
3,comsan,497,4.297684,Actitis hypoleucos,Common Sandpiper,1.433354,2.864329,148,349
4,barswa,496,5.308204,Hirundo rustica,Barn Swallow,1.675853,3.632351,149,347
5,woosan,474,3.730303,Tringa glareola,Wood Sandpiper,1.026231,2.704072,142,332
6,combuz1,473,6.874921,Buteo buteo,Common Buzzard,2.240165,4.634755,141,332
7,hoopoe,419,6.955822,Upupa epops,Eurasian Hoopoe,1.781583,5.174239,126,293
8,eubeat1,417,7.826683,Merops apiaster,European Bee-eater,2.178028,5.648654,125,292
9,cohmar1,408,5.656965,Delichon urbicum,Common House-Martin,1.553775,4.103189,122,286


In [208]:
species_info_top10_full['total_files'] = np.sum(species_info_top10_full['species_total_files'])
species_info_top10_full['train_files'] = np.sum(species_info_top10_full['species_train_files'])
species_info_top10_full['test_files'] = np.sum(species_info_top10_full['species_test_files'])
species_info_top10_full['total_duration_hrs'] = np.sum(species_info_top10_full['species_total_duration_hrs'])
species_info_top10_full['train_duration_hrs'] = np.sum(species_info_top10_full['species_train_duration_hrs'])
species_info_top10_full['test_duration_hrs'] = np.sum(species_info_top10_full['species_test_duration_hrs'])
species_info_top10_full

Unnamed: 0_level_0,primary_label,species_total_files,species_total_duration_hrs,scientific_name,common_name,species_test_duration_hrs,species_train_duration_hrs,species_test_files,species_train_files,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs
species_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,thrnig1,499,17.078735,Luscinia luscinia,Thrush Nightingale,6.339658,10.739077,150,349,4680,3278,1402,71.689209,49.463144,22.226065
1,wlwwar,499,9.310583,Phylloscopus trochilus,Willow Warbler,2.577132,6.733451,149,350,4680,3278,1402,71.689209,49.463144,22.226065
2,eaywag1,498,4.649309,Motacilla flava,Western Yellow Wagtail,1.420284,3.229026,150,348,4680,3278,1402,71.689209,49.463144,22.226065
3,comsan,497,4.297684,Actitis hypoleucos,Common Sandpiper,1.433354,2.864329,148,349,4680,3278,1402,71.689209,49.463144,22.226065
4,barswa,496,5.308204,Hirundo rustica,Barn Swallow,1.675853,3.632351,149,347,4680,3278,1402,71.689209,49.463144,22.226065
5,woosan,474,3.730303,Tringa glareola,Wood Sandpiper,1.026231,2.704072,142,332,4680,3278,1402,71.689209,49.463144,22.226065
6,combuz1,473,6.874921,Buteo buteo,Common Buzzard,2.240165,4.634755,141,332,4680,3278,1402,71.689209,49.463144,22.226065
7,hoopoe,419,6.955822,Upupa epops,Eurasian Hoopoe,1.781583,5.174239,126,293,4680,3278,1402,71.689209,49.463144,22.226065
8,eubeat1,417,7.826683,Merops apiaster,European Bee-eater,2.178028,5.648654,125,292,4680,3278,1402,71.689209,49.463144,22.226065
9,cohmar1,408,5.656965,Delichon urbicum,Common House-Martin,1.553775,4.103189,122,286,4680,3278,1402,71.689209,49.463144,22.226065


In [209]:
species_info_top10_full.reset_index(inplace=True)
species_info_top10_full.set_index('primary_label', inplace=True)
species_info_top10_full

Unnamed: 0_level_0,species_rank,species_total_files,species_total_duration_hrs,scientific_name,common_name,species_test_duration_hrs,species_train_duration_hrs,species_test_files,species_train_files,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
thrnig1,0,499,17.078735,Luscinia luscinia,Thrush Nightingale,6.339658,10.739077,150,349,4680,3278,1402,71.689209,49.463144,22.226065
wlwwar,1,499,9.310583,Phylloscopus trochilus,Willow Warbler,2.577132,6.733451,149,350,4680,3278,1402,71.689209,49.463144,22.226065
eaywag1,2,498,4.649309,Motacilla flava,Western Yellow Wagtail,1.420284,3.229026,150,348,4680,3278,1402,71.689209,49.463144,22.226065
comsan,3,497,4.297684,Actitis hypoleucos,Common Sandpiper,1.433354,2.864329,148,349,4680,3278,1402,71.689209,49.463144,22.226065
barswa,4,496,5.308204,Hirundo rustica,Barn Swallow,1.675853,3.632351,149,347,4680,3278,1402,71.689209,49.463144,22.226065
woosan,5,474,3.730303,Tringa glareola,Wood Sandpiper,1.026231,2.704072,142,332,4680,3278,1402,71.689209,49.463144,22.226065
combuz1,6,473,6.874921,Buteo buteo,Common Buzzard,2.240165,4.634755,141,332,4680,3278,1402,71.689209,49.463144,22.226065
hoopoe,7,419,6.955822,Upupa epops,Eurasian Hoopoe,1.781583,5.174239,126,293,4680,3278,1402,71.689209,49.463144,22.226065
eubeat1,8,417,7.826683,Merops apiaster,European Bee-eater,2.178028,5.648654,125,292,4680,3278,1402,71.689209,49.463144,22.226065
cohmar1,9,408,5.656965,Delichon urbicum,Common House-Martin,1.553775,4.103189,122,286,4680,3278,1402,71.689209,49.463144,22.226065


In [211]:
species_info_top10_full['class_weight_file'] = \
  np.mean(species_info_top10_full['species_total_files']) / species_info_top10_full['species_total_files']

species_info_top10_full['class_weight_duration'] = \
  np.mean(species_info_top10_full['species_total_duration_hrs']) / species_info_top10_full['species_total_duration_hrs']

species_info_top10_full['class_weight_combined'] = \
  np.sqrt(species_info_top10_full['class_weight_file'] * species_info_top10_full['class_weight_duration'])

species_info_top10_full

Unnamed: 0_level_0,species_rank,species_total_files,species_total_duration_hrs,scientific_name,common_name,species_test_duration_hrs,species_train_duration_hrs,species_test_files,species_train_files,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,class_weight_file,class_weight_duration,class_weight_combined
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
thrnig1,0,499,17.078735,Luscinia luscinia,Thrush Nightingale,6.339658,10.739077,150,349,4680,3278,1402,71.689209,49.463144,22.226065,0.937876,0.419757,0.627439
wlwwar,1,499,9.310583,Phylloscopus trochilus,Willow Warbler,2.577132,6.733451,149,350,4680,3278,1402,71.689209,49.463144,22.226065,0.937876,0.769975,0.849789
eaywag1,2,498,4.649309,Motacilla flava,Western Yellow Wagtail,1.420284,3.229026,150,348,4680,3278,1402,71.689209,49.463144,22.226065,0.939759,1.541932,1.203763
comsan,3,497,4.297684,Actitis hypoleucos,Common Sandpiper,1.433354,2.864329,148,349,4680,3278,1402,71.689209,49.463144,22.226065,0.94165,1.66809,1.253298
barswa,4,496,5.308204,Hirundo rustica,Barn Swallow,1.675853,3.632351,149,347,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
woosan,5,474,3.730303,Tringa glareola,Wood Sandpiper,1.026231,2.704072,142,332,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.37749
combuz1,6,473,6.874921,Buteo buteo,Common Buzzard,2.240165,4.634755,141,332,4680,3278,1402,71.689209,49.463144,22.226065,0.989429,1.042764,1.015747
hoopoe,7,419,6.955822,Upupa epops,Eurasian Hoopoe,1.781583,5.174239,126,293,4680,3278,1402,71.689209,49.463144,22.226065,1.116945,1.030636,1.072923
eubeat1,8,417,7.826683,Merops apiaster,European Bee-eater,2.178028,5.648654,125,292,4680,3278,1402,71.689209,49.463144,22.226065,1.122302,0.915959,1.013895
cohmar1,9,408,5.656965,Delichon urbicum,Common House-Martin,1.553775,4.103189,122,286,4680,3278,1402,71.689209,49.463144,22.226065,1.147059,1.267273,1.205669


# Prepare metadata & write to file

In [212]:
species_info_top10_full.columns

Index(['species_rank', 'species_total_files', 'species_total_duration_hrs',
       'scientific_name', 'common_name', 'species_test_duration_hrs',
       'species_train_duration_hrs', 'species_test_files',
       'species_train_files', 'total_files', 'train_files', 'test_files',
       'total_duration_hrs', 'train_duration_hrs', 'test_duration_hrs',
       'class_weight_file', 'class_weight_duration', 'class_weight_combined'],
      dtype='object')

In [213]:
species_info_top10_full = species_info_top10_full[[
    'scientific_name'
    , 'common_name'
    , 'species_rank'
    , 'species_total_files'
    , 'species_train_files'
    , 'species_test_files'
    , 'species_total_duration_hrs'
    , 'species_train_duration_hrs'
    , 'species_test_duration_hrs'
    , 'total_files'
    , 'train_files'
    , 'test_files'
    , 'total_duration_hrs'
    , 'train_duration_hrs'
    , 'test_duration_hrs'
    , 'class_weight_file'
    , 'class_weight_duration'
    , 'class_weight_combined'
]]

species_info_top10_full

Unnamed: 0_level_0,scientific_name,common_name,species_rank,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,class_weight_file,class_weight_duration,class_weight_combined
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
thrnig1,Luscinia luscinia,Thrush Nightingale,0,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,0.937876,0.419757,0.627439
wlwwar,Phylloscopus trochilus,Willow Warbler,1,499,350,149,9.310583,6.733451,2.577132,4680,3278,1402,71.689209,49.463144,22.226065,0.937876,0.769975,0.849789
eaywag1,Motacilla flava,Western Yellow Wagtail,2,498,348,150,4.649309,3.229026,1.420284,4680,3278,1402,71.689209,49.463144,22.226065,0.939759,1.541932,1.203763
comsan,Actitis hypoleucos,Common Sandpiper,3,497,349,148,4.297684,2.864329,1.433354,4680,3278,1402,71.689209,49.463144,22.226065,0.94165,1.66809,1.253298
barswa,Hirundo rustica,Barn Swallow,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
woosan,Tringa glareola,Wood Sandpiper,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.37749
combuz1,Buteo buteo,Common Buzzard,6,473,332,141,6.874921,4.634755,2.240165,4680,3278,1402,71.689209,49.463144,22.226065,0.989429,1.042764,1.015747
hoopoe,Upupa epops,Eurasian Hoopoe,7,419,293,126,6.955822,5.174239,1.781583,4680,3278,1402,71.689209,49.463144,22.226065,1.116945,1.030636,1.072923
eubeat1,Merops apiaster,European Bee-eater,8,417,292,125,7.826683,5.648654,2.178028,4680,3278,1402,71.689209,49.463144,22.226065,1.122302,0.915959,1.013895
cohmar1,Delichon urbicum,Common House-Martin,9,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,1.147059,1.267273,1.205669


In [216]:
species_info_top10_full.to_csv(f'{new_data_path}/species_metadata.csv')

In [217]:
species_info_top10_joinable = species_info_top10_full.drop(
    ['scientific_name', 'common_name']
    , axis=1
)
species_info_top10_joinable

Unnamed: 0_level_0,species_rank,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,class_weight_file,class_weight_duration,class_weight_combined
primary_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
thrnig1,0,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,0.937876,0.419757,0.627439
wlwwar,1,499,350,149,9.310583,6.733451,2.577132,4680,3278,1402,71.689209,49.463144,22.226065,0.937876,0.769975,0.849789
eaywag1,2,498,348,150,4.649309,3.229026,1.420284,4680,3278,1402,71.689209,49.463144,22.226065,0.939759,1.541932,1.203763
comsan,3,497,349,148,4.297684,2.864329,1.433354,4680,3278,1402,71.689209,49.463144,22.226065,0.94165,1.66809,1.253298
barswa,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
woosan,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.37749
combuz1,6,473,332,141,6.874921,4.634755,2.240165,4680,3278,1402,71.689209,49.463144,22.226065,0.989429,1.042764,1.015747
hoopoe,7,419,293,126,6.955822,5.174239,1.781583,4680,3278,1402,71.689209,49.463144,22.226065,1.116945,1.030636,1.072923
eubeat1,8,417,292,125,7.826683,5.648654,2.178028,4680,3278,1402,71.689209,49.463144,22.226065,1.122302,0.915959,1.013895
cohmar1,9,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,1.147059,1.267273,1.205669


In [218]:
meta_df_top10_full = meta_df_top10.join(species_info_top10_joinable, on='primary_label')
meta_df_top10_full

Unnamed: 0,orig_index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,duration_mins,duration_hrs,dataset,species_rank,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,class_weight_file,class_weight_duration,class_weight_combined
0,895,barswa,[],"['call', 'flight call']",46.4605,6.3914,Hirundo rustica,Barn Swallow,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/113914,barswa/XC113914.ogg,57.077596,0.951293,0.015855,test,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
1,896,barswa,[],['song'],35.0307,-120.6205,Hirundo rustica,Barn Swallow,Thomas G. Graves,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/129647,barswa/XC129647.ogg,106.728027,1.778800,0.029647,test,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
2,897,barswa,[],['song'],45.3675,-73.8566,Hirundo rustica,Barn Swallow,Patrick Turgeon,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132406,barswa/XC132406.ogg,10.788571,0.179810,0.002997,train,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
3,898,barswa,[],"['call', 'female', 'male', 'song']",56.1559,47.4939,Hirundo rustica,Barn Swallow,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133096,barswa/XC133096.ogg,11.572290,0.192872,0.003215,test,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
4,899,barswa,[],['song'],55.9937,-3.5605,Hirundo rustica,Barn Swallow,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133802,barswa/XC133802.ogg,15.490658,0.258178,0.004303,train,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,16261,woosan,[],[''],60.2183,25.1208,Tringa glareola,Wood Sandpiper,Hannu Varkki,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://xeno-canto.org/754173,woosan/XC754173.ogg,50.016009,0.833600,0.013893,test,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490
4676,16262,woosan,[],[''],64.1910,21.1130,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754702,woosan/XC754702.ogg,58.546440,0.975774,0.016263,train,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490
4677,16263,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754763,woosan/XC754763.ogg,78.560499,1.309342,0.021822,test,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490
4678,16264,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754765,woosan/XC754765.ogg,71.276780,1.187946,0.019799,train,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490


In [219]:
meta_df_top10_full['filename_npy'] = meta_df_top10_full['filename'].apply(
    lambda x: x.replace('ogg', 'npy')
)
meta_df_top10_full

Unnamed: 0,orig_index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration_secs,duration_mins,duration_hrs,dataset,species_rank,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,class_weight_file,class_weight_duration,class_weight_combined,filename_npy
0,895,barswa,[],"['call', 'flight call']",46.4605,6.3914,Hirundo rustica,Barn Swallow,Bram Piot,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/113914,barswa/XC113914.ogg,57.077596,0.951293,0.015855,test,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847,barswa/XC113914.npy
1,896,barswa,[],['song'],35.0307,-120.6205,Hirundo rustica,Barn Swallow,Thomas G. Graves,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/129647,barswa/XC129647.ogg,106.728027,1.778800,0.029647,test,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847,barswa/XC129647.npy
2,897,barswa,[],['song'],45.3675,-73.8566,Hirundo rustica,Barn Swallow,Patrick Turgeon,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132406,barswa/XC132406.ogg,10.788571,0.179810,0.002997,train,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847,barswa/XC132406.npy
3,898,barswa,[],"['call', 'female', 'male', 'song']",56.1559,47.4939,Hirundo rustica,Barn Swallow,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133096,barswa/XC133096.ogg,11.572290,0.192872,0.003215,test,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847,barswa/XC133096.npy
4,899,barswa,[],['song'],55.9937,-3.5605,Hirundo rustica,Barn Swallow,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/133802,barswa/XC133802.ogg,15.490658,0.258178,0.004303,train,4,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,0.943548,1.350536,1.128847,barswa/XC133802.npy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,16261,woosan,[],[''],60.2183,25.1208,Tringa glareola,Wood Sandpiper,Hannu Varkki,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://xeno-canto.org/754173,woosan/XC754173.ogg,50.016009,0.833600,0.013893,test,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490,woosan/XC754173.npy
4676,16262,woosan,[],[''],64.1910,21.1130,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754702,woosan/XC754702.ogg,58.546440,0.975774,0.016263,train,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490,woosan/XC754702.npy
4677,16263,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754763,woosan/XC754763.ogg,78.560499,1.309342,0.021822,test,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490,woosan/XC754763.npy
4678,16264,woosan,[],[''],58.7396,17.8657,Tringa glareola,Wood Sandpiper,Alan Dalton,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/754765,woosan/XC754765.ogg,71.276780,1.187946,0.019799,train,5,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,0.987342,1.921806,1.377490,woosan/XC754765.npy


In [220]:
for col in meta_df_top10_full.columns:
  print(col)

orig_index
primary_label
secondary_labels
type
latitude
longitude
scientific_name
common_name
author
license
rating
url
filename
duration_secs
duration_mins
duration_hrs
dataset
species_rank
species_total_files
species_train_files
species_test_files
species_total_duration_hrs
species_train_duration_hrs
species_test_duration_hrs
total_files
train_files
test_files
total_duration_hrs
train_duration_hrs
test_duration_hrs
class_weight_file
class_weight_duration
class_weight_combined
filename_npy


In [221]:
meta_df_top10_full = meta_df_top10_full[[

  'primary_label'
  , 'common_name'
  , 'scientific_name'
  , 'species_rank'
  , 'filename'
  , 'dataset'
  , 'duration_secs'
  , 'duration_mins'
  , 'duration_hrs'

  , 'species_total_files'
  , 'species_train_files'
  , 'species_test_files'
  , 'species_total_duration_hrs'
  , 'species_train_duration_hrs'
  , 'species_test_duration_hrs'

  , 'total_files'
  , 'train_files'
  , 'test_files'
  , 'total_duration_hrs'
  , 'train_duration_hrs'
  , 'test_duration_hrs'

  , 'filename_npy'
  , 'type'
  , 'secondary_labels'
  , 'rating'
  , 'latitude'
  , 'longitude'
  , 'author'
  , 'url'
  , 'license'

  , 'class_weight_file'
  , 'class_weight_duration'
  , 'class_weight_combined'
  , 'orig_index'
]]

meta_df_top10_full

Unnamed: 0,primary_label,common_name,scientific_name,species_rank,filename,dataset,duration_secs,duration_mins,duration_hrs,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,filename_npy,type,secondary_labels,rating,latitude,longitude,author,url,license,class_weight_file,class_weight_duration,class_weight_combined,orig_index
0,barswa,Barn Swallow,Hirundo rustica,4,barswa/XC113914.ogg,test,57.077596,0.951293,0.015855,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,barswa/XC113914.npy,"['call', 'flight call']",[],2.5,46.4605,6.3914,Bram Piot,https://www.xeno-canto.org/113914,Creative Commons Attribution-NonCommercial-Sha...,0.943548,1.350536,1.128847,895
1,barswa,Barn Swallow,Hirundo rustica,4,barswa/XC129647.ogg,test,106.728027,1.778800,0.029647,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,barswa/XC129647.npy,['song'],[],4.5,35.0307,-120.6205,Thomas G. Graves,https://www.xeno-canto.org/129647,Creative Commons Attribution-NonCommercial-Sha...,0.943548,1.350536,1.128847,896
2,barswa,Barn Swallow,Hirundo rustica,4,barswa/XC132406.ogg,train,10.788571,0.179810,0.002997,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,barswa/XC132406.npy,['song'],[],3.0,45.3675,-73.8566,Patrick Turgeon,https://www.xeno-canto.org/132406,Creative Commons Attribution-NonCommercial-Sha...,0.943548,1.350536,1.128847,897
3,barswa,Barn Swallow,Hirundo rustica,4,barswa/XC133096.ogg,test,11.572290,0.192872,0.003215,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,barswa/XC133096.npy,"['call', 'female', 'male', 'song']",[],3.0,56.1559,47.4939,Albert Lastukhin,https://www.xeno-canto.org/133096,Creative Commons Attribution-NonCommercial-Sha...,0.943548,1.350536,1.128847,898
4,barswa,Barn Swallow,Hirundo rustica,4,barswa/XC133802.ogg,train,15.490658,0.258178,0.004303,496,347,149,5.308204,3.632351,1.675853,4680,3278,1402,71.689209,49.463144,22.226065,barswa/XC133802.npy,['song'],[],3.0,55.9937,-3.5605,Mike Nelson,https://www.xeno-canto.org/133802,Creative Commons Attribution-NonCommercial-Sha...,0.943548,1.350536,1.128847,899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,woosan,Wood Sandpiper,Tringa glareola,5,woosan/XC754173.ogg,test,50.016009,0.833600,0.013893,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,woosan/XC754173.npy,[''],[],3.5,60.2183,25.1208,Hannu Varkki,https://xeno-canto.org/754173,Creative Commons Attribution-NonCommercial-Sha...,0.987342,1.921806,1.377490,16261
4676,woosan,Wood Sandpiper,Tringa glareola,5,woosan/XC754702.ogg,train,58.546440,0.975774,0.016263,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,woosan/XC754702.npy,[''],[],5.0,64.1910,21.1130,Alan Dalton,https://xeno-canto.org/754702,Creative Commons Attribution-NonCommercial-Sha...,0.987342,1.921806,1.377490,16262
4677,woosan,Wood Sandpiper,Tringa glareola,5,woosan/XC754763.ogg,test,78.560499,1.309342,0.021822,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,woosan/XC754763.npy,[''],[],5.0,58.7396,17.8657,Alan Dalton,https://xeno-canto.org/754763,Creative Commons Attribution-NonCommercial-Sha...,0.987342,1.921806,1.377490,16263
4678,woosan,Wood Sandpiper,Tringa glareola,5,woosan/XC754765.ogg,train,71.276780,1.187946,0.019799,474,332,142,3.730303,2.704072,1.026231,4680,3278,1402,71.689209,49.463144,22.226065,woosan/XC754765.npy,[''],[],5.0,58.7396,17.8657,Alan Dalton,https://xeno-canto.org/754765,Creative Commons Attribution-NonCommercial-Sha...,0.987342,1.921806,1.377490,16264


In [222]:
meta_df_top10_full.sort_values(
    by=['species_rank', 'filename']
    , inplace=True
)
meta_df_top10_full.reset_index(drop=True, inplace=True)
meta_df_top10_full

Unnamed: 0,primary_label,common_name,scientific_name,species_rank,filename,dataset,duration_secs,duration_mins,duration_hrs,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,filename_npy,type,secondary_labels,rating,latitude,longitude,author,url,license,class_weight_file,class_weight_duration,class_weight_combined,orig_index
0,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC110335.ogg,train,119.275102,1.987918,0.033132,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC110335.npy,['song'],[],4.5,58.5264,13.8637,Patrik Åberg,https://www.xeno-canto.org/110335,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13845
1,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC110336.ogg,train,122.044127,2.034069,0.033901,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC110336.npy,['song'],[],5.0,59.1763,15.4038,Patrik Åberg,https://www.xeno-canto.org/110336,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13846
2,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC112492.ogg,test,58.540408,0.975673,0.016261,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC112492.npy,['song'],[],3.0,59.8510,17.6230,Sander Bot,https://www.xeno-canto.org/112492,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13847
3,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC118260.ogg,train,44.382041,0.739701,0.012328,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC118260.npy,"['male', 'song']",[],5.0,61.5650,29.5650,Steve Klasan,https://www.xeno-canto.org/118260,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13848
4,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC120947.ogg,train,189.231066,3.153851,0.052564,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC120947.npy,['song'],[],4.5,52.4430,21.0940,Lars Lachmann,https://www.xeno-canto.org/120947,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749483.ogg,train,20.950249,0.349171,0.005820,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749483.npy,[''],[],5.0,46.2621,-0.2092,Jack Berteau,https://xeno-canto.org/749483,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4073
4676,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749488.ogg,test,71.915102,1.198585,0.019976,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749488.npy,[''],[],4.0,46.2621,-0.2092,Jack Berteau,https://xeno-canto.org/749488,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4074
4677,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749637.ogg,test,5.040000,0.084000,0.001400,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749637.npy,[''],[],2.5,64.5626,26.8394,Hannu Varkki,https://xeno-canto.org/749637,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4075
4678,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749638.ogg,train,6.984036,0.116401,0.001940,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749638.npy,[''],[],3.5,64.5626,26.8394,Hannu Varkki,https://xeno-canto.org/749638,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4076


In [223]:
meta_df_top10_full.to_csv(f'{new_data_path}/sample_metadata.csv', index=False)

In [224]:
meta_df_train = meta_df_top10_full[meta_df_top10_full['dataset'] == 'train']
meta_df_train

Unnamed: 0,primary_label,common_name,scientific_name,species_rank,filename,dataset,duration_secs,duration_mins,duration_hrs,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,filename_npy,type,secondary_labels,rating,latitude,longitude,author,url,license,class_weight_file,class_weight_duration,class_weight_combined,orig_index
0,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC110335.ogg,train,119.275102,1.987918,0.033132,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC110335.npy,['song'],[],4.5,58.5264,13.8637,Patrik Åberg,https://www.xeno-canto.org/110335,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13845
1,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC110336.ogg,train,122.044127,2.034069,0.033901,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC110336.npy,['song'],[],5.0,59.1763,15.4038,Patrik Åberg,https://www.xeno-canto.org/110336,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13846
3,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC118260.ogg,train,44.382041,0.739701,0.012328,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC118260.npy,"['male', 'song']",[],5.0,61.5650,29.5650,Steve Klasan,https://www.xeno-canto.org/118260,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13848
4,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC120947.ogg,train,189.231066,3.153851,0.052564,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC120947.npy,['song'],[],4.5,52.4430,21.0940,Lars Lachmann,https://www.xeno-canto.org/120947,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13849
5,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC125024.ogg,train,45.312018,0.755200,0.012587,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC125024.npy,"['call', 'song']",[],2.5,54.5770,11.9226,Louis A. Hansen,https://www.xeno-canto.org/125024,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4672,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC748724.ogg,train,54.909025,0.915150,0.015253,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC748724.npy,[''],[],1.0,48.2741,10.8188,johannes buhl,https://xeno-canto.org/748724,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4069
4674,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC748726.ogg,train,6.974694,0.116245,0.001937,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC748726.npy,[''],[],1.0,48.2741,10.8188,johannes buhl,https://xeno-canto.org/748726,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4071
4675,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749483.ogg,train,20.950249,0.349171,0.005820,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749483.npy,[''],[],5.0,46.2621,-0.2092,Jack Berteau,https://xeno-canto.org/749483,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4073
4678,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749638.ogg,train,6.984036,0.116401,0.001940,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749638.npy,[''],[],3.5,64.5626,26.8394,Hannu Varkki,https://xeno-canto.org/749638,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4076


In [225]:
meta_df_test = meta_df_top10_full[meta_df_top10_full['dataset'] == 'test']
meta_df_test

Unnamed: 0,primary_label,common_name,scientific_name,species_rank,filename,dataset,duration_secs,duration_mins,duration_hrs,species_total_files,species_train_files,species_test_files,species_total_duration_hrs,species_train_duration_hrs,species_test_duration_hrs,total_files,train_files,test_files,total_duration_hrs,train_duration_hrs,test_duration_hrs,filename_npy,type,secondary_labels,rating,latitude,longitude,author,url,license,class_weight_file,class_weight_duration,class_weight_combined,orig_index
2,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC112492.ogg,test,58.540408,0.975673,0.016261,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC112492.npy,['song'],[],3.0,59.8510,17.6230,Sander Bot,https://www.xeno-canto.org/112492,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13847
11,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC132295.ogg,test,73.769796,1.229497,0.020492,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC132295.npy,"['male', 'song']",[],3.0,56.0889,47.2543,Albert Lastukhin,https://www.xeno-canto.org/132295,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13856
14,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC133558.ogg,test,78.759229,1.312654,0.021878,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC133558.npy,"['male', 'song']",[],4.5,61.5650,29.5650,Stuart Fisher,https://www.xeno-canto.org/133558,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13859
15,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC134265.ogg,test,394.187755,6.569796,0.109497,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC134265.npy,['song'],[],5.0,51.3506,23.0467,Jarek Matusiak,https://www.xeno-canto.org/134265,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13860
21,thrnig1,Thrush Nightingale,Luscinia luscinia,0,thrnig1/XC135792.ogg,test,42.344535,0.705742,0.011762,499,349,150,17.078735,10.739077,6.339658,4680,3278,1402,71.689209,49.463144,22.226065,thrnig1/XC135792.npy,"['male', 'song']",[],3.0,56.7542,46.8450,Albert Lastukhin,https://www.xeno-canto.org/135792,Creative Commons Attribution-NonCommercial-Sha...,0.937876,0.419757,0.627439,13866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4668,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC748720.ogg,test,224.261043,3.737684,0.062295,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC748720.npy,[''],[],2.0,48.2741,10.8188,johannes buhl,https://xeno-canto.org/748720,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4065
4669,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC748721.ogg,test,15.360000,0.256000,0.004267,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC748721.npy,[''],[],1.0,48.2741,10.8188,johannes buhl,https://xeno-canto.org/748721,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4066
4673,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC748725.ogg,test,13.975510,0.232925,0.003882,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC748725.npy,[''],[],1.0,48.2741,10.8188,johannes buhl,https://xeno-canto.org/748725,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4070
4676,cohmar1,Common House-Martin,Delichon urbicum,9,cohmar1/XC749488.ogg,test,71.915102,1.198585,0.019976,408,286,122,5.656965,4.103189,1.553775,4680,3278,1402,71.689209,49.463144,22.226065,cohmar1/XC749488.npy,[''],[],4.0,46.2621,-0.2092,Jack Berteau,https://xeno-canto.org/749488,Creative Commons Attribution-NonCommercial-Sha...,1.147059,1.267273,1.205669,4074


In [227]:
meta_df_train.to_csv(f'{new_data_path}/train_metadata.csv', index=False)
meta_df_test.to_csv(f'{new_data_path}/test_metadata.csv', index=False)

# Create & populate train & test directories

In [None]:
train_dir = f'{new_data_path}/train'
test_dir = f'{new_data_path}/test'

In [None]:
def reset_train_test_directories(train_dir, test_dir, meta_df):

  try:
    shutil.rmtree(train_dir)
  except:
    pass

  try:
    shutil.rmtree(test_dir)
  except:
    pass

  primary_labels = set(meta_df['primary_label'].values)

  for dir1 in [train_dir, test_dir]:
    os.mkdir(dir1)
    os.mkdir(f'{dir1}/audio_files')
    for dir2 in primary_labels:
      dir = f'{dir1}/audio_files/{dir2}'
      os.mkdir(dir)


def populate_train_test_directories(
    train_dir
    , test_dir
    , orig_trainaudio_path
    , meta_df
    , copy_progress_iters=100):

  n_files = len(meta_df)
  print(f'Initiating copy of {n_files:,} files...')
  failed_copies = 0

  for i in range(n_files):
    record = meta_df.loc[i]
    filename_orig = record['filename']
    filepath_orig = f'{orig_trainaudio_path}/{filename_orig}'
    dataset = record['dataset']
    if dataset == 'train':
      filepath_new = f'{train_dir}/audio_files/{filename_orig}'
    elif dataset == 'test':
      filepath_new = f'{test_dir}/audio_files/{filename_orig}'
    else:
      filepath_new = None
      failed_copies += 1
      print(f'{filepath_orig} has invalid dataset type \'{dataset}\'')

    if i % copy_progress_iters == 0:
      print(f'Copying {filename_orig} - {i:,} of {n_files:,}')

    try:
      shutil.copyfile(filepath_orig, filepath_new)
    except:
      failed_copies += 1
      print(f'Failed to copy {filename_orig} to {filepath_new}')

  print(f'Successfully copied {(n_files - failed_copies):,} of {n_files:,}.')


In [None]:
# DELETES **EVERYTHING** IN TRAIN AND TEST DIRECTORIES
# ONLY RUN IF YOU ARE 100% CONFIDENT IT CAN BE RESET
# Takes >1 hr to complete

# reset_train_test_directories(train_dir, test_dir, meta_df_top20_datasets)
# populate_train_test_directories(train_dir, test_dir, orig_trainaudio_path, meta_df_top20_datasets)