# Google Drive Interface Setup

In [2]:
from google.colab import drive, auth
import sys

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Pull down github repo, store in colab runtime memory
!git clone https://github.com/andrew-loeber/proj_utils_207.git

Cloning into 'proj_utils_207'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 31 (delta 9), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (31/31), 7.98 KiB | 743.00 KiB/s, done.


In [5]:
# Tell Python to also look in this repo when running an import
sys.path.insert(1, "/content/proj_utils_207")

In [6]:
# Authenticate google account and give back the session access token
auth.authenticate_user()
gcloud_token = !gcloud auth print-access-token
gcloud_token

['ya29.a0AbVbY6PmAaK_sddkIeaaZYR5GSYfOXhG0Jsw0Lu25Vx45y-BJtEilDNOjDxYqqldoWzlZOKXQkX3Am9V0x87QAhgYupZiY5uIbpfi8rQW7Vkpw3WIMq_21WapHe4K0e-DCm7YkMypVpzgw7qPk8RA7HSXOEqaCgYKAbcSARISFQFWKvPl98fEkDs6vXgAkvZg0ceT8w0163']

In [7]:
from gdriveinterface import GDriveInterface
import proj_ref

In [8]:
# Show email address, username, and the path of the shared project folder on
# your Google Drive instance
gdi = GDriveInterface(gcloud_token)
gdi.__dict__

{'email': 'aloeber@berkeley.edu',
 'account': 'aloeber',
 'shared_folder_path': '/content/drive/MyDrive/207-Project'}

In [9]:
# Show name keys and paths for tracked files
proj_ref.files

{'taxonomy': 'BirdCLEF/eBird_Taxonomy_v2021.csv',
 'orig_metadata': 'BirdCLEF/train_metadata.csv',
 'sample_metadata': 'data/sample_metadata.csv',
 'species_metadata': 'data/species_metadata.csv',
 'train_metadata': 'data/train_metadata.csv',
 'test_metadata': 'data/test_metadata.csv'}

In [10]:
# Show name keys and paths for tracked directories
proj_ref.dirs

{'orig_audio': {'path': 'BirdCLEF/train_audio'},
 'train_audio': {'path': 'data/train/audio_files'},
 'test_audio': {'path': 'data/test/audio_files'},
 'train_npy_full': {'path': 'data/train/librosa_loaded'},
 'test_npy_full': {'path': 'data/test/librosa_loaded'},
 'train_npy_loud5s': {'path': 'data/train/librosa_loaded_loudest_5sec'},
 'test_npy_loud5s': {'path': 'data/test/librosa_loaded_loudest_5sec'}}

### **Usage examples**

##### Query specific file based on named key
```
command: gdi.get_file_path('taxonomy')
returns: '/content/drive/MyDrive/207-Project/BirdCLEF/eBird_Taxonomy_v2021.csv'
```

##### Query specific directory based on named key
```
command: gdi.get_dir_path('orig_audio')
returns: '/content/drive/MyDrive/207-Project/BirdCLEF/train_audio'
```

##### Query specific sample record based on named keys for directory and file
```
command: gdi.get_sample_path('train_audio', 'barswa/XC132406.ogg')
returns: '/content/drive/MyDrive/207-Project/data/train/audio_files/barswa/XC132406.ogg'
```

##### Provide full path given a custom partial path
```
command: gdi.join_to_shared('images/sandpiper.jpeg')
returns: '/content/drive/MyDrive/207-Project/images/sandpiper.jpeg'
```


# Read in metadata files

In [11]:
import pandas as pd
import numpy as np

In [15]:
train_metadata = pd.read_csv(gdi.get_file_path('train_metadata'))
train_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,librosa_loaded
0,thrnig1,[],['song'],58.5264,13.8637,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/110335,thrnig1/XC110335.ogg,119.275102,500,61606.560907,17.112934,0,train,thrnig1/XC110335.npy
1,thrnig1,[],['song'],59.1763,15.4038,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/110336,thrnig1/XC110336.ogg,122.044127,500,61606.560907,17.112934,0,train,thrnig1/XC110336.npy
2,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Steve Klasan,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/118260,thrnig1/XC118260.ogg,44.382041,500,61606.560907,17.112934,0,train,thrnig1/XC118260.npy
3,thrnig1,[],['song'],52.443,21.094,Luscinia luscinia,Thrush Nightingale,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/120947,thrnig1/XC120947.ogg,189.231066,500,61606.560907,17.112934,0,train,thrnig1/XC120947.npy
4,thrnig1,[],"['call', 'song']",54.577,11.9226,Luscinia luscinia,Thrush Nightingale,Louis A. Hansen,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/125024,thrnig1/XC125024.ogg,45.312018,500,61606.560907,17.112934,0,train,thrnig1/XC125024.npy


In [16]:
test_metadata = pd.read_csv(gdi.get_file_path('test_metadata'))
test_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,librosa_loaded
0,thrnig1,[],['song'],59.851,17.623,Luscinia luscinia,Thrush Nightingale,Sander Bot,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/112492,thrnig1/XC112492.ogg,58.540408,500,61606.560907,17.112934,0,test,thrnig1/XC112492.npy
1,thrnig1,[],"['male', 'song']",56.0889,47.2543,Luscinia luscinia,Thrush Nightingale,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/132295,thrnig1/XC132295.ogg,73.769796,500,61606.560907,17.112934,0,test,thrnig1/XC132295.npy
2,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Stuart Fisher,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/133558,thrnig1/XC133558.ogg,78.759229,500,61606.560907,17.112934,0,test,thrnig1/XC133558.npy
3,thrnig1,[],['song'],51.3506,23.0467,Luscinia luscinia,Thrush Nightingale,Jarek Matusiak,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/134265,thrnig1/XC134265.ogg,394.187755,500,61606.560907,17.112934,0,test,thrnig1/XC134265.npy
4,thrnig1,[],"['male', 'song']",56.7542,46.845,Luscinia luscinia,Thrush Nightingale,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/135792,thrnig1/XC135792.ogg,42.344535,500,61606.560907,17.112934,0,test,thrnig1/XC135792.npy


In [19]:
sample_metadata = pd.read_csv(gdi.get_file_path('sample_metadata'))
sample_metadata.drop('Unnamed: 0', axis=1, inplace=True)
sample_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset
0,thrnig1,[],['song'],58.5264,13.8637,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/110335,thrnig1/XC110335.ogg,119.275102,500,61606.560907,17.112934,0,train
1,thrnig1,[],['song'],59.1763,15.4038,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/110336,thrnig1/XC110336.ogg,122.044127,500,61606.560907,17.112934,0,train
2,thrnig1,[],['song'],59.851,17.623,Luscinia luscinia,Thrush Nightingale,Sander Bot,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/112492,thrnig1/XC112492.ogg,58.540408,500,61606.560907,17.112934,0,test
3,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Steve Klasan,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/118260,thrnig1/XC118260.ogg,44.382041,500,61606.560907,17.112934,0,train
4,thrnig1,[],['song'],52.443,21.094,Luscinia luscinia,Thrush Nightingale,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/120947,thrnig1/XC120947.ogg,189.231066,500,61606.560907,17.112934,0,train


# Find duplicates in sample_metadata file

In [20]:
train_samples = list(train_metadata['filename'])
train_samples[:5]

['thrnig1/XC110335.ogg',
 'thrnig1/XC110336.ogg',
 'thrnig1/XC118260.ogg',
 'thrnig1/XC120947.ogg',
 'thrnig1/XC125024.ogg']

In [21]:
print(len(train_samples), len(set(train_samples)))

3278 3278


In [22]:
test_samples = list(test_metadata['filename'])
test_samples[:5]

['thrnig1/XC112492.ogg',
 'thrnig1/XC132295.ogg',
 'thrnig1/XC133558.ogg',
 'thrnig1/XC134265.ogg',
 'thrnig1/XC135792.ogg']

In [23]:
print(len(test_samples), len(set(test_samples)))

1402 1402


In [24]:
train_and_test_samples = train_samples + test_samples
print(len(train_and_test_samples), len(set(train_and_test_samples)))

4680 4680


In [25]:
sample_md_samples = list(sample_metadata['filename'])
print(len(sample_md_samples), len(set(sample_md_samples)))

6937 6937


In [26]:
sample_metadata_new = sample_metadata.copy()
sample_metadata_new['keep'] = ''
sample_metadata_new.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,keep
0,thrnig1,[],['song'],58.5264,13.8637,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/110335,thrnig1/XC110335.ogg,119.275102,500,61606.560907,17.112934,0,train,
1,thrnig1,[],['song'],59.1763,15.4038,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/110336,thrnig1/XC110336.ogg,122.044127,500,61606.560907,17.112934,0,train,
2,thrnig1,[],['song'],59.851,17.623,Luscinia luscinia,Thrush Nightingale,Sander Bot,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/112492,thrnig1/XC112492.ogg,58.540408,500,61606.560907,17.112934,0,test,
3,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Steve Klasan,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/118260,thrnig1/XC118260.ogg,44.382041,500,61606.560907,17.112934,0,train,
4,thrnig1,[],['song'],52.443,21.094,Luscinia luscinia,Thrush Nightingale,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/120947,thrnig1/XC120947.ogg,189.231066,500,61606.560907,17.112934,0,train,


In [27]:
for i in range(len(sample_metadata_new)):
  row = sample_metadata_new.iloc[i]
  if row['filename'] in train_and_test_samples:
    sample_metadata_new.at[i, 'keep'] = True
  else:
    sample_metadata_new.at[i, 'keep'] = False

sample_metadata_new.head(10)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,keep
0,thrnig1,[],['song'],58.5264,13.8637,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/110335,thrnig1/XC110335.ogg,119.275102,500,61606.560907,17.112934,0,train,True
1,thrnig1,[],['song'],59.1763,15.4038,Luscinia luscinia,Thrush Nightingale,Patrik Åberg,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/110336,thrnig1/XC110336.ogg,122.044127,500,61606.560907,17.112934,0,train,True
2,thrnig1,[],['song'],59.851,17.623,Luscinia luscinia,Thrush Nightingale,Sander Bot,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/112492,thrnig1/XC112492.ogg,58.540408,500,61606.560907,17.112934,0,test,True
3,thrnig1,[],"['male', 'song']",61.565,29.565,Luscinia luscinia,Thrush Nightingale,Steve Klasan,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/118260,thrnig1/XC118260.ogg,44.382041,500,61606.560907,17.112934,0,train,True
4,thrnig1,[],['song'],52.443,21.094,Luscinia luscinia,Thrush Nightingale,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/120947,thrnig1/XC120947.ogg,189.231066,500,61606.560907,17.112934,0,train,True
5,thrnig1,[],"['call', 'song']",54.577,11.9226,Luscinia luscinia,Thrush Nightingale,Louis A. Hansen,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/125024,thrnig1/XC125024.ogg,45.312018,500,61606.560907,17.112934,0,train,True
6,thrnig1,[],['song'],52.7167,23.8334,Luscinia luscinia,Thrush Nightingale,Mathias Ritschard,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/129509,thrnig1/XC129509.ogg,108.288027,500,61606.560907,17.112934,0,train,True
7,thrnig1,[],"['male', 'song']",52.4606,20.7392,Luscinia luscinia,Thrush Nightingale,Jarek Matusiak,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/131153,thrnig1/XC131153.ogg,106.161633,500,61606.560907,17.112934,0,train,True
8,thrnig1,[],"['male', 'song']",56.1379,47.349,Luscinia luscinia,Thrush Nightingale,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/132228,thrnig1/XC132228.ogg,86.491474,500,61606.560907,17.112934,0,train,True
9,thrnig1,[],"['male', 'song']",56.1396,47.3507,Luscinia luscinia,Thrush Nightingale,Albert Lastukhin,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/132244,thrnig1/XC132244.ogg,164.336327,500,61606.560907,17.112934,0,train,True


In [28]:
sample_metadata_new['keep'].value_counts()

True     4680
False    2257
Name: keep, dtype: int64

In [31]:
dup_rows = sample_metadata_new[
    (sample_metadata_new['species_rank'] <= 9)
    & (sample_metadata_new['keep'] == False)
]
dup_rows

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,duration,total_files,total_duration_secs,total_duration_hrs,species_rank,dataset,keep
485,thrnig1,[],[''],53.0902,14.3647,Luscinia luscinia,Thrush Nightingale,Sven Kransel,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/728016,thrnig1/XC728016.ogg,123.115011,500,61606.560907,17.112934,0,train,False
809,barswa,[],"['call', 'song', 'various calls']",58.7542,23.8439,Hirundo rustica,Barn Swallow,Stanislas Wroza,Creative Commons Attribution-NonCommercial-Sha...,0.0,https://www.xeno-canto.org/575749,barswa/XC575749.ogg,36.12,500,19230.524263,5.341812,1,train,False
895,barswa,[],"['adult', 'song']",52.8858,23.8293,Hirundo rustica,Barn Swallow,Ireneusz Oleksik,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/664977,barswa/XC664977.ogg,64.704036,500,19230.524263,5.341812,1,test,False
923,barswa,[],['call'],51.3672,5.8406,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,3.0,http://xeno-canto.org/671721,barswa/XC671721.ogg,10.057007,500,19230.524263,5.341812,1,train,False
995,barswa,[],[''],56.6346,9.7837,Hirundo rustica,Barn Swallow,Ad Hilders,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/747232,barswa/XC747232.ogg,10.109025,500,19230.524263,5.341812,1,train,False
1310,eaywag1,[],['flight call'],44.1062,1.8827,Motacilla flava,Western Yellow Wagtail,Cedric Mroczko,Creative Commons Attribution-NonCommercial-Sha...,0.5,https://www.xeno-canto.org/645113,eaywag1/XC645113.ogg,6.072018,500,16751.867029,4.653296,2,train,False
1345,eaywag1,[],['flight call'],53.5337,-1.7832,Motacilla flava,Western Yellow Wagtail,David Pennington,Creative Commons Attribution-NonCommercial-Sha...,3.0,http://xeno-canto.org/669616,eaywag1/XC669616.ogg,8.280816,500,16751.867029,4.653296,2,train,False
1759,comsan,[],['nocturnal flight call'],51.4309,-2.8518,Actitis hypoleucos,Common Sandpiper,Paul Williams,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/586330,comsan/XC586330.ogg,7.026032,500,15526.777506,4.312994,3,test,False
1760,comsan,[],['nocturnal flight call'],51.4309,-2.8518,Actitis hypoleucos,Common Sandpiper,Paul Williams,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/586332,comsan/XC586332.ogg,7.026032,500,15526.777506,4.312994,3,test,False
1794,comsan,[],['call'],-13.101,31.7949,Actitis hypoleucos,Common Sandpiper,Meena Haribal,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/613128,comsan/XC613128.ogg,41.064535,500,15526.777506,4.312994,3,train,False


In [34]:
list(dup_rows['filename'])

['thrnig1/XC728016.ogg',
 'barswa/XC575749.ogg',
 'barswa/XC664977.ogg',
 'barswa/XC671721.ogg',
 'barswa/XC747232.ogg',
 'eaywag1/XC645113.ogg',
 'eaywag1/XC669616.ogg',
 'comsan/XC586330.ogg',
 'comsan/XC586332.ogg',
 'comsan/XC613128.ogg',
 'combuz1/XC144258.ogg',
 'combuz1/XC647787.ogg',
 'woosan/XC647845.ogg',
 'eubeat1/XC392184.ogg',
 'eubeat1/XC392191.ogg',
 'eubeat1/XC392193.ogg',
 'hoopoe/XC417570.ogg',
 'cohmar1/XC558316.ogg',
 'cohmar1/XC748727.ogg']