# Google Drive Interface Setup

In [1]:
from google.colab import drive, auth
import sys

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Pull down github repo, store in colab runtime memory
!git clone https://github.com/andrew-loeber/proj_utils_207.git

fatal: destination path 'proj_utils_207' already exists and is not an empty directory.


In [4]:
# Tell Python to also look in this repo when running an import
sys.path.insert(1, "/content/proj_utils_207")

In [5]:
# Authenticate google account and give back the session access token
auth.authenticate_user()
gcloud_token = !gcloud auth print-access-token
gcloud_token

['ya29.a0AbVbY6NJWUKsBRMGPfacuvuP-e_NEvf_pQMIsYGal-N3L-r_MXIUmL4BfOOmVI8JOlFLtq_Fs2k0xlNMUk2rvAOLFeTW_9hVieCISWb3F2wHozfSLVnZfnrur-hKuF4SsVKtb1ZU8QqgNl6NchYU4CAVV9yNaCgYKAe4SARASFQFWKvPlnXNfQZ-VkGXi6idKqJkodg0163']

In [6]:
from gdriveinterface import GDriveInterface
import proj_ref

In [7]:
# Show email address, username, and the path of the shared project folder on
# your Google Drive instance
gdi = GDriveInterface(gcloud_token)
shared_folder_path = gdi.shared_folder_path
gdi.__dict__

{'email': 'aloeber@berkeley.edu',
 'account': 'aloeber',
 'shared_folder_path': '/content/drive/MyDrive/207-Project'}

In [8]:
# Show name keys and paths for tracked files
proj_ref.files

{'taxonomy': 'BirdCLEF/eBird_Taxonomy_v2021.csv',
 'orig_metadata': 'BirdCLEF/train_metadata_with_duration.csv',
 'sample_metadata': 'data/sample_metadata.csv',
 'species_metadata': 'data/species_metadata.csv',
 'train_metadata': 'data/train_metadata.csv',
 'test_metadata': 'data/test_metadata.csv'}

In [9]:
# Show name keys and paths for tracked directories
proj_ref.dirs

{'orig_audio': {'path': 'BirdCLEF/train_audio'},
 'train_audio': {'path': 'data/train/audio_files'},
 'test_audio': {'path': 'data/test/audio_files'},
 'train_npy_full': {'path': 'data/train/librosa_loaded'},
 'test_npy_full': {'path': 'data/test/librosa_loaded'},
 'train_npy_loud5s': {'path': 'data/train/librosa_loaded_loudest_5sec'},
 'test_npy_loud5s': {'path': 'data/test/librosa_loaded_loudest_5sec'}}

### **Usage examples**

##### Query specific file based on named key
```
command: gdi.get_file_path('taxonomy')
returns: '/content/drive/MyDrive/207-Project/BirdCLEF/eBird_Taxonomy_v2021.csv'
```

##### Query specific directory based on named key
```
command: gdi.get_dir_path('orig_audio')
returns: '/content/drive/MyDrive/207-Project/BirdCLEF/train_audio'
```

##### Query specific sample record based on named keys for directory and file
```
command: gdi.get_sample_path('train_audio', 'barswa/XC132406.ogg')
returns: '/content/drive/MyDrive/207-Project/data/train/audio_files/barswa/XC132406.ogg'
```

##### Provide full path given a custom partial path
```
command: gdi.join_to_shared('images/sandpiper.jpeg')
returns: '/content/drive/MyDrive/207-Project/images/sandpiper.jpeg'
```


# Your work below

In [11]:
import pandas as pd
import numpy as np
import librosa
import time

In [28]:
json_path = gdi.join_to_shared('data/train/Json_loudest_5sec_audio_appended_all_birds.json')
df_reload = pd.read_json(json_path)
df_reload.head()

Unnamed: 0,audio_data,labels
0,"[[-798.6386108398, -492.040222168, -338.189941...",XC132406
1,"[[-297.9124450684, -256.0943908691, -250.26420...",XC133802
2,"[[-252.2703094482, -244.6105651855, -279.55819...",XC134349
3,"[[-314.9478149414, -279.9412536621, -278.54443...",XC135474
4,"[[-502.5234069824, -375.5750427246, -333.17370...",XC139171


In [16]:
mfcc_sample = np.array(df_reload.iloc[0]['audio_data'], dtype=np.float32)
mfcc_sample

array([[-7.98638611e+02, -4.92040222e+02, -3.38189941e+02, ...,
        -2.68975586e+02, -2.58449066e+02, -2.48451630e+02],
       [ 0.00000000e+00,  2.64158173e+01,  2.78944511e+01, ...,
         1.46622486e+01,  1.18767738e+01,  3.57308722e+00],
       [ 0.00000000e+00, -2.96921349e+01, -2.40511513e+01, ...,
        -1.40458126e+01, -1.82061920e+01, -2.79340878e+01],
       ...,
       [ 0.00000000e+00,  9.72716331e+00,  2.24974394e+00, ...,
         1.14183024e-01, -8.30849409e-01,  1.66273117e-03],
       [ 0.00000000e+00, -4.49927425e+00, -6.46138716e+00, ...,
        -1.52785063e+00, -3.50012255e+00,  1.47831440e+00],
       [ 0.00000000e+00,  1.19867764e+01,  7.32835293e+00, ...,
         1.50572252e+00,  5.64457464e+00,  1.09916935e+01]], dtype=float32)

In [17]:
mfcc_sample.shape

(20, 313)

In [11]:
sample_ogg = gdi.get_sample_path('train_audio', 'barswa/XC361235.ogg')
sample_ogg

'/content/drive/MyDrive/207-Project/data/train/audio_files/barswa/XC361235.ogg'

In [18]:
n_iters = 50

total_time = 0
for i in range(n_iters):
  sample_ogg = gdi.get_sample_path('train_audio', 'barswa/XC379571.ogg')
  start = time.perf_counter()
  sample_ogg_loaded = librosa.load(sample_ogg, sr=32000, duration=5.0)
  end = time.perf_counter()
  total_time += end - start
print(f'32 KHz resample, 5 sec clip: {(total_time) / n_iters} sec')

total_time = 0
for i in range(n_iters):
  sample_ogg = gdi.get_sample_path('train_audio', 'barswa/XC379571.ogg')
  start = time.perf_counter()
  sample_ogg_loaded = librosa.load(sample_ogg, sr=None, mono=False)
  end = time.perf_counter()
  total_time += end - start
print(f'No resample: {(total_time) / n_iters} sec')

total_time = 0
for i in range(n_iters):
  sample_ogg = gdi.get_sample_path('train_audio', 'barswa/XC379571.ogg')
  start = time.perf_counter()
  sample_ogg_loaded = librosa.load(sample_ogg, sr=32000)
  end = time.perf_counter()
  total_time += end - start
print(f'32 KHz resample: {(total_time) / n_iters} sec')

32 KHz resample, 5 sec clip: 0.026331552560004638 sec
No resample: 0.3584645011998509 sec
32 KHz resample: 0.4254391048198886 sec
