In [148]:
import os
import boto3
from botocore.handlers import disable_signing

In [149]:
boto3.__version__

'1.15.18'

In [150]:
s3 = boto3.resource('s3')
s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)
bucket = s3.Bucket('congo8khz-pnnn')

In [197]:
import pandas as pd
import numpy as np

import os
from tqdm import tqdm

from scipy.io import wavfile
import math

In [57]:
training_set = pd.read_csv('./TrainingSet_rumble_info.txt', sep = '\t')

In [59]:
def get_parent(col):
    return col.split('_')[0]

In [64]:
training_set['Parent'] = training_set['filename'].apply(get_parent)

In [69]:
training_set['Folder'] = ['Training']*len(training_set)

In [67]:
training_set['marginals'].value_counts()

marginal    945
Name: marginals, dtype: int64

In [211]:
training_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.9,48865.100,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.0640,,nn01a,Training
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
...,...,...,...,...,...,...,...,...
3175,3176,44.1,86263.733,nn10b_20180728_000000.wav,2.2700,,nn10b,Training
3176,3177,33.6,86264.870,nn10b_20180728_000000.wav,6.4400,,nn10b,Training
3177,3178,44.1,86269.957,nn10b_20180728_000000.wav,6.3800,,nn10b,Training
3178,3179,29.0,86300.628,nn10b_20180728_000000.wav,3.4700,,nn10b,Training


In [92]:
test_set = pd.read_csv('./GeneralTest_rumble_info.txt', sep = '\t')

In [93]:
test_set['Parent'] = test_set['filename'].apply(get_parent)

In [94]:
test_set['Folder'] = ['Testing']*len(test_set)

In [95]:
test_set['marginals'].unique()

array([nan, 'maginal', 'DUMMY_NoEles', 'marginal'], dtype=object)

In [96]:
test_set.drop(test_set[test_set['marginals'] == 'DUMMY_NoEles'].index, inplace = True)

In [97]:
join_set = pd.concat([training_set, test_set], ignore_index = True, join = 'outer')

In [98]:
join_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.700,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.900,48865.100,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.700,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.100,48876.017,nn01a_20180126_000000.wav,4.0640,,nn01a,Training
4,5,57.100,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
...,...,...,...,...,...,...,...,...
3932,754,39.400,3530.350,nn10b_20180907_000000.wav,4.3590,maginal,nn10b,Testing
3933,755,46.600,3533.675,nn10b_20180907_000000.wav,2.1390,maginal,nn10b,Testing
3934,756,49.300,3547.271,nn10b_20180907_000000.wav,3.5100,maginal,nn10b,Testing
3935,757,43.624,52488.090,nn10b_20180907_000000.wav,4.4920,,nn10b,Testing


In [199]:
import shutil

In [200]:
if 'Data' not in os.listdir():
    os.makedirs('./Data/Training/Rumbles')
    os.makedirs('./Data/Training/Noise')
    os.makedirs('./Data/Testing/Rumbles')
    os.makedirs('./Data/Testing/Noise')
else:
    shutil.rmtree('./Data')
    os.makedirs('./Data/Training/Rumbles')
    os.makedirs('./Data/Training/Noise')
    os.makedirs('./Data/Testing/Rumbles')
    os.makedirs('./Data/Testing/Noise')

In [115]:
parent_folders = join_set['Parent'].unique()

In [201]:
train_rumble_counter = 1
test_rumble_counter = 1
for folder in tqdm(parent_folders):
    parent_df = join_set.query('Parent == @folder')
    files = parent_df['filename'].unique()
    for file in files:
        file_df = parent_df.query('filename == @file')
        bucket.download_file('recordings/wav/' + folder + '/' + file, './'+file)
        sr, og_audio = wavfile.read(file)
        for offset, duration, save_folder in zip(file_df['File Offset (s)'], file_df['duration'], file_df['Folder']):
            start_sample = math.floor(sr*offset)
            end_sample = math.ceil(sr*(offset+duration))
            out_audio = og_audio[start_sample:end_sample+1]
            if save_folder == 'Training':
                wavfile.write('./Data/Training/Rumbles/rumble_{}.wav'.format(train_rumble_counter), sr, out_audio)
                train_rumble_counter+=1
            else:
                wavfile.write('./Data/Testing/Rumbles/rumble_{}.wav'.format(test_rumble_counter), sr, out_audio)
                test_rumble_counter+=1
        os.remove('./'+file)

100%|██████████| 46/46 [3:10:45<00:00, 248.81s/it]  


In [142]:
parent_folders

array(['nn01a', 'nn01b', 'nn01c', 'nn01d', 'nn01e', 'nn01f', 'nn01g',
       'nn02a', 'nn02b', 'nn02c', 'nn02d', 'nn02e', 'nn02f', 'nn02g',
       'nn03a', 'nn03b', 'nn03c', 'nn03d', 'nn03e', 'nn03f', 'nn03g',
       'nn04a', 'nn04b', 'nn04c', 'nn04d', 'nn04f', 'nn05a', 'nn05b',
       'nn05c', 'nn05d', 'nn05e', 'nn06a', 'nn06b', 'nn06c', 'nn06d',
       'nn06e', 'nn06f', 'nn07b', 'nn07c', 'nn08a', 'nn08b', 'nn08c',
       'nn09b', 'nn10a', 'nn10b', 'nn07a'], dtype=object)

In [145]:
parent_df

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.9,48865.100,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.0640,,nn01a,Training
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
...,...,...,...,...,...,...,...,...
76,77,41.3,39030.894,nn01a_20180706_000000.wav,1.9751,,nn01a,Training
77,78,32.4,39031.826,nn01a_20180706_000000.wav,3.0186,,nn01a,Training
78,79,45.8,39058.435,nn01a_20180706_000000.wav,3.1677,,nn01a,Training
79,80,35.7,39080.609,nn01a_20180706_000000.wav,1.5652,,nn01a,Training


In [146]:
files

array(['nn01a_20180126_000000.wav', 'nn01a_20180203_000000.wav',
       'nn01a_20180220_000000.wav', 'nn01a_20180305_000000.wav',
       'nn01a_20180324_000000.wav', 'nn01a_20180330_000000.wav',
       'nn01a_20180619_000000.wav', 'nn01a_20180706_000000.wav'],
      dtype=object)

In [147]:
file_df

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.9,48865.1,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.064,,nn01a,Training
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
5,6,57.1,49017.105,nn01a_20180126_000000.wav,3.5837,,nn01a,Training
6,7,51.5,51113.721,nn01a_20180126_000000.wav,2.9927,marginal,nn01a,Training
7,8,46.5,51117.157,nn01a_20180126_000000.wav,3.1034,,nn01a,Training
8,9,40.801,51405.606,nn01a_20180126_000000.wav,2.433,marginal,nn01a,Training


In [207]:
len(os.listdir('./Data/Training/Rumbles')) == len(training_set)

True

In [210]:
len(os.listdir('./Data/Testing/Rumbles')) == len(test_set)

True