In [31]:
import os
import boto3
from botocore.handlers import disable_signing
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from scipy.io import wavfile
import math

In [32]:
boto3.__version__

'1.15.18'

In [33]:
np.__version__

'1.18.5'

In [34]:
s3 = boto3.resource('s3')
s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)
bucket = s3.Bucket('congo8khz-pnnn')

In [35]:
training_set = pd.read_csv('../mubin/TrainingSet_rumble_info.txt', sep = '\t')

In [36]:
training_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,
1,2,47.9,48865.100,nn01a_20180126_000000.wav,3.8974,
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.0640,
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,
...,...,...,...,...,...,...
3175,3176,44.1,86263.733,nn10b_20180728_000000.wav,2.2700,
3176,3177,33.6,86264.870,nn10b_20180728_000000.wav,6.4400,
3177,3178,44.1,86269.957,nn10b_20180728_000000.wav,6.3800,
3178,3179,29.0,86300.628,nn10b_20180728_000000.wav,3.4700,


In [37]:
def get_parent(col):
    return col.split('_')[0]

In [38]:
training_set['Parent'] = training_set['filename'].apply(get_parent)

In [39]:
training_set['Folder'] = ['Training']*len(training_set)

In [40]:
training_set['marginals'].value_counts()

marginal    945
Name: marginals, dtype: int64

In [41]:
training_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.9,48865.100,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.0640,,nn01a,Training
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
...,...,...,...,...,...,...,...,...
3175,3176,44.1,86263.733,nn10b_20180728_000000.wav,2.2700,,nn10b,Training
3176,3177,33.6,86264.870,nn10b_20180728_000000.wav,6.4400,,nn10b,Training
3177,3178,44.1,86269.957,nn10b_20180728_000000.wav,6.3800,,nn10b,Training
3178,3179,29.0,86300.628,nn10b_20180728_000000.wav,3.4700,,nn10b,Training


In [42]:
test_set = pd.read_csv('../mubin/GeneralTest_rumble_info.txt', sep = '\t')

In [43]:
test_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals
0,1,36.600,29314.080,nn01d_20180127_000000.wav,4.407,
1,2,31.200,32552.884,nn01d_20180127_000000.wav,3.954,
2,3,39.100,40181.978,nn01d_20180127_000000.wav,6.526,
3,4,40.000,22722.894,nn02e_20180906_000000.wav,5.320,
4,5,30.300,22726.736,nn02e_20180906_000000.wav,3.029,
...,...,...,...,...,...,...
753,754,39.400,3530.350,nn10b_20180907_000000.wav,4.359,maginal
754,755,46.600,3533.675,nn10b_20180907_000000.wav,2.139,maginal
755,756,49.300,3547.271,nn10b_20180907_000000.wav,3.510,maginal
756,757,43.624,52488.090,nn10b_20180907_000000.wav,4.492,


In [44]:
test_set['Parent'] = test_set['filename'].apply(get_parent)

In [45]:
test_set['Folder'] = ['Testing']*len(test_set)

In [46]:
test_set['marginals'].unique()

array([nan, 'maginal', 'DUMMY_NoEles', 'marginal'], dtype=object)

In [47]:
test_set.drop(test_set[test_set['marginals'] == 'DUMMY_NoEles'].index, inplace = True)

In [48]:
join_set = pd.concat([training_set, test_set], ignore_index = True, join = 'outer')

In [49]:
join_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.700,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.900,48865.100,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.700,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.100,48876.017,nn01a_20180126_000000.wav,4.0640,,nn01a,Training
4,5,57.100,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
...,...,...,...,...,...,...,...,...
3932,754,39.400,3530.350,nn10b_20180907_000000.wav,4.3590,maginal,nn10b,Testing
3933,755,46.600,3533.675,nn10b_20180907_000000.wav,2.1390,maginal,nn10b,Testing
3934,756,49.300,3547.271,nn10b_20180907_000000.wav,3.5100,maginal,nn10b,Testing
3935,757,43.624,52488.090,nn10b_20180907_000000.wav,4.4920,,nn10b,Testing


In [50]:
parent_folders = join_set['Parent'].unique()
parent_folders

array(['nn01a', 'nn01b', 'nn01c', 'nn01d', 'nn01e', 'nn01f', 'nn01g',
       'nn02a', 'nn02b', 'nn02c', 'nn02d', 'nn02e', 'nn02f', 'nn02g',
       'nn03a', 'nn03b', 'nn03c', 'nn03d', 'nn03e', 'nn03f', 'nn03g',
       'nn04a', 'nn04b', 'nn04c', 'nn04d', 'nn04f', 'nn05a', 'nn05b',
       'nn05c', 'nn05d', 'nn05e', 'nn06a', 'nn06b', 'nn06c', 'nn06d',
       'nn06e', 'nn06f', 'nn07b', 'nn07c', 'nn08a', 'nn08b', 'nn08c',
       'nn09b', 'nn10a', 'nn10b', 'nn07a'], dtype=object)

In [51]:
sorted_join_set = join_set.copy()
sorted_join_set.sort_values(['filename','File Offset (s)'])
sorted_join_set.head(12)

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.9,48865.1,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.064,,nn01a,Training
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
5,6,57.1,49017.105,nn01a_20180126_000000.wav,3.5837,,nn01a,Training
6,7,51.5,51113.721,nn01a_20180126_000000.wav,2.9927,marginal,nn01a,Training
7,8,46.5,51117.157,nn01a_20180126_000000.wav,3.1034,,nn01a,Training
8,9,40.801,51405.606,nn01a_20180126_000000.wav,2.433,marginal,nn01a,Training
9,10,40.2,47942.93,nn01a_20180203_000000.wav,1.9582,marginal,nn01a,Training


In [53]:
# Custom Append
def newAppend(ar,left,right,window,target,cuts):
    j = left
    while right-j>=window and len(target)<cuts:
        target.append(ar[j:j+window])
        j+=window


# Audio file doesnt't contain neither noise nor rumble
startingPoint = 1000

# Iterators for train and test noise set
train_noise_iterator = 1
test_noise_iterator = 1

for folder in tqdm(parent_folders):
    
    sorted_parent_df = sorted_join_set.query('Parent == @folder')
    sorted_files = sorted_parent_df['filename'].unique()
    
    for file in sorted_files:
        file_df = sorted_parent_df.query('filename == @file')
        bucket.download_file('recordings/wav/' + folder + '/' + file, './'+file)
        sr, og_audio = wavfile.read(file)
        
        rumble_tuple = []
        
        for offset, duration in zip(file_df['File Offset (s)'], file_df['duration']):
            start_sample = math.floor(sr*offset)
            end_sample = math.ceil(sr*(offset+duration))
            rumble_tuple.append((start_sample,end_sample))
        
        newAudio = []
        
        # 4 seconds cut
        cut_length = 4*sr
        total_cuts = len(file_df)
        
        i = 0
        
        while i<len(rumble_tuple)-1 and len(newAudio)<total_cuts:
            if i == 0:
                wasteSound = startingPoint*sr
                firstEnd = rumble_tuple[i][0]
                if firstEnd>wasteSound:
                    newAppend(og_audio,wasteSound+1,firstEnd,cut_length,newAudio,total_cuts)

            start = rumble_tuple[i][1]
            end = rumble_tuple[i+1][0]
            newAppend(og_audio,start+1,end,cut_length,newAudio,total_cuts)
            
            i+=1

        if len(newAudio)<total_cuts:
            endPart = rumble_tuple[len(rumble_tuple)-1][1]
            terminating_point = (len(og_audio)-1)*sr
            newAppend(og_audio,endPart,terminating_point,cut_length,newAudio,total_cuts)

        folder_to_save = file_df['Folder']
        
        newAudio_counter = 0
        print(len(newAudio),total_cuts)
        
        for save_folder in file_df['Folder']:
            if newAudio_counter < len(newAudio):
                out_audio = newAudio[newAudio_counter]

                if save_folder == 'Training':
                    wavfile.write('../mubin/Data/Training/Noise/noise_{}.wav'.format(train_noise_iterator), sr, out_audio)
                    train_noise_iterator+=1
                else:
                    wavfile.write('../mubin/Data/Testing/Noise/noise_{}.wav'.format(test_noise_iterator), sr, out_audio)
                    test_noise_iterator+=1

                newAudio_counter+=1
        os.remove('./'+file)


  0%|          | 0/46 [00:00<?, ?it/s]

9 9
3 3
1 1
47 47
3 3
1 1
9 9
8 8


  2%|▏         | 1/46 [08:32<6:24:09, 512.21s/it]

4 4
1 1
13 13
1 1
16 16
60 60


  4%|▍         | 2/46 [14:54<5:19:35, 435.80s/it]

1 1
1 1
4 4
4 4
3 3
10 10
1 1
45 45
14 14
10 10
24 24
22 22
18 18


  7%|▋         | 3/46 [28:34<7:18:03, 611.24s/it]

2 2
19 19
28 28
3 3


  9%|▊         | 4/46 [32:40<5:26:53, 467.00s/it]

47 47
33 33


 11%|█         | 5/46 [34:38<3:53:02, 341.05s/it]

1 1
48 48
33 33


 13%|█▎        | 6/46 [37:52<3:14:07, 291.20s/it]

15 15
18 18


 15%|█▌        | 7/46 [40:09<2:36:30, 240.79s/it]

10 10
18 18


 17%|█▋        | 8/46 [42:24<2:11:10, 207.12s/it]

19 19
6 6


 20%|█▉        | 9/46 [44:36<1:53:16, 183.69s/it]

19 19
56 56


 22%|██▏       | 10/46 [47:00<1:42:46, 171.30s/it]

35 35
15 15
17 17


 26%|██▌       | 12/46 [51:31<1:26:35, 152.82s/it]

39 39
30 30


 28%|██▊       | 13/46 [53:58<1:22:59, 150.90s/it]

24 24
25 25


 30%|███       | 14/46 [56:32<1:21:01, 151.93s/it]

81 81
102 102


 33%|███▎      | 15/46 [59:29<1:22:23, 159.48s/it]

13 13
6 6


 35%|███▍      | 16/46 [1:01:58<1:18:11, 156.39s/it]

28 28
5 5


 37%|███▋      | 17/46 [1:04:36<1:15:50, 156.91s/it]

19 19
11 11


 39%|███▉      | 18/46 [1:07:05<1:12:03, 154.40s/it]

32 32
40 40


 41%|████▏     | 19/46 [1:09:39<1:09:23, 154.19s/it]

18 18
20 20


 43%|████▎     | 20/46 [1:12:09<1:06:18, 153.02s/it]

23 23
61 61


 46%|████▌     | 21/46 [1:14:47<1:04:19, 154.38s/it]

47 47
13 13


 48%|████▊     | 22/46 [1:17:24<1:02:08, 155.36s/it]

1 1
31 31


 50%|█████     | 23/46 [1:19:57<59:16, 154.63s/it]  

23 23
2 2
5 5
1 1
7 7
2 2
5 5


 52%|█████▏    | 24/46 [1:28:15<1:34:29, 257.71s/it]

4 4
2 2


 54%|█████▍    | 25/46 [1:30:35<1:17:49, 222.35s/it]

1 1


 57%|█████▋    | 26/46 [1:31:45<58:51, 176.59s/it]  

9 9
18 18
2 2
8 8
12 12
32 32
21 21
2 2
15 15


 59%|█████▊    | 27/46 [1:42:22<1:39:39, 314.71s/it]

62 62
11 11
19 19
14 14
2 2
12 12


 61%|██████    | 28/46 [1:48:51<1:41:05, 336.96s/it]

1 1
3 3


 63%|██████▎   | 29/46 [1:50:58<1:17:39, 274.07s/it]

1 1
7 7
5 5
13 13
2 2
28 28
1 1


 65%|██████▌   | 30/46 [1:58:43<1:28:20, 331.31s/it]

1 1
2 2
5 5


 67%|██████▋   | 31/46 [2:01:58<1:12:38, 290.55s/it]

28 28
64 64
161 161
44 44
67 67
42 42
58 58
8 8
8 8
64 64
37 37
26 26
25 25
57 57


 70%|██████▉   | 32/46 [2:17:37<1:53:10, 485.05s/it]

47 47
26 26
20 20
54 54
28 28
1 1
41 41
12 12
33 33
17 17
29 29
39 39
17 17


 72%|███████▏  | 33/46 [2:32:53<2:13:06, 614.31s/it]

36 36
27 27


 74%|███████▍  | 34/46 [2:35:26<1:35:08, 475.74s/it]

1 1
15 15


 76%|███████▌  | 35/46 [2:37:53<1:09:09, 377.19s/it]

4 4
7 7
1 1
23 23


 78%|███████▊  | 36/46 [2:42:58<59:17, 355.70s/it]  

13 13


 80%|████████  | 37/46 [2:44:14<40:45, 271.77s/it]

17 17
35 35


 83%|████████▎ | 38/46 [2:46:57<31:52, 239.04s/it]

19 19
75 75


 85%|████████▍ | 39/46 [2:49:45<25:24, 217.71s/it]

8 8
15 15


 87%|████████▋ | 40/46 [2:52:17<19:47, 197.85s/it]

97 97
47 47


 89%|████████▉ | 41/46 [2:54:59<15:35, 187.12s/it]

228 228
72 72


 91%|█████████▏| 42/46 [2:58:10<12:33, 188.26s/it]

8 8
18 18


 93%|█████████▎| 43/46 [3:01:06<09:13, 184.67s/it]

113 113
40 40


 96%|█████████▌| 44/46 [3:04:22<06:16, 188.16s/it]

45 45
28 28
1 1
19 19


 98%|█████████▊| 45/46 [3:10:41<04:05, 245.37s/it]

15 15


100%|██████████| 46/46 [3:12:09<00:00, 250.65s/it]


In [54]:
training_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,49.7,48860.426,nn01a_20180126_000000.wav,6.2622,,nn01a,Training
1,2,47.9,48865.100,nn01a_20180126_000000.wav,3.8974,,nn01a,Training
2,3,49.7,48869.829,nn01a_20180126_000000.wav,4.2672,,nn01a,Training
3,4,58.1,48876.017,nn01a_20180126_000000.wav,4.0640,,nn01a,Training
4,5,57.1,48880.487,nn01a_20180126_000000.wav,4.3227,,nn01a,Training
...,...,...,...,...,...,...,...,...
3175,3176,44.1,86263.733,nn10b_20180728_000000.wav,2.2700,,nn10b,Training
3176,3177,33.6,86264.870,nn10b_20180728_000000.wav,6.4400,,nn10b,Training
3177,3178,44.1,86269.957,nn10b_20180728_000000.wav,6.3800,,nn10b,Training
3178,3179,29.0,86300.628,nn10b_20180728_000000.wav,3.4700,,nn10b,Training


In [55]:
len(os.listdir('../mubin/Data/Training/Noise')) == len(training_set)

True

In [56]:
test_set

Unnamed: 0,Selection,High Freq (Hz),File Offset (s),filename,duration,marginals,Parent,Folder
0,1,36.600,29314.080,nn01d_20180127_000000.wav,4.407,,nn01d,Testing
1,2,31.200,32552.884,nn01d_20180127_000000.wav,3.954,,nn01d,Testing
2,3,39.100,40181.978,nn01d_20180127_000000.wav,6.526,,nn01d,Testing
3,4,40.000,22722.894,nn02e_20180906_000000.wav,5.320,,nn02e,Testing
4,5,30.300,22726.736,nn02e_20180906_000000.wav,3.029,,nn02e,Testing
...,...,...,...,...,...,...,...,...
753,754,39.400,3530.350,nn10b_20180907_000000.wav,4.359,maginal,nn10b,Testing
754,755,46.600,3533.675,nn10b_20180907_000000.wav,2.139,maginal,nn10b,Testing
755,756,49.300,3547.271,nn10b_20180907_000000.wav,3.510,maginal,nn10b,Testing
756,757,43.624,52488.090,nn10b_20180907_000000.wav,4.492,,nn10b,Testing


In [58]:
len(os.listdir('../mubin/Data/Testing/Noise')) == len(test_set)

True