## Getting audio clips of required classes from google audioset data #### 
https://research.google.com/audioset/dataset/index.html
* Since the dataset is too huge, shortlisted a few classes to work on
    * Natural sounds: Fire vs Wind/Water/Storm.
* To get maximum data for the above classes, have used all the data available from the audioset data
    * Refer https://research.google.com/audioset//download.html#split for more details
    * ID to Class mapping is available in this link - https://github.com/audioset/ontology/blob/master/ontology.json

In [1]:
import os
import subprocess
import youtube_dl
import pandas as pd
import glob
import numpy as np
import tqdm

In [2]:
!pwd

/home/sramirez/git/FeuerFreiKiller/notebooks


In [3]:
os.chdir('/home/sramirez/git/FeuerFreiKiller/notebooks/')

In [6]:
!wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv -O '../data/external/unbalanced_data.csv'

--2019-08-17 12:22:06--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv
Resolviendo storage.googleapis.com (storage.googleapis.com)... 216.58.201.176, 2a00:1450:4003:803::2010
Conectando con storage.googleapis.com (storage.googleapis.com)[216.58.201.176]:80... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 101468408 (97M) [application/octet-stream]
Grabando a: “../data/external/unbalanced_data.csv”


2019-08-17 12:22:37 (3,29 MB/s) - “../data/external/unbalanced_data.csv” guardado [101468408/101468408]



In [7]:
!wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv -O '../data/external/balanced_train_data.csv'

--2019-08-17 12:22:37--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv
Resolviendo storage.googleapis.com (storage.googleapis.com)... 216.58.201.176, 2a00:1450:4003:803::2010
Conectando con storage.googleapis.com (storage.googleapis.com)[216.58.201.176]:80... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 1211931 (1,2M) [application/octet-stream]
Grabando a: “../data/external/balanced_train_data.csv”


2019-08-17 12:22:38 (3,13 MB/s) - “../data/external/balanced_train_data.csv” guardado [1211931/1211931]



In [8]:
!wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv -O '../data/external/eval_data.csv'

--2019-08-17 12:22:38--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
Resolviendo storage.googleapis.com (storage.googleapis.com)... 216.58.201.176, 2a00:1450:4003:803::2010
Conectando con storage.googleapis.com (storage.googleapis.com)[216.58.201.176]:80... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 1143389 (1,1M) [application/octet-stream]
Grabando a: “../data/external/eval_data.csv”


2019-08-17 12:22:39 (3,62 MB/s) - “../data/external/eval_data.csv” guardado [1143389/1143389]



## Merging the csv annotation files - Eval, Balanced and Unbalanced to get maximum data

In [None]:
#os.chdir('../data/external/')

In [9]:
path = '../data/external/'
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    print(filename)
    df = pd.read_csv(filename, skiprows=2, quotechar='"', engine='python', skipinitialspace=True)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

../data/external/eval_data.csv
../data/external/unbalanced_data.csv
../data/external/balanced_train_data.csv


In [10]:
df.shape,  2042985 + 22176 + 20383

((2084320, 4), 2085544)

In [11]:
df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels
0,--4gqARaEJE,0.0,10.0,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
1,--BfvyPmVMo,20.0,30.0,/m/03l9g
2,--U7joUcTCo,0.0,10.0,/m/01b_21
3,--i-y1v8Hy8,0.0,9.0,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"
4,-0BIyqJj9ZU,30.0,40.0,"/m/07rgt08,/m/07sq110,/t/dd00001"


In [12]:
df.duplicated().sum()

0

In [14]:
df['label'] = df.positive_labels.map(lambda x: 'single' if len(x.split(',')) == 1 else 'multi')
print(df.label.value_counts())

multi     1183535
single     900785
Name: label, dtype: int64


In [16]:
df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label
0,--4gqARaEJE,0.0,10.0,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk",multi
1,--BfvyPmVMo,20.0,30.0,/m/03l9g,single
2,--U7joUcTCo,0.0,10.0,/m/01b_21,single
3,--i-y1v8Hy8,0.0,9.0,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005",multi
4,-0BIyqJj9ZU,30.0,40.0,"/m/07rgt08,/m/07sq110,/t/dd00001",multi


## Filtering clips with required classes

In [17]:
# Use the corressponding ids based for the classes as available in json file mentioned at the start 
str2id = {'Fire': '/m/02_41', 'Wind': '/m/03m9d0z', 'Water': '/m/0838f', 'Thunderstorm': '/m/0jb2l'}
id2str = {v: k for k, v in str2id.items()}

for k in str2id.keys():
    df[k] = df.positive_labels.map(lambda x: 1 if (str2id[k] in x.split(',')) else 0)

In [18]:
for k in str2id.keys():
    print('Category: {}, # of elements: {}'.format(k, df[k].sum()))
    

Category: Thunderstorm, # of elements: 1262
Category: Wind, # of elements: 6805
Category: Fire, # of elements: 1445
Category: Water, # of elements: 8994


In [19]:
list(str2id.keys())


['Thunderstorm', 'Wind', 'Fire', 'Water']

In [27]:
# Filter out those registers with value for either classes considered (output class: fire)
columns = list(str2id.keys())
fw_df = df[df[columns].sum(axis=1) > 0]
fw_df.shape

(18082, 9)

In [28]:
fw_df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Thunderstorm,Wind,Fire,Water
16,-1EXhfqLLwQ,150.0,160.0,"/m/03dnzn,/m/068hy,/m/07p7b8y,/m/07ptzwd,/m/08...",multi,0,0,0,1
25,-1pPw9zZopA,190.0,200.0,"/m/01jt3m,/m/0838f",multi,0,0,0,1
34,-3-4qmWSJXU,30.0,40.0,"/m/06mb1,/m/0jb2l,/m/0ngt1,/t/dd00038",multi,1,0,0,0
41,-3rHVsIj1M8,30.0,40.0,"/m/019jd,/m/03m9d0z,/m/06q74,/t/dd00092",multi,0,1,0,0
100,-DSNfC2EJhU,20.0,30.0,"/m/01jt3m,/m/0838f",multi,0,0,0,1


In [29]:
# Translate after selection
fw_df['translated_labels'] = fw_df.positive_labels.map(lambda x: ','.join([id2str[y] for y in x.split(',') if y in id2str.keys()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
fw_df.translated_labels.value_counts()

Water                 8978
Wind                  6388
Thunderstorm          1233
Fire                  1059
Fire,Wind              381
Wind,Thunderstorm       27
Wind,Water               9
Fire,Water               5
Water,Thunderstorm       2
Name: translated_labels, dtype: int64

In [31]:
# Let's check an example for fire-water tuple
fw_df[fw_df.translated_labels == 'Fire,Water']

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Thunderstorm,Wind,Fire,Water,translated_labels
796,1009ux1xbkg,0.0,10.0,"/m/02_41,/m/0838f",multi,0,0,1,1,"Fire,Water"
3089,7Zdx0YrzHVk,20.0,30.0,"/m/02_41,/m/0838f",multi,0,0,1,1,"Fire,Water"
1204763,ToKqR2NHqwQ,200.0,210.0,"/m/02_41,/m/0838f,/m/09x0r",multi,0,0,1,1,"Fire,Water"
1215371,UB7upK3ZBsA,30.0,40.0,"/m/02_41,/m/07p9k1k,/m/0838f",multi,0,0,1,1,"Fire,Water"
1872047,s6dbv2C2N8M,30.0,40.0,"/m/02_41,/m/0838f,/m/09x0r",multi,0,0,1,1,"Fire,Water"


In [32]:
fw_df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Thunderstorm,Wind,Fire,Water,translated_labels
16,-1EXhfqLLwQ,150.0,160.0,"/m/03dnzn,/m/068hy,/m/07p7b8y,/m/07ptzwd,/m/08...",multi,0,0,0,1,Water
25,-1pPw9zZopA,190.0,200.0,"/m/01jt3m,/m/0838f",multi,0,0,0,1,Water
34,-3-4qmWSJXU,30.0,40.0,"/m/06mb1,/m/0jb2l,/m/0ngt1,/t/dd00038",multi,1,0,0,0,Thunderstorm
41,-3rHVsIj1M8,30.0,40.0,"/m/019jd,/m/03m9d0z,/m/06q74,/t/dd00092",multi,0,1,0,0,Wind
100,-DSNfC2EJhU,20.0,30.0,"/m/01jt3m,/m/0838f",multi,0,0,0,1,Water


In [33]:
## Create subset with only around 1K from each type of sound, and all that mix several types.
size = 1100        # sample size
replace = False  # with replacement
subset = fw_df.groupby('translated_labels').apply(lambda x: x.sample(min(size, x.shape[0]), replace = replace)).reset_index(drop=True)
subset.translated_labels.value_counts()

Wind                  1100
Thunderstorm          1100
Water                 1100
Fire                  1059
Fire,Wind              381
Wind,Thunderstorm       27
Wind,Water               9
Fire,Water               5
Water,Thunderstorm       2
Name: translated_labels, dtype: int64

In [35]:
fw_df.to_csv('../data/interim/firewind_dataset_links.csv')
fw_df.to_pickle('../data/interim/firewind_dataset_links.pickle')
subset.to_pickle('../data/interim/stratified_dataset_links.pickle')
subset.to_csv('../data/interim/stratified_dataset_links.csv')

In [36]:
assert(fw_df[fw_df['# YTID'] == "1icE61afXcY"].start_seconds.values[0] == 270)

## Download the 10 sec audio snippets for the filtered classes from youtube videos
* Using ffmpeg to get the audio and extract the requried 10s clip
* This part takes up lot of time as it involved downloading the entire video. Coudnt figure out a way to extract only the audio for a predefined time period. Any suggestions here would be very helpful

In [4]:
meta_df = pd.read_pickle('../data/interim/stratified_dataset_links.pickle')
clipsmeta = list(zip(meta_df['# YTID'].values, meta_df.start_seconds.values))

In [6]:
meta_df.columns

Index(['# YTID', 'start_seconds', 'end_seconds', 'positive_labels', 'label',
       'Water', 'Thunderstorm', 'Wind', 'Fire', 'translated_labels'],
      dtype='object')

In [8]:
meta_df.head() # positive labels can include other beyond nature sounds

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Water,Thunderstorm,Wind,Fire,translated_labels
0,90DSJfFfvio,200.0,210.0,"/m/012f08,/m/012ndj,/m/02_41,/m/07r04,/m/07yv9",multi,0,0,0,1,Fire
1,2H1h6nizICI,20.0,30.0,"/m/02_41,/m/07yv9",multi,0,0,0,1,Fire
2,6JF0IbMBFkQ,220.0,230.0,"/m/02_41,/m/07yv9,/m/09x0r",multi,0,0,0,1,Fire
3,fn5gRQ87Ga0,100.0,110.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire
4,1Emd2moPfRk,20.0,30.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire


In [25]:
meta_df.columns = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels', 'label',
       'Water', 'Thunderstorm', 'Wind', 'Fire', 'translated_nature_labels']
meta_df.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels,label,Water,Thunderstorm,Wind,Fire,translated_nature_labels
0,90DSJfFfvio,200.0,210.0,"/m/012f08,/m/012ndj,/m/02_41,/m/07r04,/m/07yv9",multi,0,0,0,1,Fire
1,2H1h6nizICI,20.0,30.0,"/m/02_41,/m/07yv9",multi,0,0,0,1,Fire
2,6JF0IbMBFkQ,220.0,230.0,"/m/02_41,/m/07yv9,/m/09x0r",multi,0,0,0,1,Fire
3,fn5gRQ87Ga0,100.0,110.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire
4,1Emd2moPfRk,20.0,30.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire


In [101]:
len(clipsmeta)

4783

In [102]:
processed_path = '../data/processed/'
os.chdir(processed_path)

In [103]:
!pwd

/home/sramirez/git/FeuerFreiKiller/data/processed


In [107]:
def download_audioset(metadata):
    i, start = metadata
    dur = 10
    #print(i, start, dur)
    ydl_opts = {'format': 'bestaudio/best',
                'outtmpl': './{}'.format(i+ '.mp4'),
                'postprocessors': [{'key': 'FFmpegExtractAudio','preferredcodec': 'wav','preferredquality': '192'}],
                'prefer_ffmpeg': True,
                'keepvideo': True,
                'info_dict': {'start_time': start,
                'end_time': start + dur}}
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            #ydl.download(['http://www.youtube.com/watch?v={}'.format(i)]);
            url = 'http://www.youtube.com/watch?v={}'.format(i)
            #command = "ffmpeg -k -ss {} -t {} -i ./{}.wav ./{}.wav".format(start, dur, i,i+'_seg')
            command = "./ffmpeg -ss {} -i $(youtube-dl -f mp4 --get-url {}) -t {} -vn -c:v copy -c:a copy {}.aac".format(int(start), url, dur, i)
            print(command)
            subprocess.call(command, shell=True)
            #os.remove('./{}.wav'.format(i))
            #os.rename('./{}_seg.wav'.format(i),'./{}.wav'.format(i))
    except:
        print('Video not available{}'.format(i))

In [None]:
import multiprocessing as mp
mp.cpu_count()

with mp.Pool(8) as pool:
    list(tqdm.tqdm(pool.imap(download_audioset, clipsmeta), total=len(clipsmeta)))

In [133]:
# some clips will be missing due to unavailability of video
print(len(os.listdir('./')), len(clipsmeta))

4396 4783


In [76]:
# Let's see what is the most affected class

files = os.listdir('../data/processed/')
result = map(lambda x: x.split('.')[0], files)
saved_id = list(result)
not_present = set(meta_df.YTID).difference(set(saved_id))
len(not_present)

389

In [77]:
meta_df[meta_df.YTID.isin(not_present)].translated_nature_labels.value_counts()

Water           126
Thunderstorm     92
Fire             85
Wind             60
Fire,Wind        25
Wind,Water        1
Name: translated_nature_labels, dtype: int64