## Getting audio clips of required classes from google audioset data #### 
https://research.google.com/audioset/dataset/index.html
* Since the dataset is too huge, shortlisted a few classes to work on
    * Natural sounds: Fire vs Wind/Water/Storm.
* To get maximum data for the above classes, have used all the data available from the audioset data
    * Refer https://research.google.com/audioset//download.html#split for more details
    * ID to Class mapping is available in this link - https://github.com/audioset/ontology/blob/master/ontology.json

In [1]:
import os
import subprocess
import youtube_dl
import pandas as pd
import glob
import numpy as np
import tqdm

In [2]:
!pwd

/home/sramirez/git/FeuerFreiKiller/notebooks


In [3]:
os.chdir('/home/sramirez/git/FeuerFreiKiller/notebooks/')

In [4]:
!wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv -O '../data/external/unbalanced_data.csv'

--2019-08-18 16:34:21--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.201.176, 2a00:1450:4003:803::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.201.176|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 101468408 (97M) [application/octet-stream]
Saving to: ‘../data/external/unbalanced_data.csv’


2019-08-18 16:34:34 (7,75 MB/s) - ‘../data/external/unbalanced_data.csv’ saved [101468408/101468408]



In [5]:
!wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv -O '../data/external/balanced_train_data.csv'

--2019-08-18 16:34:35--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.201.176, 2a00:1450:4003:803::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.201.176|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1211931 (1,2M) [application/octet-stream]
Saving to: ‘../data/external/balanced_train_data.csv’


2019-08-18 16:34:35 (7,48 MB/s) - ‘../data/external/balanced_train_data.csv’ saved [1211931/1211931]



In [6]:
!wget http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv -O '../data/external/eval_data.csv'

--2019-08-18 16:34:35--  http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.201.176, 2a00:1450:4003:803::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.201.176|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1143389 (1,1M) [application/octet-stream]
Saving to: ‘../data/external/eval_data.csv’


2019-08-18 16:34:36 (8,18 MB/s) - ‘../data/external/eval_data.csv’ saved [1143389/1143389]



## Merging the csv annotation files - Eval, Balanced and Unbalanced to get maximum data

In [7]:
#os.chdir('../data/external/')

In [8]:
path = '../data/external/'
all_files = glob.glob(path + "/*.csv")
li = []

for filename in all_files:
    print(filename)
    df = pd.read_csv(filename, skiprows=2, quotechar='"', engine='python', skipinitialspace=True)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

../data/external/balanced_train_data.csv
../data/external/eval_data.csv
../data/external/unbalanced_data.csv


In [9]:
df.shape,  2042985 + 22176 + 20383

((2084320, 4), 2085544)

In [10]:
df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels
0,--PJHxphWEs,30.0,40.0,"/m/09x0r,/t/dd00088"
1,--ZhevVpy1s,50.0,60.0,/m/012xff
2,--aE2O5G5WE,0.0,10.0,"/m/03fwl,/m/04rlf,/m/09x0r"
3,--aO5cdqSAg,30.0,40.0,"/t/dd00003,/t/dd00005"
4,--aaILOrkII,200.0,210.0,"/m/032s66,/m/073cg4"


In [11]:
df.duplicated().sum()

0

In [12]:
df['label'] = df.positive_labels.map(lambda x: 'single' if len(x.split(',')) == 1 else 'multi')
print(df.label.value_counts())

multi     1183535
single     900785
Name: label, dtype: int64


In [13]:
df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label
0,--PJHxphWEs,30.0,40.0,"/m/09x0r,/t/dd00088",multi
1,--ZhevVpy1s,50.0,60.0,/m/012xff,single
2,--aE2O5G5WE,0.0,10.0,"/m/03fwl,/m/04rlf,/m/09x0r",multi
3,--aO5cdqSAg,30.0,40.0,"/t/dd00003,/t/dd00005",multi
4,--aaILOrkII,200.0,210.0,"/m/032s66,/m/073cg4",multi


## Filtering clips with required classes

In [14]:
# Use the corressponding ids based for the classes as available in json file mentioned at the start 
str2id = {'Fire': '/m/02_41', 'Wind': '/m/03m9d0z', 'Water': '/m/0838f', 'Thunderstorm': '/m/0jb2l'}
id2str = {v: k for k, v in str2id.items()}

for k in str2id.keys():
    df[k] = df.positive_labels.map(lambda x: 1 if (str2id[k] in x.split(',')) else 0)

In [15]:
for k in str2id.keys():
    print('Category: {}, # of elements: {}'.format(k, df[k].sum()))
    

Category: Fire, # of elements: 1445
Category: Wind, # of elements: 6805
Category: Water, # of elements: 8994
Category: Thunderstorm, # of elements: 1262


In [16]:
list(str2id.keys())


['Fire', 'Wind', 'Water', 'Thunderstorm']

In [17]:
# Filter out those registers with value for either classes considered (output class: fire)
columns = list(str2id.keys())
fw_df = df[df[columns].sum(axis=1) > 0]
fw_df.shape

(18082, 9)

In [18]:
fw_df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Fire,Wind,Water,Thunderstorm
8,-0DdlOuIFUI,50.0,60.0,"/m/0130jx,/m/02jz0l,/m/0838f",multi,0,0,1,0
38,-5GhUbDLYkQ,16.0,26.0,"/m/0130jx,/m/02jz0l,/m/0838f",multi,0,0,1,0
71,-8_HpHg6nCw,30.0,40.0,"/m/07ptzwd,/m/0838f",multi,0,0,1,0
76,-99daJhXYJY,30.0,40.0,"/m/019jd,/m/02rlv9,/m/03m9d0z,/t/dd00092",multi,0,1,0,0
156,-JKLmqDk9p8,490.0,500.0,"/m/06mb1,/m/07r10fb,/m/0jb2l,/m/0ngt1,/t/dd00038",multi,0,0,0,1


In [19]:
# Translate after selection
fw_df['translated_labels'] = fw_df.positive_labels.map(lambda x: ','.join([id2str[y] for y in x.split(',') if y in id2str.keys()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
fw_df.translated_labels.value_counts()

Water                 8978
Wind                  6388
Thunderstorm          1233
Fire                  1059
Fire,Wind              381
Wind,Thunderstorm       27
Wind,Water               9
Fire,Water               5
Water,Thunderstorm       2
Name: translated_labels, dtype: int64

In [21]:
# Let's check an example for fire-water tuple
fw_df[fw_df.translated_labels == 'Fire,Water']

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Fire,Wind,Water,Thunderstorm,translated_labels
22956,1009ux1xbkg,0.0,10.0,"/m/02_41,/m/0838f",multi,1,0,1,0,"Fire,Water"
25249,7Zdx0YrzHVk,20.0,30.0,"/m/02_41,/m/0838f",multi,1,0,1,0,"Fire,Water"
1226923,ToKqR2NHqwQ,200.0,210.0,"/m/02_41,/m/0838f,/m/09x0r",multi,1,0,1,0,"Fire,Water"
1237531,UB7upK3ZBsA,30.0,40.0,"/m/02_41,/m/07p9k1k,/m/0838f",multi,1,0,1,0,"Fire,Water"
1894207,s6dbv2C2N8M,30.0,40.0,"/m/02_41,/m/0838f,/m/09x0r",multi,1,0,1,0,"Fire,Water"


In [22]:
fw_df.head()

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Fire,Wind,Water,Thunderstorm,translated_labels
8,-0DdlOuIFUI,50.0,60.0,"/m/0130jx,/m/02jz0l,/m/0838f",multi,0,0,1,0,Water
38,-5GhUbDLYkQ,16.0,26.0,"/m/0130jx,/m/02jz0l,/m/0838f",multi,0,0,1,0,Water
71,-8_HpHg6nCw,30.0,40.0,"/m/07ptzwd,/m/0838f",multi,0,0,1,0,Water
76,-99daJhXYJY,30.0,40.0,"/m/019jd,/m/02rlv9,/m/03m9d0z,/t/dd00092",multi,0,1,0,0,Wind
156,-JKLmqDk9p8,490.0,500.0,"/m/06mb1,/m/07r10fb,/m/0jb2l,/m/0ngt1,/t/dd00038",multi,0,0,0,1,Thunderstorm


In [23]:
## Create subset with only around 1K from each type of sound, and all that mix several types.
size = 1100        # sample size
replace = False  # with replacement
subset = fw_df.groupby('translated_labels').apply(lambda x: x.sample(min(size, x.shape[0]), replace = replace)).reset_index(drop=True)
subset.translated_labels.value_counts()

Wind                  1100
Water                 1100
Thunderstorm          1100
Fire                  1059
Fire,Wind              381
Wind,Thunderstorm       27
Wind,Water               9
Fire,Water               5
Water,Thunderstorm       2
Name: translated_labels, dtype: int64

In [24]:
fw_df.to_csv('../data/interim/firewind_dataset_links.csv')
fw_df.to_pickle('../data/interim/firewind_dataset_links.pickle')
subset.to_pickle('../data/interim/stratified_dataset_links.pickle')
subset.to_csv('../data/interim/stratified_dataset_links.csv')

In [25]:
assert(fw_df[fw_df['# YTID'] == "1icE61afXcY"].start_seconds.values[0] == 270)

## Download the 10 sec audio snippets for the filtered classes from youtube videos
* Using ffmpeg to get the audio and extract the requried 10s clip
* This part takes up lot of time as it involved downloading the entire video. Coudnt figure out a way to extract only the audio for a predefined time period. Any suggestions here would be very helpful

In [4]:
meta_df = pd.read_pickle('../data/interim/stratified_dataset_links.pickle')
clipsmeta = list(zip(meta_df['# YTID'].values, meta_df.start_seconds.values))

In [6]:
meta_df.columns

Index(['# YTID', 'start_seconds', 'end_seconds', 'positive_labels', 'label',
       'Water', 'Thunderstorm', 'Wind', 'Fire', 'translated_labels'],
      dtype='object')

In [8]:
meta_df.head() # positive labels can include other beyond nature sounds

Unnamed: 0,# YTID,start_seconds,end_seconds,positive_labels,label,Water,Thunderstorm,Wind,Fire,translated_labels
0,90DSJfFfvio,200.0,210.0,"/m/012f08,/m/012ndj,/m/02_41,/m/07r04,/m/07yv9",multi,0,0,0,1,Fire
1,2H1h6nizICI,20.0,30.0,"/m/02_41,/m/07yv9",multi,0,0,0,1,Fire
2,6JF0IbMBFkQ,220.0,230.0,"/m/02_41,/m/07yv9,/m/09x0r",multi,0,0,0,1,Fire
3,fn5gRQ87Ga0,100.0,110.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire
4,1Emd2moPfRk,20.0,30.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire


In [25]:
meta_df.columns = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels', 'label',
       'Water', 'Thunderstorm', 'Wind', 'Fire', 'translated_nature_labels']
meta_df.head()

Unnamed: 0,YTID,start_seconds,end_seconds,positive_labels,label,Water,Thunderstorm,Wind,Fire,translated_nature_labels
0,90DSJfFfvio,200.0,210.0,"/m/012f08,/m/012ndj,/m/02_41,/m/07r04,/m/07yv9",multi,0,0,0,1,Fire
1,2H1h6nizICI,20.0,30.0,"/m/02_41,/m/07yv9",multi,0,0,0,1,Fire
2,6JF0IbMBFkQ,220.0,230.0,"/m/02_41,/m/07yv9,/m/09x0r",multi,0,0,0,1,Fire
3,fn5gRQ87Ga0,100.0,110.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire
4,1Emd2moPfRk,20.0,30.0,"/m/02_41,/m/09x0r",multi,0,0,0,1,Fire


In [101]:
len(clipsmeta)

4783

In [102]:
processed_path = '../data/processed/'
os.chdir(processed_path)

In [103]:
!pwd

/home/sramirez/git/FeuerFreiKiller/data/processed


In [107]:
def download_audioset(metadata):
    i, start = metadata
    dur = 10
    #print(i, start, dur)
    ydl_opts = {'format': 'bestaudio/best',
                'outtmpl': './{}'.format(i+ '.mp4'),
                'postprocessors': [{'key': 'FFmpegExtractAudio','preferredcodec': 'wav','preferredquality': '192'}],
                'prefer_ffmpeg': True,
                'keepvideo': True,
                'info_dict': {'start_time': start,
                'end_time': start + dur}}
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            #ydl.download(['http://www.youtube.com/watch?v={}'.format(i)]);
            url = 'http://www.youtube.com/watch?v={}'.format(i)
            #command = "ffmpeg -k -ss {} -t {} -i ./{}.wav ./{}.wav".format(start, dur, i,i+'_seg')
            command = "./ffmpeg -ss {} -i $(youtube-dl -f mp4 --get-url {}) -t {} -vn -c:v copy -c:a copy {}.aac".format(int(start), url, dur, i)
            print(command)
            subprocess.call(command, shell=True)
            #os.remove('./{}.wav'.format(i))
            #os.rename('./{}_seg.wav'.format(i),'./{}.wav'.format(i))
    except:
        print('Video not available{}'.format(i))

In [None]:
import multiprocessing as mp
mp.cpu_count()

with mp.Pool(8) as pool:
    list(tqdm.tqdm(pool.imap(download_audioset, clipsmeta), total=len(clipsmeta)))

In [133]:
# some clips will be missing due to unavailability of video
print(len(os.listdir('./')), len(clipsmeta))

4396 4783


In [76]:
# Let's see what is the most affected class

files = os.listdir('../data/processed/')
result = map(lambda x: x.split('.')[0], files)
saved_id = list(result)
not_present = set(meta_df.YTID).difference(set(saved_id))
len(not_present)

389

In [77]:
meta_df[meta_df.YTID.isin(not_present)].translated_nature_labels.value_counts()

Water           126
Thunderstorm     92
Fire             85
Wind             60
Fire,Wind        25
Wind,Water        1
Name: translated_nature_labels, dtype: int64