In [1]:
import pandas as pd

## Load in the list of downloaded files

In [2]:
records = pd.read_csv("audiosetTargetsDLedBy_2020-06-14_14h11.csv", names=["raw"], header=0)
records.head(5)

Unnamed: 0,raw
0,data/audiofiles/--BfvyPmVMo_20000_30000.wav
1,data/audiofiles/--U7joUcTCo_0_10000.wav
2,data/audiofiles/--i-y1v8Hy8_0_9000.wav
3,data/audiofiles/-0BIyqJj9ZU_30000_40000.wav
4,data/audiofiles/-0CamVQdP_Y_0_6000.wav


## Parse out the pieces we care about

In [3]:
records['ytid'] = records['raw'].str.replace('data/audiofiles/','').str.rsplit('_', n=2, expand=True)[0]
records['start'] = records['raw'].str.replace('data/audiofiles/','').str.rsplit('_', n=2, expand=True)[1]
records['end'] = records['raw'].str.replace('data/audiofiles/','').str.rsplit('_', n=2, expand=True)[2].str.replace('.wav','')
records.head(5)

Unnamed: 0,raw,ytid,start,end
0,data/audiofiles/--BfvyPmVMo_20000_30000.wav,--BfvyPmVMo,20000,30000
1,data/audiofiles/--U7joUcTCo_0_10000.wav,--U7joUcTCo,0,10000
2,data/audiofiles/--i-y1v8Hy8_0_9000.wav,--i-y1v8Hy8,0,9000
3,data/audiofiles/-0BIyqJj9ZU_30000_40000.wav,-0BIyqJj9ZU,30000,40000
4,data/audiofiles/-0CamVQdP_Y_0_6000.wav,-0CamVQdP_Y,0,6000


## How many files did we get?

In [4]:
print(f'''We have {len(records)} clips from {len(records['ytid'].unique())} videos''')

We have 4240 clips from 4240 videos


## Let's grab the list of tags to see what all we've got to work with

In [5]:
tags = pd.read_csv('./reference/audioset-human-readable-id-mapping.csv', names=['label_id', 'legible_id'])
tags.head(5)

Unnamed: 0,label_id,legible_id
0,/m/0dgw9r,Human sounds
1,/m/09l8g,Human voice
2,/m/09x0r,Speech
3,/m/05zppz,"Male speech, man speaking"
4,/m/02zsn,"Female speech, woman speaking"


### Convert those series into a dict we can use for remapping values in the full dataset

In [6]:
tag_map = dict(zip(tags['label_id'], tags['legible_id']))
tag_map['None'] = ''
tag_map['nan'] = ''

## Let's pull in the list of labels for each video in our dataset

In [7]:
# !ls -lah ../data/audioset
# ! head -20 ../data/audioset/eval_segments.csv
eval_segments = pd.read_csv('../data/audioset/eval_segments.csv',
                            engine='python',
                            names=['ytid','start','end','labels'],
                            skiprows=3,
                            skip_blank_lines=True,
                            quotechar='"',
                            quoting=2,
                            sep=', ')
eval_segments.head(5)

Unnamed: 0,ytid,start,end,labels
0,--4gqARaEJE,0.0,10.0,"""/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"""
1,--BfvyPmVMo,20.0,30.0,"""/m/03l9g"""
2,--U7joUcTCo,0.0,10.0,"""/m/01b_21"""
3,--i-y1v8Hy8,0.0,9.0,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"""
4,-0BIyqJj9ZU,30.0,40.0,"""/m/07rgt08,/m/07sq110,/t/dd00001"""


## Merge with our DLed data
There are about 5x as many records in the eval_segments frame as we have successfully DLed. If we merge first and then ignore everything that didn't match up, we can drastically reduce the number of labels we'll need to work with.

**N.B.**: our eval_segments start/end times are in seconds, not milliseconds. We'll need to convert them first.

### Cast the records start/end columns to numeric

In [8]:
records[['start','end']] = records[['start','end']].astype(int)
print(f'Start type: {records["start"].dtype}\nEnd type: {records["end"].dtype}')

Start type: int64
End type: int64


### While we're at it, let's also drop the 'raw' column, since it's not doing anything for us.

In [9]:
records.drop(columns=['raw'], inplace=True)
records.head(5)

Unnamed: 0,ytid,start,end
0,--BfvyPmVMo,20000,30000
1,--U7joUcTCo,0,10000
2,--i-y1v8Hy8,0,9000
3,-0BIyqJj9ZU,30000,40000
4,-0CamVQdP_Y,0,6000


### Let's do the same for the eval_segments, and multiply by 1000

In [10]:
eval_segments[['start','end']] = eval_segments[['start','end']].astype(int) * 1000
print(f'Start type: {records["start"].dtype}\nEnd type: {records["end"].dtype}')
eval_segments.head(5)

Start type: int64
End type: int64


Unnamed: 0,ytid,start,end,labels
0,--4gqARaEJE,0,10000,"""/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"""
1,--BfvyPmVMo,20000,30000,"""/m/03l9g"""
2,--U7joUcTCo,0,10000,"""/m/01b_21"""
3,--i-y1v8Hy8,0,9000,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"""
4,-0BIyqJj9ZU,30000,40000,"""/m/07rgt08,/m/07sq110,/t/dd00001"""


## Merge the `records` with the `eval_segments` so we have labels for our vids

In [11]:
labeled_vids = pd.merge(left=records,
                        right=eval_segments,
                        how='left',
                        on=['ytid', 'start', 'end'])

In [12]:
labeled_vids.head(10)

Unnamed: 0,ytid,start,end,labels
0,--BfvyPmVMo,20000,30000,"""/m/03l9g"""
1,--U7joUcTCo,0,10000,"""/m/01b_21"""
2,--i-y1v8Hy8,0,9000,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"""
3,-0BIyqJj9ZU,30000,40000,"""/m/07rgt08,/m/07sq110,/t/dd00001"""
4,-0CamVQdP_Y,0,6000,"""/m/04rlf,/m/07pbtc8,/m/09x0r"""
5,-0Gj8-vB1q4,30000,40000,"""/m/0140xf,/m/02cjck,/m/04rlf"""
6,-0RWZT-miFs,420000,430000,"""/m/03v3yw,/m/0k4j"""
7,-0YUDn-1yII,30000,40000,"""/m/02cjck,/m/04rlf"""
8,-0jeONf82dE,21000,31000,"""/m/03k3r,/m/04rlf,/m/07q5rw0,/m/09x0r,/m/0jbk"""
9,-0nqfRcnAYE,370000,380000,"""/m/04brg2"""


## Expand out the labels to individual columns

In [13]:
to_tally = pd.concat([labeled_vids, labeled_vids['labels'].str.strip('"') \
                      .str.split(',', expand=True)], axis=1)

After the splitting, the column names are actually integers. That'll trip us up later. Let's rename them here before we move on.

In [14]:
to_tally = to_tally.rename(columns = {x: f'label_{x+1}' for x in range(0,10)})

### What would it look like to remap the labels from label_id to legible_id?

In [15]:
to_tally.replace(tag_map)  # N.B.: Not persistent b/c no inplace=True

Unnamed: 0,ytid,start,end,labels,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10
0,--BfvyPmVMo,20000,30000,"""/m/03l9g""",Hammer,,,,,,,,,
1,--U7joUcTCo,0,10000,"""/m/01b_21""",Cough,,,,,,,,,
2,--i-y1v8Hy8,0,9000,"""/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005""",Music,Speech,Female singing,Child singing,,,,,,
3,-0BIyqJj9ZU,30000,40000,"""/m/07rgt08,/m/07sq110,/t/dd00001""","Chuckle, chortle",Belly laugh,Baby laughter,,,,,,,
4,-0CamVQdP_Y,0,6000,"""/m/04rlf,/m/07pbtc8,/m/09x0r""",Music,"Walk, footsteps",Speech,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,H692PyayBt4,0,10000,"""/m/0192l,/m/085jw""",Bagpipes,"Wind instrument, woodwind instrument",,,,,,,,
4236,H6BsnEYfyKg,140000,150000,"""/m/02x984l""",Mechanical fan,,,,,,,,,
4237,H6H3B1LLWw8,440000,450000,"""/m/0284vy3""",Train horn,,,,,,,,,
4238,H6IHH1g5dOc,30000,40000,"""/m/07qb_dv,/m/09x0r""",Scratch,Speech,,,,,,,,


## Categorize by containing human speech or no human speech

The AudioSet ontology is at least three levels deep, so we'll need to some unpacking to get the full list of human sounds (and then choose to tag only those sounds which are actual speech of one kind or another).

In [16]:
# !cat ../data/audioset/eval_segments.csv
!cat ../data/audioset/audioset-ontology.json

[
  {
    "id": "/m/0dgw9r",
    "name": "Human sounds",
    "description": "Sounds produced by the human body through the actions of the individual.",
    "citation_uri": "",
    "positive_examples": [],
    "child_ids": ["/m/09l8g", "/m/01w250", "/m/09hlz4", "/m/0bpl036", "/m/0160x5", "/m/0k65p", "/m/01jg02", "/m/04xp5v", "/t/dd00012"],
    "restrictions": ["abstract"]
  },
  {
    "id": "/m/09l8g",
    "name": "Human voice",
    "description": "The human voice consists of sound made by a human being using the vocal folds for talking, singing, laughing, crying, screaming, etc. The human voice is specifically a part of human sound production in which the vocal folds are the primary sound source.",
    "citation_uri": "http://en.wikipedia.org/wiki/Human_voice",
    "positive_examples": [],
    "child_ids": ["/m/09x0r", "/m/07p6fty", "/m/03qc9zr", "/m/02rtxlg", "/m/01j3sz", "/m/0463cq4", "/m/07qw_06", "/m/07plz5l", "/m/015lz1", "/m/02fxyj", "/m/07s2xch", "/m/07r4k75", "/

    "positive_examples": ["youtu.be/McKxt245g4w?start=30&end=40", "youtu.be/GBNj5Bsy6Fk?start=30&end=40", "youtu.be/X31JvVFCg-w?start=510&end=520", "youtu.be/gLdEFc0LNhI?start=100&end=110", "youtu.be/pd1CH9ukqEw?start=140&end=150", "youtu.be/V1F-0D3G0wc?start=220&end=230", "youtu.be/r2zp3mOVp0g?start=30&end=40", "youtu.be/XZzicSdJnIE?start=10&end=20", "youtu.be/JjGjX6lI-V8?start=230&end=240", "youtu.be/WTKUjbur9_c?start=30&end=40", "youtu.be/cU9AJygC17I?start=130&end=140", "youtu.be/B1TLRcWkYEI?start=20&end=30", "youtu.be/CLlaHYLiThk?start=30&end=40"],
    "child_ids": ["/m/09b5t", "/m/01rd7k", "/m/09ddx", "/m/0dbvp"],
    "restrictions": []
  },
  {
    "id": "/m/09b5t",
    "name": "Chicken, rooster",
    "description": "Sounds of the very common and widespread domesticated fowl, raised for its eggs and meat.",
    "citation_uri": "http://en.wikipedia.org/wiki/Chicken",
    "positive_examples": ["youtu.be/jmcpNukZnxA?start=400&end=410", "youtu.be/ky48d3dPjuE?start=20&end=30"

    "description": "Sounds of a music instrument with a piano-style keyboard, where pressing keys causes mechanical hammers to strike metal strings, metal reeds, or wire tines, leading to vibrations which are converted into electrical signals by magnetic pickups.",
    "citation_uri": "http://en.wikipedia.org/wiki/Electric_piano",
    "positive_examples": ["youtu.be/skeFawZe__U?start=30&end=40", "youtu.be/m484xjK104Y?start=20&end=30", "youtu.be/ExBXCryXOak?start=80&end=90", "youtu.be/-DDiBs4JIxc?start=140&end=150", "youtu.be/-YATTKBtmRA?start=190&end=200"],
    "child_ids": ["/m/025cbm", "/m/0bxl5"],
    "restrictions": []
  },
  {
    "id": "/m/025cbm",
    "name": "Clavinet",
    "description": "Sounds of an electrically-amplified keyboard instrument with a distinctive bright staccato sound, popular in funk, jazz-funk, rock, and soul songs.",
    "citation_uri": "http://en.wikipedia.org/wiki/Clavinet",
    "positive_examples": [],
    "child_ids": [],
    "restrictions":

    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=tick",
    "positive_examples": ["youtu.be/aTcKrIlKI9o?start=30&end=40", "youtu.be/y-hd-Ol4ES4?start=30&end=40", "youtu.be/GgpH6htMmQQ?start=30&end=40", "youtu.be/xNZ0aU2X5uQ?start=30&end=40", "youtu.be/R6VUgJzDwBQ?start=30&end=40", "youtu.be/WupBieE-8Us?start=30&end=40", "youtu.be/YKtbQkTmvnY?start=30&end=40", "youtu.be/t6Hlse8lKmg?start=20&end=30", "youtu.be/8nS-KGITte8?start=30&end=40"],
    "child_ids": [],
    "restrictions": []
  },
  {
    "id": "/m/07qjznl",
    "name": "Tick-tock",
    "description": "A steady recurrent ticking sound as made by a clock.",
    "citation_uri": "http://wordnetweb.princeton.edu/perl/webwn?s=ticktock",
    "positive_examples": ["youtu.be/gD5Pft3FKdU?start=8&end=18", "youtu.be/wRnYhBhofVY?start=30&end=40", "youtu.be/pDjj6MiFNuc?start=30&end=40", "youtu.be/4ftDFi4684Y?start=30&end=40", "youtu.be/y5cRJZB83ZA?start=4&end=14"],
    "child_ids": [],
    "restrictions": []
  },

## Read in the ontology as-is

In [17]:
ontology = pd.read_json('../data/audioset/audioset-ontology.json')
ontology.head()

Unnamed: 0,id,name,description,citation_uri,positive_examples,child_ids,restrictions
0,/m/0dgw9r,Human sounds,Sounds produced by the human body through the ...,,[],"[/m/09l8g, /m/01w250, /m/09hlz4, /m/0bpl036, /...",[abstract]
1,/m/09l8g,Human voice,The human voice consists of sound made by a hu...,http://en.wikipedia.org/wiki/Human_voice,[],"[/m/09x0r, /m/07p6fty, /m/03qc9zr, /m/02rtxlg,...",[abstract]
2,/m/09x0r,Speech,Speech is the vocalized form of human communic...,http://en.wikipedia.org/wiki/Speech,"[youtu.be/8uI9H5jGRV8?start=30&end=40, youtu.b...","[/m/05zppz, /m/02zsn, /m/0ytgt, /m/01h8n0, /m/...",[]
3,/m/05zppz,"Male speech, man speaking",Speech uttered by an adult male human.,,"[youtu.be/6niRPYpLOpQ?start=30&end=40, youtu.b...",[],[]
4,/m/02zsn,"Female speech, woman speaking",Speech uttered by an adult female human.,,"[youtu.be/4l05nCOnIRg?start=30&end=40, youtu.b...",[],[]


### Create a data frame of all the tags we care about

We'll start by creating a placeholder. Once we've done that, we'll systematically work through the levels of the ontology to see which of the children also need to be dug into and pulled out. Once we've done _that_ we'll go ahead and revert to working with the broader ontology. (We want a fine-grained perspective on _sounds of interest_ - we can be grossly generic with the others sound labels.)

In [18]:
sounds_of_interest = pd.DataFrame(data = [['/m/0dgw9r', 'Human sounds', '-']],
                            columns=['label_id', 'legible_id', 'parent_legible'])
sounds_of_interest

Unnamed: 0,label_id,legible_id,parent_legible
0,/m/0dgw9r,Human sounds,-


### We'll add the children of the 'Human sounds' class

In [19]:
human_sounds_lvl2 = pd.DataFrame([c for c in ontology[ontology['name'] == 'Human sounds']['child_ids'][0]],
                                 columns=['label_id'])
human_sounds_lvl2['parent_legible'] = 'Human sounds'                                

In [20]:
sounds_of_interest = sounds_of_interest.append(human_sounds_lvl2)
sounds_of_interest['legible_id'] = sounds_of_interest['label_id'].map(tag_map)

### Then we'll add the children of the 'Speech' class

In [21]:
speech_sounds_lvl2 = pd.DataFrame([c for c in ontology[ontology['name'] == 'Speech']['child_ids']][0], columns=['label_id'])
speech_sounds_lvl2['parent_legible'] = 'Speech'

In [22]:
sounds_of_interest = sounds_of_interest.append(speech_sounds_lvl2)
sounds_of_interest['legible_id'] = sounds_of_interest['label_id'].map(tag_map)

In [23]:
sounds_of_interest

Unnamed: 0,label_id,legible_id,parent_legible
0,/m/0dgw9r,Human sounds,-
0,/m/09l8g,Human voice,Human sounds
1,/m/01w250,Whistling,Human sounds
2,/m/09hlz4,Respiratory sounds,Human sounds
3,/m/0bpl036,Human locomotion,Human sounds
4,/m/0160x5,Digestive,Human sounds
5,/m/0k65p,Hands,Human sounds
6,/m/01jg02,"Heart sounds, heartbeat",Human sounds
7,/m/04xp5v,Otoacoustic emission,Human sounds
8,/t/dd00012,Human group actions,Human sounds


### Then we'll add the children of the 'Human voice' class

In [24]:
human_voice_lvl3 = pd.DataFrame([c for c in ontology[ontology['name'] == 'Human voice']['child_ids']][0],
                                columns=['label_id'])
human_voice_lvl3['parent_legible'] = 'Human voice'

In [25]:
sounds_of_interest = sounds_of_interest.append(human_voice_lvl3)
sounds_of_interest['legible_id'] = sounds_of_interest['label_id'].map(tag_map)

### We'll also add children of the 'Human group actions' class

In [26]:
human_group_actions_lvl3 = pd.DataFrame([c for c in ontology[ontology['name'] == 'Human group actions']['child_ids']][0],
                                columns=['label_id'])
human_group_actions_lvl3['parent_legible'] = 'Human group actions'

In [27]:
sounds_of_interest = sounds_of_interest.append(human_group_actions_lvl3)
sounds_of_interest['legible_id'] = sounds_of_interest['label_id'].map(tag_map)

### The 'silence' tag would be useful - it lives in the 'source_ambiguous' children

In [28]:
source_ambiguous_lvl2 = pd.DataFrame([c for c in ontology[ontology['name']=='Source-ambiguous sounds']['child_ids']][0],
                                columns=['label_id'])
source_ambiguous_lvl2['parent_legible'] = 'Source-ambiguous sounds'

In [29]:
sounds_of_interest = sounds_of_interest.append(source_ambiguous_lvl2)
sounds_of_interest['legible_id'] = sounds_of_interest['label_id'].map(tag_map)

By default, labels are not-speech. So we'll make that our base condition.

In [30]:
sounds_of_interest['marvin_class'] = 'not_speech'

If the parent classes are 'Human voice' or 'Speech', we want to give it a 'speech' tag. We'll get more specific later.
(Recall that the 'where' syntax keeps the original EXCEPT where the condition is false, hence the negation.)

In [31]:
sounds_of_interest['marvin_class'].where(
    ~sounds_of_interest['parent_legible'].isin(['Human voice', 'Speech']
                                             ), 'speech', inplace=True)

If the specific class labels are speech-like, we'll also give them a 'speech' tag

In [32]:
sounds_of_interest['marvin_class'].where(
    ~sounds_of_interest['legible_id'].isin([
                        'Human voice',
                        'Cheering',
                        'Chatter',
                        'Hubbub, speech noise, speech babble',
                        'Crowd',                               # Not sure...we should discuss
                        'Booing']), 'speech', inplace=True)

Now we'll override the speech tags with more specific ones

In [33]:
sounds_of_interest['marvin_class'].where(
    sounds_of_interest['legible_id']!='Female speech, woman speaking',
                                      'female_speech', inplace=True)
sounds_of_interest['marvin_class'].where(
    sounds_of_interest['legible_id']!='Male speech, man speaking',
                                      'male_speech', inplace=True)
sounds_of_interest['marvin_class'].where(
    ~sounds_of_interest['legible_id'].isin(
        ['Children playing',
         'Children shouting',
         'Child speech, kid speaking']),
         'child_speech', inplace=True)
sounds_of_interest['marvin_class'].where(
    ~sounds_of_interest['legible_id'].isin(['silence']),
        'silence', inplace=True)

...and let's examine our handiwork:

In [34]:
sounds_of_interest

Unnamed: 0,label_id,legible_id,parent_legible,marvin_class
0,/m/0dgw9r,Human sounds,-,not_speech
0,/m/09l8g,Human voice,Human sounds,speech
1,/m/01w250,Whistling,Human sounds,not_speech
2,/m/09hlz4,Respiratory sounds,Human sounds,not_speech
3,/m/0bpl036,Human locomotion,Human sounds,not_speech
4,/m/0160x5,Digestive,Human sounds,not_speech
5,/m/0k65p,Hands,Human sounds,not_speech
6,/m/01jg02,"Heart sounds, heartbeat",Human sounds,not_speech
7,/m/04xp5v,Otoacoustic emission,Human sounds,not_speech
8,/t/dd00012,Human group actions,Human sounds,not_speech


It looks good!

We need to be able to map those crazy codes to speech classes. We probably want to be able to do the same with the human-readable labels as well.

In [35]:
speech_label_map = dict(zip(sounds_of_interest['label_id'], sounds_of_interest['marvin_class']))
speech_legible_map = dict(zip(sounds_of_interest['legible_id'], sounds_of_interest['marvin_class']))

Now we just need to make sure that anything _not_ on that list gets flagged as `not_speech` too.

In [36]:
ontology['marvin_class'] = 'not_speech'

Now we update the ontology's `'marvin_class'` field with the values we defined in our `sounds_of_interest` dataframe...

In [37]:
ontology['marvin_class'].where(
    ~ontology['id'].isin(
        sounds_of_interest['label_id']),
        ontology['id'].map(speech_label_map), inplace=True)

Verifying it looks as we'd expect...

In [38]:
ontology[['id','name','marvin_class']]

Unnamed: 0,id,name,marvin_class
0,/m/0dgw9r,Human sounds,not_speech
1,/m/09l8g,Human voice,speech
2,/m/09x0r,Speech,speech
3,/m/05zppz,"Male speech, man speaking",male_speech
4,/m/02zsn,"Female speech, woman speaking",female_speech
...,...,...,...
627,/m/025l19,Recording,not_speech
628,/m/07hvw1,Field recording,not_speech
629,/m/0174nj,Gramophone record,not_speech
630,/m/01www,Compact disc,not_speech


It looks good! Now let's ensure we're able to take those `marvin_class` mappings and use them to update our actual data frame of interest...

In [39]:
ontology_label_map = dict(zip(ontology['id'], ontology['marvin_class']))

We'll create a new dataframe to hold the results of our summing of these labels

In [40]:
tallied = to_tally.replace(ontology_label_map)

In [41]:
tallied['tot_speech_ct'] = tallied.loc[:,'label_1':'label_10'] \
                        .isin(['speech',
                               'child_speech',
                               'female_speech',
                               'male_speech']).sum(axis=1)
tallied['female_speech_ct'] = tallied.loc[:,'label_1':'label_10'] \
                        .isin(['female_speech']).sum(axis=1)
tallied['male_speech_ct'] = tallied.loc[:,'label_1':'label_10'] \
                        .isin(['male_speech']).sum(axis=1)
tallied['child_speech_ct'] = tallied.loc[:,'label_1':'label_10'] \
                        .isin(['child_speech']).sum(axis=1)
tallied['other_speech_ct'] = tallied.loc[:,'label_1':'label_10'] \
                        .isin(['speech']).sum(axis=1)
tallied['silence_ct'] = tallied.loc[:,'label_1':'label_10'] \
                        .isin(['silence']).sum(axis=1)

In [42]:
tallied[['female_speech_ct',
         'male_speech_ct',
         'child_speech_ct',
         'other_speech_ct',
         'tot_speech_ct',
         'silence_ct']].describe()

Unnamed: 0,female_speech_ct,male_speech_ct,child_speech_ct,other_speech_ct,tot_speech_ct,silence_ct
count,4240.0,4240.0,4240.0,4240.0,4240.0,4240.0
mean,0.003066,0.00283,0.014858,0.332075,0.35283,0.0
std,0.055293,0.05313,0.135704,0.541384,0.593303,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,2.0,4.0,5.0,0.0


In [43]:
tallied['labels'] = [[tag_map[tag] for tag in str(label).split(',')] for label in to_tally['labels'].str.strip('"')]

In [44]:
tallied[tallied['tot_speech_ct'] >= 1].loc[:,'labels':'tot_speech_ct']

Unnamed: 0,labels,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,tot_speech_ct
2,"[Music, Speech, Female singing, Child singing]",not_speech,speech,not_speech,not_speech,,,,,,,1
4,"[Music, Walk, footsteps, Speech]",not_speech,not_speech,speech,,,,,,,,1
8,"[Horse, Music, Neigh, whinny, Speech, Animal]",not_speech,not_speech,not_speech,speech,not_speech,,,,,,1
10,"[Firecracker, Speech]",not_speech,speech,,,,,,,,,1
13,"[Crackle, Speech]",not_speech,speech,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4226,"[Babbling, Tearing, Speech]",speech,not_speech,speech,,,,,,,,2
4228,"[Singing, Banjo, Guitar, Music, Mandolin, Musi...",speech,not_speech,not_speech,not_speech,not_speech,not_speech,not_speech,,,,1
4231,"[Crack, Whack, thwack, Wood, Speech, Outside, ...",not_speech,not_speech,not_speech,speech,not_speech,,,,,,1
4233,"[Laughter, Giggle, Speech, Baby laughter]",speech,not_speech,speech,not_speech,,,,,,,2


In [45]:
# tallied = tallied.replace(speech_label_map)
tallied

Unnamed: 0,ytid,start,end,labels,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,tot_speech_ct,female_speech_ct,male_speech_ct,child_speech_ct,other_speech_ct,silence_ct
0,--BfvyPmVMo,20000,30000,[Hammer],not_speech,,,,,,,,,,0,0,0,0,0,0
1,--U7joUcTCo,0,10000,[Cough],not_speech,,,,,,,,,,0,0,0,0,0,0
2,--i-y1v8Hy8,0,9000,"[Music, Speech, Female singing, Child singing]",not_speech,speech,not_speech,not_speech,,,,,,,1,0,0,0,1,0
3,-0BIyqJj9ZU,30000,40000,"[Chuckle, chortle, Belly laugh, Baby laughter]",not_speech,not_speech,not_speech,,,,,,,,0,0,0,0,0,0
4,-0CamVQdP_Y,0,6000,"[Music, Walk, footsteps, Speech]",not_speech,not_speech,speech,,,,,,,,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,H692PyayBt4,0,10000,"[Bagpipes, Wind instrument, woodwind instrument]",not_speech,not_speech,,,,,,,,,0,0,0,0,0,0
4236,H6BsnEYfyKg,140000,150000,[Mechanical fan],not_speech,,,,,,,,,,0,0,0,0,0,0
4237,H6H3B1LLWw8,440000,450000,[Train horn],not_speech,,,,,,,,,,0,0,0,0,0,0
4238,H6IHH1g5dOc,30000,40000,"[Scratch, Speech]",not_speech,speech,,,,,,,,,1,0,0,0,1,0


In [46]:
# tallied['marvin_classes'] = ' '
tallied['marvin_classes'] = [[label for label in labels if str(label) != 'None'] for labels in zip(tallied['label_1'],
                                tallied['label_2'],
                                tallied['label_3'],
                                tallied['label_4'],
                                tallied['label_5'],
                                tallied['label_6'],
                                tallied['label_7'],
                                tallied['label_9'],
                                tallied['label_10'])]                            

In [50]:
tallied.to_csv('summary_of_available_audioset_training_data.csv', columns=['labels',
                        'tot_speech_ct',
                        'child_speech_ct',
                        'female_speech_ct',
                        'male_speech_ct',
                        'other_speech_ct',
                        'silence_ct','marvin_classes'])

In [49]:
tallied

Unnamed: 0,ytid,start,end,labels,label_1,label_2,label_3,label_4,label_5,label_6,...,label_8,label_9,label_10,tot_speech_ct,female_speech_ct,male_speech_ct,child_speech_ct,other_speech_ct,silence_ct,marvin_classes
0,--BfvyPmVMo,20000,30000,[Hammer],not_speech,,,,,,...,,,,0,0,0,0,0,0,[not_speech]
1,--U7joUcTCo,0,10000,[Cough],not_speech,,,,,,...,,,,0,0,0,0,0,0,[not_speech]
2,--i-y1v8Hy8,0,9000,"[Music, Speech, Female singing, Child singing]",not_speech,speech,not_speech,not_speech,,,...,,,,1,0,0,0,1,0,"[not_speech, speech, not_speech, not_speech]"
3,-0BIyqJj9ZU,30000,40000,"[Chuckle, chortle, Belly laugh, Baby laughter]",not_speech,not_speech,not_speech,,,,...,,,,0,0,0,0,0,0,"[not_speech, not_speech, not_speech]"
4,-0CamVQdP_Y,0,6000,"[Music, Walk, footsteps, Speech]",not_speech,not_speech,speech,,,,...,,,,1,0,0,0,1,0,"[not_speech, not_speech, speech]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,H692PyayBt4,0,10000,"[Bagpipes, Wind instrument, woodwind instrument]",not_speech,not_speech,,,,,...,,,,0,0,0,0,0,0,"[not_speech, not_speech]"
4236,H6BsnEYfyKg,140000,150000,[Mechanical fan],not_speech,,,,,,...,,,,0,0,0,0,0,0,[not_speech]
4237,H6H3B1LLWw8,440000,450000,[Train horn],not_speech,,,,,,...,,,,0,0,0,0,0,0,[not_speech]
4238,H6IHH1g5dOc,30000,40000,"[Scratch, Speech]",not_speech,speech,,,,,...,,,,1,0,0,0,1,0,"[not_speech, speech]"


In [55]:
100*len(tallied[tallied['female_speech_ct']>1])/len(tallied)

0.0