In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import itertools

import numpy as np
import seaborn as sns
import pandas as pd

from paderbox.visualization import plot, context_manager
from padercontrib.database.keys import *
from padercontrib.database.iterator import AudioReader
from paderbox.transform import stft
from paderbox.array import interval
from padercontrib.visualization.database_to_html import display_database_html

from padercontrib.database.fearless import Fearless

In [3]:
db = Fearless()
db

Fearless(PosixPath('/net/vol/jenkins/jsons/fearless.json'))

In [4]:
db.dataset_names

('Dev_SID',
 'Dev_segment',
 'Dev_stream',
 'Eval_SID',
 'Eval_segment',
 'Eval_stream',
 'Train_SID',
 'Train_segment',
 'Train_stream')

In [5]:
datasets = [dataset for dataset in db.dataset_names if "segment" in dataset]
datasets

['Dev_segment', 'Eval_segment', 'Train_segment']

In [6]:
# Working with ['Dev_segment', 'Eval_segment', 'Train_segment'] Sunday 9.39am

### 1. Dev Segment

In [7]:
dataset_Dev_seg = db.get_dataset('Dev_segment')
df_dev_seg = pd.DataFrame(dataset_Dev_seg)

In [8]:
df_dev_seg = df_dev_seg.drop(columns=['audio_path'])
df_dev_seg['totalwords'] = df_dev_seg['transcription'].str.split().str.len()
WL = df_dev_seg['transcription']
df_dev_seg['totl_wrd_lngth'] = WL.str.split().explode().str.len().sum(level=0)
df_dev_seg['time_in_sec'] = df_dev_seg['num_samples']/8000
df_dev_seg.head()

Unnamed: 0,num_samples,speaker_id,transcription,example_id,dataset,totalwords,totl_wrd_lngth,time_in_sec
0,37280,GNC1,ROG AND WED LIKE UH ZERO AND FOUR ONES [unk],FS02_dev_0001,Dev_segment,10,35,4.66
1,56720,CAPCOM1,COLUMBIA THIS IS HOUSTON DID YOU COPY L O S A ...,FS02_dev_0002,Dev_segment,17,53,7.09
2,49520,BUZZ,THE UH PANORAMA ILL BE TAKING IS ABOUT THIRTY ...,FS02_dev_0003,Dev_segment,15,58,6.19
3,54880,GNC4,TEST WOULD BE CYCLING AT ONE COUNT PER FIVE SE...,FS02_dev_0004,Dev_segment,22,107,6.86
4,47120,RETRO1,DID UH [unk] SOMETHING ABOUT P TWENTY DATA THA...,FS02_dev_0005,Dev_segment,14,56,5.89


In [9]:
df_dev_seg['transcription'][0]

'ROG AND WED LIKE UH ZERO AND FOUR ONES [unk]'

In [10]:
print(df_dev_seg['transcription'])

0            ROG AND WED LIKE UH ZERO AND FOUR ONES [unk]
1       COLUMBIA THIS IS HOUSTON DID YOU COPY L O S A ...
2       THE UH PANORAMA ILL BE TAKING IS ABOUT THIRTY ...
3       TEST WOULD BE CYCLING AT ONE COUNT PER FIVE SE...
4       DID UH [unk] SOMETHING ABOUT P TWENTY DATA THA...
                              ...                        
9198                                              WERE GO
9199    ROGER AFTER UH THIS FIRST P FIFTY SEVEN YOU WA...
9200                    ROGER ONE TWO THREE THREE TWO ONE
9201                               CHARLIE LETS GO TO ONE
9202                                   I THINK HE WILL BE
Name: transcription, Length: 9203, dtype: object


In [11]:
#1. Average speech activity per speaker in seconds, dataset = Dev_segment
avg_speech_activity = (df_dev_seg
   .groupby(['speaker_id'], as_index=False)
   .time_in_sec.sum()
   .to_dict('r'))
avg_speech_activity

[{'speaker_id': 'AFD1', 'time_in_sec': 51.92999999999999},
 {'speaker_id': 'AFD3', 'time_in_sec': 4.27},
 {'speaker_id': 'AFD5', 'time_in_sec': 26.160000000000004},
 {'speaker_id': 'AGC', 'time_in_sec': 4.02},
 {'speaker_id': 'ALDS', 'time_in_sec': 21.339999999999996},
 {'speaker_id': 'ALSEP', 'time_in_sec': 29.599999999999998},
 {'speaker_id': 'ANTIGUA', 'time_in_sec': 1.25},
 {'speaker_id': 'ARIA', 'time_in_sec': 12.79},
 {'speaker_id': 'ASCENSION', 'time_in_sec': 11.719999999999999},
 {'speaker_id': 'ASCENSIONCOMMTECH', 'time_in_sec': 43.47999999999999},
 {'speaker_id': 'BERMUDA', 'time_in_sec': 2.29},
 {'speaker_id': 'BOB', 'time_in_sec': 21.08},
 {'speaker_id': 'BOOSTER1', 'time_in_sec': 36.94},
 {'speaker_id': 'BOOSTER2', 'time_in_sec': 4.9399999999999995},
 {'speaker_id': 'BTU', 'time_in_sec': 18.35},
 {'speaker_id': 'BUZZ', 'time_in_sec': 3703.0899999999983},
 {'speaker_id': 'CALIFORNIA', 'time_in_sec': 2.23},
 {'speaker_id': 'CANARY', 'time_in_sec': 17.06},
 {'speaker_id': 'CA

In [12]:
#2. no. of speakers in a dataset, dataset = Dev_segment
speakers_dev_seg = df_dev_seg['speaker_id']
speakers_dev_seg = set(speakers_dev_seg)
len(speakers_dev_seg)

201

In [13]:
#3. No. of words per dataset, dataset = Dev_segment
number_of_words_dev_seg = [len(x.split()) for x in df_dev_seg['transcription'].tolist()]
print('number of unique words',sum(set(number_of_words_dev_seg)))

number of unique words 3626


In [14]:
#4. Avg word length per dataset, dataset = Dev_segment
sum(df_dev_seg.totl_wrd_lngth)/sum(df_dev_seg.totalwords)

3.8697010930514226

### 2. Train Segment

In [15]:
dataset_Train_seg = db.get_dataset('Train_segment')
df_tr_seg = pd.DataFrame(dataset_Train_seg)

In [16]:
#df_tr_seg = df_tr_seg.drop(columns=['audio_path'])
df_tr_seg['totalwords'] = df_tr_seg['transcription'].str.split().str.len()
WL_tr_seg = df_tr_seg['transcription']
df_tr_seg['totl_wrd_lngth'] = WL_tr_seg.str.split().explode().str.len().sum(level=0)
df_tr_seg['time_in_sec'] = df_tr_seg['num_samples']/8000
df_tr_seg.head()

Unnamed: 0,audio_path,num_samples,speaker_id,transcription,example_id,dataset,totalwords,totl_wrd_lngth,time_in_sec
0,{'observation': '/net/db/fearless/Audio/Segmen...,7760,NETWORK5,ROGER,FS02_train_00001,Train_segment,1,5.0,0.97
1,{'observation': '/net/db/fearless/Audio/Segmen...,16720,CONTROL2,OKAY WERE CLEAN UH ON [unk],FS02_train_00002,Train_segment,6,22.0,2.09
2,{'observation': '/net/db/fearless/Audio/Segmen...,23360,COLLINS,UPTEL I U TO ACCEPT,FS02_train_00003,Train_segment,5,15.0,2.92
3,{'observation': '/net/db/fearless/Audio/Segmen...,11760,GNC2,ROG,FS02_train_00004,Train_segment,1,3.0,1.47
4,{'observation': '/net/db/fearless/Audio/Segmen...,6240,UNK,[unk] ON THE,FS02_train_00005,Train_segment,3,10.0,0.78


In [17]:
#1. Average speech activity per speaker in seconds, dataset = Train_segment
avg_speech_activity_tr_seg = (df_tr_seg
   .groupby(['speaker_id'], as_index=False)
   .time_in_sec.sum()
   .to_dict('r'))
avg_speech_activity_tr_seg

[{'speaker_id': 'AFD1', 'time_in_sec': 176.57999999999998},
 {'speaker_id': 'AFD2', 'time_in_sec': 18.51},
 {'speaker_id': 'AFD3', 'time_in_sec': 62.23},
 {'speaker_id': 'AFD4', 'time_in_sec': 7.08},
 {'speaker_id': 'AFD5', 'time_in_sec': 688.3400000000001},
 {'speaker_id': 'AGC', 'time_in_sec': 74.89999999999999},
 {'speaker_id': 'ALDS', 'time_in_sec': 20.32},
 {'speaker_id': 'ALSEP', 'time_in_sec': 40.370000000000005},
 {'speaker_id': 'ANTIGUA', 'time_in_sec': 21.69},
 {'speaker_id': 'ARIA', 'time_in_sec': 49.40999999999999},
 {'speaker_id': 'ASCENSION', 'time_in_sec': 765.4399999999997},
 {'speaker_id': 'ASCENSIONCOMMTECH', 'time_in_sec': 91.24000000000001},
 {'speaker_id': 'BERMUDA', 'time_in_sec': 44.89000000000001},
 {'speaker_id': 'BOOSTER1', 'time_in_sec': 458.35999999999996},
 {'speaker_id': 'BUZZ', 'time_in_sec': 6496.479999999987},
 {'speaker_id': 'CALIFORNIA', 'time_in_sec': 2.41},
 {'speaker_id': 'CANARY', 'time_in_sec': 212.71999999999994},
 {'speaker_id': 'CANARYCOMMTECH

In [18]:
#2. no. of speakers in a dataset, dataset = Train_segment
speakers_tr_seg = df_tr_seg['speaker_id']
speakers_tr_seg = set(speakers_tr_seg)
len(speakers_tr_seg)

256

In [19]:
#3. No. of words per dataset, dataset = Train_segment
number_of_words_tr_seg = [len(x.split()) for x in df_tr_seg['transcription'].tolist()]
sum(number_of_words_tr_seg)

262545

In [20]:
#4. Avg word length per dataset, dataset = Train_segment
sum(df_tr_seg.totl_wrd_lngth)/sum(df_tr_seg.totalwords)

3.9228627473385513

In [21]:
#5. Speaker intersection between datasets, dataset = Dev_segment
speaker_intersection_between_datasets = np.intersect1d(df_dev_seg.speaker_id, 
                                        np.intersect1d(df_tr_seg.speaker_id, df_dev_seg.speaker_id))
print(speaker_intersection_between_datasets)

['AFD1' 'AFD3' 'AFD5' 'AGC' 'ALDS' 'ALSEP' 'ANTIGUA' 'ARIA' 'ASCENSION'
 'ASCENSIONCOMMTECH' 'BERMUDA' 'BOOSTER1' 'BUZZ' 'CALIFORNIA' 'CANARY'
 'CAPCOM1' 'CAPCOM2' 'CARNARVON' 'CCATSCOMMAND' 'CCATSTELEMETRY' 'CLTC'
 'COLLINS' 'COMM' 'COMMANAGER1' 'COMMANAGER3' 'COMMCONTROL1'
 'COMMCONTROL2' 'COMMCONTROL3' 'COMMCONTROL4' 'COMPUTERMNO' 'COMPUTERTM'
 'CONSUMABLES' 'CONTROL1' 'CONTROL2' 'CPSPK' 'CPSS' 'CTSC' 'CVTS'
 'DATACORE' 'DEPUTYLOM' 'DISPLAY' 'ECS1' 'ECS2' 'EECOM1' 'EECOM2' 'EECOM3'
 'EECOM5' 'EMU' 'EPE' 'EPS2' 'EPS3' 'EVA' 'EXPERIMENTS' 'FAO1' 'FAO2'
 'FAO3' 'FD1' 'FD2' 'FD3' 'FEMALE2' 'FIDO1' 'FIDO3' 'FORTYEIGHT' 'GBM'
 'GCC' 'GMIL' 'GNC1' 'GNC4' 'GNN1' 'GNN2' 'GODDARDOPS' 'GODDARDVOICE'
 'GOLDSTONE' 'GOLDSTONECOMMTECH' 'GOLDSTONEVIDEO' 'GUAM' 'GUAYMAS'
 'GUIDANCE1' 'GUIDANCE2' 'GUIDANCE3' 'GUIDANCE4' 'HAWAII' 'HONEYSUCKLE'
 'HOUSTONCOMMTECH1' 'HOUSTONCOMMTECH2' 'HOUSTONCOMMTECH3' 'HOUSTONTV1'
 'HUNTSVILLE' 'INCO1' 'INCO3' 'INCO4' 'LC' 'LMPROPULSION' 'LOADCONTROL'
 'MADRID' 'MADRID

In [22]:
len(speaker_intersection_between_datasets)

167

In [None]:
#6. Length of silence between utterances as histogram:

### 3. Train_SID

In [23]:
dataset_Train_SID = db.get_dataset('Train_SID')
df_tr_sid = pd.DataFrame(dataset_Train_SID)
df_tr_sid['time_in_sec'] = df_tr_sid['num_samples']/8000
df_tr_sid.head()

Unnamed: 0,audio_path,num_samples,speaker_id,example_id,dataset,time_in_sec
0,{'observation': '/net/db/fearless/Audio/Segmen...,27040,FD1,FS02_SID_train_00001,Train_SID,3.38
1,{'observation': '/net/db/fearless/Audio/Segmen...,32720,BUZZ,FS02_SID_train_00002,Train_SID,4.09
2,{'observation': '/net/db/fearless/Audio/Segmen...,29520,COMMCONTROL3,FS02_SID_train_00003,Train_SID,3.69
3,{'observation': '/net/db/fearless/Audio/Segmen...,34240,EECOM5,FS02_SID_train_00004,Train_SID,4.28
4,{'observation': '/net/db/fearless/Audio/Segmen...,30640,EECOM3,FS02_SID_train_00005,Train_SID,3.83


In [24]:
#1. Average speech activity per speaker in seconds, dataset = Train_SID
avg_speech_activity_tr_sid = (df_tr_sid
   .groupby(['speaker_id'], as_index=False)
   .time_in_sec.sum()
   .to_dict('r'))
avg_speech_activity_tr_sid

[{'speaker_id': 'AFD1', 'time_in_sec': 190.31999999999996},
 {'speaker_id': 'AFD3', 'time_in_sec': 64.99},
 {'speaker_id': 'AFD5', 'time_in_sec': 428.3900000000002},
 {'speaker_id': 'AGC', 'time_in_sec': 89.32000000000001},
 {'speaker_id': 'AGCSUPPORT', 'time_in_sec': 12.25},
 {'speaker_id': 'ALDS', 'time_in_sec': 26.270000000000003},
 {'speaker_id': 'ALSEP', 'time_in_sec': 115.1},
 {'speaker_id': 'ANTIGUA', 'time_in_sec': 15.950000000000001},
 {'speaker_id': 'ARIA', 'time_in_sec': 80.26},
 {'speaker_id': 'ASCENSION', 'time_in_sec': 694.7799999999999},
 {'speaker_id': 'ASCENSIONCOMMTECH', 'time_in_sec': 101.42},
 {'speaker_id': 'BERMUDA', 'time_in_sec': 52.97},
 {'speaker_id': 'BERMUDACOMMTECH', 'time_in_sec': 11.01},
 {'speaker_id': 'BOB', 'time_in_sec': 7.78},
 {'speaker_id': 'BOOSTER1', 'time_in_sec': 534.1599999999996},
 {'speaker_id': 'BUZZ', 'time_in_sec': 8986.029999999988},
 {'speaker_id': 'CALIFORNIA', 'time_in_sec': 10.27},
 {'speaker_id': 'CANARY', 'time_in_sec': 179.5},
 {'

In [25]:
#2. no. of speakers in a dataset, dataset = Train_SID
speakers_tr_sid = df_tr_sid['speaker_id']
speakers_tr_sid = set(speakers_tr_sid)
len(speakers_tr_sid)

218

### 4. Dev_SID

In [26]:
dataset_Dev_SID = db.get_dataset('Dev_SID')
df_dev_sid = pd.DataFrame(dataset_Dev_SID)
df_dev_sid['time_in_sec'] = df_dev_sid['num_samples']/8000
df_dev_sid.head()

Unnamed: 0,audio_path,num_samples,speaker_id,example_id,dataset,time_in_sec
0,{'observation': '/net/db/fearless/Audio/Segmen...,25760,NETWORK5,FS02_SID_dev_0001,Dev_SID,3.22
1,{'observation': '/net/db/fearless/Audio/Segmen...,33440,CAPCOM1,FS02_SID_dev_0002,Dev_SID,4.18
2,{'observation': '/net/db/fearless/Audio/Segmen...,27760,FD3,FS02_SID_dev_0003,Dev_SID,3.47
3,{'observation': '/net/db/fearless/Audio/Segmen...,33760,FIDO1,FS02_SID_dev_0004,Dev_SID,4.22
4,{'observation': '/net/db/fearless/Audio/Segmen...,35760,FIDO1,FS02_SID_dev_0005,Dev_SID,4.47


In [27]:
#1. Average speech activity per speaker in seconds, dataset = Train_SID
avg_speech_activity_dev_sid = (df_dev_sid
   .groupby(['speaker_id'], as_index=False)
   .time_in_sec.sum()
   .to_dict('r'))
avg_speech_activity_dev_sid

[{'speaker_id': 'AFD1', 'time_in_sec': 41.02},
 {'speaker_id': 'AFD3', 'time_in_sec': 17.4},
 {'speaker_id': 'AFD5', 'time_in_sec': 97.66999999999997},
 {'speaker_id': 'AGC', 'time_in_sec': 22.63},
 {'speaker_id': 'AGCSUPPORT', 'time_in_sec': 3.44},
 {'speaker_id': 'ALDS', 'time_in_sec': 10.04},
 {'speaker_id': 'ALSEP', 'time_in_sec': 26.489999999999995},
 {'speaker_id': 'ANTIGUA', 'time_in_sec': 8.64},
 {'speaker_id': 'ARIA', 'time_in_sec': 18.93},
 {'speaker_id': 'ASCENSION', 'time_in_sec': 171.13999999999996},
 {'speaker_id': 'ASCENSIONCOMMTECH', 'time_in_sec': 27.54},
 {'speaker_id': 'BERMUDA', 'time_in_sec': 16.009999999999998},
 {'speaker_id': 'BERMUDACOMMTECH', 'time_in_sec': 3.76},
 {'speaker_id': 'BOB', 'time_in_sec': 7.699999999999999},
 {'speaker_id': 'BOOSTER1', 'time_in_sec': 124.89999999999999},
 {'speaker_id': 'BUZZ', 'time_in_sec': 2060.560000000001},
 {'speaker_id': 'CALIFORNIA', 'time_in_sec': 4.17},
 {'speaker_id': 'CANARY', 'time_in_sec': 40.489999999999995},
 {'spe

In [28]:
#2. no. of speakers in a dataset, dataset = Dev_SID
speakers_dev_sid = df_dev_sid['speaker_id']
speakers_dev_sid = set(speakers_dev_sid)
len(speakers_dev_sid)

218

In [30]:
set(speakers_tr_sid).issubset(speakers_dev_sid)

True

In [31]:
#5. Speaker intersection between datasets, dataset = Dev_SID & Train_SID
speaker_intersection_between_datasets_SID = np.intersect1d(df_dev_sid.speaker_id, 
                                        np.intersect1d(df_tr_sid.speaker_id, df_dev_sid.speaker_id))
print(speaker_intersection_between_datasets_SID, 'length: ',len(speaker_intersection_between_datasets_SID))

['AFD1' 'AFD3' 'AFD5' 'AGC' 'AGCSUPPORT' 'ALDS' 'ALSEP' 'ANTIGUA' 'ARIA'
 'ASCENSION' 'ASCENSIONCOMMTECH' 'BERMUDA' 'BERMUDACOMMTECH' 'BOB'
 'BOOSTER1' 'BUZZ' 'CALIFORNIA' 'CANARY' 'CANARYCOMMTECH' 'CAPCOM1'
 'CAPCOM2' 'CAPCOM3' 'CAPCOM4' 'CARNARVON' 'CCATSCOMMAND' 'CCATSTELEMETRY'
 'CLTC' 'COLLINS' 'COMM' 'COMMANAGER1' 'COMMCONTROL1' 'COMMCONTROL2'
 'COMMCONTROL3' 'COMMCONTROL4' 'COMPUTERMNO' 'CONSUMABLES' 'CONTROL1'
 'CONTROL2' 'CPSPK' 'CPSS' 'CSANINE' 'CSAT' 'CTSC' 'CVTS' 'DEPUTYLOM'
 'DISPLAY' 'DOCUMENTATION' 'DONHUGHES' 'DYNAMICS' 'ECS1' 'ECS2' 'ECS3'
 'ECS4' 'EECOM1' 'EECOM2' 'EECOM3' 'EECOM5' 'EMU' 'EPE' 'EPS1' 'EPS2'
 'EPS3' 'ERNIE' 'EVA' 'EXPERIMENTS' 'FAO1' 'FAO2' 'FAO3' 'FD1' 'FD2' 'FD3'
 'FEMALE2' 'FEMALE3' 'FIDO1' 'FIDO2' 'FIDO3' 'FLIGHTPLAN1' 'FLIGHTPLAN2'
 'FLIGHTPLANSUPPORT' 'FORTYEIGHT' 'FRANK' 'GBM' 'GCC' 'GEORGE' 'GMIL'
 'GNC1' 'GNC2' 'GNC3' 'GNC4' 'GNN1' 'GNN2' 'GNN3' 'GNN4' 'GODDARDOPS'
 'GODDARDVOICE' 'GOLDSTONE' 'GOLDSTONECOMMTECH' 'GOLDSTONEVIDEO' 'GUAM'
 'GUAYM

In [32]:
set(speaker_intersection_between_datasets).issubset(speaker_intersection_between_datasets_SID)

False

### Eval_SID

In [None]:
dataset_Eval_SID = db.get_dataset('Eval_SID')
df_eval_sid = pd.DataFrame(dataset_Eval_SID)
df_eval_sid['time_in_sec'] = df_eval_sid['num_samples']/8000
df_eval_sid.head()

## Streams

In [None]:
datasets = [
 'Dev_stream',
 'Eval_stream',
 'Train_stream',
]
datasets

### Dev Stream

In [None]:
dataset_Dev_str = db.get_dataset('Dev_stream')
df_dev_str = pd.DataFrame(dataset_Dev_str)
#df_dev_str = df_dev_str.drop(columns=['audio_path', 'example_id'])
df_dev_str.head()

In [None]:
diff = []
for i in range(len(df_dev_str)):
    diff.append((np.subtract(df_dev_str['end'][i], df_dev_str['start'][i])/8000))
df_dev_str['speech_time'] = diff 

In [None]:
#1. Average speech activity per speaker in seconds, dataset = Dev_stream
speech_diary = []
for i in range(len(df_dev_str)):
    tempDF = df_dev_str.loc[[i],['speaker_id','speech_time']]
    speaker_id = tempDF['speaker_id'].to_list()
    speech_time = tempDF['speech_time'].to_list()
    tempdf = pd.DataFrame({'speaker_id': speaker_id[0],
                   'speech_time': speech_time[0]})

    avg_speech_activity = (tempdf
       .groupby(['speaker_id'], as_index=False)
       .speech_time.sum()
       .to_dict('r'))
    
    speech_diary+=avg_speech_activity

In [None]:
speech_diary_df = pd.DataFrame(speech_diary)
avg_speech_activity = (speech_diary_df
       .groupby(['speaker_id'], as_index=False)
       .speech_time.sum()
       .to_dict('r'))
avg_speech_activity

In [None]:
#2. no. of speakers in a dataset, dataset = Dev_stream
len(avg_speech_activity)


In [None]:
#3. no. of words per dataset, dataset = Dev_stream
df_dev_str['liststring'] = [' '.join(map(str, l)) for l in df_dev_str['transcription']]
df_dev_str['totalwords'] = df_dev_str['liststring'].str.split().str.len()

df_dev_str.head()

In [None]:
sum(df_dev_str['totalwords'])

In [None]:
#4. Avg word length per dataset, dataset = Dev_stream
df_dev_str['totalwords'] = df_dev_str['liststring'].str.split().str.len()
WL = df_dev_str['liststring']
df_dev_str['totl_wrd_lngth'] = WL.str.split().explode().str.len().sum(level=0)
sum(df_dev_str.totl_wrd_lngth)/sum(df_dev_str.totalwords)

### Train Stream

In [None]:
dataset_Train_str = db.get_dataset('Train_stream')
df_tr_str = pd.DataFrame(dataset_Train_str)
df_tr_str = df_tr_str.drop(columns=['audio_path', 'example_id'])
df_tr_str.head()

In [None]:
diff = []
for i in range(len(df_tr_str)):
    diff.append((np.subtract(df_tr_str['end'][i], df_tr_str['start'][i])/8000))
df_tr_str['speech_time'] = diff 

In [None]:
#1. Average speech activity per speaker in seconds, dataset = Train_stream
speech_diary = []
for i in range(len(df_tr_str)):
    tempDF = df_tr_str.loc[[i],['speaker_id','speech_time']]
    speaker_id = tempDF['speaker_id'].to_list()
    speech_time = tempDF['speech_time'].to_list()
    tempdf = pd.DataFrame({'speaker_id': speaker_id[0],
                   'speech_time': speech_time[0]})

    avg_speech_activity = (tempdf
       .groupby(['speaker_id'], as_index=False)
       .speech_time.sum()
       .to_dict('r'))
    
    speech_diary+=avg_speech_activity

In [None]:
speech_diary_df = pd.DataFrame(speech_diary)
avg_speech_activity = (speech_diary_df
       .groupby(['speaker_id'], as_index=False)
       .speech_time.sum()
       .to_dict('r'))
avg_speech_activity

In [None]:
#2. no. of speakers in a dataset, dataset = Train_segment
len(avg_speech_activity)

In [None]:
#3. No. of words per dataset, dataset = Train_segment

In [None]:
df_tr_str['liststring'] = [' '.join(map(str, l)) for l in df_tr_str['transcription']]
df_tr_str['totalwords'] = df_tr_str['liststring'].str.split().str.len()

df_tr_str.head()

In [None]:
sum(df_tr_str['totalwords'])

In [None]:
#4. Avg word length per dataset, dataset = Dev_stream
df_tr_str['totalwords'] = df_tr_str['liststring'].str.split().str.len()
WL = df_tr_str['liststring']
df_tr_str['totl_wrd_lngth'] = WL.str.split().explode().str.len().sum(level=0)
print('Average word lengh for Dev_stream dataset is: ', sum(df_tr_str.totl_wrd_lngth)/sum(df_tr_str.totalwords))

In [None]:
#5. Speaker intersection between datasets, dataset = Dev_stream, Train_stream
result_between_dataset = []
for i in range(len(dataset_Dev_str)):
    for j in range(len(dataset_Train_str)):
        result_between_dataset += set(dataset_Dev_str[i]['speaker_id']).intersection(dataset_Train_str[j]['speaker_id'])
print('Speaker intersection between Dev_stream and Train_stream: ', len(set(result_between_dataset)))

In [None]:
#6. Length of silence between utterances, dataset = Dev_stream

In [None]:
import matplotlib.pyplot as plt

In [None]:
append_ = []
for i in range(len(df_dev_str)):
    activity = db.get_activity(dataset_Dev_str[i])
    no_speech = interval.zeros(activity.shape[0])
    intervals = ((0, 0), *activity.intervals, (activity.shape[0], activity.shape[0]))
    no_speech.intervals = [
        (start[1], end[0]) for start, end in zip(intervals[:-1], intervals[1:])]
    append_.append(no_speech.intervals)

for j in range(len(df_dev_str)):
    left_values = []
    duration_values = []
    for start, end in append_[j]:
        left_values.append(start)
        duration_values.append(end-start)
    x = duration_values 
    bins = left_values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.hist(np.clip(x, bins[0], bins[-1]), bins=bins, label='%s x' % j)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
append_ = []
for i in range(len(df_tr_str)):
    activity = db.get_activity(dataset_Train_str[i])
    no_speech = interval.zeros(activity.shape[0])
    intervals = ((0, 0), *activity.intervals, (activity.shape[0], activity.shape[0]))
    no_speech.intervals = [
        (start[1], end[0]) for start, end in zip(intervals[:-1], intervals[1:])]
    append_.append(no_speech.intervals)

for j in range(len(df_tr_str)):
    left_values = []
    duration_values = []
    for start, end in append_[j]:
        left_values.append(start)
        duration_values.append(end-start)
    x = duration_values 
    bins = left_values
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.hist(np.clip(x, bins[0], bins[-1]), bins=bins, label='%s x' % j)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
#activity = db.get_activity(dataset_Dev_str[1])
#30 min long audio recordings 125
#activity