In [1]:
import pandas as pd
from tqdm import tqdm
import re

In [2]:
corpus = pd.read_csv('./data_corpus/buckeye_words.tsv', sep = '\t')
corpus

Unnamed: 0,id,file,speaker,start,word,word_dur,phonemes,phones,pos
0,0,s0101a,s01,0.000,{B_TRANS},0.102385,B,B,
1,1,s0101a,s01,0.102,<SIL>,4.173359,S,S,
2,2,s0101a,s01,4.276,<NOISE>,4.237774,U,U,
3,3,s0101a,s01,8.514,<IVER>,23.703057,S,S,
4,4,s0101a,s01,32.217,okay,0.405470,ow k ey,k ay,NN
...,...,...,...,...,...,...,...,...,...
348399,348399,s4004a,s40,85.555,<SIL>,1.227988,S,S,
348400,348400,s4004a,s40,86.783,<IVER>,3.400615,S,S,
348401,348401,s4004a,s40,90.183,all,0.294289,aa l,ow,DT
348402,348402,s4004a,s40,90.478,right,0.253473,r ay t,er eh tq,NN


In [3]:
corpus.word.value_counts().to_csv('counts.tsv')


In [4]:
corpus.loc[(corpus['word'].str.contains('<HES'))]

Unnamed: 0,id,file,speaker,start,word,word_dur,phonemes,phones,pos
52,52,s0101a,s01,61.143,<HES-I>,0.156922,U,U,
326,326,s0101a,s01,161.918,<HES-uh>,0.131541,U,U,
399,399,s0101a,s01,208.458,<HES-I>,0.402080,U,U,
565,565,s0101a,s01,274.678,<HES-familiarity>,1.185220,U,U,
584,584,s0101a,s01,281.413,<HES-uh>,0.602781,U,U,
...,...,...,...,...,...,...,...,...,...
345565,345565,s4002b,s40,528.831,<HES-um>,0.649959,U,U,
345571,345571,s4002b,s40,531.174,<HES-know>,0.388934,U,U,
345605,345605,s4002b,s40,544.761,<HES-um>,0.470049,U,U,
345666,345666,s4002b,s40,583.265,<HES-um>,0.439891,U,U,


In [5]:
corpus['utt_break'] = 'no'

In [6]:
break_targets = ['<SIL', '<IVER']
break_targets_strict = ['{B_', '{E_']

for b in break_targets:
    corpus.loc[(corpus['word'].str.contains(b)) & (corpus['word_dur'] > 1), 'utt_break'] = 'yes'
for b in break_targets_strict:
    corpus.loc[(corpus['word'].str.contains(b)), 'utt_break'] = 'yes'

In [7]:
corpus

Unnamed: 0,id,file,speaker,start,word,word_dur,phonemes,phones,pos,utt_break
0,0,s0101a,s01,0.000,{B_TRANS},0.102385,B,B,,yes
1,1,s0101a,s01,0.102,<SIL>,4.173359,S,S,,yes
2,2,s0101a,s01,4.276,<NOISE>,4.237774,U,U,,no
3,3,s0101a,s01,8.514,<IVER>,23.703057,S,S,,yes
4,4,s0101a,s01,32.217,okay,0.405470,ow k ey,k ay,NN,no
...,...,...,...,...,...,...,...,...,...,...
348399,348399,s4004a,s40,85.555,<SIL>,1.227988,S,S,,yes
348400,348400,s4004a,s40,86.783,<IVER>,3.400615,S,S,,yes
348401,348401,s4004a,s40,90.183,all,0.294289,aa l,ow,DT,no
348402,348402,s4004a,s40,90.478,right,0.253473,r ay t,er eh tq,NN,no


In [8]:
corpus['tok'] = corpus['word']
pound_targets = ['<SIL', '<IVER', '{B_', '{E_']
star_targets = ['<', '{']

for b in pound_targets:
    corpus.loc[(corpus['tok'].str.contains(b)), 'tok'] = '#'

for b in star_targets:
    corpus.loc[(corpus['tok'].str.contains(b)), 'tok'] = '*'

In [9]:
#get a list of sentences
utts = []

start = 1
cur_sent = ['']
for i in tqdm(corpus.index):
    if corpus.utt_break[i] == 'yes':
        utts.append(cur_sent)
        cur_sent = [corpus.tok[i]]
    else:
        cur_sent.append(corpus.tok[i])

100%|████████████████████████████████████████████████████████████████████████████████████| 348404/348404 [00:06<00:00, 52116.08it/s]


In [10]:
filler = ['uh', 'um']

In [27]:
disfluent_utts = [x for x in utts if 'uh' in x or 'um' in x]
len(disfluent_utts)

2680

In [28]:
disfluent_utts = [x for x in disfluent_utts if 'uh-huh' not in re.sub('uh huh', 'uh-huh', ' '.join(x))]
len(disfluent_utts)

2640

In [33]:
disfluent_utts = [x for x in disfluent_utts if len(x) > 10 and len(x) < 25 and x[-1] not in filler ]
len(disfluent_utts)

237

In [32]:
disfluent_utts[:10]

[['#',
  "it's",
  '*',
  '*',
  'no',
  "it's",
  'a',
  'uh',
  '*',
  'clinical',
  'study',
  'where',
  "we're",
  'seeing',
  'how',
  'treadmill',
  'training',
  '*',
  'effects',
  'ambulation',
  'in',
  'people',
  'with',
  'spinal',
  'cord',
  'injury'],
 ['#',
  'so',
  'family',
  '#',
  'um',
  '#',
  'familiarity',
  'and',
  '#',
  'um',
  '#',
  'just',
  'the',
  'community',
  'i',
  'think'],
 ['#',
  'it',
  'was',
  'on',
  'national',
  'news',
  '#',
  'you',
  'see',
  "i'm",
  '#',
  'not',
  'that',
  'informed',
  'on',
  'that',
  'um',
  '*'],
 ['#',
  'i',
  'i',
  'uh',
  '#',
  'i',
  'would',
  'find',
  'that',
  'hard',
  'to',
  'believe',
  '*',
  '#',
  'personally'],
 ['#',
  'along',
  'the',
  'lassie',
  'line',
  'and',
  'uh',
  '*',
  'uh',
  'leave',
  'it',
  'to',
  'beaver'],
 ['#',
  'let',
  'me',
  'focus',
  'on',
  '*',
  "you're",
  'question',
  "i'm",
  'not',
  '*',
  'that',
  '#',
  'i',
  '*',
  'that',
  'she',
  'was',


In [150]:
with open('./data_corpus/real_disf_en.txt', 'w', encoding = 'utf-8') as f:
    for i in disfluent_utts:
        f.write(' '.join(i) + '\n')