In [21]:
import pandas as pd
from tqdm import tqdm
import re
import random

random.seed(1991)

In [42]:
corpus = pd.read_csv('mcdc_disfl_annotated.csv', sep = ',', index_col = 0)
corpus = corpus[corpus['word_start'] != -1.000]
corpus


Unnamed: 0,speaker,file,tok,pinyin,IPA,duration,word_length,word_start
0,MCDC_01,MCDC_01_L_001,我,wo3,w o,0.115000,1,0.000
1,MCDC_01,MCDC_01_L_001,姓,xing4,ɕ i ŋ,0.221004,1,0.115
2,MCDC_01,MCDC_01_L_001,溫,wen1,w e n,0.348996,1,0.336
3,MCDC_01,MCDC_01_L_001,溫暖,wen1-nuan3,w e n#n w a n,0.350000,2,0.685
4,MCDC_01,MCDC_01_L_001,的,di4,t i,0.088748,1,1.035
...,...,...,...,...,...,...,...,...
107619,MCDC_26,MCDC_26_R_303,現在兵,xian4-zai4-bing1,ɕ j a n#ts a j#p i ŋ,0.405054,3,0.666
107620,MCDC_26,MCDC_26_R_303,已經,yi3-jing1,j i#tɕ i ŋ,0.179947,2,1.071
107621,MCDC_26,MCDC_26_R_303,越來越,yue4-lai2-yue4,j w e#l a j#j w e,0.300000,3,1.251
107622,MCDC_26,MCDC_26_R_303,少,shao3,ʂ a w,0.264385,1,1.551


In [43]:
corpus['tok'] = corpus['tok'].str.replace("NE GE", "NEGE")
corpus['tok'] = corpus['tok'].str.replace("NEI GE", "NEIGE")
corpus['tok'] = corpus['tok'].str.replace("ZE GE", "ZEGE")
corpus['tok'] = corpus['tok'].str.replace("ZEI GE", "ZEIGE")

In [44]:
utt_ids = corpus.file.unique()
utt_ids

array(['MCDC_01_L_001', 'MCDC_01_L_003', 'MCDC_01_L_004', ...,
       'MCDC_26_R_301', 'MCDC_26_R_302', 'MCDC_26_R_303'], dtype=object)

In [45]:
corpus[corpus.file == utt_ids[2]].reset_index(drop=True)

corpus['tok'] = corpus['tok'].fillna("#")

In [46]:
utts = []
for turn in tqdm(utt_ids):
    temp = corpus[corpus.file == turn].reset_index(drop=True)
    utt = []
    for i in temp.index:
        utt.append(temp.tok[i])
    utts.append(utt)

100%|█████████████████████████████████████████████████████████████████████████████| 4336/4336 [00:37<00:00, 116.34it/s]


In [59]:
DISFLs = ['uhm', 'mhm', 'uhn', 'en', 'E', 'EI', 'ein', 'NEGE', 'NEIGE', 'nhn']

disfluent_utts = []

for d in DISFLs:
    disfluent_utts =disfluent_utts + [x for x in range(len(utts)) if d in utts[x]]

disfluent_utts = list(set(disfluent_utts))
disfluent_utts = [utts[x] for x in disfluent_utts]
len(disfluent_utts)

1062

In [60]:
disfluent_utts = [x for x in disfluent_utts if len(x) < 30 and len(set(x)) > 4]
len(disfluent_utts)

346

In [61]:
disfluent_utts

[['對', '有', '可能', '是', '這樣子', 'mhm', 'mhm'],
 ['啊', '*', '是', '進口', 'en', '出口', '嗎'],
 ['股票', '都', '會', 'EI', '*', '都', '*', '慘跌', '呀'],
 ['#', 'E', '*', '業務', '本身', '的', '職責', '會', '有'],
 ['是',
  '對',
  '對',
  '那',
  '是',
  '本省',
  '的',
  '*',
  '對',
  '對',
  '對',
  '對',
  '對',
  '#',
  'EI',
  '會',
  '會',
  '起',
  '衝突',
  '衝突'],
 ['*', '新', '生活', '#', '新', '形象', '*', 'EI'],
 ['對', '*', 'EI', '好像', '是', '他們', '是', '說', '是', '竹聯', '的', '嘛', '*', 'ho'],
 ['平常',
  '的',
  '話',
  '就是',
  '看電影',
  '嘛',
  '*',
  '逛街',
  '啦',
  '或是',
  '說',
  '*',
  'uhn',
  'mhm',
  '在',
  '家',
  '看',
  '看',
  '電視',
  '就',
  '這樣',
  '這樣'],
 ['*',
  '可是',
  '烏來',
  '也',
  '很',
  '塞',
  'EI',
  '上',
  '次',
  '是',
  '我們',
  '去',
  '也',
  '是',
  '一路',
  '塞',
  '上去',
  '然後',
  '再',
  '塞下',
  '來'],
 ['沒有', '四十八', '歲', '啦', '#', '四十', '歲', '四十一', '歲', '啦', '*', 'EI'],
 ['我們',
  '家',
  '是',
  '如果',
  '要',
  '出去',
  '的',
  '話',
  '都',
  '會',
  '跑',
  '比較',
  '遠',
  '*',
  '跑到',
  'NEGE',
  '*',
  '基隆',
  '宜蘭',
  '那邊'

In [62]:

random.shuffle(disfluent_utts)

In [63]:
mapping = {
 #'E':'欸',
 #'EI':'欸',
 #'NE':'那',
 #'NEGE':'那個',  'NE GE':'那個',
 #'NEI':'那',
 #'NEIGE':'那個',  'NEI GE':'那個',
 #'NEIN':'那',
 #'ZHE':'這',
 #'ZHEGE':'這個',
 #'ZHEIGE':'這個',  'ZHE GE':'這個',  'ZHE':'這',  
     #'ZHEI GE':'這個', 'ZHEI': '這',
      #'ZHEI':'這'

}

In [64]:
with open('mandarin_fp_base.txt', 'w', encoding = 'utf-8') as f:
    for i in disfluent_utts:
        text = ' '.join(i)
        f.write(text + '\n')

In [53]:
SP = ['#']

disfluent_utts = []

for d in SP:
    disfluent_utts =disfluent_utts + [x for x in range(len(utts)) if d in utts[x]]

disfluent_utts = list(set(disfluent_utts))
disfluent_utts = [utts[x] for x in disfluent_utts]

disfluent_utts = [x for x in disfluent_utts if len(x) < 30 and len(set(x)) > 4]
len(disfluent_utts)

557

In [54]:
random.shuffle(disfluent_utts)

with open('mandarin_sp_base.txt', 'w', encoding = 'utf-8') as f:
    for i in disfluent_utts:
        text = ' '.join(i)
        f.write(text + '\n')

In [65]:
cand_utts = [x for x in utts if len(x) < 30 and len(set(x)) > 4]
len(cand_utts)

1829

In [66]:
repetition_pool = []
for c in cand_utts:
    temp = [x for x in c]
    for j in range(0,len(temp)-1):
        if temp[j] not in DISFLs+SP+['*'] and temp[j] == temp[j+1]:
            repetition_pool.append(c)
            break

In [67]:
len(repetition_pool)

791

In [68]:
random.shuffle(repetition_pool)

with open('mandarin_repeats_base.txt', 'w', encoding = 'utf-8') as f:
    for i in repetition_pool:
        text = ' '.join(i)
        f.write(text + '\n')