In [186]:
import re
import os
import pandas as pd
import random
import json

random.seed(20250409)

In [187]:
# MODIFYING GENUINE EXAMPLES
def move_fp(row):

    utt_as_list = row['text'].split()
            
    filtered_utt_ = [t for t in utt_as_list if t not in FP_ITEMS]
    items_to_move = [t for t in utt_as_list if t in FP_ITEMS]

    ids_to_move = [x for x in range(len(utt_as_list)) if utt_as_list[x] in FP_ITEMS]

    left_context = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x-1 for x in ids_to_move] ]
    right_context = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x+1 for x in ids_to_move] ]
    targets = list(zip(left_context, items_to_move, right_context))

    new_utt = utt_as_list
    while new_utt == utt_as_list or '# #' in ' '.join(new_utt):
        new_utt = [t for t in utt_as_list if t not in FP_ITEMS]
        moved_targets = []
        for item in items_to_move:
            insert_location = random.randrange(1, len(new_utt)-1)
            new_utt.insert(insert_location,item)  
            moved_targets.append((new_utt[insert_location-1], item, new_utt[insert_location+1]))

    return ' '.join(new_utt), targets, moved_targets

def move_sp(row):
    utt_as_list = row['text'].split()

    filtered_utt = [t for t in utt_as_list if t not in SP_ITEMS]
    items_to_move = [t for t in utt_as_list if t in SP_ITEMS]

    ids_to_move = [x for x in range(len(utt_as_list)) if utt_as_list[x] in SP_ITEMS]

    left_context = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x-1 for x in ids_to_move] ]
    right_context = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x+1 for x in ids_to_move] ]
    targets = list(zip(left_context, items_to_move, right_context))

    
    new_utt = utt_as_list
    while new_utt == utt_as_list or '# #' in ' '.join(new_utt):
        new_utt = [t for t in utt_as_list if t not in SP_ITEMS]
        moved_targets = []
        for item in items_to_move:
            insert_location = random.randrange(1,len(new_utt)-1)
            new_utt.insert(insert_location,item) 
            moved_targets.append((new_utt[insert_location-1], item, new_utt[insert_location+1]))


    return ' '.join(new_utt), targets, moved_targets



def move_rep(row):
    utt_as_list = row['text'].split()


    repeated_indices = []
    i = 1
    while i<len(utt_as_list):
        if (utt_as_list[i] == utt_as_list[i-1]) and (utt_as_list[i] not in SP_ITEMS):
            repeated_indices.append(i)
        i+=1
    #tmp.append(utt_as_list[-1])

    rep_to_move = random.choice(repeated_indices)
    rep_to_move_token = utt_as_list[rep_to_move]
    left_context = utt_as_list[rep_to_move-2] if rep_to_move > 1 else ''
    right_context = utt_as_list[rep_to_move+1] if rep_to_move < len(utt_as_list)-1 else ''
    
    utt_as_list[rep_to_move] = ''
    
    avoid =  [rep_to_move + 1, rep_to_move] + [x + 1 for x in range(len(utt_as_list)-1) if utt_as_list[x] in SP_ITEMS + FP_ITEMS + ['*']]
    
    original_length = len(utt_as_list)
    newlist_length = 0

    while original_length != newlist_length:
    
        location = random.choice([x for x in list(range(1,len(utt_as_list)+1)) if x not in avoid])
        #print(utt_as_list, location)
        rep_moved = utt_as_list[location-1]
        left_moved = utt_as_list[location-2] if location > 1 else ''
        right_moved = utt_as_list[location] if location < len(utt_as_list) else ''
        
        utt_as_list.insert(location, utt_as_list[location-1])
        utt_as_list = [x for x in utt_as_list if x != '']

        newlist_length = len(utt_as_list)



    
    return ' '.join(utt_as_list), [(left_context, rep_to_move_token, right_context)], [(left_moved, rep_moved, right_moved)]



def swap_sp_fp(row):
    utt_as_list = row['text'].split()

    fp_positions = [x for x in range(len(utt_as_list)) if utt_as_list[x] in FP_ITEMS]
    sp_positions = [x for x in range(len(utt_as_list)) if utt_as_list[x] in SP_ITEMS]

    fp_pos = random.choice(fp_positions)
    sp_pos = random.choice(sp_positions)
    fp_to_switch = utt_as_list[fp_pos] 
    sp_to_switch = utt_as_list[sp_pos] 

    utt_as_list[fp_pos] = sp_to_switch
    utt_as_list[sp_pos] = fp_to_switch

    left_context = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x-1 for x in [fp_pos]]]
    right_context = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x+1 for x in [fp_pos]]]
    targets = list(zip(left_context, [fp_to_switch], right_context))

    left_moved = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x-1 for x in [sp_pos]]]
    right_moved = [utt_as_list[x] if x >= 0 and x < len(utt_as_list) else '' for x in [x+1 for x in [sp_pos]]]
    targets_moved = list(zip(left_moved, [sp_to_switch], right_moved)) 
    
    return ' '.join(utt_as_list), targets, targets_moved


def shuffle_utterance(row):
    utt_as_list = row['text'].split()

    random.shuffle(utt_as_list)


    return ' '.join(utt_as_list), [], []

# EN
def lowercase_except_I_and_contractions(text):
    return re.sub(r"\b(?!I\b|I'm\b|I've\b|I'll\b|I'd\b)[A-HIJ-Z][a-z]*\b", lambda match: match.group(0).lower(), text)


def replace_dm(row,dmlist,sep = ' # '):
    right_as_list = row['right'].split()[1:]

    if isinstance(dmlist, list):
        dm_candidates = [dm for dm in dmlist if dm != row['dm'].strip()]
    else:
        dm_candidates = [dm for dm in dmlist[row['dm'].strip()]]
    
    random.shuffle(dm_candidates)

    left_as_list = row['left'].split()
    if left_as_list[0] == '#':
        left_as_list = left_as_list[1:]
    
    output = ' '.join(left_as_list).strip() + sep + dm_candidates[0] + ' ' + ' '.join(right_as_list).strip()
    return output, [(left_as_list[-1], row['right'].split()[0], right_as_list[0])], [(left_as_list[-1], dm_candidates[0], right_as_list[0])]


def get_dm(row):
    return row['right'].split()[0]


In [188]:
a = []
b = {}
isinstance(a, list) 

True

In [189]:
mapping = {
'NEGE':'那個', 'NEINGE':'那個',
 'NEGE':'那個',  'NE GE':'那個',
 'ZHEGE':'這個',     'ZHEI GE':'這個',
 'ZHEIGE':'這個',  'ZHE GE':'這個', 
 'NEIGE':'那個',  'NEI GE':'那個',  'NAGE':'那個',
 'NEIN':'那', 'NA': '那',
 'ZHE':'這', 'GE':'個',
 'ZHEI': '這',
 'ZHEI':'這',  'ein':'eh',  'en':'嗯',
 'E':'eh',
 'EI':'eh',
 'NE':'那',  'NEI':'那', 'ZHE':'這',  
 'mhm':'mhm','nhn': '嗯', 'uhm':'嗯', 'uhn':'嗯', 
}


# Define a more comprehensive Chinese character range covering all CJK blocks
CHINESE_RANGE = r'\u4e00-\u9fff\uf900-\ufaff\u3400-\u4dbf\U00020000-\U0002A6DF\U0002A700-\U0002B73F\U0002B740-\U0002B81F\U0002B820-\U0002CEAF'

# Define Chinese punctuation range (including 「」【】 and others)
PUNCTUATION_RANGE = r'().,!?;，。！？；：:︰、\[\]（）〔〕【】「」『』《》”“－…<＞/%％*@"'
CHINESE_PUNCTUATION_RANGE = r'()，。！？；：︰、\[\]（）〔〕【】「」『』《》”“－…<＞0123456789"A-Za-z*@'

def clean_zh_space(text):
    
    for key in mapping.keys():
        text = re.sub("(^| )" + key + "($| )", "\\1"+mapping[key]+"\\2", text)
        text = re.sub("(^| )" + key + "($| )", "\\1"+mapping[key]+"\\2", text)
        
    text = re.sub(rf'(?<=[{CHINESE_RANGE+'*'}]) (?=[{CHINESE_RANGE+'*'}])', '', text)
    text = re.sub(rf'(?<=[{CHINESE_PUNCTUATION_RANGE}]) (?=[{CHINESE_RANGE}])', '', text)
    text = re.sub(rf'(?<=[{CHINESE_RANGE}]) (?=[{CHINESE_PUNCTUATION_RANGE}])', '', text)
    text = re.sub(rf'(?<=[{CHINESE_RANGE}]) (?=[{PUNCTUATION_RANGE}])', '', text)

    text = re.sub(' *# *', '#', text)
    text = re.sub(' *\\* *', '*', text)
    return text

In [202]:
task_dict = {
    'filler_moved': move_fp,
    'filler_swapped': swap_sp_fp,
    'pause_moved': move_sp,
    'rep_moved': move_rep,
    'filler_shuffled': shuffle_utterance,
    'pause_shuffled': shuffle_utterance,
    'rep_shuffled': shuffle_utterance,   
    'dm_dial_sem_shuffled': shuffle_utterance,
    'dm_dial_att_shuffled': shuffle_utterance,
    'dm_mono_sem_shuffled': shuffle_utterance,
    'dm_mono_att_shuffled': shuffle_utterance,  
    'dm_dial_sem_replaced': replace_dm,
    'dm_dial_att_replaced': replace_dm,
    'dm_mono_sem_replaced': replace_dm,
    'dm_mono_att_replaced': replace_dm,  
}

def filter_for_swap(EXAMPLES, fp_list, sp_list):

    good_example_no = []

    for x in range(len(EXAMPLES)):
        if EXAMPLES[x].split(' ')[-1] == '#':
            EXAMPLES[x] = ' '.join(EXAMPLES[x].split(' ')[:-1])

    for f in range(len(EXAMPLES)):
        FP_locs = [x for x in range(len(EXAMPLES[f].split())) if EXAMPLES[f].split()[x] in FP_ITEMS]
        SP_locs = [x for x in range(len(EXAMPLES[f].split())) if EXAMPLES[f].split()[x] in SP_ITEMS]
        if len(SP_locs) >= 1 and len(FP_locs) >= 1:
            differences = [abs(a - b) for a in FP_locs for b in SP_locs]
            if 1 not in differences:
                good_example_no.append(f)
      
    
    EXAMPLES = [EXAMPLES[x] for x in range(len(EXAMPLES)) if x in good_example_no and '#' in EXAMPLES[x]]

    return EXAMPLES



def run_gen(task, lan, base_file, fp_list, sp_list, out_folder, comma_pattern, n_exps, dm_list=[]):

    #create the output benchmark path if not existing
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)


    for i_exp in range(n_exps):
        if 'dm' in task:
            EXAMPLES = [u.split('<$>') for u in open(base_file,'r', encoding = 'utf-8').readlines()]
            if lan == 'fr':
                DF = pd.DataFrame(EXAMPLES,columns=['left','dm','right'])
                DF['left'] = DF['left'].str.replace("#( #)+", "# ", regex = True)
                DF['right'] = DF['right'].str.replace("#( #)+", "# ", regex = True)        
                DF['text'] = DF['left'].str.strip() + ' # '+  DF['right'].str.strip()
            else:
                DF = pd.DataFrame(EXAMPLES,columns=['left','right'])
                DF['dm'] = DF.apply(get_dm,axis=1)       
                DF['text'] = DF['left'].str.strip() + ' # '+  DF['right'].str.strip()                
        else:
            EXAMPLES = [u[:-1] for u in open(base_file,'r', encoding = 'utf-8').readlines()]
            EXAMPLES = [x for x in EXAMPLES if '# #' not in x]
            if 'swap' in task:
                EXAMPLES = filter_for_swap(EXAMPLES, fp_list, sp_list)    
            DF = pd.DataFrame(EXAMPLES,columns=['text'])

        if lan == 'en':
            DF['text'] = DF['text'].apply(lowercase_except_I_and_contractions)

        if 'dm' in task and 'replaced' in task:
            DF[task] = DF.apply(task_dict[task],dmlist=dm_list,axis=1)
        else:            
            DF[task] = DF.apply(task_dict[task],axis=1)
        
        exp_tag = lan+'_'+str(i_exp)

        result = []
        cnt=0
        for i,row in DF.iterrows():
            good = row['text']
            bad = row[task][0]

            if lan == 'zh':
                good = clean_zh_space(good)
                bad = clean_zh_space(bad)
            elif lan == 'fr':
                good = re.sub(r"\b(c|d|j|l|m|n|s|t|qu)'\s+(?=[aeiouyhAEIOUYHàâäéèêëîïôöùûüÿœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸŒÆ])", r"\1'", good)
                bad = re.sub(r"\b(c|d|j|l|m|n|s|t|qu)'\s+(?=[aeiouyhAEIOUYHàâäéèêëîïôöùûüÿœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸŒÆ])", r"\1'", bad)
                
            good = re.sub('( *)#', comma_pattern, good)
            bad = re.sub('( *)#', comma_pattern, bad)
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": task, 
                             "linguistics_term": task+"_"+LAN, "UID": task+"_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt), 'targets': row[task][1], 'moved_targets': row[task][2]}
            result.append(item_moved)
            cnt +=1
                
        with open(out_folder + task + '_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')


In [205]:
LAN = 'fr'
FP_ITEMS= ['euh']
SP_ITEMS = ['#']
FILE_FP = 'data_corpus/french_fp_base_500_checked.txt'
comma = ','

run_gen('filler_moved', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('filler_shuffled', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('filler_swapped', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)

FILE_REP = 'data_corpus/french_repeats_500.txt'

run_gen('rep_moved', LAN, FILE_REP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('rep_shuffled', LAN, FILE_REP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)

FILE_SP = 'data_corpus/french_sp_base_500_checked.txt'

run_gen('pause_moved', LAN, FILE_SP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/',comma, 10)
run_gen('pause_shuffled', LAN, FILE_SP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/',comma, 10)





In [206]:
DM_SEM = ['donc','mais','alors','après'] 
DM_ATT =  ["ah", "ben", "oh", "enfin", "bon"]#["ah", "ben"]

FILE_DM_DIAL_SEM = 'data_corpus/french_DM_sem_dial.txt'
FILE_DM_MONO_SEM = 'data_corpus/french_DM_sem_mono.txt'
FILE_DM_DIAL_ATT = 'data_corpus/french_DM_att_dial.txt'
FILE_DM_MONO_ATT = 'data_corpus/french_DM_att_mono.txt'

run_gen('dm_dial_sem_replaced', LAN, FILE_DM_DIAL_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)
run_gen('dm_dial_sem_shuffled', LAN, FILE_DM_DIAL_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)

run_gen('dm_mono_sem_replaced', LAN, FILE_DM_MONO_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)
run_gen('dm_mono_sem_shuffled', LAN, FILE_DM_MONO_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)

run_gen('dm_dial_att_replaced', LAN, FILE_DM_DIAL_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)
run_gen('dm_dial_att_shuffled', LAN, FILE_DM_DIAL_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)

run_gen('dm_mono_att_replaced', LAN, FILE_DM_MONO_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)
run_gen('dm_mono_att_shuffled', LAN, FILE_DM_MONO_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)

In [207]:

LAN = 'en'
FP_ITEMS= ['uh', 'um']
SP_ITEMS = ['#']
FILE_FP = 'data_corpus/english_fp_base_500_checked.txt'

run_gen('filler_moved', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('filler_shuffled', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('filler_swapped', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)

FILE_REP = 'data_corpus/english_repeats_base_500_checked.txt'

run_gen('rep_moved', LAN, FILE_REP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('rep_shuffled', LAN, FILE_REP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)

FILE_SP = 'data_corpus/english_sp_base_500_checked.txt'

run_gen('pause_moved', LAN, FILE_SP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('pause_shuffled', LAN, FILE_SP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)


In [208]:
FILE_DM_DIAL_SEM = 'data_corpus/english_DM_SEM_dial.txt'
FILE_DM_MONO_SEM = 'data_corpus/english_DM_SEM_mono.txt'
FILE_DM_DIAL_ATT = 'data_corpus/english_DM_ATT_dial.txt'
FILE_DM_MONO_ATT = 'data_corpus/english_DM_ATT_mono.txt'

DM_ATT = ['oh','like','well']
DM_SEM = ['so','but',"because", 'then']#then

run_gen('dm_dial_sem_replaced', LAN, FILE_DM_DIAL_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)
run_gen('dm_dial_sem_shuffled', LAN, FILE_DM_DIAL_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)

run_gen('dm_mono_sem_replaced', LAN, FILE_DM_MONO_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)
run_gen('dm_mono_sem_shuffled', LAN, FILE_DM_MONO_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)

run_gen('dm_dial_att_replaced', LAN, FILE_DM_DIAL_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)
run_gen('dm_dial_att_shuffled', LAN, FILE_DM_DIAL_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)

run_gen('dm_mono_att_replaced', LAN, FILE_DM_MONO_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)
run_gen('dm_mono_att_shuffled', LAN, FILE_DM_MONO_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)

In [209]:
LAN = 'zh'
FP_ITEMS= ['uhm', 'uhn', 'en', 'NEGE', 'NEIGE', 'nhn', 'NAGE']
SP_ITEMS= ['#']

comma = '，'

FILE_FP = 'data_corpus/mandarin_fp_base_updated_checked.txt'

run_gen('filler_moved', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('filler_shuffled', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('filler_swapped', LAN, FILE_FP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)

FILE_REP = 'data_corpus/mandarin_repeats_base_updated_checked.txt'

run_gen('rep_moved', LAN, FILE_REP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('rep_shuffled', LAN, FILE_REP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)

FILE_SP = 'data_corpus/mandarin_sp_base_updated_checked.txt'

run_gen('pause_moved', LAN, FILE_SP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)
run_gen('pause_shuffled', LAN, FILE_SP, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10)


In [210]:
FILE_DM_DIAL_SEM = 'data_corpus/mandarin_DM_SEM_dial.txt'
FILE_DM_MONO_SEM = 'data_corpus/mandarin_DM_SEM_mono.txt'
FILE_DM_DIAL_ATT = 'data_corpus/mandarin_DM_ATT_dial.txt'
FILE_DM_MONO_ATT = 'data_corpus/mandarin_DM_ATT_mono.txt'

DM_SEM = ['所以','但是','因為','然後', '可是', '不過', '而且'] #'and'
DM_ATT = ['喔', '哦', '就是', '像', '就', '就是說']

DM_ATT = {'喔': ['就是', '像', '就', '就是說'],
          '哦': ['就是', '像', '就', '就是說'],
          '就是': ['喔', '哦', '像',],
          '像': ['喔', '哦', '就是', '就', '就是說'],
          '就': ['喔', '哦', '像'],
          '就是說': ['喔', '哦', '像'],}


DM_SEM = {'所以':  ['但是','因為','然後', '可是', '不過', '而且'],
          '但是':  ['所以','因為','然後',  '而且'],
          '因為':  ['所以','但是','然後', '可是', '不過', '而且'],
          '然後':  ['所以','但是','因為', '可是', '不過', '而且'],
          '可是':  ['所以','因為','然後', '而且'],
          '不過':  ['因為','然後', '所以', '而且'],
          '而且':  ['所以','但是','因為','然後', '可是', '不過']    
}


run_gen('dm_dial_sem_replaced', LAN, FILE_DM_DIAL_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)
run_gen('dm_dial_sem_shuffled', LAN, FILE_DM_DIAL_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)

run_gen('dm_mono_sem_replaced', LAN, FILE_DM_MONO_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)
run_gen('dm_mono_sem_shuffled', LAN, FILE_DM_MONO_SEM, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_SEM)

run_gen('dm_dial_att_replaced', LAN, FILE_DM_DIAL_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)
run_gen('dm_dial_att_shuffled', LAN, FILE_DM_DIAL_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)

run_gen('dm_mono_att_replaced', LAN, FILE_DM_MONO_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)
run_gen('dm_mono_att_shuffled', LAN, FILE_DM_MONO_ATT, FP_ITEMS, SP_ITEMS, './data_benchmark/disfl_comma/', comma, 10, DM_ATT)

In [211]:
'done'

'done'

# DISCOURSE

## French

In [140]:

# FR
LAN = 'fr'
FILE_DM_DIAL_SEM = 'data_corpus/french_DM_sem_dial.txt'
FILE_DM_DIAL_ATT = 'data_corpus/french_DM_att_dial.txt'

DM_SEM = ['donc','mais','alors','après'] 
DM_ATT =  ["ah", "ben", "oh", "enfin", "bon"]#["ah", "ben"]


for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)


    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_DIAL_SEM,'r', encoding = 'utf-8').readlines()]

    

    for i in range(N_EXPS):
                                                 
        DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','dm','right'])
        #print(DM_DF['dm'])

        DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
        DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)    
        
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)

        
        exp_tag = LAN+'_'+str(i)
        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['replaced_dm'])            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_sem_dial", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_dial_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
                        
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_sem_dial", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_dial_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')


    



for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)


    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_DIAL_ATT,'r', encoding = 'utf-8').readlines()]
    
    

    for i in range(N_EXPS):
        DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','dm','right'])

        
        DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
        DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)    
        
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_ATT,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        DM_DF['mistyped_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)

        exp_tag = LAN+'_'+str(i)
        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['replaced_dm'])            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_att_dial", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_dial_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_att_dial", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_dial_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')



In [141]:
FILE_DM_MONO_SEM = 'data_corpus/french_DM_sem_mono.txt'
FILE_DM_MONO_ATT = 'data_corpus/french_DM_att_mono.txt'

DM_SEM = ['donc','mais','alors','après'] #['donc','mais'] 
DM_ATT = ["ah", "ben", "oh", "enfin", "bon"]


for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)


    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_MONO_SEM,'r', encoding = 'utf-8').readlines()]

    

    for i in range(N_EXPS):
                                                 
        DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','dm','right'])

        #print(DM_DF['dm'])

        DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
        DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)    
        
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        
        exp_tag = LAN+'_'+str(i)

        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['replaced_dm'])            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_sem_mono", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_mono_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_sem_mono", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_mono_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)


    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_MONO_ATT,'r', encoding = 'utf-8').readlines()]
    
    

    for i in range(N_EXPS):
        DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','dm','right'])

        
        DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
        DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)    
        
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_ATT,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        DM_DF['mistyped_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)
        
        exp_tag = LAN+'_'+str(i)

        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['replaced_dm'])            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_att_mono", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_mono_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_att_mono", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_mono_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

## 

## English

In [142]:

# EN
LAN = 'en'
FILE_DM_DIAL_SEM = 'data_corpus/english_DM_SEM_dial.txt'
FILE_DM_MONO_SEM = 'data_corpus/english_DM_SEM_mono.txt'
FILE_DM_DIAL_ATT = 'data_corpus/english_DM_ATT_dial.txt'
FILE_DM_MONO_ATT = 'data_corpus/english_DM_ATT_mono.txt'
DM_ATT = ['oh','like','well']
DM_SEM = ['so','but',"because", 'then']#then

for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_DIAL_SEM,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['text'] = DM_DF['text'].apply(lowercase_except_I_and_contractions)

        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)

        
        exp_tag = LAN+'_'+str(i)


        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#" , alt[1], row['replaced_dm'])
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_sem_dial", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_dial_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_sem_dial", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_dial_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_DIAL_ATT,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['text'] = DM_DF['text'].apply(lowercase_except_I_and_contractions)

        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_ATT,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        
        exp_tag = LAN+'_'+str(i)

        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#" , alt[1], row['replaced_dm'])
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_att_dial", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_dial_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_att_dial", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_dial_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

In [143]:
for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_MONO_SEM,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['text'] = DM_DF['text'].apply(lowercase_except_I_and_contractions)

        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        
        exp_tag = LAN+'_'+str(i)

        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#" , alt[1], row['replaced_dm'])
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_sem_mono", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_mono_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_sem_mono", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_mono_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

for alt in alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_MONO_ATT,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['text'] = DM_DF['text'].apply(lowercase_except_I_and_contractions)

        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm,dmlist=DM_ATT,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        
        exp_tag = LAN+'_'+str(i)

        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#" , alt[1], row['replaced_dm'])
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_att_mono", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_mono_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub("( *)#", alt[1], row['text'])
            bad = re.sub("( *)#", alt[1], row['shuffled'])

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_att_mono", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_mono_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

## Mandarin

In [144]:

# ZH
LAN = 'zh'
FILE_DM_DIAL_SEM = 'data_corpus/mandarin_DM_SEM_dial.txt'
FILE_DM_MONO_SEM = 'data_corpus/mandarin_DM_SEM_mono.txt'
FILE_DM_DIAL_ATT = 'data_corpus/mandarin_DM_ATT_dial.txt'
FILE_DM_MONO_ATT = 'data_corpus/mandarin_DM_ATT_mono.txt'
DM_SEM = ['所以','但是','因為','然後', '可是', '不過', '而且'] #'and'
DM_ATT = ['喔', '哦', '就是', '像', '就', '就是說']

DM_ATT = {'喔': ['就是', '像', '就', '就是說'],
          '哦': ['就是', '像', '就', '就是說'],
          '就是': ['喔', '哦', '像',],
          '像': ['喔', '哦', '就是', '就', '就是說'],
          '就': ['喔', '哦', '像'],
          '就是說': ['喔', '哦', '像'],}


DM_SEM = {'所以':  ['但是','因為','然後', '可是', '不過', '而且'],
          '但是':  ['所以','因為','然後',  '而且'],
          '因為':  ['所以','但是','然後', '可是', '不過', '而且'],
          '然後':  ['所以','但是','因為', '可是', '不過', '而且'],
          '可是':  ['所以','因為','然後', '而且'],
          '不過':  ['因為','然後', '所以', '而且'],
          '而且':  ['所以','但是','因為','然後', '可是', '不過']    
}

for alt in zh_alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split(' <$> ') for u in open(FILE_DM_DIAL_SEM,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
    DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)
    #DM_DF['left'] = DM_DF['left'].str.replace("SOUTHERN_MIN", "*", regex = True)
    #DM_DF['right'] = DM_DF['right'].str.replace("SOUTHERN_MIN", "*", regex = True)
    
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm_dict,dmlist=DM_SEM,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)

        
        exp_tag = LAN+'_'+str(i)

        DM_DF['text'] = DM_DF.apply(clean_zh_space,col='text',axis=1)
        DM_DF['replaced_dm'] = DM_DF.apply(clean_zh_space,col='replaced_dm',axis=1)
        DM_DF['shuffled'] = DM_DF.apply(clean_zh_space,col='shuffled',axis=1)

        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['replaced_dm'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)
            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_sem_dial", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_dial_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['shuffled'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_sem_dial", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_dial_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')


for alt in zh_alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split('<$>') for u in open(FILE_DM_DIAL_ATT,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
    DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)
    #DM_DF['left'] = DM_DF['left'].str.replace("SOUTHERN_MIN", "*", regex = True)
    #DM_DF['right'] = DM_DF['right'].str.replace("SOUTHERN_MIN", "*", regex = True)
    
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm_dict,dmlist=DM_ATT,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)
        DM_DF['mistyped_dm'] = DM_DF.apply(replace_dm,dmlist=DM_SEM,axis=1)

        
        exp_tag = LAN+'_'+str(i)

        DM_DF['text'] = DM_DF.apply(clean_zh_space,col='text',axis=1)
        DM_DF['replaced_dm'] = DM_DF.apply(clean_zh_space,col='replaced_dm',axis=1)
        DM_DF['shuffled'] = DM_DF.apply(clean_zh_space,col='shuffled',axis=1)
        DM_DF['mistyped_dm'] = DM_DF.apply(clean_zh_space,col='mistyped_dm',axis=1)
        

        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['replaced_dm'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)
            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_att_dial", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_dial_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')
            
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['shuffled'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_att_dial", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_dial_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')


In [145]:
for alt in zh_alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split(' <$> ') for u in open(FILE_DM_MONO_SEM,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
    DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)
    #DM_DF['left'] = DM_DF['left'].str.replace("SOUTHERN_MIN", "*", regex = True)
    #DM_DF['right'] = DM_DF['right'].str.replace("SOUTHERN_MIN", "*", regex = True)
    
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm_dict,dmlist=DM_SEM,axis=1)
        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)

        
        exp_tag = LAN+'_'+str(i)

        DM_DF['text'] = DM_DF.apply(clean_zh_space,col='text',axis=1)
        DM_DF['replaced_dm'] = DM_DF.apply(clean_zh_space,col='replaced_dm',axis=1)
        DM_DF['shuffled'] = DM_DF.apply(clean_zh_space,col='shuffled',axis=1)
        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['replaced_dm'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)
            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_sem_mono", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_mono_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

        
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['shuffled'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_sem_mono", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_mono_sem_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')


for alt in zh_alts:
    folder = alt[0]    
    if not os.path.exists(folder):
        os.makedirs(folder)

    DM_EXAMPLES = [u.split(' <$> ') for u in open(FILE_DM_MONO_ATT,'r', encoding = 'utf-8').readlines()]
        
    DM_DF = pd.DataFrame(DM_EXAMPLES,columns=['left','right'])
    DM_DF['dm'] = DM_DF.apply(get_dm,axis=1)
    DM_DF['left'] = DM_DF['left'].str.replace("#( #)+", "# ", regex = True)
    DM_DF['right'] = DM_DF['right'].str.replace("#( #)+", "# ", regex = True)
    #DM_DF['left'] = DM_DF['left'].str.replace("SOUTHERN_MIN", "*", regex = True)
    #DM_DF['right'] = DM_DF['right'].str.replace("SOUTHERN_MIN", "*", regex = True)
    
    
    for i in range(N_EXPS):
        DM_DF['text'] = DM_DF['left'].str.strip() + ' # '+  DM_DF['right'].str.strip()
        DM_DF['replaced_dm'] = DM_DF.apply(replace_dm_dict,dmlist=DM_ATT,axis=1)

        DM_DF['shuffled'] = DM_DF.apply(shuffle_utterance,axis=1)

        
        exp_tag = LAN+'_'+str(i)

        DM_DF['text'] = DM_DF.apply(clean_zh_space,col='text',axis=1)
        DM_DF['replaced_dm'] = DM_DF.apply(clean_zh_space,col='replaced_dm',axis=1)

        DM_DF['shuffled'] = DM_DF.apply(clean_zh_space,col='shuffled',axis=1)
        
        #REPLACED
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['replaced_dm'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)
            
            item_replaced = {"sentence_good":good,"sentence_bad":bad, "field": "discourse_att_mono", 
                             "linguistics_term": "replaced_"+LAN, "UID": "replaced_"+exp_tag, "simple_LM_method": True, 
                             "one_prefix_method": False, "two_prefix_method": False, "lexically_identical": False, 
                             "pair_id": str(cnt)}
            result.append(item_replaced)
            cnt +=1
                
        with open(folder + 'replaced_dm_mono_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

        
        # SHUFFLED    
        result = []
        cnt=0
        for i,row in DM_DF.iterrows():
            good = re.sub(" *#", alt[1], row['text'])
            bad = re.sub(" *#", alt[1], row['shuffled'])
            #good = re.sub(" *\\*", "", good)
            #bad = re.sub(" *\\*", "", bad)

            
            item_moved = {"sentence_good":good, "sentence_bad":bad, "field": "discourse_att_mono", 
                             "linguistics_term": "shuffled_"+LAN, "UID": "shuffled_"+exp_tag, "simple_LM_method": True,
                             "one_prefix_method": False, "two_prefix_method": False, 
                             "lexically_identical": True, "pair_id": str(cnt)}
            result.append(item_moved)
            cnt +=1
                        
        with open(folder + 'shuffled_dm_mono_att_' + exp_tag + '.json', 'w', encoding='utf-8') as outfile:
            for entry in result:
                json.dump(entry, outfile, ensure_ascii=False)
                outfile.write('\n')

In [146]:
1

1

In [147]:
bad = "導致他們現在都不敢出來說*一講什麼，就吵啊把他那個解讀啊解成不一樣害他們都嚇到不是嗎"
bad = re.sub('(@|\\*) *', '', bad)
bad

'導致他們現在都不敢出來說一講什麼，就吵啊把他那個解讀啊解成不一樣害他們都嚇到不是嗎'