In [35]:
import os
import re
import random
from collections import defaultdict
import argparse

In [16]:
def extractErrors(seq):
    for pair in re.finditer(r'.\(.\)', seq):
        err_ch, cor_ch = pair.group()[0], pair.group()[2]
        yield err_ch, cor_ch

In [17]:
def dataAction(seq, action):
    global testData_output, testData_output, error_type, discard
    
    if action == 'append':
        errors = []
        for (err_ch, cor_ch) in extractErrors(seq):
            if cor_ch > err_ch:
                pair = (err_ch, cor_ch) 
                error_type[pair][0] += 1
            else:
                pair = (cor_ch, err_ch)
                error_type[pair][1] += 1

            testData_pair[pair].add(seq)
            errors.append((err_ch, cor_ch))    

        testData_output[seq] = errors
    
    elif action == 'remove':
        for (err_ch, cor_ch) in extractErrors(seq):
            if cor_ch > err_ch:
                pair = (err_ch, cor_ch) 
                error_type[pair][0] -= 1
            else:
                pair = (cor_ch, err_ch)
                error_type[pair][1] -= 1
            if seq in testData_pair[pair]:
                testData_pair[pair].remove(seq)
            

        testData_output.pop(seq)        
        discard.add(seq)

In [18]:
def loadData(labelFile):            
    global discard
    confused = []
    uniseq = dict()
    with open(labelFile, 'r', encoding='utf8') as fp:
        for line in fp:
            line = line.strip()        
            label, seq = line[0], line[1:]

            if seq in uniseq:            
                if label != uniseq.get(seq, ''):
                    confused.append(seq)                
                    if uniseq.get(seq,'E') == 'S':                    
                        dataAction(seq, 'remove')
            elif label == 'S':                        
                dataAction(seq, 'append')

            elif label == 'E':
                discard.add(seq)

            uniseq[seq] = label           
    
        
    return confused

In [19]:
logFile = './UDN_benchmark/UDN_benchmark45.md'

def loadLog(logFile):
    with open(logFile, 'r', encoding='utf8') as fp:
        for idx, line in enumerate(fp,1):
            if line[0] == '#': continue

            lst = line.strip().split('|||')
            if lst[0] == 'remove':
                pass
            elif lst[0] == 'append':
                dataAction(lst[1], 'append')

            elif lst[0] == 'process':
                pass        

            elif lst[0] == 'kick_pair':
                pass            

            elif lst[0] == 'kick_remove':
                dataAction(lst[1], 'remove')

            elif lst[0] == 'rank':
                pair = tuple(lst[1].split(','))
                pairRanking[pair] = int(lst[2])

            elif lst[0] == 'unrank':
                pair = tuple(lst[1].split(','))            
                pairRanking[pair] = 0

            else:
                print(idx,lst)
                break

In [20]:
def infoPrint():
    print('Total number of sequence = {}'.format(len(testData_output)+len(discard)))
    print('Number of Spelling errors = {}'.format(len(testData_output)))
    print('Number of editor errors = {}'.format(len(discard)))
    
    print('Average length of spelling error sentences = {:.2f}'.
          format(sum(len(i) for i in testData_output.keys())/len(testData_output)))
    print('Average error in spelling error sentence = {:.2f}'.
          format(sum(len(e) for s, e in testData_output.items())/len(testData_output)))
    
    mutual_case = len([1 for pair, count in error_type.items()
                       if count[0]!= 0 and count[1]!=0])
    
    error_sort = sorted(error_type.items(), key=lambda x:sum(x[1]), reverse=True)
        
    print('Number of unique pairs = {}'.format(len(testData_pair)+mutual_case))
    for idx,t in enumerate(error_sort[:10],1):
        print('Top {} count in pair {di[0]}/{di[1]}'.format(idx,di=t))
        
    print('===== Ranking =====')
    if not pairRanking:
        print('Ranking not load...')
    else:
        print('Number of unique pairs = {}'.format(len(pairRanking)))
        sc = list(pairRanking.values())
        for r in set(sc):
            print('Case {} have {}'.format(r, sc.count(r)))
            

In [21]:
def confusedLabelClassify(confused):
    # Confused label classify 
    ####
    fid = open(UDN_benchmark_log, 'a', encoding='utf8')
    fid.write('### Conufsed Label Classify \n')
    #####

    for seq in confused:
        while(1):
            ans = input('{} want to add? y;/n\'\t'.format(seq)).lower()
            if ans == ';':
                dataAction(seq, 'append')   
                fid.write('append|||{}\n'.format(seq))
                break
            elif ans == '\'':
                fid.write('remove|||{}\n'.format(seq))
                break
    fid.close()

In [22]:
def mutualCase():
    ####
    fid = open(UDN_benchmark_log, 'a', encoding='utf8')
    fid.write('### Mutual Error Case \n')
    #####
    
    mutual_case = [(pair,count) for pair, count in error_type.items()
                       if count[0]!= 0 and count[1]!=0]

    for pair, count in mutual_case:    
        allSeqs = list(testData_pair[pair])
        print('==========')
        print('\n'.join(allSeqs))
        print('{}->{} {}'.format(pair[0],pair[1],count[0]))
        print('{}->{} {}'.format(pair[1],pair[0],count[1]))    

        while(1):
            action = input('Want to process? Y;/N\'\t').lower()
            if action == ';':
                tag = 1 
                fid.write('process|||{}\n'.format(pair))
                break
            elif action == '\'':
                tag = 0
                fid.write('kick_pair|||{}\n'.format(pair))
                break

        fid.write('#### {}\n'.format(input('Comment: ')))

        if tag == 1:
            for seq in allSeqs:
                while(1):
                    action = input('{} Store? Y;/N\'\t'.format(seq)).lower()
                    if action == '\'':
                        dataAction(seq, 'remove')
                        fid.write('kick_remove|||{}\n'.format(seq))
                        break
                    elif action == ';':
                        break
        else:
            for seq in allSeqs:
                dataAction(seq, 'remove')   
                fid.write('kick_remove|||{}\n'.format(seq))

    if not fid.closed: fid.close()

In [23]:
def rankPair():
    # Ranking pair
    # testData_pair = defaultdict(set)
    # testData_output = dict()
    # error_type = defaultdict(lambda :[0,0])    

    #####
    with open(UDN_benchmark_log, 'a', encoding='utf8') as fid:
        fid.write('### Rank Pair \n')
    #####

    items = error_type.items()
    for pair, nums in items:        
        for idx in range(2):            
            fid = open(UDN_benchmark_log, 'a', encoding='utf8')
            
            if nums[idx] != 0:
                p = (pair[0],pair[1]) if idx==0 else (pair[1], pair[0])                
                
                
                if p in pairRanking:
                    continue
                
                seqs = [seq for seq in testData_pair[pair] if p in testData_output[seq]]
                print('======')
                print(p, nums[idx])
                print('\n'.join(seqs[:5]))
                while (1):
                    action = input('Rank {}-{} 0(Remove)-5(Good)\t'.format(p[0],p[1]))
                    if not action.isdigit():
                        continue
                    else:
                        action = int(action)
                        if action <= 0:
                            fid.write('unrank|||{}\n'.format(','.join(p)))
                            for seq in seqs:
                                fid.write('kick_remove|||{}\n'.format(seq))
                                dataAction(seq, 'remove')
                            break
                        elif action <=5 :
                            fid.write('rank|||{}|||{}\n'.format(','.join(p), action))
                            pairRanking[p] = action 
                            break
                
                action = input('Need comment? Y; ')
                if action == ';':
                    fid.write('#### {} {}\n'.format(pair, input('comment: ')))
                    
            fid.close()                                

In [24]:
def outputSeperate(seq):
    errLst = re.findall(r'\(.\)', seq)
    tmpStr = re.subn(r'\(.\)', '|||', seq)[0].split('|||')
    total_length = 0
    error_info = []
    for idx, cor_ch in zip(tmpStr[:-1], errLst):
        error_info.append((len(idx)+total_length, cor_ch[1]))
        total_length += len(idx)

    return ''.join(tmpStr), error_info

In [183]:
def generateTestData(pairRanking, select):
#     PAIRLIMIT = 5
    SEQLEN = 5
    
    output_seq = dict()
    for pair, r in pairRanking.items():
        if r not in select: continue

        if pair in testData_pair:
            seqs = testData_pair[pair]
        else:        
            p = (pair[1], pair[0])
            seqs = testData_pair[p]
        
        pick = 0
        for seq in list(seqs):
            error_pairs = [(e,c) for (e,c) in extractErrors(seq)]
            if pair in error_pairs and len(seq)>SEQLEN and '\xa0' not in seq and 'http' not in seq:
#                 if pick>PAIRLIMIT: 
#                     break
#                 else:
#                     pick += 1                    
                clean_seq, error_info = outputSeperate(seq)
                output_seq[clean_seq] = error_info
    
    print('Output {} sentences with {} errors'.format(
        len(output_seq), sum(len(es) for es in output_seq.values()) ))

    
    return output_seq

In [195]:
def dumpTest(output_seq, output_file):
    # Write into File
#     with open(output_file, 'w', encoding='utf8') as wp:
#         for (seq, error_info) in output_seq.items():
#             error_infoStr = ', '.join(['{}, {}'.format(idx,ch) for idx, ch in error_info])
#             wp.write('{}|||{}\n'.format(seq, error_infoStr))
#     with open(output_file, 'w', encoding='utf8') as wp:
#         for (seq, error_info) in output_seq:
#             error_infoStr = ', '.join(['{}, {}'.format(idx,ch) for idx, ch in error_info])
#             wp.write('{}|||{}\n'.format(seq, error_infoStr))
    with open(output_file, 'w', encoding='utf8') as wp:
        for item in output_seq:
            if len(item)==2:
                seq, error_info = item
                error_infoStr = ', '.join(['{}, {}'.format(idx,ch) for idx, ch in error_info])
                wp.write('{}|||{}\n'.format(seq, error_infoStr))
            else:
                wp.write('{}|||\n'.format(item))

In [26]:
def garbageDump(filename):
    print('Editor error sentences = {}'.format(len(discard)))
    with open(filename, 'w', encoding='utf8') as fp:
        fp.write('\n'.join(discard))

In [27]:
def testDataErrorInfo(logFile, error_infoFile):
    # Output error type information 
    def errorInfo():    
        output = dict()
        for pair, r in pairRanking.items():   
            if r == 0: continue
            if pair in testData_pair:
                cnt = error_type[pair][0]
            else:        
                p = (pair[1], pair[0])
                cnt = error_type[p][1]

            output[pair] = [r, cnt//2]

        return output

    def commentInfo(logFile):
        comment_dict = dict()
        with open(logFile, 'r', encoding='utf8') as fp:
            all_seqs = fp.readlines()

        for cur_idx in range(len(all_seqs)):
            if all_seqs[cur_idx][0] == '#':
                continue

            lst = all_seqs[cur_idx].strip().split('|||')

            if lst[0] == 'process' or lst[0] == 'kick_pair':
                pair = tuple((lst[1][2], lst[1][7]))

                comment = ' '.join(all_seqs[cur_idx+1].strip().split()[1:])

                comment_dict[pair] = comment
                comment_dict[(pair[1],pair[0])] = comment


            elif lst[0][0:2] == 'ra' or lst[0][0:2] == 'un':
                pair = tuple(lst[1].split(','))

                check_ptn = ['#', 'r', 'u']

                scan_idx = cur_idx
                while(1):
                    scan_idx += 1            
                    next_line = all_seqs[scan_idx]            
                    if next_line[0] in check_ptn:
                        break

                if next_line[0] == '#':
                    comment = ' '.join(next_line.strip().split()[3:])

                    if pair in comment_dict:
                        comment_dict[pair] += comment
                    else:
                        comment_dict[pair] = comment

        return comment_dict
    
    
    error_info = errorInfo()
    comment_info = commentInfo(logFile)
    
    for pair, comment in comment_info.items():
        if pair in error_info:
            error_info[pair].append(comment)
        else:
            error_info[pair] = [0,0,comment]
    
    _sort = sorted(error_info.items(), key=lambda x:(-x[1][0], -x[1][1]))
    
    with open(error_infoFile, 'w', encoding='utf8') as wp:
        for pair, info in _sort:
            wp.write('{p[0]}\t{p[1]}\t{s}\n'.format(
            p=pair, s='\t'.join([str(i) for i in info])))

In [28]:
### Global 
testData_pair = defaultdict(set)
testData_output = dict()
error_type = defaultdict(lambda :[0,0])
discard = set()
pairRanking = dict()
UDN_benchmark_log = './UDN_benchmark/UDN_benchmark45.md'
labelFile = 'withError_label.txt'

In [29]:
confused = loadData(labelFile)
logFile = './UDN_benchmark/UDN_action.log'
loadLog(logFile)
infoPrint()

Total number of sequence = 3626
Number of Spelling errors = 1332
Number of editor errors = 2294
Average length of spelling error sentences = 20.48
Average error in spelling error sentence = 1.03
Number of unique pairs = 660
Top 1 count in pair ('劃', '畫')/[133, 0]
Top 2 count in pair ('周', '週')/[0, 87]
Top 3 count in pair ('佈', '布')/[84, 0]
Top 4 count in pair ('份', '分')/[77, 2]
Top 5 count in pair ('愈', '越')/[0, 63]
Top 6 count in pair ('佔', '占')/[50, 0]
Top 7 count in pair ('台', '臺')/[0, 49]
Top 8 count in pair ('祕', '秘')/[0, 41]
Top 9 count in pair ('汙', '污')/[0, 31]
Top 10 count in pair ('作', '做')/[19, 11]
===== Ranking =====
Number of unique pairs = 633
Case 0 have 162
Case 1 have 36
Case 2 have 16
Case 3 have 144
Case 4 have 268
Case 5 have 7


In [184]:
select = [2,3,4,5]
output_file = 'UDN_benchmark/UDN_testdata.txt'
output_seq = generateTestData(pairRanking, select)

##### Make 650 
# output_seq['專注的表清帶著感恩，'] = [(5,'情')]

# Transform to list 
output_error_lst = [(seq, info) for seq, info in output_seq.items()]

Output 1217 sentences with 1255 errors


In [186]:
def correctSelect(correct_filename):
    
    _choose = 2500
    correct_seqs = []
    with open(correct_filename, 'r', encoding='utf8') as fp:
    #     for i in range(_choose):
        while(len(correct_seqs)<_choose):
            try:
                line = fp.readline().strip().split(', ')[1].strip()
            except IndexError:
                continue
            if '●' not in line and '【' not in line and len(line)>8 and '\xa0' not in line and 'http' not in line:
                correct_seqs.append(line)
                
    return correct_seqs

In [187]:
CORRECTCHOOSE = len(output_error_lst)
correct_filename = './extractUDN/withError_correct.txt'
output_correct_lst = correctSelect(correct_filename)
random.shuffle(output_correct_lst)
output_correct_lst = output_correct_lst[:CORRECTCHOOSE]

In [190]:
output_lst = list(output_correct_lst)
output_lst.extend(output_error_lst)

In [191]:
random.shuffle(output_lst)

In [196]:
dumpTest(output_lst, output_file)

In [197]:
garbage_file = 'UDN_benchmark/UDN_editorSequence.txt'
garbageDump(garbage_file)

Editor error sentences = 2294


In [198]:
error_infoFile = 'UDN_benchmark/UDN_errorInfo.txt'
testDataErrorInfo(logFile, error_infoFile)