In [1]:
from collections import defaultdict
import re
import csv
import pandas as pd
import os

In [2]:
def outputprepost(ptable, select_col):
    def speDataframe(_gg):
        _ggS = _gg.size()
        _ggDF = pd.DataFrame(_ggS,columns=['count'])
        _ggDF_sort = _ggDF.sort_values('count', ascending=False)
        return _ggDF_sort
        
    output = ptable.groupby(select_col)
    return speDataframe(output)

In [21]:
def extract_prepost(input_filename, groundTruth_filename, kick_ptn='', specialSelect=False):
    tmpdict = {'pre':[],'post':[],'corr':[],'error':[],'pre2':[],'post2':[]}
    KICKPTN = re.compile(kick_ptn)

    with open(input_filename, 'r', encoding='utf8') as ip, \
        open(groundTruth_filename, 'r', encoding='utf8') as gtp:
        for line_idx, (seq_line, gt_line) in enumerate(zip(ip, gtp)):
            seqID, seq = seq_line.strip('\n').split('|||')
            # ^M case 
            if len(seq) == 1:
                seq = ip.readline().strip('\n')                    
            
            gtlst = gt_line.strip('\n').split('|||')
            gtID, gt_info = gtlst[0], gtlst[1:]

            if seqID != gtID:
                print(seq_line, gt_line)
                break

            for idx, corr_ch in zip(gt_info[::2], gt_info[1::2]):
                error_idx = int(idx)-1
                try:
                    pre2_ch  = str(seq[error_idx-2]) if error_idx-2 >= 0 else 'NAN'
                    pre_ch   = str(seq[error_idx-1]) if error_idx-1 >= 0 else 'NAN'
                    error_ch = seq[error_idx]            
                    post_ch  = str(seq[error_idx+1]) if error_idx+1 < len(seq) else 'NAN'
                    post2_ch = str(seq[error_idx+2]) if error_idx+2 < len(seq) else 'NAN'
                except:
                    print(line_idx, seq, error_idx)                   
                    
                # Same fix
                if error_ch == corr_ch:
                    print('Same on {}: {}'.format(seqId, error_ch))
                    continue
                    
                # Remove NAN case
                if pre_ch == 'NAN' or pre2_ch == 'NAN'\
                    or post_ch == 'NAN' or post2_ch == 'NAN'\
                    or error_ch == 'NAN' or corr_ch == 'NAN':
                        continue
                
                # KICK pattern 
                if KICKPTN.search(error_ch) or KICKPTN.search(corr_ch):
                    ptnflag = False if kick_ptn == '' else True                    
                else:
                    ptnflag = False
                
                if not(specialSelect ^ ptnflag):
                    tmpdict['pre2'].append(pre2_ch)
                    tmpdict['pre'].append(pre_ch)
                    tmpdict['post'].append(post_ch)
                    tmpdict['post2'].append(post2_ch)
                    tmpdict['corr'].append(corr_ch)
                    tmpdict['error'].append(error_ch)

        pTable = pd.DataFrame(tmpdict)        
        print('= Processing {} sequences'.format(line_idx))
        print('= Processing {} pairs'.format(len(pTable)))
        print('Done.')
        
        return (pTable, (line_idx, len(pTable)))


In [28]:
def main(input_filename, groundTruth_filename, kick_ptn, special_select, case_token):
    case_folder = './extractUDN_new/{}'.format(case_token)
    print('File output to {}'.format(case_folder))
    if not os.path.isdir(case_folder):
        os.mkdir(case_folder)
    
    df, df_info = extract_prepost(input_filename, groundTruth_filename, kick_ptn, special_select)
    with open(os.path.join(case_folder, 'info.txt'), 'w') as wp:
        wp.write('Processing {} sequences\n'.format(df_info[0]))
        wp.write('Processing {} pairs'.format(df_info[1]))
    
    

    outputfilename = '{}_{}.csv'.format
    ALLCOLUMN = ['pre2', 'pre', 'error', 'post', 'post2']
    column_select_dict = {
        'error':[ALLCOLUMN[2]],
        'preError':ALLCOLUMN[1:3],
        'errorPost':ALLCOLUMN[2:4],
        'pre2Error':ALLCOLUMN[:3],
        'preErrorPost':ALLCOLUMN[1:4],
        'errorPost2':ALLCOLUMN[2:],
        'pre2ErrorPost':ALLCOLUMN[:4],
        'preErrorPost2':ALLCOLUMN[1:],
        'pre2ErrorPost2':ALLCOLUMN
    }

    for key, select_columns in column_select_dict.items():
        newdf = outputprepost(df, select_columns+['corr'])
        newdf.to_csv(
            os.path.join(case_folder, outputfilename(case_token, key)),
            sep='\t')
    print('Done.')

In [30]:
if __name__ == '__main__':
    input_filename = './extractUDN_new/all_seqraw.txt'
    groundTruth_filename = './extractUDN_new/all_gtraw.txt'

    # ptn = re.compile('[0-9A-Za-z：\–－\-•%％&（\(\）)\.\*\,、\/／\:\?_~∼˙‘’“”《「〞※」+＋＞→■○●─°・★〇℃éＯ．=＝…\s]')
    kick_ptn = '[0-9]'
    special_select = False
    case_token = 'num'
    main(input_filename, groundTruth_filename, kick_ptn, special_select, case_token)

File output to ./extractUDN_new/remove_numeng
= Processing 135927 sequences
= Processing 86858 pairs
Done.
Done.
