## Collect all confusion set

In [1]:
import os 
import pickle
import pandas as pd
import csv
from collections import defaultdict
from bs4 import BeautifulSoup
import xlrd
import multiprocessing

import random
import time 

## Big unihan

In [2]:
def bigUnihan_extract(filename):
    df = pd.read_csv(filename, sep='|', low_memory=False)
    df = df[['char','kFrequency']].set_index('char')
    df = df[~pd.isnull(df.kFrequency)]
    
    return df.to_dict()['kFrequency']

## SIGHAN_char_information

In [3]:
def shape_compare_SIGHAN(ch_x, ch_y):
    '''
    return (similar, 同部首同筆畫數)
    '''
    cands1 = shape_SIGHAN.get(ch_x, [])
    try:
        cands2 = sound_SIGHAN.loc[ch_x].同部首同筆畫數
        if type(cands2)==float:
            cands2 = []
    except KeyError:
        cands2 = []
    
    out1 = 1 if ch_y in cands1 else 0
    out2 = 1 if ch_y in cands2 else 0
    
    return (out1, out2)

In [77]:
def sound_compare_SIGHAN(ch_x, ch_y):
    '''
    4. 同音同調
    3. 同音異調
    2. 近音同調
    1. 近音異調
    0 Not Found  
    '''
    try:
        row = sound_SIGHAN.loc[ch_x]
        for idx, col in enumerate(row[:-1]):
            if type(col)==str and col.find(ch_y)!=-1:
                return 4-idx
                break
        else:
            return 0
    except KeyError:
        return 0

In [78]:
sound_SIGHAN.loc['相']

同音同調                                         湘像襄嚮鄉鑲橡瓖廂巷象香箱項向
同音異調                               祥詳襄降翔鑲橡巷廂享餉響像湘嚮鄉庠瓖象饗香箱想項向
近音同調       信心彊燼芯馨降筋縉漿進韁訢醬巾新津噤莘儘匠疆覲斤近昕今欣江鋅矜薑勁襟觔絳僵盡金晉禁僅薪浸辛姜釁將強
近音異調       信彊心燼馨芯降筋縉漿謹進韁獎醬訢巾新噤津儘莘匠疆覲斤講槳近錦昕今欣江鋅矜薑勁襟觔瑾絳僵盡饉金...
同部首同筆畫數                                              眉看省盼盾盹眇
Name: 相, dtype: object

In [79]:
sound_compare_SIGHAN('相','向')

4

## unihan.csv (注音跟倉頡)

In [5]:
def unihan_extract(unihan_filename):
    global dicBPMF, dicPhone, dicCangjie, dicCangjie2char
    with open(unihan_filename, 'r', encoding='utf8') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in spamreader:
            row = [cell for cell in row] # unicode
            char, bpmf, cangjie, components, jp, kr, name, pinyin_chs, pinyin_cht, char_strokes_count, radical, radical_name, radical_strokes_count = row
            for ph in bpmf.split(): # 發音
                dicBPMF[char] += [ph]
                dicPhone[ph] += [char]
            for cj in cangjie.split(): # 倉頡碼
                if u"難" in cj: continue
                for i in range(0, 3):
                    if i == len(cj): continue
                    # dicBPMF[char]['cangjie'] += [cj[:i]]
                    dicCangjie[char] += [cj[i:]]
                    # dicCangjie[cj[:i]] += [char]
                    dicCangjie2char[cj[i:]] += [char]

## Big Unihan.csv

## zwt.titles.txt (字典)

In [6]:
def zwtTitle_train(lines):
    d = defaultdict(lambda: 0)
    for word in lines:
        d[word.strip()] += 1
    #d[word.strip().decode('utf-8')[:2]] += 1
    #print word.strip().decode('utf-8')[:2]
    return d

## radical.txt (部首)

In [7]:
def radicalDic(lines):
    dicRadicalnum = defaultdict(list)
    dicRadical = defaultdict(list)
    for line in lines:
        for char in line[5:].strip().split('|'):
            dicRadical[char] += [line[:4]]
            dicRadicalnum[line[:4]] += [char]
    return dicRadicalnum, dicRadical

In [8]:
def shape_similar(char):
    return list(set(ch for rnum in dicRadical[char] for ch in dicRadicalnum[rnum]))

# Tone extraction

In [9]:
def sound_extract_same(char):
    '''
    Same neutral and tone
    '''
    return list(set(ch for ph in dicBPMF[char] for ch in dicPhone[ph]))

In [10]:
def sound_extract_tone(char):
    '''
    the char of different tone 
    '''
    output = set()
    tones = ['ˊ', 'ˇ', 'ˋ', '˙']
    for ph in dicBPMF[char]:
        if ph[-1] in tones:
            for t in tones:
                if t == ph[-1]: continue
                output = output.union(dicPhone[ph[:-1]+t])
        else:
            for t in tones:
                output = output.union(dicPhone[ph+t])
    return output
        

In [11]:
def sound_extract_finalConsonant(char, toneKeep=True):
    '''
    單：ㄚㄛㄜㄝ
    複：ㄞㄟㄠㄡ
    鼻：ㄢㄣㄤㄥ
    捲舌：ㄦ
    '''
    output = set()
    tones = ['','ˊ', 'ˇ', 'ˋ', '˙']
    consonants = [
        ['ㄚ','ㄛ','ㄜ','ㄝ'],
        ['ㄞ','ㄟ','ㄠ','ㄡ'],
        ['ㄢ','ㄣ','ㄤ','ㄥ']
    ]
    
    for ph in dicBPMF[char]:
        # Tone delete
        if ph[-1] in tones:
            neutral, tone = ph[:-1], ph[-1]
        else:
            neutral, tone = ph,''
        
        # Add relevent consonant
        for cons in consonants:
            if neutral[-1] in cons:
                new_neutrals = set(neutral[:-1] + c for c in cons if c!=neutral[-1])
                for n in new_neutrals:
                    if toneKeep:
                        output = output.union(dicPhone[n+tone])
                    else:
                        for t in tones:
                            output = output.union(dicPhone[n+t])
                break
                               
    return output

In [12]:
def sound_extract_similartConsonant(char, toneKeep=True):
    '''
    一次只針對一種，不會並用
    Initial
    ㄈㄏ
    ㄋㄌ
    ㄓㄗ
    ㄔㄘ
    Final:
    ㄢㄤ
    ㄜㄦ
    ㄣㄥ
    Intermediate:
    ㄧㄩ
    '''
    new_neutrals = set()
    output = set()
    tones = ['','ˊ', 'ˇ', 'ˋ', '˙']
    initial_pairs = [
        ['ㄈ','ㄏ'],
        ['ㄋ','ㄌ'],
        ['ㄓ','ㄗ'],
        ['ㄔ','ㄘ']
    ]
    final_pairs = [
        ['ㄢ','ㄤ'],
        ['ㄜ','ㄦ'],
        ['ㄣ','ㄥ']
    ]
    inter_pairs = [['ㄧ','ㄩ']]
    
    for ph in dicBPMF[char]:        
        # Tone delete
        if ph[-1] in tones:
            neutral, tone = ph[:-1], ph[-1]
        else:
            neutral, tone = ph, ''
            
        # Initial-consonant, just pick one 
        for cons in initial_pairs:
            if neutral[0] in cons:
                new_neutrals = new_neutrals.union(c + neutral[1:] + tone for c in cons if c!=neutral[0])
                break        
        
                    
        # Final-consonant       
        for cons in final_pairs:
#             print(neutral[-1], cons)
            if neutral[-1] in cons:
#                 print('i', cons)
#                 print(neutral[:-1])
#                 print(list(neutral[:-1] + c for c in cons if c!=neutral[-1]))
                new_neutrals = new_neutrals.union(neutral[:-1] + c + tone for c in cons if c!=neutral[-1])
                break
        
        # Inter_
        for cons in inter_pairs:
            for idx, tmp in enumerate(neutral):
                if tmp in cons:
                    new_neutrals = new_neutrals.union(neutral[:idx] + c + neutral[idx+1:] + tone for c in cons if c!=tmp)
                    break
    
    ######## fIX TOne pRoblEm
#     print(new_neutrals)
    # Get candidate based on new_neutrals
    for n in new_neutrals:
        if toneKeep:
            output = output.union(dicPhone[n])
        else:
            tmp = n[:-1] if n[-1] in tones else n            
            for t in tones:
                output = output.union(dicPhone[n+t])
#         print(n,' '.join(output))
        
#     print(len(output))                     
    return output if len(output)>0 else []

# Cangjie Extraction (from UNIHAN)

In [13]:
def cangjie_extract_same(char):
    cang = dicCangjie[char]
    if len(cang) > 0:
        output = set(dicCangjie2char[cang[0]])
        output.remove(char)
    else:
        output = set()
    
    return list(output)

In [14]:
def cangjie_compare_unihan(ch_x,ch_y):
    '''
    Compare the cangjie between two character
    applied LCS to check whether the two chars have similar cangjie code 
    '''
    
    def lcs(xstr, ystr):
        """
        >>> lcs('thisisatest', 'testing123testing')
        'tsitest'
        """
        if not xstr or not ystr:
            return ""
        x, xs, y, ys = xstr[0], xstr[1:], ystr[0], ystr[1:]
        if x == y:
            return x + lcs(xs, ys)
        else:
            return max(lcs(xstr, ys), lcs(xs, ystr), key=len)
    
    cang_x = dicCangjie.get(ch_x,[])
    cang_y = dicCangjie.get(ch_y,[])
    
    if len(cang_x)==0 or len(cang_y)==0:
        return 0
    else:
        cang_x == cang_x[0]
        cang_y == cang_y[0]
    
    if cang_x == cang_y:
        return 2
    else:
        lcs_length = len(lcs(cang_x, cang_y))
        if len(cang_x) == 2:
            if (lcs_length == 1 and len(cang_y)==2)\
            or (lcs_length == 2 and len(cang_y)==3):
                return 1
        elif len(cang_x) == 3:
            if lcs_length == 2 and len(cang_y)<=4:
                return 1
        elif len(cang_x) == 4:
            if lcs_length == 3 and len(cang_y)>=3:
                return 1
        elif len(cang_y) == 5:
            if lcs_length == 4 and len(cang_y)==4:
                return 1
    
    return 0     

## Error_correct pair

In [206]:
def extractPairs(filelist):
    for filename, path in filelist.items():
        print('== Filename: {}'.format(filename))
        '''
        QQQQQQ 有兩個以上的錯誤在一個詞裡面，但更正只有一項
        把上方例子放棄不取
        有 duplicate 
        '''
        # 1新編常用錯別字門診.txt OR 4教育部錯別字表.txt
        if filename.startswith('1') or filename.startswith('4'):
            df = pd.read_csv(path, sep='\t')
        # 2東東錯別字.txt OR 3常見錯別字一覽表.txt
        elif filename.startswith('2') or filename.startswith('3'):        
            df = pd.read_csv(path, sep='\t', header=None, names = ['正確詞','錯誤詞','正確字','錯誤字'])
        elif filename.startswith('udn_common'):
            table = xlrd.open_workbook(path).sheet_by_index(0)
            ch_dict = defaultdict(set)
            # Have multierros (error_word to correct_word)
            word_dict = defaultdict(set)
            for idx in range(1,table.nrows):
                row = table.row_values(idx)[:5]
                # Consider the priority of pairs 
                if row[2].strip():            
                    chs = row[2].split()
                    if len(chs)==1:
                        continue
                    for i in range(1,len(chs)):
                        freq = row[1] if type(row[1])==float else 1.0                
                        ch_dict[chs[i]].add((int(freq),chs[0]))
                elif row[3].strip():
                    corr_seq = row[3].strip()
                    error_seq = row[4].strip()
                    word_dict[error_seq] = corr_seq
            yield (filename, ch_dict, word_dict)
            continue
        elif filename.startswith('udn_pairs'):
            ch_dict = defaultdict(set)
            with open(path, 'r', encoding='utf8') as fp:
                for line in fp:
                    tt = line.split()
                    if int(tt[2])>10:
                        ch_dict[tt[0]].add((int(tt[2]), tt[1]))
                    
            yield (filename, ch_dict, dict())
            continue
        
        print(filename)
        
        # For 1,2,3,4
        if len(df)>0:
            df = df.dropna()
            df['idx'] = df.apply(lambda x:x['錯誤詞'].find(x['錯誤字']), axis=1)
            df['pair'] = tuple(zip(df['idx'], df['錯誤字']))
            df['noMultiErrors'] = df.apply(lambda x:x['正確詞']==x['錯誤詞'].replace(x['錯誤字'],x['正確字']), axis=1)
            
            # Remove multi-errors for the lack of right answer 
            preCnt = len(df)
            df = df[df['noMultiErrors']==True]
            postCnt = len(df)
            
            print('Original:{}\tPost:{}'.format(preCnt,postCnt))
            
            df = df.set_index('錯誤詞')
            
            # Output DICT{'error_word':'(idx, corr_ch)'}
#             df_slice = df[['pair']]
#             word_dict = df_slice.to_dict()['pair']
            word_dict = df[['正確詞']].to_dict()['正確詞']

            # output DICT{'error_ch':set(cands)}
            ch_dict = defaultdict(lambda :set())
            pairs = tuple(zip(df['錯誤字'], df['正確字']))
            for error_ch, corr_ch in pairs:
                ch_dict[error_ch].add(corr_ch)

            yield (filename, ch_dict, word_dict)

## Confusion Sentnece (from SIGHAN)

1. Bakeoff-2013 not work
2. sequence error not append 

In [16]:
def extractSentence(filelist):
    for filename, path in filelist.items():
        print('== Filename: {}'.format(filename))
        
        with open(path,'r',encoding='utf8') as fp:
            soup = BeautifulSoup(fp, 'lxml')
        
        ch_dict = defaultdict(set)
        word_dict = defaultdict(set)
        seq_dict  = defaultdict(set)
        
        # Different label
        if filename.startswith('Bakeoff'):
            pass 
            ############## NOT FIX
            for idx,element in enumerate(soup.find_all('DOC')):  
                # Text
                text = dict()
                for pas in element.find('p').find_all('passage'):
                    text[pas.get('id')] = pas.string

                # Mistake
                for mistake in element.find_all('mistake'):
                    mis_id = mistake.get('id')
                    mis_loc = mistake.get('location')
                    mis_wrong = mistake.find('wrong').string.strip()
                    mis_corr  = mistake.find('correction').string.strip()
                    cur_seq = text.get(mis_id, '')

                    pairs =  [(mis_wrong,idx,x,y) for idx, (x,y) in enumerate(zip(mis_wrong, mis_corr)) if x!=y]

                    # error-corr
                    for mis_wrong,idx,error_ch,corr_ch in pairs:
                        # char-based
                        ch_dict[error_ch].add(corr_ch)

                        # word-based
                        word_dict[mis_wrong].add((idx,corr_ch))

        else:            
            for idx,element in enumerate(soup.find_all('essay')):  
                # Text
                text = dict()
                for pas in element.find('text').find_all('passage'):
                    text[pas.get('id')] = pas.string

                # Mistake
                for mistake in element.find_all('mistake'):
                    mis_id = mistake.get('id')
                    mis_loc = mistake.get('location')
                    mis_wrong = mistake.find('wrong').string.strip()
                    mis_corr  = mistake.find('correction').string.strip()
                    cur_seq = text.get(mis_id, '')

                    pairs =  [(mis_wrong,idx,x,y) for idx, (x,y) in enumerate(zip(mis_wrong, mis_corr)) if x!=y]

                    # error-corr
                    for mis_wrong,idx,error_ch,corr_ch in pairs:
                        # char-based
                        ch_dict[error_ch].add(corr_ch)

                        # word-based
                        word_dict[mis_wrong].add((idx,corr_ch))

                        # sequence-based 
                        ### Have problem with multiple errors in single word 
            #             seq_dict[cur_seq].add((int(mis_loc)-1,corr_ch))
    
        yield (filename, ch_dict, word_dict, seq_dict)

## The other 

# ALL

# * Char information

In [17]:
dataroot = '/home/kiwi/udn_data/training_confusion/char_information/'
filelist = dict((file,dataroot+file) for file in os.listdir(dataroot))

In [18]:
# Global variables
dicBPMF = defaultdict(list)
dicPhone = defaultdict(list)
dicCangjie = defaultdict(list)
dicCangjie2char = defaultdict(list)
unihan_extract(filelist['unihan.csv'])

sound_SIGHAN = pd.read_csv(filelist['Bakeoff2013_CharacterSet_SimilarPronunciation.txt'], sep='\t', index_col=0)
shape_SIGHAN = pd.read_csv(filelist['Bakeoff2013_CharacterSet_SimilarShape.txt'], \
                           sep=',', index_col=0, names=['cands']).to_dict()['cands']

voc = zwtTitle_train(open(filelist['zwt.titles.txt']).readlines())

dicRadicalnum, dicRadical = radicalDic(open(filelist['radical.txt'], 'r', encoding='utf8').readlines())

dicFreq = bigUnihan_extract(filelist['unihan_utf8_new.csv'])

# * Error_corr_pair

In [201]:
dataroot = '/home/kiwi/udn_data/training_confusion/error_corr_pair/'
filelist = dict((file,dataroot+file) for file in os.listdir(dataroot))
# special = filelist.pop('udn_common.xls')

In [207]:
confusion_pairs = dict()
# confusion_pairs[special] = extractPairs_udn
for filename, ch_dict, word_dict in extractPairs(filelist):
    print('ch_dict:{}\tword_dict:{}\n'.format(len(ch_dict),len(word_dict)))
    confusion_pairs[filename] = (ch_dict,word_dict)

== Filename: 4教育部錯別字表.txt
4教育部錯別字表.txt
Original:490	Post:470
ch_dict:416	word_dict:470

== Filename: 3常見錯別字一覽表.txt
3常見錯別字一覽表.txt
Original:1364	Post:1265
ch_dict:803	word_dict:1172

== Filename: 1新編常用錯別字門診.txt
1新編常用錯別字門診.txt
Original:490	Post:470
ch_dict:416	word_dict:470

== Filename: udn_common.xls
ch_dict:384	word_dict:1057

== Filename: udn_pairs.csv
ch_dict:273	word_dict:0

== Filename: 2東東錯別字.txt
2東東錯別字.txt
Original:57924	Post:38353
ch_dict:3385	word_dict:37478



# * Error_corr_sentence

In [21]:
dataroot = '/home/kiwi/udn_data/training_confusion/error_corr_sentence/'
filelist = dict((file,dataroot+file) for file in os.listdir(dataroot))
unwated_file = filelist.pop('big5')

confusion_sentences = dict()
for filename, ch_dict, word_dict, seq_dict in extractSentence(filelist):
    print('ch_dict:{}\tword_dict:{}\n'.format(len(ch_dict),len(word_dict)))
    confusion_sentences[filename] = (ch_dict,word_dict)

== Filename: SIGHAN15_CSC_B2_Training_utf8.sgml
ch_dict:715	word_dict:1478

== Filename: C1_training.sgml
ch_dict:237	word_dict:369

== Filename: Bakeoff2013_SampleSet_WithError_utf8.txt
ch_dict:0	word_dict:0

== Filename: SIGHAN15_CSC_A2_Training.sgml
ch_dict:521	word_dict:794

== Filename: B1_training_utf8.sgml
ch_dict:1165	word_dict:3608



# Confusion_training 

In [22]:
def sound_compare_unihan(ch_x,ch_y):
    if ch_y in sound_extract_same(ch_x):
        return 4
    elif ch_y in sound_extract_tone(ch_x):
        return 3
    elif ch_y in sound_extract_similartConsonant(ch_x, toneKeep=True):
        return 2
    elif ch_y in sound_extract_similartConsonant(ch_x, toneKeep=False):
        return 1
    else:
        return 0
    
def shape_compare_unihan(ch_x,ch_y):
    if ch_y in shape_similar(ch_x):
        return 1
    else:
        return 0

In [62]:
def comparison4confusion(ch_chunk):
    ch_x = ch_chunk[0]
    ch_y = ch_chunk[1]
    
    log = list()
    score = 0.0
    
    # MaxScore = 36 (pair/sentence count:5)
    
    tmp = sound_compare_unihan(ch_x,ch_y)
    log.append(tmp)
    score += tmp
#     print('1',score)
    
    tmp = shape_compare_unihan(ch_x,ch_y) * 3
    log.append(tmp)
    score += tmp
#     print('2',score)
    
    tmp = cangjie_compare_unihan(ch_x,ch_y) * 2
    log.append(tmp)
    score += tmp
#     print('3',score)

    tmp = sound_compare_SIGHAN(ch_x,ch_y)
    log.append(tmp)
    score += tmp
#     print('4',score)

    
    tmp = shape_compare_SIGHAN(ch_x,ch_y)
    log.append(tmp)
    score = score + tmp[0]*2 + tmp[1] * 4
#     print('5',score)
    
    
    
    for filename, (ch_dict,_) in confusion_pairs.items():
        if ch_y in ch_dict.get(ch_x,[]):
            log.append(filename)
            score += 1

    for filename, (ch_dict,_) in confusion_sentences.items():
        if ch_y in ch_dict.get(ch_x,[]):
            log.append(filename)
            score += 1
            
    tmp = (5.0-dicFreq.get(ch_y,5))
    if score!=0:
        score += tmp
    log.append(tmp)

    return (score,log)

# Char_comparison

In [24]:
def comparison(ch_x, ch_y):
    print('=== Comparison')
    print(ch_x, ch_y)
    '''
    相似音的處理
    sound_extract_same
    sound_extract_tone
    sound_extract_similartConsonant
    sound_extract_finalConsonant

    同音同調 同音異調 異音同調 異音異調
    '''
        
        
    print('\n=== Sound similar (from unihan)')
    print(sound_compare_unihan(ch_x,ch_y))
    
    '''
    shape_similar
    '''
    print('\n=== Shape similar (from radical)')
    print(shape_compare_unihan(ch_x,ch_y))

    '''
    cangjie_compare
    2 same cangjie code 
    1 similar cangjie code 
    0 nothing special
    '''
    print('\n=== Cangjie (from unihan)')
    print( cangjie_compare_unihan(ch_x,ch_y))
        
    '''
    SIGHAN
    '''
    print('\n=== SIGHAN Data (sound)')
    print(sound_compare_SIGHAN(ch_x, ch_y))
    

    print('\n=== SIGHAN Data (shape)')
    print(shape_compare_SIGHAN(ch_x,ch_y))
#     if t[0]==1:
#         print('Similar shape')
#     if t[1]==1:
#         print('同部首同筆畫數')
    

    '''
    Error-correct pair 
    'filename': (ch_dict,word_dict)
    '''
    print('\n=== Error-correct pair')
    for filename, (ch_dict,_) in confusion_pairs.items():
        if ch_y in ch_dict.get(ch_x,[]):
            print(filename)

    '''
    Error-correct sentence 
    'filename': (ch_dict, word_dict)
    '''
    print('\n=== Error-correct sentence')
    for filename, (ch_dict,_) in confusion_sentences.items():
        if ch_y in ch_dict.get(ch_x,[]):
            print(filename)

In [25]:
# ch_x = '相'

# ch_y = random.choice(list(rr))

# comparison(ch_x,ch_y)

# ch_x = random.choice(sound_SIGHAN.index)
# ch_y = random.choice(sound_SIGHAN.index)

# comparison(ch_x,ch_y)


In [81]:
if __name__=="__main__":
    ch_label = set(dicBPMF.keys()).union(set(sound_SIGHAN.index))

    ch_n_label = random.choices(list(ch_label),k=50)

    # %%timeit -n 1 -r 1

    # pool_size=multiprocessing.cpu_count()

    bigDict = defaultdict(dict)

    start_time = time.clock()
    with multiprocessing.Pool(processes=4) as pool:
        for ch_x in ch_n_label:
            ch_n_inside = list(ch_n_label)

            ch_n_inside.remove(ch_x)

            ch_chunk = [(ch_x, ch_y) for ch_y in ch_n_inside]

            scores = pool.map(comparison4confusion, ch_chunk)

            for idx,(_,ch_y) in enumerate(ch_chunk):
                if scores[idx][0]>0.0:
    #             if scores[idx] >= 0:
                    bigDict[ch_x][ch_y] = scores[idx]
            bigDict[ch_x][ch_x] = (30,[])

#     with open('./tt.pkl', 'wb') as fp:
#         pickle.dump(bigDict,fp)
    
    print(time.clock()-start_time)

0.12344599999999772


In [88]:
with open('/home/kiwi/udn_data/training_confusion/confu.pkl','rb') as fp:
    bigDict = pickle.load(fp)

In [188]:
# Find best correct score 
# Prune (not much candidates)
idx = 0
for ch, candsValue in bigDict.items():
#     print(ch, candsValue)
    
    idx += 1
    
    if idx%100==0: print(idx)
#     if idx>1: break
    total_score = sum(score for cand, (score, _) in candsValue.items())
    
    ttttt = candsValue.items()
    tmp = sorted(candsValue.items(), key=lambda x:x[1][0], reverse=True)
    tmp.insert(0, (ch,(35,[])))
    
    if total_score > 1000:
        
        # pick 20
        
        tmp = tmp[:20]
        
    elif total_score > 500:
        # pick 10
        tmp = tmp[:10]
        
    elif total_score > 300:
        # pick 5
        tmp = tmp[:5]
    elif total_score >100:
        # pick 3 
        tmp = tmp[:3]
    else:
        # pick 1 
        tmp = tmp[0:0]
        
    tmp_score  = sum(val for _,(val,_) in tmp)
    confus[ch] = dict((cand, value/tmp_score) for cand, (value,_) in tmp)
    
    
    
#     confus[ch] = dict((cand, score/total_score) for cand, (score,_) in candsValue.items())
    
#     print(confus[ch])

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000


In [223]:
confu = defaultdict(set)
for ch, candsValue in bigDict.items():
#     print(ch, candsValue)
    
    idx += 1
    
    if idx%100==0: print(idx)
    if idx>3: break
#     total_score = sum(score for cand, (score, _) in candsValue.items())
    
    print(candsValue)
    ttttt = candsValue.items()
    tmp = sorted(candsValue.items(), key=lambda x:x[1][0], reverse=True)
    
    print(tmp)
    for cand, (cnt_score,_) in tmp:
        print(cand, cnt_score)
        if cnt_score>7:
            confu[ch].add(cand)
    
    

In [218]:
confu

defaultdict(set, {})

In [211]:
tmp

[('球', (10.0, [4, 0, 0, 0, (0, 0), 6.0])),
 ('求', (10.0, [4, 0, 0, 0, (0, 0), 6.0])),
 ('鰌', (7.0, [4, 3, 0, 0, (0, 0), 0.0])),
 ('遒', (7.0, [4, 3, 0, 0, (0, 0), 0.0])),
 ('崷', (7.0, [4, 3, 0, 0, (0, 0), 0.0])),
 ('湭', (7.0, [4, 3, 0, 0, (0, 0), 0.0])),
 ('酋', (7.0, [4, 3, 0, 0, (0, 0), 0.0])),
 ('蝤', (7.0, [4, 3, 0, 0, (0, 0), 0.0])),
 ('仇', (6.0, [4, 0, 0, 0, (0, 0), 2.0])),
 ('尊', (5.0, [0, 3, 0, 0, (0, 0), 2.0])),
 ('猶', (5.0, [0, 3, 0, 0, (0, 0), 2.0])),
 ('汓', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('釓', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('唒', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('浗', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('訄', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('叴', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('泅', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('賕', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('逑', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('蛷', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('頯', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('艽', (4.0, [4, 0, 0, 0, (0, 0), 0.0])),
 ('肍', (4.0, [4, 0, 0, 0, (0, 0)