In [None]:
import  confusionTable_extractFeature as CTF

In [1]:
import math

In [2]:
import sys

In [3]:
from multiprocessing import Pool

In [4]:
import pickle
import numpy as np
from sklearn import svm

In [57]:
from sklearn.preprocessing import Normalizer, StandardScaler

In [6]:
from sklearn.naive_bayes import GaussianNB

In [7]:
from collections import defaultdict
import matplotlib.pyplot as plt

In [8]:

POS = 1
NEG = -1
UNK = 2


In [65]:
PROCESSCNT = 2
MULTIPROCESS = True

In [62]:
results = []
with Pool(processes=PROCESSCNT) as pool:
    for x in range(5):
        r = pool.apply_async(lambda i:i*2, (x, ))
        results.append(r)
    for r in results:
        r.wait()

In [63]:
results

[<multiprocessing.pool.ApplyResult at 0x7f1ec5543a90>,
 <multiprocessing.pool.ApplyResult at 0x7f1ec5543a20>,
 <multiprocessing.pool.ApplyResult at 0x7f1ec5543860>,
 <multiprocessing.pool.ApplyResult at 0x7f1ec55430f0>,
 <multiprocessing.pool.ApplyResult at 0x7f1ec55435c0>]

In [60]:
ttt = results[0]

In [61]:
ttt.wait?

In [9]:
def extract(bigDict):
    '''Extract (feature/label) from filename
    Args:
        bigDict (nested dict): the dict of feature (*.pkl) from confusionTable_extractFeature
    Return:
        train_feature (np.array): feature array
        train_label (np.array): the label of feature 
            (1 for positive, 2 for negative )
    '''
    
    # Initial 
    label = list()
    feature = list()

    for error_ch, (cands_val) in bigDict.items():
        # two situation for error: (error-pair) or (higher-score)
        for cand, (score, log) in cands_val.items():                        
            feature.append(log[:-2])
            
            
            if log[-2] != 0:
                label.append(POS)
            else:
                label.append(UNK)
    
    # Combine
    train_feature = np.asarray(feature, dtype='float')            
    tmp_label = np.asarray(label, dtype='int')
    pos = np.where(tmp_label == POS )[0]
    neg = np.where(tmp_label == UNK )[0]
    train_label = {POS:pos, UNK:neg}        

    print('The number of sample = {}'.format(train_feature.shape))
    print('Positive case (candidate) = {}'.format(len(train_label[POS])))
    print('Negative case (uncandidate) = {}'.format(len(train_label[UNK])))
    
    return (train_feature, train_label)

In [10]:
def preprocess(data, method):
    '''Preprocessing of feature
    Args:
        data (np.array): feature 
        method (str): 'normal'-normalization // 'standard'-standardization
    Return:
        new_data (np.array): after preprocessing 
        pre (scikit.preprocess): transform model for feature     
    '''
    if method == 'normal':
        pre = Normalizer().fit(data)
        new_data = pre.transform(data)
    elif method == 'standard':
        pre = Normalizer().fit(data)
        new_data = pre.transform(data)
    else:
        print('Unknown method')
        sys.exit(0)
    
    return (new_data, pre)

In [79]:
rn = RN(new_feature, label)

In [None]:
rn.runrun()

In [77]:
class RN:
    def __init__(self, feature, label):
        nbf_idx = np.concatenate((label[POS],label[UNK]))
        nbf_label = np.concatenate(
            (np.full(len(label[POS]), POS, dtype=int)
             , np.full(len(label[UNK]), UNK, dtype=int)))
        nbf_feature = feature[nbf_idx]
        self.nbf = GaussianNB().fit(nbf_feature, nbf_label)
        self.chunk = [(idx, feature[idx]) for idx in label[UNK]]
    
    def runrun(self):
        
        if MULTIPROCESS:
            with Pool(processes=PROCESSCNT) as pool:
                self.rn_neg = pool.map(self.rnBatch, self.chunk)
            
    
    def rnBatch(self, chunk):
        if self.nbf.predict(chunk[1].reshape(1,-1)) == UNK:
            return chunk[0]
        else:
            return -1
        

In [66]:
def rnSelect(feature, label):
    def rnBatch(chunk):
        if nbf.predict(chunk[1].reshape(1,-1)) == UNK:
            return chunk[0]
        else:
            return -1
        
    nbf_idx = np.concatenate((label[POS],label[UNK]))
    nbf_label = np.concatenate(
        (np.full(len(label[POS]), POS, dtype=int)
         , np.full(len(label[UNK]), UNK, dtype=int)))
    nbf_feature = feature[nbf_idx]
    nbf = GaussianNB().fit(nbf_feature, nbf_label)

#     print('Start RN step select')
    chunk = [(idx, feature[idx]) for idx in label[UNK]]
    
    if MULTIPROCESS:
        with Pool(processes=PROCESSCNT) as pool:
            rn_neg = pool.map(rnBatch, chunk)
    else:
        rn_neg = [rnBatch(c) for c in chunk]
        
        
    label[NEG] = np.where(np.asarray(rn_neg) != -1)[0]
    
    return label

In [67]:
l = rnSelect(new_feature, label)

AttributeError: Can't pickle local object 'rnSelect.<locals>.rnBatch'

In [12]:
def train_svm(feature, label, train_cnt, test_cnt=0, cross_validation=False):
    '''Train SVM model from feature
    Args:
        feature (np.array): feature array
        label (np.array): the label of feature 
        train_cnt (int): the number of training samples 
        test_cnt (int): (default=samples-train_cnt) the number of testing samples
    Return:
        xxx    
    '''
    
    assert train_cnt < len(label[POS]), 'Train count must less than positive samples'
    
    # If test_cnt not declare, use all the remain set as test set    
    if train_cnt==0:
        assert False, 'sample count cannot be zero'
        
    if test_cnt==0:
        test_cnt = len(label[POS]) - train_cnt
        
    # Picke feature/label to train & test set    
    np.random.shuffle(label[POS])
    np.random.shuffle(label[NEG])

    train_idx = np.concatenate(
        (label[POS][:train_cnt],
         label[NEG][:train_cnt]))
    train_label = np.concatenate(
        (np.full(train_cnt, POS, dtype=int), np.full(train_cnt, NEG, dtype=int)))
    train_feature = feature[train_idx]

    test_idx = np.concatenate(
        (label[POS][train_cnt:train_cnt+test_cnt], 
         label[NEG][train_cnt:train_cnt+test_cnt]))
    test_label = np.concatenate(
        (np.full(test_cnt, POS, dtype=int), np.full(test_cnt, NEG, dtype=int)))
    test_feature = feature[test_idx]

    # Training 
    clf = svm.SVC(kernel='rbf', probability=True)
    clf.fit(train_feature,train_label)
    
    # Testing 
    accuracy = clf.score(test_feature, test_label)
    pos_acc = clf.score(test_feature[:test_cnt], test_label[:test_cnt])
    print('Accuracy ({}/{}): {}'.format(test_cnt, len(test_label), accuracy))
    print('Positive Accuracy ({}): {}'.format(test_cnt, pos_acc))
    
#     output = [accuracy, train_cnt, test_cnt*2]
#     return output
    return clf

In [13]:
def create(clf, bigDict, pre_filter, threshold):
    '''Construct basic NCM
    
    '''
    confusion = defaultdict(dict)
    c_t, c_tf, sk = 0,0,0
    
    idx =0
    for ch, cands in bigDict.items():
#         if idx>5:break
#         idx+=1
#         print(ch)
        for ca, (score,log) in cands.items():            
            tmp = pre_filter.transform(np.asarray(log[:-2]).reshape(1,-1))
            f = clf.predict_proba(tmp).tolist()[0]            
            
            # Known confusion pair
            if log[-2] != 0:
                c_t += 1
                ###########
                # MAY have bad score
                ###########
                
            # Unknown confusion pair but predict 
            elif f[0] > f[1] and f[0] > threshold:
                c_tf += 1
            else:
                sk += 1
                continue
            
            confusion[ch][ca] = f[0]                        
        
    print('Original Pair: {}\tNew Pair: {}\tSkip Pair:{}'.format(c_t, c_tf, sk))
    return confusion

In [14]:
def info(confusion):
    '''Get basic information about NCM 
    
    '''
    total_error = len(confusion)
    cands = [(ch, len(cand)) for (ch, cand) in 
        sorted(confusion.items(), key= lambda x:len(x[1]), reverse=True)]

    sum_cands = sum(cnt for _, cnt in cands)
    max_cands = max(cnt for _, cnt in cands)
    min_cands = min(cnt for _, cnt in cands)
    mean_cands = sum_cands/total_error
    
    if total_error%2 == 0:
        mid_cands = cands[total_error//2]
    else:
        mid_cands = cands[(total_error+1)//2]
    
    print('Total_error: {}\nSum_cands: {}'.format(total_error, sum_cands))
    print('Max_cands: {}\nMin_cands: {}'.format(max_cands, min_cands))
    print('Mean_cands: {:.2f}\nMid_cands: {}'.format(mean_cands, mid_cands))
    print('Top 20 cands:\n {}'.format(cands[:20]))
    
    plt.plot([cnt for _, cnt in cands])
    plt.xlabel('index')
    plt.ylabel('candidates number')
    fig = plt.gcf()
    plt.show()
    fig.savefig('./confusionTable/confu_info.png',dpi=100)

In [15]:
def outputPKL(confusion, ncm_correct, pkl_file):
    '''Assign local probability and output to NCM file
    '''
    output = defaultdict(dict)
    
    for ch, cands in confusion.items():
        total_cand_val = sum(cands.values())*(1-ncm_correct) + ncm_correct
        output[ch] = {ca:(p*(1-ncm_correct))/total_cand_val for ca,p in cands.items()}
        output[ch][ch] = ncm_correct/total_cand_val        
    
    with open('./confusionTable/{}'.format(pkl_file), 'wb') as fp:
        pickle.dump(output, fp)
    
    return output

In [16]:
if __name__ == '__main__':
    feature_file = './confusionTable/feature.pkl'
    output_file = './confusionTable/confu_{}'.format
    ncm_correct = 0.95
        
    with open(feature_file, 'rb') as fp:
        dataset = pickle.load(fp)
    
    

In [17]:
feature,label = extract(dataset)


The number of sample = (2592820, 10)
Positive case (candidate) = 9357
Negative case (uncandidate) = 2583463


In [18]:
(feature, pre) = preprocess(feature, 'normal')


In [None]:
label = rnSelect(feature, label)

In [None]:
feature,label = extract(dataset)

(new_feature, pre) = preTrain(feature, 'normal')

clf = train(new_feature, label, 4000)

confu_raw = create(clf, dataset, pre, 0.85)

info(confu_raw)

In [None]:
confu = outputPKL(confu_raw, ncm_correct, output_file('xxx'))