In [1]:
import optuna
import numpy as np
import pandas as pd
import sklearn.metrics

np.random.seed(2020)

In [2]:
def macro_recall(true, pred):
    mr = sklearn.metrics.recall_score(true, pred, average='macro')
    return mr

def calc_score(true, pred):
    pred_label = np.argmax(pred, axis=1)
    return macro_recall(true, pred_label)

def macro_recall_v2(true, pred):
    true_bin_count = np.bincount(true)
    correct_bin_count = np.bincount(true[true == pred])
    
    recalls = correct_bin_count / (true_bin_count + 1e-8)
    mr = np.average(recalls)
    return mr

def calc_score_v2(true, pred):
    pred_label = np.argmax(pred, axis=1)
    return macro_recall_v2(true, pred_label)
        


In [3]:
def read_data(_path):
    _df = pd.read_csv(_path, index_col=0).values
    print(_df.shape)
    
    _true_label = _df[:,0]
    _pred_label = _df[:,1]
    _pred_logit = _df[:,2:]
    return _true_label.astype('int'), _pred_label.astype('int'), _pred_logit

def make_true_label_pred_logit(_paths):
    print('read files')
    
    _ave_logit = None
    
    for _path in _paths:
        print(_path)
        _true_label, _, _pred_logit = read_data(_path)
        print('score ', calc_score(_true_label, _pred_logit))
        
        if _ave_logit is None:
            _ave_logit = _pred_logit
        else:
            _ave_logit = _ave_logit + _pred_logit
        
    _ave_logit = _ave_logit / len(_paths)
    print('ensemble')
    print('score ', calc_score(_true_label, _ave_logit))
    
    return _true_label, _ave_logit

In [4]:
kw = 'w_da' # 'wo_da', 'w_da'
kw2 = 'vl_' # 'tr_', 'vl_'
kw3 = 'con_' # 'gra_', 'vow_', 'con_'

data_paths = ['./pred/model_v1_0_0/' + kw + '/' + kw2 + kw3 + 'pred.csv',
              './pred/model_v1_0_1/' + kw + '/' + kw2 + kw3 + 'pred.csv',
              './pred/model_v1_0_5/' + kw + '/' + kw2 + kw3 + 'pred.csv',
              './pred/model_v1_0_8/' + kw + '/' + kw2 + kw3 + 'pred.csv',
             ]

In [5]:
true_label, pred_logit = make_true_label_pred_logit(data_paths)
output_file = kw3 + kw + '_'

read files
./pred/model_v1_0_0/w_da/vl_con_pred.csv
(200840, 9)
score  0.9859636050371242
./pred/model_v1_0_1/w_da/vl_con_pred.csv
(200840, 9)
score  0.986669545264819
./pred/model_v1_0_5/w_da/vl_con_pred.csv
(200840, 9)
score  0.9859905969352206
./pred/model_v1_0_8/w_da/vl_con_pred.csv
(200840, 9)
score  0.9848894423557721
ensemble
score  0.989326057683655


In [6]:
def top_k_mask(pred_logit, k):
    sorted_logit = -np.sort(-pred_logit, axis=1)[:,:k]
    
    top_k_th = sorted_logit[:,k-1:k]
    tk_mask = (pred_logit - top_k_th) > -1e-8
    
    return tk_mask

k = 3
tk_mask = top_k_mask(pred_logit, k)
print(np.max(np.sum(tk_mask, axis=1)))
print(np.min(np.sum(tk_mask, axis=1)))

3
3


In [7]:
true_bin_count = np.bincount(true_label)

_j = pred_logit.shape[1]
_k = k
_n = true_label.shape[0]
tk_mask_pred_label = np.repeat(np.arange(_j)[None,:],_n,axis=0)[tk_mask].reshape([_n, _k])
tk_eye = np.eye(_k, dtype='bool')

def macro_recall_v3(true, pred):
    correct_bin_count = np.bincount(true[true == pred])
    
    if len(true_bin_count) != len(correct_bin_count):
        correct_bin_count = np.concatenate([correct_bin_count, np.zeros(len(true_bin_count)-len(correct_bin_count), dtype='int')])
        
    recalls = correct_bin_count / (true_bin_count + 1e-8)
    mr = np.average(recalls)
    return mr

def calc_score_v3(true, pred):
    _j = pred.shape[1]
    _k = k
    _n = true.shape[0]
    msk_maxarg = np.argmax(pred[tk_mask].reshape([_n, _k]), axis=1)    
    pred_label = tk_mask_pred_label[tk_eye[msk_maxarg]]
    return macro_recall_v2(true, pred_label)

In [8]:
print(calc_score(true_label, pred_logit * tk_mask))
print(calc_score_v2(true_label, pred_logit * tk_mask))
print(calc_score_v3(true_label, pred_logit))

0.989326057683655
0.9893260576799612
0.9893260576799612


In [9]:
import time

start = time.time()
for _ in range(10):
    calc_score(true_label, pred_logit * tk_mask)
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time / 10) + "[sec]")

start = time.time()
for _ in range(10):
    calc_score_v2(true_label, pred_logit * tk_mask)
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time / 10) + "[sec]")

start = time.time()
for _ in range(10):
    calc_score_v3(true_label, pred_logit)
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time/10) + "[sec]")

elapsed_time:0.0721437931060791[sec]
elapsed_time:0.020857620239257812[sec]
elapsed_time:0.017766308784484864[sec]


In [10]:
import RealCodedGeneticAlgorithm as rcga

def ev_func(_genes):
    scores = []
    for g in _genes:
        s = -calc_score_v3(true_label, pred_logit + g[None,:])
        s = s + 0.3 * np.mean(g) ** 2
        scores.append(s)
    scores = np.array(scores)
    return scores
    
ga = rcga.RealCodecGA_JGG_AREX(gene_num=pred_logit.shape[1], 
                                evaluation_func=ev_func, 
                                initial_min=-1.0, initial_max=1.0, 
                                population=600, 
                                crossover_num=20, child_num=100, 
                                initial_expantion_rate=1.0, learning_rate=0.01, 
                                seed=2020)

def __summary(best_evals, ga):
        # best
        print('<summary>')
        print(' best_evals, {0}'.format(best_evals[-1]))
        print(' best_gene, {0}'.format(ga.best_gene))
        return

def run(step_num, print_evaluation=True):
    if print_evaluation:
        print('generation, best_evaluation, diversity')
    #
    best_evals = []
    for i in range(step_num):
        ga.generation_step()
        best_evals.append(ga.best_evaluation)
        diversity = ga.calc_diversity()

        if print_evaluation:
            print('{0}, {1}, {2}'.format(i+1, ga.best_evaluation, diversity))
            
        np.savetxt(output_file + 'best_bias.csv', ga.best_gene[None,:], delimiter=',')

    #
    __summary(best_evals, ga)

    return best_evals, ga.best_gene, ga.best_evaluation

best_evals, best_gene, best_evaluation = run(400)

generation, best_evaluation, diversity
1, -0.991033797514218, 0.5801678058042138
2, -0.9910945746522588, 0.5764619659381794
3, -0.9913496191307962, 0.5738912231135764
4, -0.9913496191307962, 0.5713456128797374
5, -0.9913496191307962, 0.5698224692484325
6, -0.9913496191307962, 0.565628079057159
7, -0.9913496191307962, 0.5643855663725436
8, -0.9913496191307962, 0.5625532156589761
9, -0.9913496191307962, 0.5605173840271999
10, -0.9913496191307962, 0.557886735640466
11, -0.9913496191307962, 0.5562542824290647
12, -0.9913496191307962, 0.555165421075029
13, -0.9913496191307962, 0.5531736899515206
14, -0.9913496191307962, 0.553049632433894
15, -0.9913496191307962, 0.5509068171793597
16, -0.9913496191307962, 0.548842580011258
17, -0.9913496191307962, 0.5473804168157824
18, -0.9913496191307962, 0.5443477607282057
19, -0.9913496191307962, 0.5439620730228892
20, -0.9913496191307962, 0.5408089735398206
21, -0.9913496191307962, 0.5388367834229371
22, -0.9913496191307962, 0.5385154155269571
23, -0.9

183, -0.9918713245247952, 0.2567704045732705
184, -0.9918713245247952, 0.2555482011377364
185, -0.9918713245247952, 0.2537018814667547
186, -0.9918713245247952, 0.25222402928499127
187, -0.9918713245247952, 0.25084798484011744
188, -0.9918713245247952, 0.24992935095075158
189, -0.9918713245247952, 0.24875362705446727
190, -0.9918713245247952, 0.24618909944137668
191, -0.9918713245247952, 0.24539139167178345
192, -0.9918713245247952, 0.24387227937052697
193, -0.9918713245247952, 0.24231722723206714
194, -0.9918713245247952, 0.23972777723714314
195, -0.9918713245247952, 0.2380625457628452
196, -0.9918713245247952, 0.23646696599669162
197, -0.9918713245247952, 0.23517551835302394
198, -0.9918713245247952, 0.2324291626033947
199, -0.9918713245247952, 0.23178841744613515
200, -0.9918713245247952, 0.22926590174115244
201, -0.9918713245247952, 0.22733156219128256
202, -0.9918713245247952, 0.2260643367337279
203, -0.9918713245247952, 0.22467437515970984
204, -0.9918713245247952, 0.223594149055

362, -0.9919232143663491, 0.09155176408631394
363, -0.9919232143663491, 0.09097526250799472
364, -0.9919232143663491, 0.09016651715446036
365, -0.9919232143663491, 0.08969998334800719
366, -0.9919232143663491, 0.0900903380595246
367, -0.9919232143663491, 0.08975615477664431
368, -0.9919232143663491, 0.08950685364499222
369, -0.9919232143663491, 0.08952175223843828
370, -0.9919232143663491, 0.08883486313578695
371, -0.9919232143663491, 0.08927015220892683
372, -0.9919232143663491, 0.08947807264327211
373, -0.9919232143663491, 0.08975113121870235
374, -0.9919232143663491, 0.08995216786859363
375, -0.9919232143663491, 0.08995828421657656
376, -0.9919232143663491, 0.0898279623538135
377, -0.9919232143663491, 0.08977705969750939
378, -0.9919232143663491, 0.08941679117396738
379, -0.9919232143663491, 0.08946409804970797
380, -0.9919232143663491, 0.08966328990868845
381, -0.9919232143663491, 0.08922079906776002
382, -0.9919232143663491, 0.0891703845657406
383, -0.9919232143663491, 0.088548077

In [11]:
print('rcga')
print(calc_score(true_label, pred_logit))
print(calc_score(true_label, pred_logit + best_gene[None,:]))

rcga
0.989326057683655
0.9919275720087402


In [12]:
np.savetxt(output_file + 'best_bias.csv', best_gene[None,:], delimiter=',')