In [1]:
%matplotlib inline
from sklearn.neighbors import NearestNeighbors, KDTree

from mcmatch import analyze
from mcmatch.db.pg_database import PgFunDB
from mcmatch.util import extract_funname, signature_to_fname_heuristic
from mcmatch.feature import all_features, textlength_features, counter_sum_features, counter_features, relative_counter_features, relative_counter_sum_features
from mcmatch.feature import cyclo_feature
from mcmatch.feature.aggregator import FeatureAggregator
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
from metric_learn import LMNN
import timeit

In [2]:
def run_test(k, mtr_n, norm, tfm, output_function):
    global mtm
    global fdb
    line = '%d;%s;%r;%s;' % (k, mtr_n, tfm, norm)
    
    mtr_o = mtm[mtr_n]
    used_features = FeatureAggregator(mtr_o.values())

    di = analyze.KNearestNeighbors(fdb, used_features, training_repositories=train_set, transform=tfm,
                                   k=k, norm=norm)

    test_info, distances, train_info = di.test(fdb, in_repositories=['t-dietlibc'])
    
    #train_info = map(lambda z: z[1], train_info)
    test_info = map(lambda z: signature_to_fname_heuristic(z[1]), test_info)
    
    totalup = 0
    totaldown = 0
    totalnf = 0
    for idx, v in enumerate(test_info):
        #print "[%4d] %30s" % (idx, v)
        upcount = 0
        if not di.has_function(v):
            totalnf += 1
            continue
        
        for trainidx, trainv in enumerate(train_info[idx]):
            #print " %4d ~%5f [%50s]" % (trainidx,
            #                            distances[idx][trainidx],
            #                            signature_to_fname_heuristic(trainv[1]))
            if signature_to_fname_heuristic(trainv[1]) == v:
                upcount += 1
        if upcount:
            #print "woop,", v
            totalup += 1
        else:
            totaldown += 1
        
    line += "%d;%d;%5f%%" % (totalup, totaldown,
                                                    totalup*100.0/(totalup+totaldown))
    output_function(line)

In [8]:
fdb = PgFunDB()
mtm = {
    'all' : all_features,
#    'textlength' : textlength_features,
#    'counter': counter_features,
#    'counter_sum': counter_sum_features,
#    'rel_counter': relative_counter_features,
#    'rel_counter_sum' : relative_counter_sum_features,
#    'cyclo' : cyclo_feature,
#    'textlen+rel_counter_sum' : dict(
#          list(textlength_features.items())
#        + list(relative_counter_sum_features.items()))
}

#for m in counter_features:
#    mtm['counter_' + m] = {'m': counter_features[m]}
#for m in relative_counter_features:
#    mtm['rel_counter_' + m] = {'m' : relative_counter_features[m]}

norms = [ 'euclidean', 'cityblock', 'cosine' ]
    #'cityblock', 'euclidean', 'cosine']

transform_modes = [ 0, 1, 2, 4 ]

train_set = ['t-glibc', 'musl-1.1.6']
#train_set = ['t-musl']

#ks = [1,5,25,50]
ks = [ 5, 25 ]

outfile = open("shell-knn-timeit.out.csv", "w")
def doublewrap(line):
    #outfile.write(line + "\n")
    #outfile.flush()
    print line

doublewrap("k;feature;tranformation_mode;norm;in_knn;not_in_knn;percent")
for k, mtr_n, norm, tfm in product(ks, mtm, norms, transform_modes):
    print timeit.timeit(lambda: run_test(k, mtr_n, norm, tfm, doublewrap), number=1)
    #training_infos = di.get_trainingset_infos()

    #em = analyze.DistanceInfo.make_equivalence_map(testset_infos, training_infos)

    #good, bad, other = 0, 0, 0
    #for i in range(0, len(em)):
    #    res = analyze.DistanceInfo.get_partition_sizes(pairwise_d[i], None, em[i])
    #    for el in res:
    #        if el[0] < el[2]:
    #            good += 1
    #        elif el[0] > el[2]:
    #            bad += 1
    #try:
    #    print good, bad, other, 1.0 * good/bad
    #except ZeroDivisionError, e:
    #    # zomg x/0
    #    print good, bad, other, "---"


    #di.make_aggregate_graph(pairwise_d, testset_infos, em, title=label)
    #plt.savefig("glibc_dietlibc_default_aggr.pdf")
    #plt.close()
outfile.close()


k;feature;tranformation_mode;norm;in_knn;not_in_knn;percent
5;all;0;euclidean;34;385;8.114558%
6.04579806328
5;all;1;euclidean;4;415;0.954654%
5.92357587814
5;all;2;euclidean;0;419;0.000000%
7.14539790154
5;all;4;euclidean;0;419;0.000000%
24.4634330273
5;all;0;cityblock;39;380;9.307876%
6.14201402664
5;all;1;cityblock;3;416;0.715990%
6.11756706238
5;all;2;cityblock;0;419;0.000000%
7.37689900398
5;all;4;cityblock;0;419;0.000000%
24.1733560562
5;all;0;cosine;31;388;7.398568%
7.77216887474
5;all;1;cosine;4;415;0.954654%
7.76878213882
5;all;2;cosine;0;419;0.000000%
9.08852982521
5;all;4;cosine;0;419;0.000000%
25.1514749527
25;all;0;euclidean;59;360;14.081146%
6.00383210182
25;all;1;euclidean;9;410;2.147971%
5.98672294617
25;all;2;euclidean;0;419;0.000000%
7.69231820107
25;all;4;euclidean;0;419;0.000000%
24.7314100266
25;all;0;cityblock;71;348;16.945107%
6.27079296112
25;all;1;cityblock;7;412;1.670644%
6.21936988831
25;all;2;cityblock;3;416;0.715990%
7.33002400398
25;all;4;cityblock;3;416;0