In [2]:
%matplotlib inline
from sklearn.neighbors import NearestNeighbors, KDTree

from mcmatch import cluster
from mcmatch.db.pg_database import PgFunDB
from mcmatch.util import extract_funname, signature_to_fname_heuristic
from mcmatch.metric import all_metrics, textlength_metrics, counter_sum_metrics, counter_metrics, relative_counter_metrics, relative_counter_sum_metrics
from mcmatch.metric import cyclo_metric
from mcmatch.metric.aggregator import MetricAggregator
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
from metric_learn import LMNN


In [4]:
fdb = PgFunDB()
mtm = {
    'all' : all_metrics,
    'textlength' : textlength_metrics,
    'counter': counter_metrics,
    'counter_sum': counter_sum_metrics,
    'rel_counter': relative_counter_metrics,
    'rel_counter_sum' : relative_counter_sum_metrics,
    'cyclo' : cyclo_metric,
    'textlen+rel_counter_sum' : dict(
          list(textlength_metrics.items())
        + list(relative_counter_sum_metrics.items()))
}

for m in counter_metrics:
    mtm['counter_' + m] = {'m': counter_metrics[m]}
for m in relative_counter_metrics:
    mtm['rel_counter_' + m] = {'m' : relative_counter_metrics[m]}

norms = [ 'euclidean', 'cityblock', 'cosine' ]
    #'cityblock', 'euclidean', 'cosine']

transform_modes = [ 0 ]

train_set = ['t-glibc', 'musl-1.1.6']
#train_set = ['t-musl']

#ks = [1,5,25,50]
ks = [ 5, 25 ]

outfile = open("shell-knn.out.csv", "w")
def doublewrap(line):
    outfile.write(line + "\n")
    outfile.flush()
    print line

doublewrap("k;metric;tranformation_mode;norm;in_knn;not_in_knn;percent")
for k, mtr_n, norm, tfm in product(ks, mtm, norms, transform_modes):
    line = '%d;%s;%r;%s;' % (k, mtr_n, tfm, norm)
    
    mtr_o = mtm[mtr_n]
    used_metrics = MetricAggregator(mtr_o.values())

    di = cluster.KNearestNeighbors(fdb, used_metrics, training_repositories=train_set, transform=tfm,
                                   k=k, norm=norm)

    test_info, distances, train_info = di.test(fdb, in_repositories=['t-dietlibc'])
    
    #train_info = map(lambda z: z[1], train_info)
    test_info = map(lambda z: signature_to_fname_heuristic(z[1]), test_info)
    
    totalup = 0
    totaldown = 0
    totalnf = 0
    for idx, v in enumerate(test_info):
        #print "[%4d] %30s" % (idx, v)
        upcount = 0
        if not di.has_function(v):
            totalnf += 1
            continue
        
        for trainidx, trainv in enumerate(train_info[idx]):
            #print " %4d ~%5f [%50s]" % (trainidx,
            #                            distances[idx][trainidx],
            #                            signature_to_fname_heuristic(trainv[1]))
            if signature_to_fname_heuristic(trainv[1]) == v:
                upcount += 1
        if upcount:
            #print "woop,", v
            totalup += 1
        else:
            totaldown += 1
        
    line += "%d;%d;%5f%%" % (totalup, totaldown,
                                                    totalup*100.0/(totalup+totaldown))
    doublewrap(line)
    #training_infos = di.get_trainingset_infos()

    #em = cluster.DistanceInfo.make_equivalence_map(testset_infos, training_infos)

    #good, bad, other = 0, 0, 0
    #for i in range(0, len(em)):
    #    res = cluster.DistanceInfo.get_partition_sizes(pairwise_d[i], None, em[i])
    #    for el in res:
    #        if el[0] < el[2]:
    #            good += 1
    #        elif el[0] > el[2]:
    #            bad += 1
    #try:
    #    print good, bad, other, 1.0 * good/bad
    #except ZeroDivisionError, e:
    #    # zomg x/0
    #    print good, bad, other, "---"


    #di.make_aggregate_graph(pairwise_d, testset_infos, em, title=label)
    #plt.savefig("glibc_dietlibc_default_aggr.pdf")
    #plt.close()
outfile.close()


k;metric;tranformation_mode;norm;in_knn;not_in_knn;percent
5;all;0;euclidean;35;384;8.353222%
5;all;0;cityblock;40;379;9.546539%
5;all;0;cosine;29;390;6.921241%
5;counter;0;euclidean;35;384;8.353222%
5;counter;0;cityblock;41;378;9.785203%
5;counter;0;cosine;29;390;6.921241%
5;counter_logical;0;euclidean;13;406;3.102625%
5;counter_logical;0;cityblock;13;406;3.102625%
5;counter_logical;0;cosine;6;413;1.431981%
5;rel_counter_r_jumps;0;euclidean;10;409;2.386635%
5;rel_counter_r_jumps;0;cityblock;10;409;2.386635%
5;rel_counter_r_jumps;0;cosine;10;409;2.386635%
5;rel_counter_r_expensive;0;euclidean;1;418;0.238663%
5;rel_counter_r_expensive;0;cityblock;1;418;0.238663%
5;rel_counter_r_expensive;0;cosine;2;417;0.477327%
5;rel_counter_r_arith;0;euclidean;10;409;2.386635%
5;rel_counter_r_arith;0;cityblock;10;409;2.386635%
5;rel_counter_r_arith;0;cosine;8;411;1.909308%
5;cyclo;0;euclidean;12;407;2.863962%
5;cyclo;0;cityblock;12;407;2.863962%
5;cyclo;0;cosine;10;409;2.386635%
5;rel_counter_sum;0;eu