In [3]:
%matplotlib inline
from sklearn.neighbors import NearestNeighbors, KDTree

from mcmatch import cluster
from mcmatch.db.pg_database import PgFunDB
from mcmatch.util import extract_funname, signature_to_fname_heuristic
from mcmatch.metric import all_metrics, textlength_metrics, counter_sum_metrics, counter_metrics, relative_counter_metrics, relative_counter_sum_metrics
from mcmatch.metric import cyclo_metric
from mcmatch.metric.aggregator import MetricAggregator
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
import pprint


In [4]:
fdb = PgFunDB()
mtm = {
    'all' : all_metrics,
    'textlength' : textlength_metrics,
    'counter': counter_metrics,
    'counter_sum': counter_sum_metrics,
    'rel_counter': relative_counter_metrics,
    'rel_counter_sum' : relative_counter_sum_metrics,
    'cyclo' : cyclo_metric,
    'textlen+rel_counter_sum' : dict(
          list(textlength_metrics.items())
        + list(relative_counter_sum_metrics.items()))
}

for m in counter_metrics:
    mtm['counter_' + m] = {'m': counter_metrics[m]}
for m in relative_counter_metrics:
    mtm['rel_counter_' + m] = {'m' : relative_counter_metrics[m]}

norms = ['cityblock', 'euclidean', 'cosine']

transform_modes = [0]

train_set = ['t-glibc', 'musl-1.1.6']

outfile = open("shell-dist.out.csv", "w")
def doublewrap(line):
    outfile.write(line + "\n")
    outfile.flush()
    print line

doublewrap("metric;tranformation_mode;norm;better_than_avg;worse_than_avg;other;num_fns_in_both_sets;mean_k;median_k;stddev_k")
for mtr_n, norm, tfm in product(mtm, norms, transform_modes):
    line = '%s;%r;%s;' % (mtr_n, tfm, norm)
    #print "***", label
    mtr_o = mtm[mtr_n]
    used_metrics = MetricAggregator(mtr_o.values())

    di = cluster.DistanceInfo(fdb, used_metrics, training_repositories=train_set, transform=tfm, norm=norm)
    #print di.train_data.shape
    pairwise_d, testset_infos = di.test(fdb, in_repositories=['t-dietlibc'])
    #print pairwise_d.shape
    training_infos = di.get_trainingset_infos()

    em = cluster.DistanceInfo.make_equivalence_map(testset_infos, training_infos,
                                                  key=lambda z: signature_to_fname_heuristic(z[1]))

    #print pairwise_d.shape
    #print di.train_data.shape
    
    good, bad, other = 0, 0, 0
    
    ks = []
    for i in range(0, len(em)):
        closests = [] # Yes?
        #pprint.pprint(em[i])
        
        #print ">", testset_infos[i]
        #print ">", di.test_data[i]
        #for eq in em[i]:
        #    print "<", di.train_data[eq], training_infos[eq]
        res = cluster.DistanceInfo.get_partition_sizes(pairwise_d[i], None, em[i])
        for el in res:
            closests.append(el[0])    
            if el[0] < el[2]:
                good += 1
            elif el[0] > el[2]:
                bad += 1
            else:
                other += 1
        if (len(closests)):
            ks.append(min(closests))
    try:
        line += ";".join(map(str, [good, bad, other, len(ks), 1.0*sum(ks)/len(ks), np.median(ks), np.std(ks)]))
    except ZeroDivisionError, e:
        # zomg x/0
        line += ";".join(map(str, [good, bad, other, len(ks), "NaN", np.median(ks), np.std(ks)]))
    doublewrap(line)

    #di.make_aggregate_graph(pairwise_d, testset_infos, em, title=label)
    #plt.savefig("glibc_dietlibc_default_aggr.pdf")
    #plt.close()
    del di.train_data
    del di.trainingset_idx_to_ftid
    del pairwise_d
    del testset_infos
    del training_infos
    del em
    del di
    #gc.collect()

metric;tranformation_mode;norm;better_than_avg;worse_than_avg;other;num_fns_in_both_sets;mean_k;median_k;stddev_k
all;0;cityblock;2828;1308;0;419;3442.92840095;1034.0;4689.96208234
all;0;euclidean;2820;1316;0;419;3429.52505967;1061.0;4643.23611034
all;0;cosine;2665;1471;0;419;3071.94033413;1173.0;4065.5129111
counter;0;cityblock;3016;1120;0;419;3126.28639618;373.0;5086.09234204
counter;0;euclidean;3014;1122;0;419;3303.55131265;400.0;5167.71293999
counter;0;cosine;2739;1199;198;419;2186.42720764;645.0;3391.63064763
counter_logical;0;cityblock;2462;1674;0;419;2214.28162291;0.0;4774.91742416
counter_logical;0;euclidean;2487;1649;0;419;2250.82100239;0.0;4785.69548981
counter_logical;0;cosine;919;903;2314;419;768.957040573;0.0;1970.33727606
rel_counter_r_jumps;0;cityblock;2528;1608;0;419;2884.08591885;696.0;4324.51626909
rel_counter_r_jumps;0;euclidean;2478;1658;0;419;3173.29594272;971.0;4433.58149974
rel_counter_r_jumps;0;cosine;1780;1097;1259;419;2176.86873508;390.0;3239.28827143
rel_coun