In [13]:
%matplotlib inline
from sklearn.neighbors import NearestNeighbors, KDTree

from mcmatch import cluster
from mcmatch.db.pg_database import PgFunDB
from mcmatch.util import extract_funname, signature_to_fname_heuristic
from mcmatch.feature.counter import counter_features, relative_counter_features
from mcmatch.feature import all_features, textlength_features
from mcmatch.feature.aggregator import MetricAggregator
from sklearn import preprocessing
import pprint
import matplotlib.pyplot as plt

metr_aggr = MetricAggregator([counter_features[m] for m in ['arithmetic']] + [textlength_features[m] for m in textlength_features])
metr_aggr_rel = MetricAggregator([relative_counter_features[m] for m in relative_counter_features] + [textlength_features[m] for m in textlength_features])
metr_aggr_tl = MetricAggregator([textlength_features[m] for m in textlength_features])
fdb = PgFunDB()


In [14]:
def print_analysis(predictions, test_fun_fids, fid_names):
    good = 0
    nf = 0
    f = 0
    for i, v in enumerate(predictions):
        #if v == 'i2a' or v == 'dummy':
        #    continue
        fb = signature_to_fname_heuristic(test_fun_fids[i][1])
        if v == fb:
            good += 1
            print "**********", i, v, fb
        elif fb not in fid_names:
            nf += 1
            #print "!!", fb, "not in fid_names"
        #print i, v, fb
    
    print good, "of", len(predictions), float(good)/len(predictions)
    if nf != len(predictions):
        print good, "of", len(predictions)-nf, float(good)/(len(predictions)-nf)

In [15]:
def run_test(fdb, metr_aggr, clf, trainset, transformMode=0):
    print "start"
    fids, train_data = fdb.get_features_np(metr_aggr, in_repositories=trainset)
    fid_names = map(lambda z: signature_to_fname_heuristic(z[1]), fids)
    test_fun_fids, test_fun_data = fdb.get_features_np(metr_aggr, in_repositories=['t-dietlibc'])
    test_fun_names = map(lambda z: signature_to_fname_heuristic(z[1]), test_fun_fids)
    train_data, fid_names = cluster.filter_classes(train_data, fid_names, test_fun_names)
    ppl = cluster.TransformPipeline(transformMode)
    train_data = ppl.transform_trainingset(train_data, labels=fid_names)
    print "Fit"
    clf.fit(train_data, fid_names)
    del train_data
    test_fun_data = ppl.transform_testset(test_fun_data)
    predictions = clf.predict(test_fun_data)
    print_analysis(predictions, test_fun_fids, fid_names)


In [16]:
trainset1 = ['glibc', 'musl-1.1.6']
trainset2 = ['musl-1.1.6']
#trainset_tmu = ['t-musl']
#trainset = ['glibc']
#trainset2 = trainset_tmu

from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [17]:
run_test(fdb, metr_aggr, svm.SVC(), trainset2, 16)

start
['dl_iterate_phdr' 'getdtablesize' 'ftw' ..., 'strerror' 'atexit'
 'posix_fallocate']
[   0.    1.    2. ...,  335.  338.  339.]


AssertionError: not enough class labels for specified k (smallest class has 3)

In [5]:
run_test(fdb, metr_aggr, svm.SVC(), trainset2, 16)

start


MemoryError: 

In [5]:
run_test(fdb, metr_aggr, tree.DecisionTreeClassifier(criterion="gini"), trainset2, 16)

start
Fit
********** 274 getpriority getpriority
1 of 596 0.00167785234899
1 of 371 0.00269541778976


In [6]:
run_test(fdb, metr_aggr, tree.DecisionTreeClassifier(criterion="entropy"), trainset2, 16)

start
Fit
********** 44 getgrent getgrent
1 of 596 0.00167785234899
1 of 371 0.00269541778976


In [7]:
run_test(fdb, metr_aggr, RandomForestClassifier(n_estimators=6), trainset2, 16)

start
Fit
********** 274 getpriority getpriority
********** 293 cfmakeraw cfmakeraw
********** 301 tcsetattr tcsetattr
********** 526 iconv iconv
********** 553 wcscmp wcscmp
5 of 596 0.00838926174497
5 of 371 0.0134770889488


In [9]:
run_test(fdb, metr_aggr, RandomForestClassifier(n_estimators=10), trainset2, 16)

start
Fit
********** 274 getpriority getpriority
********** 328 get_current_dir_name get_current_dir_name
********** 526 iconv iconv
********** 553 wcscmp wcscmp
4 of 596 0.00671140939597
4 of 371 0.010781671159


In [10]:
run_test(fdb, metr_aggr, RandomForestClassifier(n_estimators=14), trainset2, 16)

start
Fit
********** 146 siginterrupt siginterrupt
********** 274 getpriority getpriority
********** 293 cfmakeraw cfmakeraw
********** 377 wcscpy wcscpy
********** 526 iconv iconv
5 of 596 0.00838926174497
5 of 371 0.0134770889488


In [8]:
run_test(fdb, metr_aggr, RandomForestClassifier(n_estimators=10), trainset2, 0)

start
Fit
********** 14 isalnum isalnum
********** 51 iconv_open iconv_open
********** 274 getpriority getpriority
********** 293 cfmakeraw cfmakeraw
********** 544 ldiv ldiv
5 of 596 0.00838926174497
5 of 371 0.0134770889488


----------- NEW STUFF --------------

In [14]:
from itertools import product

tfms = [0, 16]
metra = {'ctr': metr_aggr, 'rel': metr_aggr_rel}
trainset = {'musl': trainset2}
dtfc = ['gini', 'entropy']

for tfm, metr_key, dtf, ts in product(tfms, metra, dtfc, trainset):
    print tfm, metr_key, dtf, ts
    run_test(fdb, metra[metr_key], tree.DecisionTreeClassifier(criterion=dtf), trainset[ts], tfm)

0 rel gini full
start
Fit
********** 2 isblank isblank
********** 144 isalnum isalnum
********** 197 abs abs
********** 253 isblank isblank
********** 307 iconv_open iconv_open
********** 340 flbf flbf
********** 361 mkfifo mkfifo
********** 380 usleep usleep
********** 456 isalpha isalpha
********** 484 sigdelset sigdelset
********** 501 bsearch bsearch
********** 526 cfmakeraw cfmakeraw
********** 535 getopt getopt
********** 537 lockf lockf
********** 549 wcswidth wcswidth
********** 571 to64 to64
16 of 596 0.0268456375839
16 of 419 0.0381861575179
0 rel gini musl
start
Fit
********** 6 getgrent getgrent
********** 278 iconv iconv
2 of 596 0.00335570469799
2 of 371 0.00539083557951
0 rel entropy full
start
Fit
********** 2 isblank isblank
********** 31 sigaddset sigaddset
********** 71 cfsetospeed cfsetospeed
********** 144 isalnum isalnum
********** 197 abs abs
********** 253 isblank isblank
********** 340 flbf flbf
********** 361 mkfifo mkfifo
********** 456 isalpha isalpha
******

MemoryError: 

In [5]:
from itertools import product

tfms = [0, 16]
metra = {'ctr': metr_aggr, 'rel': metr_aggr_rel}
trainset = {'musl': trainset2}
dtfc = ['gini', 'entropy']

for tfm, metr_key, dtf, ts in product(tfms, metra, dtfc, trainset):
    print tfm, metr_key, dtf, ts
    run_test(fdb, metra[metr_key], tree.DecisionTreeClassifier(criterion=dtf), trainset[ts], tfm)

0 rel gini musl
start
Fit
********** 209 strtok_r strtok_r
********** 278 iconv iconv
2 of 596 0.00335570469799
2 of 371 0.00539083557951
0 rel entropy musl
start
Fit
0 of 596 0.0
0 of 371 0.0
0 ctr gini musl
start
Fit
********** 279 gai_strerror gai_strerror
********** 526 iconv iconv
2 of 596 0.00335570469799
2 of 371 0.00539083557951
0 ctr entropy musl
start
Fit
********** 44 getgrent getgrent
1 of 596 0.00167785234899
1 of 371 0.00269541778976
16 rel gini musl
start
Fit
********** 144 isalnum isalnum
1 of 596 0.00167785234899
1 of 371 0.00269541778976
16 rel entropy musl
start
Fit
0 of 596 0.0
0 of 371 0.0
16 ctr gini musl
start
Fit
********** 51 iconv_open iconv_open
1 of 596 0.00167785234899
1 of 371 0.00269541778976
16 ctr entropy musl
start
Fit
********** 44 getgrent getgrent
********** 315 localtime_r localtime_r
2 of 596 0.00335570469799
2 of 371 0.00539083557951


In [6]:
from itertools import product

tfms = [0, 16]
metra = {'ctr': metr_aggr, 'rel': metr_aggr_rel}
trainset = {'full': trainset1}
dtfc = ['gini', 'entropy']

for tfm, metr_key, dtf, ts in product(tfms, metra, dtfc, trainset):
    print tfm, metr_key, dtf, ts
    run_test(fdb, metra[metr_key], tree.DecisionTreeClassifier(criterion=dtf), trainset[ts], tfm)

0 rel gini full
start
Fit
********** 2 isblank isblank
********** 144 isalnum isalnum
********** 197 abs abs
********** 253 isblank isblank
********** 307 iconv_open iconv_open
********** 340 flbf flbf
********** 361 mkfifo mkfifo
********** 456 isalpha isalpha
********** 501 bsearch bsearch
********** 524 ftime ftime
********** 526 cfmakeraw cfmakeraw
********** 535 getopt getopt
********** 549 wcswidth wcswidth
********** 571 to64 to64
14 of 596 0.0234899328859
14 of 419 0.0334128878282
0 rel entropy full
start
Fit
********** 2 isblank isblank
********** 31 sigaddset sigaddset
********** 71 cfsetospeed cfsetospeed
********** 144 isalnum isalnum
********** 197 abs abs
********** 253 isblank isblank
********** 340 flbf flbf
********** 361 mkfifo mkfifo
********** 456 isalpha isalpha
********** 484 sigdelset sigdelset
********** 526 cfmakeraw cfmakeraw
11 of 596 0.0184563758389
11 of 419 0.0262529832936
0 ctr gini full
start
Fit
********** 14 isalnum isalnum
********** 15 isalpha isalph

MemoryError: 

In [14]:
from itertools import product

tfms = [16]
metra = {'ctr': metr_aggr}
trainset = {'full': trainset1}
dtfc = ['gini', 'entropy']

for tfm, metr_key, dtf, ts in product(tfms, metra, dtfc, trainset):
    print tfm, metr_key, dtf, ts
    run_test(fdb, metra[metr_key], tree.DecisionTreeClassifier(criterion=dtf), trainset[ts], tfm)

16 ctr gini full
start


MemoryError: 