In [1]:
# Configs

embedding_type = "perf" # time or perf

In [2]:
import pandas as pd
import matplotlib
import numpy as np
from sklearn import tree

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=3, suppress=True)

In [3]:
dataset = pd.read_csv(f"../../dataset/{embedding_type}/dataset.csv")
dataset = pd.get_dummies(dataset)

dataset.head()

labels = [
    "label_strings",
    "label_implementation",
    "label_greedy",
    "label_brute_force",
    "label_dp",
    "label_divide_and_conquer",
    "label_graphs",
    "label_binary_search",
    "label_math",
    "label_sortings",
    "label_shortest_paths",
]
print_labels = list(map(lambda l: (l.split('_', 1)[1].replace('_', ' ')), labels))

train, test = train_test_split(dataset, test_size=0.33, random_state=42, shuffle=True)

train_dataset_features = train.copy().drop(labels, axis=1)
train_dataset_labels = pd.concat([train.copy().pop(x) for x in labels], axis=1)

test_dataset_features = test.copy().drop(labels, axis=1)
test_dataset_labels = pd.concat([test.copy().pop(x) for x in labels], axis=1)



In [4]:
test_dataset_features.sort_index()

Unnamed: 0,branch-misses_FEATURE_CONFIG,branch-misses_INTERCEPT,branch-misses_R-VAL,branches_FEATURE_CONFIG,branches_INTERCEPT,branches_R-VAL,context-switches_FEATURE_CONFIG,context-switches_INTERCEPT,context-switches_R-VAL,cpu-migrations_FEATURE_CONFIG,...,stalled-cycles-frontend_FEATURE_TYPE_LOGLOG_POLYNOMIAL,stalled-cycles-frontend_FEATURE_TYPE_LOG_POLYNOMIAL,stalled-cycles-frontend_FEATURE_TYPE_POLYNOMIAL,stalled-cycles-frontend_FEATURE_TYPE_POWER,task-clock_FEATURE_TYPE_FACTORIAL,task-clock_FEATURE_TYPE_FRACTIONAL_POWER,task-clock_FEATURE_TYPE_LOGLOG_POLYNOMIAL,task-clock_FEATURE_TYPE_LOG_POLYNOMIAL,task-clock_FEATURE_TYPE_POLYNOMIAL,task-clock_FEATURE_TYPE_POWER
8,2.0,12378.326531,-209.163265,0.0,356091.016439,1.403287e+01,0,0.0,0.0,0,...,0,1,0,0,0,0,0,1,0,0
12,1.0,12213.758169,3.625786,1.0,353659.222153,3.252067e+02,0,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,0
14,0.0,12116.890361,60.779827,1.0,356370.899267,7.348262e+01,0,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,0
15,1.0,12371.937143,2.987563,1.0,356527.551020,6.530074e+01,0,0.0,0.0,0,...,0,1,0,0,0,0,0,0,0,1
17,0.0,12247.457746,52.703847,1.0,356387.498915,1.423683e+02,0,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5941,1.0,12314.822883,1.704246,1.0,354975.643034,6.615757e+02,0,0.0,0.0,0,...,0,1,0,0,0,0,0,0,1,0
5943,2.0,12300.387768,-0.000003,4.0,359984.733725,-2.879984e-13,0,0.0,0.0,0,...,0,0,1,0,0,0,0,1,0,0
5944,0.0,12233.320504,75.226898,1.0,359885.417516,2.033561e+03,0,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,0
5945,0.9,12262.235739,89.969619,2.0,355167.687085,1.331121e-03,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
model = tree.DecisionTreeClassifier()
model.fit(train_dataset_features, train_dataset_labels)

DecisionTreeClassifier()

In [6]:
from graphviz import Source

graph = Source(tree.export_graphviz(model, out_file=None, feature_names=train_dataset_features.columns))
graph.format = 'png'
graph.render('dtree_render',view=True)


dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.670864 to fit


'dtree_render.png'

In [7]:
print(classification_report(train_dataset_labels, model.predict(train_dataset_features), 
                            target_names = print_labels))

                    precision    recall  f1-score   support

           strings       1.00      1.00      1.00      1522
    implementation       1.00      1.00      1.00      2788
            greedy       1.00      1.00      1.00      1060
       brute force       1.00      1.00      1.00       645
                dp       1.00      1.00      1.00        72
divide and conquer       1.00      1.00      1.00        67
            graphs       1.00      1.00      1.00       159
     binary search       1.00      1.00      1.00        67
              math       1.00      1.00      1.00       711
          sortings       1.00      1.00      1.00       334
    shortest paths       1.00      1.00      1.00       159

         micro avg       1.00      1.00      1.00      7584
         macro avg       1.00      1.00      1.00      7584
      weighted avg       1.00      1.00      1.00      7584
       samples avg       0.99      0.99      0.99      7584



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
print(classification_report(test_dataset_labels, model.predict(test_dataset_features), 
                            target_names = print_labels))

                    precision    recall  f1-score   support

           strings       0.87      0.87      0.87       756
    implementation       0.94      0.93      0.93      1387
            greedy       0.78      0.78      0.78       523
       brute force       0.78      0.79      0.79       311
                dp       0.87      0.77      0.82        35
divide and conquer       0.91      0.65      0.75        31
            graphs       0.91      0.83      0.87        83
     binary search       0.91      0.65      0.75        31
              math       0.90      0.85      0.87       301
          sortings       0.66      0.64      0.65       176
    shortest paths       0.91      0.83      0.87        83

         micro avg       0.87      0.85      0.86      3717
         macro avg       0.86      0.78      0.81      3717
      weighted avg       0.87      0.85      0.86      3717
       samples avg       0.89      0.88      0.87      3717



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# ax = pybaobabdt.drawTree(model, size=10, dpi=72, features=train_dataset_features.keys())

In [10]:
# ax.get_figure().savefig('tree.png', format='png', dpi=300, transparent=True)