In [2]:
import inference.InferCode_Inference as II
import data_process.self_supervised.data_reader as data_reader
import xml.etree.ElementTree as ET
import numpy as np
import time
import tqdm
import copy
import math

token_dict_path = "/home/stanley/Desktop/dictionaries/v2_dic/token2id.json"
type_dict_path = "/home/stanley/Desktop/dictionaries/v2_dic/type2id.json"
subtree_count = 633916 # 633916 for the SS-PTM v2
dimension = 64

# code examples to get code vector
file_path = "/home/stanley/Desktop/test_20k"
# file_path = "/home/stanley/Desktop/test_dataset_100k"

In [3]:

# construct a tree that has 'percentage(e.g., 0.1 or 10%)' less number of nodes
def reduce(tree, percentage):
    reduced_tree = copy.deepcopy(tree)
    node_count = len(reduced_tree.getroot().findall(".//*"))
    threshold = math.ceil(node_count * percentage)
    delete_e = []
    difference = node_count

    for e in reduced_tree.iter():
        num_child = len(e.findall(".//*"))
        if abs(num_child - threshold) < difference :
            delete_e = e
            difference = abs(num_child - threshold)
    delete_e.clear()
    delete_e.tag = "unknown_type"
    delete_e.text = "unknown_token"

    #return 1.0 - len(reduced_tree.getroot().findall(".//*"))/node_count
    return reduced_tree


In [2]:
dataset = data_reader.Data_Reader(file_path).processed_dataset

Data Processing : 100%|██████████| 20705/20705 [00:52<00:00, 394.97it/s]


In [4]:
def collect_file_paths(path):
        '''
        collect individual xml file path and return those as a list
        '''
        file_paths = []
        for root, directories, filenames in os.walk(path):
            for filename in filenames:
                file_paths.append(os.path.join(root,filename))
        return file_paths

def get_node_count(tree):
    return len(tree.getroot().findall(".//*"))

In [5]:
paths = collect_file_paths(file_path)
full_tree = []
tree_10 = []
tree_30 = []

len_full = [] # the number of nodes for each original tree
len_10 = [] # the number of nodes for each tree with 10% less nodes
len_30 = [] # the number of nodes for each tree with 30% less nodes

for idx, path in tqdm.tqdm(enumerate(paths)):
    original_tree = ET.parse(path)
    full_tree.append(copy.deepcopy(original_tree))
    tree_10.append(reduce(original_tree, 0.1))
    tree_30.append(reduce(original_tree, 0.3))
    len_full.append(get_node_count(original_tree))
    len_10.append(len(tree_10[idx].getroot().findall(".//*")))
    len_30.append(len(tree_30[idx].getroot().findall(".//*")))

20705it [02:04, 166.02it/s]


In [13]:
model_direct = "/home/stanley/Desktop/batch_32_SS_PTM_v2"
model_paths = collect_file_paths(model_direct)
for model_path in model_paths:
    inferencer = II.InferCode_Inference(model_path, token_dict_path, type_dict_path, subtree_count, dimension)

    positive_count = 0
    total = len(paths)
    difference_threshold = 0.1 # 10%
    negative_count = 0
    total_diff = 0
    for idx, path in tqdm.tqdm(enumerate(paths)):
        
        difference = len_10[idx]/len_full[idx] - len_30[idx]/len_full[idx]

        if difference < difference_threshold: # remove examples that have a small difference in removed nummber of nodes
            total -= 1
        else:

            code_vector = inferencer.code2vec(copy.deepcopy(full_tree[idx]), tree = True)
            code_vector_10 = inferencer.code2vec(copy.deepcopy(tree_10[idx]), tree = True)
            code_vector_30 = inferencer.code2vec(copy.deepcopy(tree_30[idx]), tree = True)
            
            # if cos(code_vector, code_vector_10) > cos(code_vector, code_vector_30):
            if np.linalg.norm(code_vector-code_vector_10) < np.linalg.norm(code_vector-code_vector_30):
                positive_count += 1
            else:
                # for the negative ones, calculate the average distance
                negative_count += 1
                total_diff += difference
    print(model_path, " has accuracy :", positive_count / total, " The total number is ", total)
    print("average difference for negative pairs", total_diff/negative_count)
    # SS-PTM-v1 with momentum, epoch 59, accuracy: 88%
    # SS-PTM-v2-pos with momentum, epoch 1, accuracy: 88.6%
    # SS-PTM-v2 with momentum, epoch 11, accuracy: 87.1%
    # batch_32_SS_PTM_v2/epoch_6, 87%

20705it [07:06, 48.57it/s]
/home/stanley/Desktop/batch_32_SS_PTM_v2/epoch_3.pkl  has accuracy : 0.8436698271054099  The total number is  17930
average difference for negative pairs 0.18001563379638125
20705it [06:33, 52.65it/s]
/home/stanley/Desktop/batch_32_SS_PTM_v2/epoch_9.pkl  has accuracy : 0.8140546569994422  The total number is  17930
average difference for negative pairs 0.18559996546153235
20705it [06:16, 55.05it/s]
/home/stanley/Desktop/batch_32_SS_PTM_v2/epoch_8.pkl  has accuracy : 0.8241494701617401  The total number is  17930
average difference for negative pairs 0.17818164047995083
20705it [06:32, 52.77it/s]
/home/stanley/Desktop/batch_32_SS_PTM_v2/epoch_11.pkl  has accuracy : 0.7946458449525934  The total number is  17930
average difference for negative pairs 0.18290712443291796
20705it [07:19, 47.09it/s]
/home/stanley/Desktop/batch_32_SS_PTM_v2/epoch_1.pkl  has accuracy : 0.8386503067484663  The total number is  17930
average difference for negative pairs 0.179992161481

In [4]:
# generate code vector examples
model2 = II.InferCode_Inference("/home/stanley/Desktop/batch_32_SS_PTM_v2/epoch_6.pkl", token_dict_path, type_dict_path, subtree_count, dimension)
code1 = "/home/stanley/Desktop/code1.xml"
code2 = "/home/stanley/Desktop/code2.xml"
code3 = "/home/stanley/Desktop/code3.xml"

def distance(vec1, vec2):
    return np.linalg.norm(vec1-vec2)

def cos(x, y):
    return np.dot(x,y) / (np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y)))

def dot(x, y):
    return np.dot(x, y)

vec_1 = model2.code2vec(code1)
vec_2 = model2.code2vec(code2)
vec_3 = model2.code2vec(code3)

d12 = distance(vec_1, vec_2)
d13 = distance(vec_1, vec_3)
d23 = distance(vec_2, vec_3)

print(vec_3)

print("distance between function 1 and 2", d12)
print("distance between function 1 and 3", d13)
print("distance between function 2 and 3", d23)


[ 0.9999982   0.99998236  0.46020326  0.9999997   0.9999998   0.9999957
 -0.5291352   0.99999076  0.9999909   0.99999917  0.9999896   0.9999915
  0.9999956   0.9996745   1.          0.99999857  0.99999034  0.99999976
  0.99989307  0.9999901   0.99994284  0.9999986   1.          0.9982172
  0.9998947   0.9998592   0.99999654  0.9999362   0.99998933  0.99999076
  0.9999972   1.          0.9999859   0.9999971   0.999994    1.
  0.99945176  0.99999803  0.9999988   0.92809904  0.9999935   0.9999959
  0.9999914  -0.1701388   0.9999992   0.9999931   0.99938476  1.
  0.9999988   0.9995378   1.          0.99999535  0.9999631   0.99978536
  0.9999957   0.9999979   0.9999793   0.99999887  0.9999999   0.999998
  0.9999999   0.999997    0.99999094  0.9999833 ]
distance between function 1 and 2 1.102961
distance between function 1 and 3 1.1660444
distance between function 2 and 3 1.1417463
