### This notebook is supposed to generate all the results of DirectProbe analysis presented in the paper.
#### Except number of clusters and distance between clusters which are in logs files of DIrectProbe.

In [None]:
from utils import load_codesearchnet, get_max_edges, draw_map

import os
import json
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
from sklearn.manifold import TSNE

### DirectProbe Accuracy

In [3]:
def get_label_accuracy(prediction_file, labels):
    correct = [0 for _ in labels]
    total = [0 for _ in labels]

    with open(prediction_file) as f:
        for line in f:
            predictions = line.rstrip('\n').split('\t')
            label, pred = predictions[0], predictions[1].split(',')[0]
            index_of_hyph = pred.index('-')
            predicted_label = pred[index_of_hyph+1:]

            index_of_label = labels.index(label)
            total[index_of_label] += 1
            if label == predicted_label:
                correct[index_of_label] += 1

    accuracy = []
    for cor, tot in zip(correct, total):
        accuracy.append(round(cor/tot, 2))

    return list(zip(labels, accuracy))

In [5]:
res_dir = '../DirectProbe/results/'
model_and_layers = {
    'plbart': [3,6],
    'codebert': [5,9,12],
    'graphcodebert': [5,9,12],
    'unixcoder': [5,9,12],
    'codet5': [5,9,12],
    'codet5p_220': [5,12],
    'codet5_musu': [5,12],
    'codet5p_220_bi': [5,12],
    'coderl': [12,24],
    'codet5_lntp': [12,24],
    'codet5p_770': [12,24],
    'codegen': [8,16],
    'codet5p_2b': [10,20],
    'codet5p_2b_dec': [16,32]
}
labels = ['2.0','3.0','4.0','5.0','6.0']

print('For distance combined')
task = 'distance'
for model,layers in model_and_layers.items():
    print(f'{model} : ')
    for layer in layers:
        prediction_file = os.path.join(res_dir,task, model, str(layer), 'prediction.txt')
        accuracy = get_label_accuracy(prediction_file, labels)
        print(f'{layer} : {accuracy}')
    print('-------------------------------------------')
print('=============================================================')

print('For distance identifiers')
task = 'distance_id'
model_and_layers['codet5p_2b_dec'] = [32]
for model, layers in model_and_layers.items():
    print(f'{model} : ')
    for layer in layers:
        prediction_file = os.path.join(res_dir,task, model, str(layer), 'prediction.txt')
        accuracy = get_label_accuracy(prediction_file, labels)
        print(f'{layer} : {accuracy}')
    print('-------------------------------------------')

For distance combined
plbart : 
3 : [('2.0', 0.79), ('3.0', 0.77), ('4.0', 0.62), ('5.0', 0.7), ('6.0', 0.57)]
6 : [('2.0', 0.83), ('3.0', 0.83), ('4.0', 0.77), ('5.0', 0.7), ('6.0', 0.6)]
-------------------------------------------
codebert : 
5 : [('2.0', 0.87), ('3.0', 0.85), ('4.0', 0.74), ('5.0', 0.72), ('6.0', 0.62)]
9 : [('2.0', 0.89), ('3.0', 0.81), ('4.0', 0.72), ('5.0', 0.72), ('6.0', 0.61)]
12 : [('2.0', 0.85), ('3.0', 0.75), ('4.0', 0.73), ('5.0', 0.68), ('6.0', 0.55)]
-------------------------------------------
graphcodebert : 
5 : [('2.0', 0.88), ('3.0', 0.84), ('4.0', 0.75), ('5.0', 0.7), ('6.0', 0.63)]
9 : [('2.0', 0.83), ('3.0', 0.81), ('4.0', 0.69), ('5.0', 0.68), ('6.0', 0.62)]
12 : [('2.0', 0.84), ('3.0', 0.78), ('4.0', 0.67), ('5.0', 0.67), ('6.0', 0.57)]
-------------------------------------------
unixcoder : 
5 : [('2.0', 0.86), ('3.0', 0.82), ('4.0', 0.72), ('5.0', 0.71), ('6.0', 0.66)]
9 : [('2.0', 0.77), ('3.0', 0.77), ('4.0', 0.69), ('5.0', 0.63), ('6.0', 0.6

In [7]:
res_dir = '../DirectProbe/results/'
model_and_layers = {
    'plbart': [3,6],
    'codebert': [5,9,12],
    'graphcodebert': [5,9,12],
    'unixcoder': [5,9,12],
    'codet5': [5,9,12],
    'codet5p_220': [5,12],
    'codet5_musu': [5,12],
    'codet5p_220_bi': [5,12],
    'coderl': [12,24],
    'codet5_lntp': [12,24],
    'codet5p_770': [12,24],
    'codegen': [8,16],
    'codet5p_2b': [10,20],
    'codet5p_2b_dec': [16,32]
}
labels = ['0','1']

print('For siblings combined')
task = 'siblings'
for model, layers in model_and_layers.items():
    print(f'{model} : ')
    for layer in layers:
        prediction_file = os.path.join(res_dir,task, model, str(layer), 'prediction.txt')
        accuracy = get_label_accuracy(prediction_file, labels)
        print(f'{layer} : {accuracy}')
    print('-------------------------------------------')
print('=============================================================')

print('For siblings identifiers')
task = 'siblings_id'
for model, layers in model_and_layers.items():
    print(f'{model} : ')
    for layer in layers:
        prediction_file = os.path.join(res_dir,task, model, str(layer), 'prediction.txt')
        accuracy = get_label_accuracy(prediction_file, labels)
        print(f'{layer} : {accuracy}')
    print('-------------------------------------------')

For siblings combined
plbart : 
3 : [('0', 0.83), ('1', 0.86)]
6 : [('0', 0.88), ('1', 0.88)]
-------------------------------------------
codebert : 
5 : [('0', 0.87), ('1', 0.94)]
9 : [('0', 0.87), ('1', 0.93)]
12 : [('0', 0.87), ('1', 0.88)]
-------------------------------------------
graphcodebert : 
5 : [('0', 0.87), ('1', 0.91)]
9 : [('0', 0.84), ('1', 0.92)]
12 : [('0', 0.76), ('1', 0.87)]
-------------------------------------------
unixcoder : 
5 : [('0', 0.86), ('1', 0.91)]
9 : [('0', 0.8), ('1', 0.88)]
12 : [('0', 0.61), ('1', 0.64)]
-------------------------------------------
codet5 : 
5 : [('0', 0.84), ('1', 0.85)]
9 : [('0', 0.86), ('1', 0.89)]
12 : [('0', 0.82), ('1', 0.91)]
-------------------------------------------
codet5p_220 : 
5 : [('0', 0.91), ('1', 0.89)]
12 : [('0', 0.78), ('1', 0.94)]
-------------------------------------------
codet5_musu : 
5 : [('0', 0.87), ('1', 0.83)]
12 : [('0', 0.8), ('1', 0.87)]
-------------------------------------------
codet5p_220_bi :

In [9]:
res_dir = '../DirectProbe/results/'
model_and_layers = {
    'plbart': [3,6],
    'codebert': [5,9,12],
    'graphcodebert': [5,9,12],
    'unixcoder': [5,9,12],
    'codet5': [5,9,12],
    'codet5p_220': [5,12],
    'codet5_musu': [5,12],
    'codet5p_220_bi': [5,12],
    'coderl': [12,24],
    'codet5_lntp': [12,24],
    'codet5p_770': [12,24],
    'codegen': [8,16],
    'codet5p_2b': [10,20],
    'codet5p_2b_dec': [16,32]
}
labels = ['-1','0','1']

print('for dataflow')
task = 'dfg'
for model, layers in model_and_layers.items():
    print(f'{model} : ')
    for layer in layers:
        prediction_file = os.path.join(res_dir,task, model, str(layer), 'prediction.txt')
        accuracy = get_label_accuracy(prediction_file, labels)
        print(f'{layer} : {accuracy}')
    print('-------------------------------------------')

for dataflow
plbart : 
3 : [('-1', 0.83), ('0', 0.68), ('1', 0.9)]
6 : [('-1', 0.94), ('0', 0.62), ('1', 0.91)]
-------------------------------------------
codebert : 
5 : [('-1', 0.94), ('0', 0.7), ('1', 0.95)]
9 : [('-1', 0.94), ('0', 0.7), ('1', 0.95)]
12 : [('-1', 0.9), ('0', 0.69), ('1', 0.91)]
-------------------------------------------
graphcodebert : 
5 : [('-1', 0.94), ('0', 0.68), ('1', 0.94)]
9 : [('-1', 0.95), ('0', 0.73), ('1', 0.95)]
12 : [('-1', 0.93), ('0', 0.71), ('1', 0.94)]
-------------------------------------------
unixcoder : 
5 : [('-1', 0.91), ('0', 0.66), ('1', 0.93)]
9 : [('-1', 0.88), ('0', 0.64), ('1', 0.9)]
12 : [('-1', 0.79), ('0', 0.54), ('1', 0.72)]
-------------------------------------------
codet5 : 
5 : [('-1', 0.81), ('0', 0.69), ('1', 0.92)]
9 : [('-1', 0.91), ('0', 0.63), ('1', 0.9)]
12 : [('-1', 0.9), ('0', 0.57), ('1', 0.86)]
-------------------------------------------
codet5p_220 : 
5 : [('-1', 0.86), ('0', 0.75), ('1', 0.89)]
12 : [('-1', 0.88)

### DirectProbe Cluster stats - Appendix

In [10]:
results_dir = '../DirectProbe/results/distance'
train_dir = '../DirectProbe/data/distance'
model_and_layers = {
    'plbart': [6],
    'codebert': [12],
    'graphcodebert': [12],
    'unixcoder': [12],
    'codet5': [12],
    'codet5p_220': [12],
    'codet5_musu': [12],
    'codet5p_220_bi': [12],
    'coderl': [24],
    'codet5_lntp': [24],
    'codet5p_770': [24],
    'codegen': [16],
    'codet5p_2b_dec': [32]
}
for model in model_and_layers.keys():
    layers = model_and_layers[model]
    for layer in layers:
        res_dir = os.path.join(results_dir, model, str(layer))
        clusters = os.path.join(res_dir, 'clusters.txt')
        dis = os.path.join(res_dir, 'dis.txt')
        train_data = os.path.join(train_dir, model, '/entities/train.txt')
        cluster_size = {}
        cluster_list = []
        with open(clusters) as cluster:
            for line in cluster:
                cluster_num = int(line.rstrip('\n'))
                cluster_list.append(cluster_num)
                if not cluster_num in cluster_size.keys():
                    cluster_size[cluster_num] = 1
                else:
                    cluster_size[cluster_num] += 1
        
        cluster_labels = [-1 for _ in cluster_size.keys()]
        with open(dis) as cluster_dist:
            for line in cluster_dist:
                lab_1, lab_2 = line.rstrip('\n').split(':')[0].lstrip('(').rstrip(')').split(',')
                clus, lab = lab_1.split('-')
                clus, lab = int(clus), float(lab)
                cluster_labels[clus] = lab

                clus, lab = lab_2.split('-')
                clus, lab = int(clus), float(lab)
                cluster_labels[clus] = lab
        print(f'{model}  : Layer {layer}')
        print(f'Cluster  Label  Size')
        for i in range(len(cluster_labels)):
            print(f'{i}         {cluster_labels[i]}    {cluster_size[i]}')
    
        print('____________________________________________________')

plbart  : Layer 6
Cluster  Label  Size
0         2.0    105
1         2.0    942
2         3.0    614
3         3.0    417
4         6.0    227
5         5.0    183
6         6.0    813
7         4.0    1042
8         5.0    857
____________________________________________________
codebert  : Layer 12
Cluster  Label  Size
0         3.0    178
1         2.0    806
2         3.0    453
3         5.0    225
4         2.0    241
5         3.0    400
6         6.0    683
7         6.0    357
8         4.0    1042
9         5.0    815
____________________________________________________
graphcodebert  : Layer 12
Cluster  Label  Size
0         2.0    48
1         3.0    386
2         5.0    94
3         3.0    645
4         2.0    999
5         6.0    921
6         5.0    946
7         6.0    119
8         4.0    1042
____________________________________________________
unixcoder  : Layer 12
Cluster  Label  Size
0         3.0    334
1         4.0    377
2         6.0    225
3         4.0    3

In [11]:
results_dir = '../DirectProbe/results/siblings'
train_dir = '../DirectProbe/data/siblings'
model_and_layers = {
    'plbart': [6],
    'codebert': [12],
    'graphcodebert': [12],
    'unixcoder': [12],
    'codet5': [12],
    'codet5p_220': [12],
    'codet5_musu': [12],
    'codet5p_220_bi': [12],
    'coderl': [24],
    'codet5_lntp': [24],
    'codet5p_770': [24],
    'codegen': [16],
    'codet5p_2b_dec': [32]
}
for model in model_and_layers.keys():
    layers = model_and_layers[model]
    for layer in layers:
        res_dir = os.path.join(results_dir, model, str(layer))
        clusters = os.path.join(res_dir, 'clusters.txt')
        dis = os.path.join(res_dir, 'dis.txt')
        train_data = os.path.join(train_dir, model, '/entities/train.txt')
        cluster_size = {}
        cluster_list = []
        with open(clusters) as cluster:
            for line in cluster:
                cluster_num = int(line.rstrip('\n'))
                cluster_list.append(cluster_num)
                if not cluster_num in cluster_size.keys():
                    cluster_size[cluster_num] = 1
                else:
                    cluster_size[cluster_num] += 1
        
        cluster_labels = [-1 for _ in cluster_size.keys()]
        with open(dis) as cluster_dist:
            for line in cluster_dist:
                lab_1, lab_2 = line.rstrip('\n').split(':')[0].lstrip('(').rstrip(')').split(',')
                clus, lab = lab_1.split('-')
                clus, lab = int(clus), float(lab)
                cluster_labels[clus] = lab

                clus, lab = lab_2.split('-')
                clus, lab = int(clus), float(lab)
                cluster_labels[clus] = lab
        print(f'{model}  : Layer {layer}')
        print(f'Cluster  Label  Size')
        for i in range(len(cluster_labels)):
            print(f'{i}         {cluster_labels[i]}    {cluster_size[i]}')
    
        print('____________________________________________________')

plbart  : Layer 6
Cluster  Label  Size
0         1.0    610
1         1.0    126
2         0.0    33
3         1.0    454
4         0.0    1177
____________________________________________________
codebert  : Layer 12
Cluster  Label  Size
0         1.0    411
1         1.0    779
2         0.0    1210
____________________________________________________
graphcodebert  : Layer 12
Cluster  Label  Size
0         1.0    1
1         0.0    53
2         0.0    1157
3         1.0    1189
____________________________________________________
unixcoder  : Layer 12
Cluster  Label  Size
0         0.0    2
1         1.0    1153
2         0.0    1208
3         1.0    37
____________________________________________________
codet5  : Layer 12
Cluster  Label  Size
0         1.0    664
1         0.0    458
2         0.0    135
3         1.0    157
4         1.0    365
5         1.0    4
6         0.0    617
____________________________________________________
codet5p_220  : Layer 12
Cluster  Label  Size

In [12]:
results_dir = '../DirectProbe/results/dfg/'
train_dir = '../DirectProbe/data/dfg/'
model_and_layers = {
    'plbart': [6],
    'codebert': [12],
    'graphcodebert': [12],
    'unixcoder': [12],
    'codet5': [12],
    'codet5p_220': [12],
    'codet5_musu': [12],
    'codet5p_220_bi': [12],
    'coderl': [24],
    'codet5_lntp': [24],
    'codet5p_770': [24],
    'codegen': [16],
    'codet5p_2b_dec': [32]
}
for model in model_and_layers.keys():
    layers = model_and_layers[model]
    for layer in layers:
        res_dir = os.path.join(results_dir, model, str(layer))
        clusters = os.path.join(res_dir, 'clusters.txt')
        dis = os.path.join(res_dir, 'dis.txt')
        train_data = os.path.join(train_dir, model, '/entities/train.txt')
        cluster_size = {}
        cluster_list = []
        with open(clusters) as cluster:
            for line in cluster:
                cluster_num = int(line.rstrip('\n'))
                cluster_list.append(cluster_num)
                if not cluster_num in cluster_size.keys():
                    cluster_size[cluster_num] = 1
                else:
                    cluster_size[cluster_num] += 1

        cluster_labels = [-1 for _ in cluster_size.keys()]
        with open(dis) as cluster_dist:
            for line in cluster_dist:
                lab_1, lab_2 = line.rstrip('\n').split(':')[0].lstrip('(').rstrip(')').split(',')
                clus, lab = lab_1[0], lab_1[2:]
                clus, lab = int(clus), int(lab)
                cluster_labels[clus] = lab

                clus, lab = lab_2[1], lab_2[3:]
                clus, lab = int(clus), int(lab)
                cluster_labels[clus] = lab
        print(f'{model}  : Layer {layer}')
        print(f'Cluster  Label  Size')
        for i in range(len(cluster_labels)):
            print(f'{i}         {cluster_labels[i]}    {cluster_size[i]}')
    
        print('____________________________________________________')

plbart  : Layer 6
Cluster  Label  Size
0         0    1
1         -1    1185
2         0    1208
3         1    1206
____________________________________________________
codebert  : Layer 12
Cluster  Label  Size
0         0    1
1         0    1208
2         1    1206
3         -1    1185
____________________________________________________
graphcodebert  : Layer 12
Cluster  Label  Size
0         -1    1
1         0    1
2         -1    1008
3         0    549
4         -1    176
5         0    659
6         1    1206
____________________________________________________
unixcoder  : Layer 12
Cluster  Label  Size
0         0    1
1         -1    1185
2         0    1208
3         1    1206
____________________________________________________
codet5  : Layer 12
Cluster  Label  Size
0         0    1
1         -1    1185
2         0    1208
3         1    1206
____________________________________________________
codet5p_220  : Layer 12
Cluster  Label  Size
0         0    1
1         -1    