# Experiment on CFGs of new dataset by utilizing 500 nodes to run it on devign

In [1]:
import pandas as pd
from tabulate import tabulate
import json, configs, gc, torch
import src.process as process
import src.data as data_util
import numpy as np

In [2]:
def pad_x(x, nodes_dim, w2v_size):
    new_x = torch.zeros(nodes_dim, w2v_size).float()
    new_x[:x.x.size(0), :] = x.x
    x.x = new_x
    return x

In [3]:
data = pd.read_pickle('new_data.pkl')

In [4]:
w2v_size  = 200
#nodes_dim = int(group.apply(lambda g: nx.number_of_nodes(g.graph),axis=1).describe()['max'])
nodes_dim = 500
idx = [True if row['input'].x.shape[0]<=nodes_dim else False for index, row in data.iterrows()]
data = data[idx]

print("No of samples in dataset: {} ".format(len(data)))
print('\n'*3,"*"*40,'\n')

node_size_group = data.apply(lambda g: g.input.num_nodes,axis=1).describe()[['min', 'max','mean','std']]

print(tabulate(node_size_group.to_frame(),
               tablefmt="grid", stralign='left', numalign='left',
               headers=['Node stats']))


print('\n'*3)

edge_size_group = data.apply(lambda g: g.input.num_edges ,axis=1).describe()[['min', 'max','mean','std']]
print(tabulate(edge_size_group.to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Edge stats']))

print('\n\n')
print("1 = Vulnerable, 0 = Not Vulnerable")
print('\n')
for name, group in data.groupby('target'):
    
    node_size_group = group.apply(lambda g: g.input.num_nodes, axis=1).describe()[['min', 'max','mean','std']]
    edge_size_group = group.apply(lambda g: g.input.num_edges ,axis=1).describe()[['min', 'max','mean','std']]
    
    print(tabulate(node_size_group.to_frame(),
               tablefmt="grid", stralign='left', numalign='left',
               headers=['Class {} Node stats'.format(name)]))
    
    print('\n'*3)
    print(tabulate(edge_size_group.to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Class {} Edge stats'.format(name)]))
    print('\n'*3)
    

No of samples in dataset: 12723 



 **************************************** 

+------+--------------+
|      | Node stats   |
| min  | 2            |
+------+--------------+
| max  | 500          |
+------+--------------+
| mean | 89.2446      |
+------+--------------+
| std  | 98.0444      |
+------+--------------+




+------+--------------+
|      | Edge stats   |
| min  | 1            |
+------+--------------+
| max  | 737          |
+------+--------------+
| mean | 99.3421      |
+------+--------------+
| std  | 111.119      |
+------+--------------+



1 = Vulnerable, 0 = Not Vulnerable


+------+----------------------+
|      | Class 0 Node stats   |
| min  | 2                    |
+------+----------------------+
| max  | 500                  |
+------+----------------------+
| mean | 92.2276              |
+------+----------------------+
| std  | 99.0916              |
+------+----------------------+




+------+----------------------+
|      | Class 0 Edge stats   |
| min  |

In [5]:
data['input'] = data['input'].apply(lambda x: pad_x(x, nodes_dim, w2v_size) )

In [6]:
with open('configs.json') as f:
    json_config = json.load(f)
    
json_config['devign']['model']['conv_args']['conv1d_1']['in_channels'] = nodes_dim
json_config['embed']['nodes_dim']  = nodes_dim
json_config['devign']['model']['emb_size']  = w2v_size
json_config['embed']['word2vec_args']['size']  = w2v_size

with open('configs.json', 'w') as f:
    json.dump(json_config,f, indent=4)
    
PATHS = configs.Paths()
FILES = configs.Files()
DEVICE = FILES.get_device()

context = configs.Process()
devign = configs.Devign()
model_path = PATHS.model + FILES.model

In [None]:
model_path = PATHS.model + FILES.model
model = process.Devign(path=model_path, device=DEVICE, model=devign.model, learning_rate=devign.learning_rate,
                       weight_decay=devign.weight_decay,
                       loss_lambda=devign.loss_lambda)
train = process.Train(model, context.epochs)
input_dataset = data[["input", "target"]]
# split the dataset and pass to DataLoader with batch size
train_loader, val_loader, test_loader = list(
    map(lambda x: x.get_loader(context.batch_size, shuffle=context.shuffle),
        data_util.train_val_test_split(input_dataset, shuffle=context.shuffle)))
train_loader_step = process.LoaderStep("Train", train_loader, DEVICE)
val_loader_step = process.LoaderStep("Validation", val_loader, DEVICE)
train(train_loader_step, val_loader_step)
print('Finish Training for {}'.format(name))
spinner.stop()
spinner.clear()

new fc1 2000 new fc2 size 1000
The model has 560,272 trainable parameters
Splitting Dataset





Epoch 1; - Train Loss: 0.3661; Acc: 0.1813; - Validation Loss: 3.8542; Acc: 0.1497; - Time: 529.6525275707245

Epoch 2; - Train Loss: 0.452; Acc: 0.1724; - Validation Loss: 2.9757; Acc: 0.1557; - Time: 773.9536793231964

Epoch 3; - Train Loss: 0.4939; Acc: 0.1741; - Validation Loss: 9.9673; Acc: 0.1796; - Time: 1018.9731566905975

Epoch 4; - Train Loss: 0.4721; Acc: 0.1975; - Validation Loss: 18.1449; Acc: 0.2754; - Time: 1265.5171356201172

Epoch 5; - Train Loss: 0.6153; Acc: 0.1679; - Validation Loss: 2.9816; Acc: 0.1497; - Time: 1511.0969188213348

Epoch 6; - Train Loss: 0.4408; Acc: 0.1885; - Validation Loss: 15.5872; Acc: 0.2575; - Time: 1756.733506679535

Epoch 7; - Train Loss: 0.5048; Acc: 0.1616; - Validation Loss: 2.9873; Acc: 0.1437; - Time: 2002.692902803421

Epoch 8; - Train Loss: 0.4175; Acc: 0.2092; - Validation Loss: 41.5218; Acc: 0.4618; - Time: 2301.779297351837

Epoch 9; - Train Loss: 2.5825; Acc: 0.1921; - Validation Loss: 2.986; Acc: 0.1557; - Time: 2853.7011320590

In [9]:
data.x

AttributeError: 'DataFrame' object has no attribute 'x'

In [None]:
[row['input'].x for index, row in data.iterrows()]

In [11]:
2853/60

47.55

In [23]:
sum(idx)

12723

In [25]:
12723/13503

0.9422350588758054

In [26]:
12723-13503

-780

## Code for GIN

In [None]:
class NodesEmbedding:
    def __init__(self, nodes_dim: int, w2v_keyed_vectors: Word2VecKeyedVectors):
        self.w2v_keyed_vectors = w2v_keyed_vectors
        self.kv_size = w2v_keyed_vectors.vector_size
        self.nodes_dim = nodes_dim

        assert self.nodes_dim >= 0

        # Buffer for embeddings with padding
        self.target = torch.zeros(self.nodes_dim, self.kv_size).float()

    def __call__(self, nodes):
        embedded_nodes = self.embed_nodes(nodes)

        nodes_tensor = torch.from_numpy(embedded_nodes).float()

        self.target[:nodes_tensor.size(0), :] = nodes_tensor

        return self.target

    def embed_nodes(self, G):
        embeddings = []

        for (n,d) in G.nodes(data=True):
            # Get node's code
            node_code = d
            # Tokenize the code
            tokenized_code = tokenizer("".join(d.values()))
            if not tokenized_code:
                # print(f"Dropped node {node}: tokenized code is empty.")
                msg = f"Empty TOKENIZED from node CODE {node_code}"
                print(msg)
            # Get each token's learned embedding vector
            vectorized_code = np.array(self.get_vectors(tokenized_code))
            # The node's source embedding is the average of it's embedded tokens
            source_embedding = np.mean(vectorized_code, 0)
            # The node representation is the concatenation of label and source embeddings
            #embedding = np.concatenate((np.array([node.type]), source_embedding), axis=0)
            embeddings.append(source_embedding)
        # print(node.label, node.properties.properties.get("METHOD_FULL_NAME"))
        
        return np.array(embeddings)

    # fromTokenToVectors
    def get_vectors(self, tokenized_code):
        vectors = []
        for token in tokenized_code:
            if token in self.w2v_keyed_vectors.key_to_index:
                vectors.append(self.w2v_keyed_vectors[token])
            else:
                # print(node.label, token, node.get_code(), tokenized_code)
                vectors.append(np.zeros(self.kv_size))
        return vectors



def nodes_to_input(G, target, nodes_dim, keyed_vectors):
    nodes_embedding = NodesEmbedding(nodes_dim, keyed_vectors)
    edge_index, edge_attr = convert.from_scipy_sparse_matrix(nx.adjacency_matrix(G))
    label = torch.tensor([target]).float()

    return Data(x=nodes_embedding(G), edge_index=edge_index, edge_attr=edge_attr ,y=label) 

In [26]:
print('Writing to file/pandas')
pd.to_pickle(data[['graph','target']], 'new_data_nx')

Writing to file/pandas


In [11]:
print(json.dumps(dict_e, indent=4))
data = pd.DataFrame(pd_list)
data = data[['target', 'project', 'graph','func_code','index']]
data = data.rename(columns={'func_code': 'func'})

{
    "dataset1/positive_cfg total": 10786,
    "dataset1/positive_cfg removed": 3360,
    "dataset1/positive_cfg used": 7426,
    "dataset1/negative_cfg total": 10786,
    "dataset1/negative_cfg removed": 2850,
    "dataset1/negative_cfg used": 7936
}


In [29]:
c = data.graph[0]

In [38]:
for d, n in c.nodes(data=True):
    c[d]['x'] = 0
    

TypeError: 'AdjacencyView' object does not support item assignment

In [10]:
print("No of samples in dataset: {} ".format(len(data)))
print('\n'*3,"*"*40,'\n')

node_size_group = data.apply(lambda g: nx.number_of_nodes(g.graph),axis=1).describe()[['min', 'max','mean','std']]

print(tabulate(node_size_group.to_frame(),
               tablefmt="grid", stralign='left', numalign='left',
               headers=['Node stats']))


print('\n'*3)
print("1 = Vulnerable, 0 = Not Vulnerable")
edge_size_group = data.apply(lambda g: nx.number_of_edges(g.graph),axis=1).describe()[['min', 'max','mean','std']]
print(tabulate(edge_size_group.to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Edge stats']))

for name in group, data.groupby('target'):
    print(tabulate(node_size_group.to_frame(),
               tablefmt="grid", stralign='left', numalign='left',
               headers=['Class {} Node stats'.format(name)]))
    
    edge_size_group = data.apply(lambda g: nx.number_of_edges(g.graph),axis=1).describe()[['min', 'max','mean','std']]
    print(tabulate(edge_size_group.to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Class {} Edge stats'.format(name)]))

NameError: name 'data' is not defined

In [7]:
data = pd.read_pickle('new_data.pkl')

In [7]:
data["input"] = data.apply(lambda row: nodes_to_input(row.graph, row.target, nx.number_of_nodes(row.graph),
                                                                                    w2vmodel.wv), axis=1)
print('Writing to file/pandas')
pd.to_pickle(data[['input','target']], 'new_data')

In [6]:
pd.to_pickle(data[['input','target']], 'new_data')

OSError: [Errno 28] No space left on device

In [6]:
data = pd.DataFrame(pd_list)
edge_size_group = data.apply(lambda g: nx.number_of_edges(g.graph),axis=1).describe()[['min', 'max','mean','std']]
print(tabulate(edge_size_group.to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Edge stats']))

+------+--------------+
|      | Edge stats   |
| min  | 1            |
+------+--------------+
| max  | 11993        |
+------+--------------+
| mean | 163.733      |
+------+--------------+
| std  | 352.179      |
+------+--------------+


In [25]:
data.head()

Unnamed: 0,index,project,func_code,graph,is_connected,dot_string,target
0,1351,positive_cfg,static int vmci_transport_dgram_dequeue(struct...,"(1000117, 1000119, 1000123, 1000124, 1000127, ...",True,"digraph vmci_transport_dgram_dequeue {\n""10001...",1
1,2623,positive_cfg,"static const char *parse_array( cJSON *item, c...","(1000113, 1000114, 1000118, 1000121, 1000123, ...",True,"digraph parse_array {\n""1000113"" [label = ""(<o...",1
2,2903,positive_cfg,"asmlinkage int arm_syscall(int no, struct pt_r...","(1000113, 1000115, 1000118, 1000119, 1000122, ...",True,"digraph arm_syscall {\n""1000113"" [label = ""(<o...",1
3,5323,positive_cfg,"static ssize_t aio_run_iocb(struct kiocb *req,...","(1000111, 1000113, 1000115, 1000124, 1000133, ...",True,"digraph aio_run_iocb {\n""1000111"" [label = ""(<...",1
4,1469,positive_cfg,cib_remote_connection_destroy(gpointer user_d...,"(1000104, 1000108, 1000112, 1000113, 1000115, ...",True,"digraph cib_remote_connection_destroy {\n""1000...",1


In [9]:
data.iloc[8].input

Data(x=[172, 200], edge_index=[2, 216], edge_attr=[216], y=[1])