## Script purpose

- Load the dataset (using cfg_pkl, our dataset)
- Split it into projects (ffmpeg and qemu in our case)
- tokenize the source code using their tokenizer 
- Generate word2vec representation using their trained model using CFG node level information
- convert the data into torch_geometric format
- train the data on their model

## Important things
- Their source code is using AST for classification
- The word2vec model is different because because we are using CFG based node information rather than using AST
- I am using their defined hyperparamters for word2vec and their model
- Their adacency encoding graph representation is kind of weird. I mean first of all it is disconnected. They have multiple isolated nodes and values of all of their adjacency matrix is 2 instead of being one (ignoring multi-edge graph).
- Their is some error while converting graph to their custom representation. I was not able to convert some of their instances from torch_geometric to simple adjacencey matrix.
- Slightly irrelvevant, There are some issues as well in torch geomteric. It save the edge list and edge wait (sum of edges in our case) in another attribute called edge_attr and cant be used during calculations.

## Defining functions

In [1]:
import configs, gc
from tabulate import tabulate
from halo import HaloNotebook as Halo
import pandas as pd
import numpy as np
import networkx as nx
import src.data as data_util
import src.process as process
from src.utils.functions.parse import tokenizer
import torch
from gensim.models.word2vec import Word2Vec
from torch_geometric.data import Data
from torch_geometric.utils import convert
from gensim.models.keyedvectors import Word2VecKeyedVectors

class NodesEmbedding:
    def __init__(self, nodes_dim: int, w2v_keyed_vectors: Word2VecKeyedVectors):
        self.w2v_keyed_vectors = w2v_keyed_vectors
        self.kv_size = w2v_keyed_vectors.vector_size
        self.nodes_dim = nodes_dim

        assert self.nodes_dim >= 0

        # Buffer for embeddings with padding
        self.target = torch.zeros(self.nodes_dim, self.kv_size).float()

    def __call__(self, nodes):
        embedded_nodes = self.embed_nodes(nodes)

        nodes_tensor = torch.from_numpy(embedded_nodes).float()

        self.target[:nodes_tensor.size(0), :] = nodes_tensor

        return self.target

    def embed_nodes(self, G):
        embeddings = []

        for (n,d) in G.nodes(data=True):
            # Get node's code
            node_code = d
            # Tokenize the code
            tokenized_code = tokenizer("".join(d.values()))
            if not tokenized_code:
                # print(f"Dropped node {node}: tokenized code is empty.")
                msg = f"Empty TOKENIZED from node CODE {node_code}"
                print(msg)
            # Get each token's learned embedding vector
            vectorized_code = np.array(self.get_vectors(tokenized_code))
            # The node's source embedding is the average of it's embedded tokens
            source_embedding = np.mean(vectorized_code, 0)
            # The node representation is the concatenation of label and source embeddings
            #embedding = np.concatenate((np.array([node.type]), source_embedding), axis=0)
            embeddings.append(source_embedding)
        # print(node.label, node.properties.properties.get("METHOD_FULL_NAME"))

        return np.array(embeddings)

    # fromTokenToVectors
    def get_vectors(self, tokenized_code):
        vectors = []
        for token in tokenized_code:
            if token in self.w2v_keyed_vectors.key_to_index:
                vectors.append(self.w2v_keyed_vectors[token])
            else:
                # print(node.label, token, node.get_code(), tokenized_code)
                vectors.append(np.zeros(self.kv_size))
        return vectors



def nodes_to_input(G, target, nodes_dim, keyed_vectors):
    nodes_embedding = NodesEmbedding(nodes_dim, keyed_vectors)
    edge_index, edge_attr = convert.from_scipy_sparse_matrix(nx.adjacency_matrix(G))
    label = torch.tensor([target]).float()

    return Data(x=nodes_embedding(G), edge_index=edge_index, edge_attr=edge_attr ,y=label)



def process_task(stopping, cpg_dataset):
    context = configs.Process()
    devign = configs.Devign()
    model_path = PATHS.model + FILES.model
    model = process.Devign(path=model_path, device=DEVICE, model=devign.model, learning_rate=devign.learning_rate,
                           weight_decay=devign.weight_decay,
                           loss_lambda=devign.loss_lambda)
    train = process.Train(model, context.epochs)
    input_dataset = cpg_dataset
    # split the dataset and pass to DataLoader with batch size
    train_loader, val_loader, test_loader = data_util.train_val_test_split(input_dataset, shuffle=context.shuffle)
    train_loader_step = process.LoaderStep("Train", train_loader, DEVICE)
    val_loader_step = process.LoaderStep("Validation", val_loader, DEVICE)
    test_loader_step = process.LoaderStep("Test", test_loader, DEVICE)

    if stopping:
        early_stopping = process.EarlyStopping(model, patience=context.patience)
        train(train_loader_step, val_loader_step, early_stopping)
        model.load()
    else:
        train(train_loader_step, val_loader_step)
        model.save()

    process.predict(model, test_loader_step)
    




## Change word2vec hyperparamters and max nodes dimension

In [None]:
w2v_size  = 150
nodes_dim = int(group.apply(lambda g: nx.number_of_nodes(g.graph),axis=1).describe()['max'])

with open('config.json') as f:
    json_config = json.load(f)
    
json_config['devign']['model']['conv_args']['conv1d_1']['in_channels'] = nodes_dim
json_config['embed']['nodes_dim']  = nodes_dim
json_config['devign']['model']['emb_size']  = w2v_size
json_config['embed']['word2vec_args']['vector_size']  = w2v_size

with open('config.json', 'w') as f:
    json.dump(json_config,f, indent=4)

In [None]:
PATHS = configs.Paths()
FILES = configs.Files()
DEVICE = FILES.get_device()

context = configs.Process()
devign = configs.Devign()
model_path = PATHS.model + FILES.model
context = configs.Embed()

## Reading dataset

In [2]:
data = pd.read_pickle('../cfg_data.pkl')
data = data[(data['is_connected']==True) & (np.array([G.number_of_nodes()>0 for G in data['graph'].values]))]
data = data[['target', 'project', 'graph','func_code']]
data = data.rename(columns={'func_code': 'func'})

## Tokenzing code, training word2vec, changing graph representation to torch_geometric and training devign model

In [None]:
for name, group in data.groupby('project'):
    
    print('\n'*3,"*"*40,'\n')
    node_size_group = group.apply(lambda g: nx.number_of_nodes(g.graph),axis=1).describe()[['min', 'max','mean','std']]
    print("No of samples in dataset {}: {} ".format(name, len(group)))
    print(tabulate(node_size_group .to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Node stats for: {}'.format(name)]) ,'\n')
    
    nodes_dim = int(node_size_group['max'])
    
    edge_size_group = group.apply(lambda g: nx.number_of_edges(g.graph),axis=1).describe()[['min', 'max','mean','std']]
    print(tabulate(edge_size_group.to_frame(),
                   tablefmt="grid", stralign='left', numalign='left',
                   headers=['Edge stats for: {}'.format(name)]))
    
    spinner = Halo(text='Tokenizing source code', spinner='dots')
    spinner.start()
    tokens_dataset = data_util.tokenize(group)
    spinner.stop()
    spinner.clear()
    
    #you can change here that instead of training word2vec again load the available dataset
    spinner = Halo(text='Training word2vec on tokens code', spinner='dots')
    w2vmodel = Word2Vec(**context.w2v_args)
    w2vmodel.build_vocab(tokens_dataset.tokens)
    w2vmodel.train(tokens_dataset.tokens, total_examples=w2vmodel.corpus_count, epochs=1)
    spinner.stop()
    
    spinner = Halo(text='Converting tokens to word2vec representation', spinner='dots')
    group["input"] = group.apply(lambda row: nodes_to_input(row.graph, row.target, nodes_dim,
                                                                                    w2vmodel.wv), axis=1)
    spinner.stop()
    spinner.clear()
    
    print(f"Saving input dataset {name} with size {len(group)}.")
    pd.to_pickle(group[["input", "target"]], f'data/input/{name}.pkl')
    gc.collect()
    print("Saving w2vmodel.")
    w2vmodel.save(f"{PATHS.w2v}/{name}+word2vec.model")
    #print('Training for {}'.format(name))
    spinner = Halo(text='Training and Testing for {}'.format(name), spinner='dots')
    stopping = False
    context = configs.Process()
    devign = configs.Devign()
    model_path = PATHS.model + FILES.model
    model = process.Devign(path=model_path, device=DEVICE, model=devign.model, learning_rate=devign.learning_rate,
                           weight_decay=devign.weight_decay,
                           loss_lambda=devign.loss_lambda)
    train = process.Train(model, context.epochs)
    input_dataset = group[["input", "target"]]
    # split the dataset and pass to DataLoader with batch size
    train_loader, val_loader, test_loader = list(
        map(lambda x: x.get_loader(context.batch_size, shuffle=context.shuffle),
            data_util.train_val_test_split(input_dataset, shuffle=context.shuffle)))
    train_loader_step = process.LoaderStep("Train", train_loader, DEVICE)
    val_loader_step = process.LoaderStep("Validation", val_loader, DEVICE)
    train(train_loader_step, val_loader_step)
    print('Finish Training for {}'.format(name))
    spinner.stop()
    spinner.clear()




 **************************************** 

No of samples in dataset FFmpeg: 9576 
+------+--------------------------+
|      | Node stats for: FFmpeg   |
| min  | 2                        |
+------+--------------------------+
| max  | 2659                     |
+------+--------------------------+
| mean | 146.661                  |
+------+--------------------------+
| std  | 209.337                  |
+------+--------------------------+ 

+------+--------------------------+
|      | Edge stats for: FFmpeg   |
| min  | 1                        |
+------+--------------------------+
| max  | 2864                     |
+------+--------------------------+
| mean | 162.551                  |
+------+--------------------------+
| std  | 234.159                  |
+------+--------------------------+


Output()

Saving input dataset FFmpeg with size 9576.
Saving w2vmodel.
new fc1 1500 new fc2 size 1000
The model has 1,098,872 trainable parameters
Splitting Dataset

Epoch 1; - Train Loss: 0.2138; Acc: 0.1778; - Validation Loss: 4.4038; Acc: 0.13; - Time: 978.7943940162659

Epoch 2; - Train Loss: 0.2605; Acc: 0.1565; - Validation Loss: 3.8768; Acc: 0.13; - Time: 1942.0359108448029

Epoch 3; - Train Loss: 0.2813; Acc: 0.179; - Validation Loss: 3.553; Acc: 0.13; - Time: 2905.7445142269135

Epoch 4; - Train Loss: 0.2762; Acc: 0.1778; - Validation Loss: 3.5249; Acc: 0.13; - Time: 3869.289839744568

Epoch 5; - Train Loss: 0.3602; Acc: 0.1885; - Validation Loss: 3.3769; Acc: 0.1379; - Time: 4835.737769842148

Epoch 6; - Train Loss: 0.4419; Acc: 0.1636; - Validation Loss: 3.4929; Acc: 0.1379; - Time: 5799.603452682495

Epoch 7; - Train Loss: 0.3888; Acc: 0.1802; - Validation Loss: 3.2758; Acc: 0.13; - Time: 6767.889722108841

Epoch 8; - Train Loss: 0.3936; Acc: 0.1923; - Validation Loss: 3.5199; Acc: 0