### Make sure following packages are installed
- Python
    - pandas
    - numpy
    - tqdm
    - networkx
    - pygraphviz
    - scipy
    - sklearn
    - spektral
    - tensorflow
    - tokenizer_c (tokenizer code)
- System
    - Joern
    - graphviz

 ## 1. Generate CFG using the following method
- make dir for each project i.e. in our case it is will be two folders one for FFmpeg and other one is for qemu defined by variable path_code
- define your path where joern is installed using variable joern_path
- read the dataset and Extract all instances of source code and group them based on their project
- iterate over each project group and save the source code in a .c file. The name of the file is the index where the source file is saved

### 1.1. Save source code from JSON file to individual .c file

In [1]:
import pathlib, glob, time, re, tokenizer_c, subprocess
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import networkx as nx
import pygraphviz
import scipy.sparse as sp
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from spektral.data import Dataset, DisjointLoader, Graph
from spektral.data import DisjointLoader
from spektral.layers import GINConv, GlobalAvgPool

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
dataset_file = 'function.json'
path_code = pathlib.Path('code')                         # creating path 'code' in which all the cfg and c files will be placed
path_code.mkdir(parents=True, exist_ok=True)             #this make sure if the folder already exist do not overwrite it
joern_path  = str(Path.home())+"/bin/joern/joern-cli/"   #joern path
pd_code  = pd.read_json(dataset_file)                    #read dataset available as JSON
#pd_code  = pd_code[:100]
pd_group = pd_code.groupby('project')

# iterate over each group/project
for group_name, df_group in pd_group:
    path_group = pathlib.Path(str(path_code)+'/'+group_name)         #make folder for the dataset
    path_group.mkdir(parents=True, exist_ok=True)
    
    for row_index, row in df_group.iterrows():
        path_src = pathlib.Path(str(path_group)+'/'+str(row_index)) #make folder for a every single source code
        path_src.mkdir(parents=True, exist_ok=True)
        code = row['func']                                              #'func' is the attribute where whole code is 
                                                                        #present as a single function
        file_name = str(path_group)+'/'+str(row_index)+'/'+str(row_index)+'.c'
        f = open(file_name, 'wb')
        f.write(code.encode())
        f.close()

### 1.2. Extract CFG from the .c files using Joern
- The Joern generate CFG from source code is two steps'
    1. Export the CPG file from the source code
    2. Parse the extracted CPG file in any supported representation: AST, CFG, PDG, CPG, etc.
- We have saved CPGs for the entire dataset and now when extracting any required graph representation we only have to run step 2
- The parallelization doesn't work that great with joern. To speed up the process run Joern on each separate group
- The Entire process takes around 2-3 Hours on sir's machine

In [None]:
start_time = time.time()

project_path = glob.glob(str(path_code)+"/*")   #get all the projects in the dataset

for i in project_path:
    project = glob.glob(i+'/*')                  #get all .c files we just generated

    for path_src in tqdm(project):
        src_file = str(Path(path_src+'/'+path_src.split("/")[-1]+'.c').resolve())
        out_path = str(Path(path_src+'/').resolve())
        
        # it checks if the CPG file is available but CFG is not available as folder
        # devnull allows the joern to write nothing on command prompt as it can slow the parsing process
        if Path(src_file.replace('.c','.cpg')).is_file() and not Path(str(Path(path_src+'/'+'cfg').resolve())).is_dir():
            result = subprocess.call([joern_path+"./joern-export",src_file.replace('.c','.cpg'),"--repr","cfg",'--out',out_path+'/cfg/'],
                                 stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
            
        # check if CFG and both CPG are present if true nothing will be generated
        elif Path(src_file.replace('.c','.cpg')).is_file() and Path(str(Path(path_src+'/'+'cfg').resolve())).is_dir():
            pass
        
        else:
        #if both of them all false compute CPG and generate CFG 
            result =subprocess.call([joern_path+"./joern-parse",src_file,"--out",out_path+'/'+path_src.split("/")[-1]+".cpg"],
                        stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
            result1 =subprocess.call([joern_path+"/joern-export",src_file.replace('.c','.cpg'),"--repr","cfg",'--out',out_path+'/cfg/'],
                        stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
            
print("--- %s miuntes ---" % ((time.time() - start_time)/60))

## 2. Compiling CFGs into graph object and storing them in Pandas Dataframe

In [None]:
check_empty  ='digraph .*\{ *\n*\}' #to check if generated CFG is empty or not
pd_code      = pd.read_json(dataset_file)

pd_list = []
for i in project_path:
    path = pathlib.PurePath(i)
    project_name = path.name
    project = glob.glob(i+'/*')
    
    for path_src in tqdm(project):
        new_dict   = dict()
        src_file   = str(Path(path_src+'/'+pathlib.PurePath(path_src).name+'.c').resolve())
        cfg_folder = str(Path(path_src+'/'+'cfg/').resolve())
        index      = int(pathlib.PurePath(src_file).name.replace(".c",""))
        target     = pd_code.loc[index]["target"]
        with open(src_file, 'r') as f:
            src_code = f.read()
        dot_arr = []
        for file in glob.glob(cfg_folder+"/*"):
            with open(file,'r') as f:
                dot_arr.append(f.read())
        dot_arr = [x for x in dot_arr if not re.search(check_empty, x)] # removes empty graphs
        if (len(dot_arr) == 1):                                         #if graph is not empty and is connected
            is_connected = True
            #G = nx.Graph(nx.drawing.nx_pydot.read_dot(Path(cfg_folder).joinpath("0-cfg.dot")))
            with open(Path(cfg_folder).joinpath("0-cfg.dot")) as f:
                dotFormat = f.read()
            new_str = dotFormat.replace('\\"', '')                        #To catch escape characters
            G = nx.drawing.nx_agraph.from_agraph(pygraphviz.AGraph(new_str)) #convert graph into Networkx object
        else:
            is_connected = False                                       # doesn't check if graph list is empty (update)
            G = nx.Graph()
             
        #creating attributes for the dataset to store them in pandas  
        new_dict['index']         = index
        new_dict['project']       = project_name
        new_dict['target']        = target
        new_dict['func_code']     = src_code
        new_dict['graph']         = G
        new_dict['is_connected']  = is_connected
        new_dict['dot_string']    = new_str
        pd_list.append(new_dict)
        
data = pd.DataFrame(pd_list)
data.set_index("index",inplace=True, verify_integrity=True)
data.to_pickle("cfg_data.pkl")

### 2.1. Extracting unique operation keywords in the dataset for node embedding
- I have extracted all the unique C-tokens that are used in the dataset
- I also have extracted all the unique operation words used in the dataset
- As you can see I have hardcoded the unique tokens and operations the reasons are:
    - There are actually 84 unique tokens in the tokenizer dictionary but in our dataset, there are only 64 unique tokens. That's why I discarded the other tokens to reduce the dimension of node embedding. You can get all the available tokens in the tokenizer by running the code tokenizer_c.tokens
    - The reason I have also hardcoded the unique operations is that I didn't want to compute all the operators every time. You can get all the unique operation words by running the following function
- To encode the node features (source code) I have done the following:
    - I have tokenized the whole code available at the node and selected only unique tokens from it
    - I have gotten the operation word at each node can compare it with the type of operations I have. It is possible that a source code contains function calling code and it has no operation type so the operation type will be a zero vector
- I have concatenated one-hot encoding of both representation
- I have stored the embeddings in a pickle file and I will integrate them into the graphs in the next step

In [None]:
tok_arr = ['EQUALS',
  'LE',
  'LNOT',
  'RETURN',
  'SEMI',
  'EQ',
  'PLUSPLUS',
  'STATIC',
  'TIMES',
  'ARROW',
  'SIZEOF',
  'MINUS',
  'LOR',
  'LAND',
  'GE',
  'AND',
  'OR',
  'INT',
  'COLON',
  'PLUS',
  'OREQUAL',
  'NE',
  'MOD',
  'LT',
  'CONDOP',
  'PERIOD',
  'RSHIFT',
  'LSHIFT',
  'GT',
  'NOT',
  'ANDEQUAL',
  'PLUSEQUAL',
  'MINUSEQUAL',
  'LONG',
  'UNSIGNED',
  'TIMESEQUAL',
  'CHAR',
  'RSHIFTEQUAL',
  'DOUBLE',
  'MINUSMINUS',
  'STRUCT',
  'LSHIFTEQUAL',
  'IF',
  'REGISTER',
  'FOR',
  'DIVEQUAL',
  'XOR',
  'AUTO',
  'WHILE',
  'CONTINUE',
  'SHORT',
  'XOREQUAL',
  'FLOAT',
  'DEFAULT',
  'ELLIPSIS',
  'MODEQUAL',
  'UNION',
  'DO',
  'ENUM',
  'BREAK',
  'SIGNED',
  'ELSE',
  'CASE',
  'VOLATILE']

op_arr = ['RETURN',
 'METHOD',
 'METHOD_RETURN',
 '<operator>.indirectFieldAccess',
 '<operator>.assignment',
 '<operator>.indirectIndexAccess',
 '<operator>.fieldAccess',
 '<operator>.addressOf',
 '<operator>.addition',
 '<operator>.lessThan',
 '<operator>.equals',
 '<operator>.multiplication',
 '<operator>.and',
 '<operator>.logicalNot',
 '<operator>.postIncrement',
 '<operator>.subtraction',
 '<operator>.logicalAnd',
 '<operator>.minus',
 '<operator>.notEquals',
 '<operator>.indirection',
 '<operator>.cast',
 '<operator>.arithmeticShiftRight',
 '<operator>.assignmentPlus',
 '<operator>.logicalOr',
 '<operator>.sizeOf',
 '<operator>.shiftLeft',
 '<operator>.greaterThan',
 '<operator>.or',
 '<operator>.division',
 '<operator>.conditional',
 '<operator>.greaterEqualsThan',
 '<operator>.lessEqualsThan',
 '<operator>.assignmentMinus',
 '<operator>.postDecrement',
 '<operator>.modulo',
 '<operator>.preIncrement',
 '<operator>.not',
 '<operator>.assignmentMultiplication',
 '<operator>.assignmentDivision',
 '<operator>.preDecrement',
 '<operator>.plus']

token_enc = OneHotEncoder(handle_unknown='ignore', sparse=False) #encoding tokens using one hot
op_enc    = OneHotEncoder(handle_unknown='ignore', sparse=False) #encoding operation types using one hot

token_enc.fit(np.array(tok_arr).reshape(-1,1))
op_enc.fit(np.array(op_arr).reshape(-1,1))

with open('cfg_data.pkl', 'rb') as f:
    data = pickle.load(f)

unique_word = {}
expr = r'^(.+?),'

#these words are for paranthesis or constants like (3,4). If they occur just ignore them
ignore_words = ['ICONST','CONST', 'ID', 'LBRACKET', 'RBRACKET', 'LPAREN', 'RPAREN', 'LBRACE', 'RBRACE' ] 
attr_dict    = {}
max_len      = 64*13
for idx, i in tqdm(data['dot_string'].items()):
    attr_dict[idx] = {}
    for line in i.split("\n"):
        if "label = " in line:
            op_arr       = []
            split_line  = line.split("label = ")
            c           = split_line[1]
            node_id     = split_line[0].split('"')[1]
            keyword     = re.search(expr,c).group(0)[2:-1] #operation type
            source_code = c.split(',')[1].split(')" ]')[0]
            tokens      = tokenizer_c.run_tokenizer_c(source_code)
            tokens       = list(map(str, tokens ))
            for tok in tokens:
                if any(x in tok for x in ignore_words):
                    pass
                else:
                    op_arr.append(tok.split('(')[1].split(',')[0])

            op_arr = list(set(op_arr))
            
            if len(op_arr)>1:
                token_ohe    = token_enc.transform(np.array(op_arr).reshape(-1, 1)).flatten()
            else:
                token_ohe    = token_enc.transform(np.array(['DEAD']).reshape(1, -1)).flatten() #if token list of source code is empty
                
            token_ohe    = np.pad(token_ohe, [(0, max_len-len(token_ohe))], mode='constant')
            op_ohe       = op_enc.transform(np.array([keyword]).reshape(1, -1)).flatten()
            feature_vect = np.concatenate((op_ohe,token_ohe))
            attr_dict[idx][node_id]  = feature_vect

            
with open('node_feature.pkl', 'wb') as f:
    pickle.dump(attr_dict,f)

## 2.2 Merging node features into graph
- I have merged the one hot encoding embeddings with the graph in following code

In [16]:
with open('node_feature.pkl', 'rb') as f:
    data = pickle.load(f)
    
    
with open('cfg_data.pkl', 'rb') as f:
    cfg_data = pickle.load(f)
    
    
torch_dict = {}
for idx, node_feature in tqdm(data.items()):

    torch_dict[idx] = {}
    row = cfg_data.loc[idx]
    G   = row['graph']
    nx.set_node_attributes(G, node_feature, name='x') #setting the one hot encoding for each node in the graph
    torch_dict[idx]['graph']        = G
    torch_dict[idx]['target']       = row['target']
    torch_dict[idx]['project']      = row['project']
    torch_dict[idx]['is_connected'] = row['is_connected']

df = pd.DataFrame(torch_dict)        #forgot to take transpose of it (update)
pd.to_pickle('all_dataset.pkl',)

84

## 3 Training on both projects using GIN
- Please configure the model's parameters before running
- You can provide training and testing set ratio
- If someone can also confirm the code of GIN is authentic ? I am using from spektral

In [None]:
# The whole code except for dataset generation is used from the spektral source code

################################################################################
# Config
################################################################################
learning_rate = 1e-3  # Learning rate
channels = 128  # Hidden units
layers = 3  # GIN layers
epochs = 100  # Number of training epochs
batch_size = 32  # Batch size
es_patience = 10  # Patience for early stopping

# Parameters
F = 64*13 + 13# Dimension of node features
n_out = 2  # Dimension of the target




################################################################################
# Build model
################################################################################
class GIN0(Model):
    def __init__(self, channels, n_layers):
        super().__init__()
        self.conv1 = GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
        self.convs = []
        for _ in range(1, n_layers):
            self.convs.append(
                GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
            )
        self.pool = GlobalAvgPool()
        self.dense1 = Dense(channels, activation="relu")
        self.dropout = Dropout(0.5)
        self.dense2 = Dense(n_out, activation="softmax")

    def call(self, inputs):
        x, a, i = inputs
        x = self.conv1([x, a])
        for conv in self.convs:
            x = conv([x, a])
        x = self.pool([x, i])
        x = self.dense1(x)
        x = self.dropout(x)
        return self.dense2(x)


def evaluate(loader):
    output = []
    step = 0
    while step < loader.steps_per_epoch:
        step += 1
        inputs, target = loader.__next__()
        pred = model(inputs, training=False)
        outs = (
            loss_fn(target, pred),
            tf.reduce_mean(categorical_accuracy(target, pred)),
            len(target),  # Keep track of batch size
        )
        output.append(outs)
        if step == loader.steps_per_epoch:
            output = np.array(output)
            return np.average(output[:, :-1], 0, weights=output[:, -1])

    
class MyDataset(Dataset):

    def __init__(self, data, **kwargs):
        self.n_samples = len(data)
        self.data     = data
        self.unique, self.inverse = np.unique(data['target'].to_numpy(), return_inverse=True)
        self.onehot = np.eye(self.unique.shape[0])[self.inverse]
        self.zero_array = []
        super().__init__(**kwargs)

    def read(self):
        def make_graph(idx, row):
            G = row['graph'].to_undirected()
            
            if len(G.nodes())>0:
                
                # Node features
                x = np.array(list(nx.get_node_attributes(G, "x").values()))

                # Edges
                a = nx.adjacency_matrix(G).todense()
                a = sp.csr_matrix(a)

                # Labels
                y = self.onehot[idx] #labels also shoud be one hot encoding

                return Graph(x=x, a=a, y=y)
            
            else:
                self.zero_array.append(1)
                self.n_samples -= 1
                return None
                
        # We must return a list of Graph. Objects of other types are discarded
        return [j for j in [make_graph(i, row) for i, row in self.data.iterrows()] if j is not None]

In [None]:

def train_spk(data1, train_size = 0.65, val_size = 0.10  ,test_size=0.25, k_fold = 5):
    if (train_size = 0.65, val_size = 0.10  ,test_size=0.25) != 1:
        print('Wrong test, train, val size. Must be equal to 1')
        return
    
    acc_dict1 = {}
    for i in range(k_fold):
        idxs = np.random.permutation(len(data1))
        split_va, split_te = int(train_size * len(data1)), int((train_size+val_size) * len(data1))
        idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
        data_tr = data1[idx_tr]
        data_va = data1[idx_va]
        data_te = data1[idx_te]

        # Data loaders
        loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs)
        loader_va = DisjointLoader(data_va, batch_size=batch_size)
        loader_te = DisjointLoader(data_te, batch_size=batch_size)


        ################################################################################
        # Fit model
        ################################################################################
        @tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
        def train_step(inputs, target):
            with tf.GradientTape() as tape:
                predictions = model(inputs, training=True)
                loss = loss_fn(target, predictions) + sum(model.losses)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            acc = tf.reduce_mean(categorical_accuracy(target, predictions))
            return loss, acc

        # Build model
        model = GIN0(channels, layers)
        optimizer = Adam(learning_rate)
        loss_fn = CategoricalCrossentropy()
        epoch = step = 0
        best_val_loss = np.inf
        best_weights = None
        patience = es_patience
        results = []
        for batch in loader_tr:
            step += 1
            loss, acc = train_step(*batch)
            results.append((loss, acc))
            if step == loader_tr.steps_per_epoch:
                step = 0
                epoch += 1

                # Compute validation loss and accuracy
                val_loss, val_acc = evaluate(loader_va)
                print(
                    "Ep. {} - Loss: {:.3f} - Acc: {:.3f} - Val loss: {:.3f} - Val acc: {:.3f}".format(
                        epoch, *np.mean(results, 0), val_loss, val_acc
                    )
                )

                # Check if loss improved for early stopping
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience = es_patience
                    print("New best val_loss {:.3f}".format(val_loss))
                    best_weights = model.get_weights()
                else:
                    patience -= 1
                    if patience == 0:
                        print("Early stopping (best val_loss: {})".format(best_val_loss))
                        break
                results = []

        ################################################################################
        # Evaluate model
        ################################################################################
        model.set_weights(best_weights)  # Load best model
        test_loss, test_acc = evaluate(loader_te)
        print("Done. Test loss: {:.4f}. Test acc: {:.2f}".format(test_loss, test_acc))
        acc_dict1[i]  = {}
        acc_dict1[i]['test_loss'] = test_loss
        acc_dict1[i]['test_acc'] = test_acc
    return acc_dict1

In [None]:
#saving acc in dict:acc one might want to print and calculate average accuracy.
# I am working on it. 
data = pd.read_pickle('all_dataset.pkl')
data = data.T
data0, data_0 = data.groupby('project')
data_ff = data0[1]
data_qe = data_0[1]
data_ff.index = list(range(len(data_ff)))
data_qe.index = list(range(len(data_qe)))
del data
acc_qe = train_spk(data_qe)
acc_ff = train_spk(data_ff)
acc = {"qemu"acc_qe, "FFmpeg": acc_ff}
with open('acc_results.pkl') as f:
    pickle.dump(acc, f)