## Build BERT model

In [1]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.models import Model
import bert
from bert import tokenization
import numpy as np
from tqdm import tqdm
import gc
from tensorflow.keras import backend as K
import pandas as pd



In [2]:
max_seq_length = 128

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
 name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
 name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
 name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
 trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])


model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])


# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

## Apply BERT tokenizer

In [3]:
#df = pd.read_parquet("/mnt/md0/user/scheuererra68323/LO_SARD102/LO_SARD102_TokenAnon_wExtFuncCalls_Labeled.parquet")
df = pd.read_parquet("/mnt/md0/user/scheuererra68323/JTT/JTT_TokenAnon_wExtFuncCalls_Labeled.parquet")
df.head()

Unnamed: 0_level_0,path,line_start,line_stop,code_snippet,external_function_names,dump_tokens_output,token_anon,CWE-327,CWE-114,CWE-121,...,CWE-775,CWE-780,CWE-785,CWE-789,CWE-078,CWE-832,CWE-835,CWE-843,CWE-090,is_vulnerable
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83452,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,22,39,"b'[""unsigned int data;"", "" data = 0;"", "" ...","b'[""printUnsignedLine"", ""globalReturnsTrue""]'",unsigned 'unsigned'\t [StartOfLine]\tLoc=<<std...,"b'[""unsigned"", ""int"", ""identifier0"", "";"", ""ide...",,,,...,,,,,,,,,,0.0
248990,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,199,203,"b'[""goodG2B();"", "" goodB2G();""]'",b'[]',identifier 'goodG2B'\t [StartOfLine]\tLoc=<<st...,"b'[""identifier0"", ""("", "")"", "";"", ""identifier1""...",,,,...,,,,,,,,,,0.0
220067,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,56,67,"b'[""int64_t * data;"", "" /* Initialize data*...","b'[""CWE762_Mismatched_Memory_Management_Routin...",identifier 'int64_t'\t [StartOfLine]\tLoc=<<st...,"b'[""identifier0"", ""*"", ""identifier1"", "";"", ""id...",,,,...,,,,,,,,,,0.0
283612,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,52,57,"b'[""printIntLine(data->intOne);"", "" /* POTE...",b'[]',identifier 'printIntLine'\t [StartOfLine]\tLoc...,"b'[""identifier0"", ""("", ""identifier1"", ""->"", ""i...",,,,...,,,,,,,,,,0.0
142227,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,46,52,"b'[""if(GLOBAL_CONST_FIVE==5)"", "" {"", "" ...","b'[""signal""]'",if 'if'\t [StartOfLine]\tLoc=<<stdin>:1:1>\nl_...,"b'[""if"", ""("", ""identifier0"", ""=="", ""<numeric_c...",,,,...,,,,,,,,,,0.0


In [None]:
debug = False

def bert_tokenize(token_anon):
    #print(token_anon)
    glued = " ".join(str(token_anon))
    bert_tokens = tokenizer.tokenize(glued)
    
    bert_tokens = ["[CLS]"] + bert_tokens + ["[SEP]"]
    if debug:
        print()
    
    return bert_tokens
    
tqdm.pandas()

df['bert_tokens'] = df.token_anon.progress_apply(bert_tokenize)

print(df.shape)
df.head()

  from pandas import Panel
 82%|████████▏ | 20441/24999 [00:50<00:15, 292.63it/s]

In [None]:
bert_tokens = df
bert_tokens.bert_tokens.map(len).describe()

In [None]:
bert_tokens = bert_tokens.loc[bert_tokens.bert_tokens.map(len) <= max_seq_length]
bert_tokens.bert_tokens.map(len).describe()

## Predict BERT embeddings

In [None]:
def bert_embed(bert_tokens):
    input_ids = get_ids(bert_tokens, tokenizer, max_seq_length)
    input_masks = get_masks(bert_tokens, max_seq_length)
    input_segments = get_segments(bert_tokens, max_seq_length)

    pooled_output, sequence_output = model.predict_on_batch(
        [[input_ids],[input_masks],[input_segments]])
    #K.clear_session()
    return sequence_output.squeeze()

tqdm.pandas()
bert_embeddings = bert_tokens.bert_tokens.progress_apply(bert_embed)

In [None]:
bert_tokens.bert_tokens.map(len).describe()

In [None]:
bert_embeddings.map(len).describe()

## Save arrays into files

In [None]:
X = np.stack(bert_embeddings.values)
print(X.shape)

In [None]:
y = bert_tokens.drop(["code_snippet", "external_function_names", "dump_tokens_output",
         "path" ,"line_start" ,"line_stop" ,"token_anon", 'bert_tokens'], axis=1)
y = y.where(pd.notnull(y), 0).astype(int)
print(y.shape)
print(y.dtypes)

In [None]:
# shuffle data before saving
#rng_state = np.random.get_state()
#np.random.shuffle(X)
#np.random.set_state(rng_state)
#np.random.shuffle(y)

In [None]:
#np.save('LOSARD102_bert128_X.npy', X)
np.save('JTT_bert128_X.npy', X)

In [None]:
#y.to_hdf('LOSARD102_bert128_y.h5', key='y')
y.to_hdf('JTT_bert128_y.h5', key='y')