# Reproduce LineVul

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import commons

In [4]:
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [5]:
# there are some warning from transformer
# due to its verbose, disable

from transformers import logging
logging.set_verbosity(40)

In [6]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler

In [7]:
import numpy as np

In [8]:
import pandas as pd
from tqdm.autonotebook import tqdm

In [9]:
from linevul_model import Model
from linevul_helpers import TextDataset
from linevul_extra import extract_line_attention, linevul_predict

In [10]:
from project_dataset import load_dataset

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [12]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [13]:
# get from LineVul
checkpoint = '/home/hqn650/LineVul/linevul/saved_models/checkpoint-best-f1/12heads_linevul_model.bin'

In [14]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [15]:
pre_train = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True)

In [16]:
from dataclasses import dataclass

@dataclass
class Args:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    use_non_pretrained_model = False
    block_size = 512
    test_data_file = '/home/hqn650/LineVul/data/big-vul_dataset/test.csv'
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=42
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512

    task = "attack_vector"
    
args = Args()

In [17]:
model = Model(pre_train, config, tokenizer, args)

In [18]:
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [19]:
from linevul_helpers import TextDataset, convert_examples_to_features

class ExtendTextDataset(TextDataset):
    def __init__(self, tokenizer, args, data_frame):
        self.examples = []
        funcs = data_frame["processed_func"].tolist()
        for i in tqdm(range(len(funcs)), desc='ExtendTextDataset'):
            self.examples.append(convert_examples_to_features(funcs[i], 1, tokenizer, args))

In [20]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [21]:
# to find TP
def find_tp(model, tokenizer, args, data_frame=None):
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
    result, y_trues, y_preds = linevul_predict(model, data_loader, args.device)
    tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
    tp_indices = list(tp_indices[0])
    return result, tp_indices
    
result, correct_indices = find_tp(model, tokenizer, args)

Convert example to features:   0%|          | 0/18864 [00:00<?, ?it/s]



In [22]:
# test_dataset = TextDataset(tokenizer, args, file_type='test')

In [23]:
# test_sampler = SequentialSampler(test_dataset)
# test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size, num_workers=0)

In [24]:
# result, y_trues, y_preds = linevul_predict(model, test_dataloader, args.device)

In [25]:
result

{'test_accuracy': 0.9909351145038168,
 'test_recall': 0.8635071090047394,
 'test_precision': 0.9712153518123667,
 'test_f1': 0.9141996989463121,
 'test_threshold': 0.5}

In [26]:
correct_indices[:10]

[66, 99, 103, 108, 120, 129, 130, 142, 153, 167]

In [27]:
# correct_indices = np.where((y_trues == y_preds))
# correct_indices = list(correct_indices[0])

In [28]:
# tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
# tp_indices = list(tp_indices[0])

In [29]:
# after identify true positive sample, create new loader with batch_size = 1 for explaination

# dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, num_workers=0)

In [30]:
# df = pd.read_csv(args.test_data_file)

In [31]:
# top_k_constant = [args.top_k_constant]

In [32]:
# model.eval()
# index = 0
# progress_bar = tqdm(dataloader, total=len(dataloader))
# extract_list = []
# for mini_batch in progress_bar:
#     if index in tp_indices:
#         (input_ids, labels) = mini_batch
#         ids = input_ids[0].detach().tolist()
#         all_tokens = tokenizer.convert_ids_to_tokens(ids)
#         all_tokens = [token.replace("Ġ", "") for token in all_tokens]
#         all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
#         with torch.no_grad():
#             prob, attentions = model(input_ids=input_ids, output_attentions=True)
#         lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
#         extract_list.append((index, lines_with_score, n_lines))
#     index += 1

In [33]:
def explain(model, tokenizer, explain_indices, data_frame=None):
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=0)
    model.eval()
    index = 0
    progress_bar = tqdm(data_loader, total=len(data_loader))
    extract_list = []
    for mini_batch in progress_bar:
        if index in explain_indices:
            (input_ids, labels) = mini_batch
            ids = input_ids[0].detach().tolist()
            all_tokens = tokenizer.convert_ids_to_tokens(ids)
            all_tokens = [token.replace("Ġ", "") for token in all_tokens]
            all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
            with torch.no_grad():
                prob, attentions = model(input_ids=input_ids, output_attentions=True)
            lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
            extract_list.append((index, lines_with_score, n_lines))
        index += 1
    return extract_list

extract_list = explain(model, tokenizer, correct_indices)

Convert example to features:   0%|          | 0/18864 [00:00<?, ?it/s]

  0%|          | 0/18864 [00:00<?, ?it/s]

In [34]:
len(extract_list), len(correct_indices)

(911, 911)

In [35]:
extract_list[-1]

(18822,
 [(2,
   'command_line->AppendSwitch(switches::kEnableThreadedCompositing);',
   1153.4075927734375),
  (1, 'GpuFeatureTest::SetUpCommandLine(command_line);', 963.8869018554688),
  (0,
   '<s>virtualvoidSetUpCommandLine(CommandLine*command_line){',
   850.890869140625),
  (3, '}', 151.55853271484375)],
 4)

In [36]:
import codecs

df = pd.read_csv(args.test_data_file)
context = df.iloc[18822]['func_before']
modified_context = codecs.decode(context, 'unicode_escape')

new_variable = modified_context.replace(r'\n', '\n')

print(new_variable)

  virtual void SetUpCommandLine(CommandLine* command_line) {
    GpuFeatureTest::SetUpCommandLine(command_line);
    command_line->AppendSwitch(switches::kEnableThreadedCompositing);
  }



# Attack vector

In [37]:
attack_vector = load_dataset(args.task)

In [38]:
attack_vector_test = attack_vector['test']
attack_vector_test

Dataset({
    features: ['CVE ID', 'explain', 'func_before'],
    num_rows: 1350
})

In [39]:
attack_vector_test

Dataset({
    features: ['CVE ID', 'explain', 'func_before'],
    num_rows: 1350
})

In [40]:
attack_vector_test_df = attack_vector_test.to_pandas()
attack_vector_test_df

Unnamed: 0,CVE ID,explain,func_before
0,CVE-2017-3731,sending specially crafted truncated packets,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,CVE-2013-2867,via a crafted web site .,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,CVE-2012-5131,via unknown vectors .,static void unregisterBlobURLTask(void* conte...
3,CVE-2016-5842,persuading a victim to open a specially crafte...,}static inline void WriteResourceLong(unsigned...
4,CVE-2012-2875,via a crafted document .,void PromoResourceService::PromoResourceState...
...,...,...,...
1345,CVE-2012-6657,by leveraging the ability to create a raw sock...,"int sock_setsockopt(struct socket *sock, int l..."
1346,CVE-2016-6787,using a specially-crafted application,static void perf_event_for_each_child(struct p...
1347,CVE-2016-3951,inserting a USB device with an invalid USB des...,"static int cdc_ncm_bind(struct usbnet *dev, s..."
1348,CVE-2016-1621,using a specially crafted media file,static unsigned int subpel_variance_ref(const ...


In [41]:
# func_before doesn't have the same format as processed_func from lineVul
attack_vector_test_df.iloc[0]['func_before']

'static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {  EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,c);  switch (type) {  case EVP_CTRL_INIT:  gctx->key_set = 0;  gctx->iv_set = 0;  gctx->ivlen = EVP_CIPHER_CTX_iv_length(c);  gctx->iv = EVP_CIPHER_CTX_iv_noconst(c);  gctx->taglen = -1;  gctx->iv_gen = 0;  gctx->tls_aad_len = -1;  return 1;   case EVP_CTRL_AEAD_SET_IVLEN:  if (arg <= 0)  return 0;  /* Allocate memory for IV if needed */  if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {  if (gctx->iv != EVP_CIPHER_CTX_iv_noconst(c))  OPENSSL_free(gctx->iv);  gctx->iv = OPENSSL_malloc(arg);  if (gctx->iv == NULL)  return 0;  }  gctx->ivlen = arg;  return 1;   case EVP_CTRL_AEAD_SET_TAG:  if (arg <= 0 || arg > 16 || EVP_CIPHER_CTX_encrypting(c))  return 0;  memcpy(EVP_CIPHER_CTX_buf_noconst(c), ptr, arg);  gctx->taglen = arg;  return 1;   case EVP_CTRL_AEAD_GET_TAG:  if (arg <= 0 || arg > 16 || !EVP_CIPHER_CTX_encrypting(c)  || gctx->taglen < 0)  return 0

In [42]:
df.iloc[18859]['processed_func']

'DisplayNameValueList(char * buffer, int bufsize)\n{\n    struct NameValueParserData pdata;\n    struct NameValue * nv;\n    ParseNameValue(buffer, bufsize, &pdata);\n    for(nv = pdata.l_head;\n        nv != NULL;\n        nv = nv->l_next)\n    {\n        printf("%s = %s\\n", nv->name, nv->value);\n    }\n    ClearNameValueList(&pdata);\n}\n'

In [43]:
# convert the same format
attack_vector_test_df['processed_func'] = commons.dfmp(attack_vector_test_df, commons.format_code, "func_before", cs=200, desc="Format string to LineVul expectation")

(6 Workers) Format string to LineVul expectation:   0%|          | 0/1350 [00:00<?, ?it/s]

In [44]:
attack_vector_test_df

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2017-3731,sending specially crafted truncated packets,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int...","static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,CVE-2013-2867,via a crafted web site .,void BluetoothDeviceChromeOS::OnUnregisterAgen...,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,CVE-2012-5131,via unknown vectors .,static void unregisterBlobURLTask(void* conte...,static void unregisterBlobURLTask(void* contex...
3,CVE-2016-5842,persuading a victim to open a specially crafte...,}static inline void WriteResourceLong(unsigned...,}\nstatic inline void WriteResourceLong(unsign...
4,CVE-2012-2875,via a crafted document .,void PromoResourceService::PromoResourceState...,void PromoResourceService::PromoResourceStateC...
...,...,...,...,...
1345,CVE-2012-6657,by leveraging the ability to create a raw sock...,"int sock_setsockopt(struct socket *sock, int l...","int sock_setsockopt(struct socket *sock, int l..."
1346,CVE-2016-6787,using a specially-crafted application,static void perf_event_for_each_child(struct p...,static void perf_event_for_each_child(struct p...
1347,CVE-2016-3951,inserting a USB device with an invalid USB des...,"static int cdc_ncm_bind(struct usbnet *dev, s...","static int cdc_ncm_bind(struct usbnet *dev, st..."
1348,CVE-2016-1621,using a specially crafted media file,static unsigned int subpel_variance_ref(const ...,static unsigned int subpel_variance_ref(const ...


In [45]:
# to find TP
# def find_tp(model, tokenizer, args, data_frame):
#     extend_dataset = ExtendTextDataset(tokenizer, args, data_frame)
#     sampler = SequentialSampler(extend_dataset)
#     data_loader = DataLoader(extend_dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
#     result, y_trues, y_preds = linevul_predict(model, data_loader, args.device)
#     tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
#     tp_indices = list(tp_indices[0])
#     return result, tp_indices
# result, tp_indices = find_tp(model, tokenizer, args, attack_vector_test_df)

In [47]:
result, explain_indices = find_tp(model, tokenizer, args, attack_vector_test_df)

ExtendTextDataset:   0%|          | 0/1350 [00:00<?, ?it/s]



In [48]:
result, len(explain_indices)

({'test_accuracy': 0.10444444444444445,
  'test_recall': 0.10444444444444445,
  'test_precision': 1.0,
  'test_f1': 0.18913480885311873,
  'test_threshold': 0.5},
 141)

In [49]:
extract_list_attack_vector = explain(model, tokenizer, explain_indices, attack_vector_test_df)

ExtendTextDataset:   0%|          | 0/1350 [00:00<?, ?it/s]

  0%|          | 0/1350 [00:00<?, ?it/s]

In [50]:
len(extract_list_attack_vector), len(explain_indices)

(141, 141)

In [51]:
extract_list_attack_vector[0]

(8,
 [(15,
   'NOTREACHED()<<"Unsupportedaudiobitdepthincrossfade.";',
   675.3397216796875),
  (12, 'DoCheckFakeData<uint8>(audio_data,length);', 492.43896484375),
  (9, 'DoCheckFakeData<int16>(audio_data,length);', 439.83258056640625),
  (0,
   '<s>voidCheckFakeData(uint8*audio_data,intframes_written,',
   434.6465759277344),
  (6, 'DoCheckFakeData<int32>(audio_data,length);', 428.0029602050781),
  (2,
   'size_tlength=(frames_written*algorithm_.bytes_per_frame())/',
   411.04876708984375),
  (1, 'doubleplayback_rate){', 327.9273681640625),
  (3, 'algorithm_.bytes_per_channel();', 314.5769958496094),
  (4, 'switch(algorithm_.bytes_per_channel()){', 278.3238830566406),
  (11, 'case1:', 156.80673217773438),
  (8, 'case2:', 143.63552856445312),
  (13, 'break;', 141.71417236328125),
  (10, 'break;', 131.60609436035156),
  (14, 'default:', 120.43937683105469),
  (5, 'case4:', 116.97069549560547),
  (7, 'break;', 116.07525634765625),
  (16, '}', 72.8502197265625)],
 17)