# Reproduce LineVul

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import commons

In [3]:
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [4]:
# there are some warning from transformer
# due to its verbose, disable

from transformers import logging
logging.set_verbosity(40)

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler

In [6]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader

In [7]:
import pandas as pd
from tqdm.autonotebook import tqdm

In [8]:
from linevul_model import Model
from linevul_helpers import TextDataset
from linevul_extra import extract_line_attention, linevul_predict

In [9]:
from project_dataset import load_dataset

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [11]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [12]:
# get from LineVul
checkpoint = '/home/hqn650/LineVul/linevul/saved_models/checkpoint-best-f1/12heads_linevul_model.bin'

In [13]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [14]:
pre_train = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True).to(device)

In [15]:
from dataclasses import dataclass

@dataclass
class Args:
    device = device
    n_gpu = n_gpu
    use_non_pretrained_model = False
    block_size = 512
    test_data_file = '/home/hqn650/LineVul/data/big-vul_dataset/test.csv'
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=42
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512

    task = "root_cause"
    
args = Args()

In [16]:
model = Model(pre_train, config, tokenizer, args)

In [18]:
import transformers
print(transformers.__version__)

4.30.2


In [19]:
state_dict = torch.load(checkpoint)

In [20]:
state_dict

OrderedDict([('roberta.embeddings.position_ids',
              tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                        98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
                     

In [17]:
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [18]:
from linevul_helpers import TextDataset, convert_examples_to_features

class ExtendTextDataset(TextDataset):
    def __init__(self, tokenizer, args, data_frame):
        self.examples = []
        funcs = data_frame["processed_func"].tolist()
        for i in tqdm(range(len(funcs)), desc='ExtendTextDataset'):
            self.examples.append(convert_examples_to_features(funcs[i], 1, tokenizer, args))

In [19]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [20]:
# to find TP
def find_tp(model, tokenizer, args, data_frame=None):
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
    result, y_trues, y_preds = linevul_predict(model, data_loader, args.device)
    tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
    tp_indices = list(tp_indices[0])
    return result, tp_indices
    
# result, correct_indices = find_tp(model, tokenizer, args)

In [21]:
def explain(model, tokenizer, explain_indices, data_frame=None): 
    """ 
        return (sample_idx, lines, n_lines)
    """
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=0)
    model.eval()
    index = 0
    progress_bar = tqdm(data_loader, total=len(data_loader))
    extract_list = []
    for mini_batch in progress_bar:
        if index in explain_indices:
            (input_ids, labels) = mini_batch
            ids = input_ids[0].detach().tolist()
            all_tokens = tokenizer.convert_ids_to_tokens(ids)
            all_tokens = [token.replace("Ġ", "") for token in all_tokens]
            all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
            with torch.no_grad():
                prob, attentions = model(input_ids=input_ids, output_attentions=True)
            lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
            extract_list.append((index, lines_with_score, n_lines))
        index += 1
    return extract_list

# extract_list = explain(model, tokenizer, correct_indices)

# Root Cause

In [22]:
attack_vector = load_dataset(args.task)

In [23]:
attack_vector_test = attack_vector['test']

In [24]:
attack_vector_test

Dataset({
    features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
    num_rows: 954
})

In [25]:
attack_vector_test_df = attack_vector_test.to_pandas()
attack_vector_test_df

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2013-1796,does not ensure a required time_page alignment...,"int kvm_set_msr_common(struct kvm_vcpu *vcpu, ...","int kvm_set_msr_common(struct kvm_vcpu *vcpu, ..."
1,CVE-2017-13009,has a buffer over-read,"mobility_print(netdissect_options *ndo, const...","mobility_print(netdissect_options *ndo, const ..."
2,CVE-2016-7798,applying the same attack one would use in a tw...,"ossl_cipher_pkcs5_keyivgen(int argc, VALUE *ar...","ossl_cipher_pkcs5_keyivgen(int argc, VALUE *ar..."
3,CVE-2015-8126,failed to check for an out-of-range palette,"png_set_text_2(png_structp png_ptr, png_infop ...","png_set_text_2(png_structp png_ptr, png_infop ..."
4,CVE-2012-3552,improper synchronization,static struct sock * tcp_v6_syn_recv_sock(stru...,static struct sock *tcp_v6_syn_recv_sock(struc...
...,...,...,...,...
949,CVE-2016-9559,a null pointer,"static void TIFFGetProperties(TIFF *tiff,Image...","static void TIFFGetProperties(TIFF *tiff, Imag..."
950,CVE-2012-5148,does not properly validate file names,void BrowserCommandController::RemoveIntersti...,void BrowserCommandController::RemoveInterstit...
951,CVE-2018-14036,insufficient path check in user_change_icon_fi...,user_change_icon_file_authorized_cb (Daemon *d...,user_change_icon_file_authorized_cb(Daemon *da...
952,CVE-2018-1000880,improper input validation,"_warc_read(struct archive_read *a, const void ...","_warc_read(struct archive_read *a, const void ..."


In [26]:
result, explain_indices = find_tp(model, tokenizer, args, attack_vector_test_df)

ExtendTextDataset:   0%|          | 0/954 [00:00<?, ?it/s]



In [27]:
result, len(explain_indices)

({'test_accuracy': 0.07337526205450734,
  'test_recall': 0.07337526205450734,
  'test_precision': 1.0,
  'test_f1': 0.13671875,
  'test_threshold': 0.5},
 70)

In [28]:
# (sample_idx, lines, n_lines)
extract_list_attack_vector = explain(model, tokenizer, explain_indices, attack_vector_test_df)

ExtendTextDataset:   0%|          | 0/954 [00:00<?, ?it/s]

  0%|          | 0/954 [00:00<?, ?it/s]

In [29]:
def extract_top(top_k, extracted_list):
    """
        extracted_list -> [(index, lines, n_lines)], sorted
    """
    result = []
    for i in extracted_list:
        index = i[0]
        lines = i[1]
        n_lines = i[2]
        lines = lines[:max(3, int(top_k*n_lines))]  # get top k        
        lines = '\n'.join([i[1] for i in lines ])  # each item -> (line_idx, content, score)
        result.append((index, lines))
    return result

In [30]:
sub_infos = extract_top(0.1, extract_list_attack_vector)
sub_infos[:3]

[(14,
  '\\if(tile_image!=(Image*)NULL)\\tile_image=DestroyImage(tile_image);\n(void)LogMagickEvent(TraceEvent,GetMagickModule(),"%s",\nstatus=OpenBlob(image_info,image,ReadBinaryBlobMode,exception);'),
 (19,
  '//policy->SetTokenLevel(sandbox::USER_RESTRICTED_SAME_ACCESS,\n//policy->SetDelayedIntegrityLevel(sandbox::INTEGRITY_LEVEL_LOW);\n//cmd_line->HasSwitch(switches::kReduceGpuSandbox)){'),
 (24,
  'size_tbits_allocated,bytes_per_pixel,colors,depth,height,length,mask,\nstatus=OpenBlob(image_info,image,ReadBinaryBlobMode,exception);\nint*bluemap,datum,*greenmap,*graymap,index,*redmap;')]

In [31]:
explain_features = []
for i in sub_infos:
    explain_features.append({
        'func_before': i[1],  # just call func_before to fit input for aspect model
        'explain': attack_vector_test_df.iloc[i[0]]['explain'],
        'orginal_fun': attack_vector_test_df.iloc[i[0]]['func_before']
    })
explain_features_df = pd.DataFrame.from_records(explain_features)

In [32]:
explain_features_dataset = Dataset.from_pandas(explain_features_df)

In [51]:
from dataclasses import dataclass

@dataclass
class AspectArgs:
    model_name = f'tf_board/{args.task}/'
    num_proc = 4
    batch_size = 5
    max_src_length = 1200
    max_des_length = 146
    data_cols = ['func_before', 'explain', 'orginal_fun']
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = args.task
    
aspect_args = AspectArgs()

In [52]:
from transformers import AutoTokenizer
codet5p_tokenizer = AutoTokenizer.from_pretrained(aspect_args.model_name)

In [53]:
import torch
from transformers import AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

explain_model = AutoModelForSeq2SeqLM.from_pretrained(aspect_args.model_name).to(device)

In [54]:
def preprocess_function(examples):
    source = [' '.join(ex) for ex in examples["func_before"]]
    target = [' '.join(ex) for ex in examples["explain"]]

    input_feature = codet5p_tokenizer(source, max_length=aspect_args.max_src_length, padding="max_length", truncation=True)
    labels = codet5p_tokenizer(target, max_length=aspect_args.max_des_length, padding="max_length", truncation=True)

    lables = labels["input_ids"].copy()

    return {  "input_ids": input_feature["input_ids"],
              "attention_mask": input_feature["attention_mask"],
              "labels": lables}


tokenized_ds = explain_features_dataset.map(
  preprocess_function,
  remove_columns=aspect_args.data_cols,
  batched=True,
  num_proc=aspect_args.num_proc,
  batch_size=aspect_args.batch_size)

tokenized_ds

Map (num_proc=4):   0%|          | 0/70 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 70
})

In [55]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  codet5p_tokenizer,
  model=explain_model,
  return_tensors="pt")

In [56]:
sample_dataloader = DataLoader(
  tokenized_ds.with_format("torch"),
  collate_fn=data_collator,
  batch_size=30,
  num_workers=aspect_args.num_proc)

rouge_list = []
gens = []
references = []

for batch in sample_dataloader:
    with torch.no_grad():
        preds = explain_model.generate(
          batch["input_ids"].to(device),
          max_length=aspect_args.max_des_length,
        ).cpu()
        labels = batch["labels"].cpu()
        gens += preds
        references += labels

In [57]:
from commons import clean_generated_str

In [58]:
gens = codet5p_tokenizer.batch_decode(gens, skip_special_tokens=True)
references = codet5p_tokenizer.batch_decode(references, skip_special_tokens=True)

gens = [clean_generated_str(i) for i in gens]
references = [clean_generated_str(i) for i in references]

In [59]:
import evaluate

rouge_metric = evaluate.load("rouge")

results = rouge_metric.compute(predictions=gens, references=references)
results



{'rouge1': 0.3512394012935808,
 'rouge2': 0.3275109320504057,
 'rougeL': 0.34988929714935907,
 'rougeLsum': 0.3512686545603396}

In [60]:
gens[10]

'creating/chowning files inside user owned directories'

In [61]:
references[10]

'has a buffer over-read'