In [2]:
%load_ext autoreload
%autoreload 2

In [17]:
import torch
import numpy as np
import pandas as pd
from project_dataset import load_dataset
from dataclasses import dataclass
from tqdm.autonotebook import tqdm
from torch.utils.data import DataLoader, SequentialSampler
from datasets import Dataset, DatasetDict

In [4]:
@dataclass
class Args:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    use_word_level_tokenizer=False
    block_size = 512
    eval_batch_size = 512
    num_attention_heads=12
    decision_threshold = 0.00001
args = Args()

In [5]:
from transformers import logging
logging.set_verbosity(50)

In [6]:
# Load model directly
from linevul_model import Model
from transformers import (RobertaTokenizer, 
                          RobertaForSequenceClassification, 
                          RobertaConfig)

config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
encoder = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True).to(args.device)
linevul_model = Model(encoder, config, tokenizer, args)

In [7]:
checkpoint = '12heads_linevul_model.bin'
state_dict = torch.load(checkpoint, map_location=args.device)
linevul_model.load_state_dict(state_dict, strict=False)
linevul_model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [8]:
from linevul_helpers import TextDataset, convert_examples_to_features
from linevul_extra import extract_line_attention, linevul_predict

class ExtendTextDataset(TextDataset):
    def __init__(self, tokenizer, args, data_frame):
        self.examples = []
        for row in data_frame.iterrows():
            processed_func = row[1]['processed_func']
            # as data has vulnerable only, we add new label column and make it all to 1
            self.examples.append(convert_examples_to_features(processed_func, 1, tokenizer, args))

# to find TP
def find_tp(model, tokenizer, args, data_frame=None):
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
    result, y_trues, y_preds = linevul_predict(model, data_loader, args.device, threshold=args.decision_threshold)
    tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
    tp_indices = list(tp_indices[0])
    return result, tp_indices


def explain(model, tokenizer, explain_indices, data_frame=None): 
    """ 
        return (sample_idx, lines, n_lines)
    """
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=0)
    model.eval()
    index = 0
    progress_bar = tqdm(data_loader, total=len(data_loader))
    extract_list = []
    for mini_batch in progress_bar:
        if index in explain_indices:
            (input_ids, labels) = mini_batch
            ids = input_ids[0].detach().tolist()
            all_tokens = tokenizer.convert_ids_to_tokens(ids)
            all_tokens = [token.replace("Ġ", "") for token in all_tokens]
            all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
            with torch.no_grad():
                prob, attentions = model(input_ids=input_ids, output_attentions=True)
            lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
            extract_list.append((index, lines_with_score, n_lines))
        index += 1
    return extract_list

def extract_linevul_ranking(linevul_output):
    linevul_ranking = {}
    for item in linevul_output:
        ranking = []
        idx = item[0]
        for line in item[1]:
            ranking.append(line[0])
        linevul_ranking[idx] = tuple(ranking)
    return linevul_ranking

In [9]:
# multi-gpu evaluate
if args.n_gpu > 1:
    linevul_model = torch.nn.DataParallel(linevul_model)

## attack_vector

In [26]:
task = 'attack_vector'
dataset = load_dataset(task)

data_dict = {}
for dset in ['train', 'test', 'validation']:
    workind_data = dataset[dset].to_pandas()
    result, tp_indices = find_tp(linevul_model, tokenizer, args, workind_data)
    print(f"TP result in {dset}:", result)
    linevul_output = explain(linevul_model, tokenizer, tp_indices, workind_data)
    linevul_ranking = extract_linevul_ranking(linevul_output)
    workind_data['linevul_ranking'] = [linevul_ranking[idx] for idx in range(len(workind_data))]
    data_dict[dset] = Dataset.from_pandas(workind_data)

new_dataset = DatasetDict(data_dict)
new_dataset.save_to_disk(f"./aspect_bigvul_new/dataset_{task}")

TP result in train: {'test_accuracy': 1.0, 'test_recall': 1.0, 'test_precision': 1.0, 'test_f1': 1.0, 'test_threshold': 1e-05}


  0%|          | 0/4858 [00:00<?, ?it/s]

TP result in test: {'test_accuracy': 1.0, 'test_recall': 1.0, 'test_precision': 1.0, 'test_f1': 1.0, 'test_threshold': 1e-05}


  0%|          | 0/1350 [00:00<?, ?it/s]

TP result in validation: {'test_accuracy': 1.0, 'test_recall': 1.0, 'test_precision': 1.0, 'test_f1': 1.0, 'test_threshold': 1e-05}


  0%|          | 0/540 [00:00<?, ?it/s]

# root_cause

In [None]:
task = 'root_cause'
dataset = load_dataset(task)

data_dict = {}
for dset in ['train', 'test', 'validation']:
    workind_data = dataset[dset].to_pandas()
    result, tp_indices = find_tp(linevul_model, tokenizer, args, workind_data)
    print(f"TP result in {dset}:", result)
    linevul_output = explain(linevul_model, tokenizer, tp_indices, workind_data)
    linevul_ranking = extract_linevul_ranking(linevul_output)
    workind_data['linevul_ranking'] = [linevul_ranking[idx] for idx in range(len(workind_data))]
    data_dict[dset] = Dataset.from_pandas(workind_data)

new_dataset = DatasetDict(data_dict)
new_dataset.save_to_disk(f"./aspect_bigvul_new/dataset_{task}")

In [None]:
task = 'root_cause'
dataset = load_dataset(task)

data_dict = {}
for dset in ['train', 'test', 'validation']:
    workind_data = dataset[dset].to_pandas()
    result, tp_indices = find_tp(linevul_model, tokenizer, args, workind_data)
    print(f"TP result in {dset}:", result)
    linevul_output = explain(linevul_model, tokenizer, tp_indices, workind_data)
    linevul_ranking = extract_linevul_ranking(linevul_output)
    workind_data['linevul_ranking'] = [linevul_ranking[idx] for idx in range(len(workind_data))]
    data_dict[dset] = Dataset.from_pandas(workind_data)

new_dataset = DatasetDict(data_dict)
new_dataset.save_to_disk(f"./aspect_bigvul_new/dataset_{task}")

# vulnerabilty_impact

In [None]:
task = 'impact'
dataset = load_dataset(task)

data_dict = {}
for dset in ['train', 'test', 'validation']:
    workind_data = dataset[dset].to_pandas()
    result, tp_indices = find_tp(linevul_model, tokenizer, args, workind_data)
    print(f"TP result in {dset}:", result)
    linevul_output = explain(linevul_model, tokenizer, tp_indices, workind_data)
    linevul_ranking = extract_linevul_ranking(linevul_output)
    workind_data['linevul_ranking'] = [linevul_ranking[idx] for idx in range(len(workind_data))]
    data_dict[dset] = Dataset.from_pandas(workind_data)

new_dataset = DatasetDict(data_dict)
new_dataset.save_to_disk(f"./aspect_bigvul_new/dataset_{task}")

# vulnerability_type

In [None]:
task = 'vulnerability_type'
dataset = load_dataset(task)

data_dict = {}
for dset in ['train', 'test', 'validation']:
    workind_data = dataset[dset].to_pandas()
    result, tp_indices = find_tp(linevul_model, tokenizer, args, workind_data)
    print(f"TP result in {dset}:", result)
    linevul_output = explain(linevul_model, tokenizer, tp_indices, workind_data)
    linevul_ranking = extract_linevul_ranking(linevul_output)
    workind_data['linevul_ranking'] = [linevul_ranking[idx] for idx in range(len(workind_data))]
    data_dict[dset] = Dataset.from_pandas(workind_data)

new_dataset = DatasetDict(data_dict)
new_dataset.save_to_disk(f"./aspect_bigvul_new/dataset_{task}")