In [1]:
from captum.attr import LayerIntegratedGradients

In [2]:
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [3]:
# there are some warning from transformer
# due to its verbose, disable

from transformers import logging
logging.set_verbosity(40)

In [17]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler

In [25]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [5]:
from linevul_model import Model
from linevul_helpers import TextDataset

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [7]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [8]:
# get from LineVul
checkpoint = '/home/hqn650/LineVul/linevul/saved_models/checkpoint-best-f1/12heads_linevul_model.bin'

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [10]:
pre_train = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True)

In [19]:
from dataclasses import dataclass

@dataclass
class Args:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    use_non_pretrained_model = False
    block_size = 512
    test_data_file = '/home/hqn650/LineVul/data/big-vul_dataset/test.csv'
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=42
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512
    
args = Args()

In [12]:
model = Model(pre_train, config, tokenizer, args)

In [13]:
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [14]:
test_dataset = TextDataset(tokenizer, args, file_type='test')

  0%|          | 0/18864 [00:00<?, ?it/s]

In [15]:
best_threshold=0.5

In [20]:
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size, num_workers=0)

In [21]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [26]:
nb_eval_steps = 0
model.eval()
logits=[]  
y_trues=[]
for batch in test_dataloader:
    (inputs_ids, labels) = [x.to(args.device) for x in batch]
    with torch.no_grad():
        lm_loss, logit = model(input_ids=inputs_ids, labels=labels)
        logits.append(logit.cpu().numpy())
        y_trues.append(labels.cpu().numpy())
    nb_eval_steps += 1
# calculate scores
logits = np.concatenate(logits, 0)
y_trues = np.concatenate(y_trues, 0)
y_preds = logits[:, 1] > best_threshold
acc = accuracy_score(y_trues, y_preds)
recall = recall_score(y_trues, y_preds)
precision = precision_score(y_trues, y_preds)   
f1 = f1_score(y_trues, y_preds)             
result = {
    "test_accuracy": float(acc),
    "test_recall": float(recall),
    "test_precision": float(precision),
    "test_f1": float(f1),
    "test_threshold":best_threshold,
}

