In [1]:
from captum.attr import LayerIntegratedGradients

In [2]:
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [3]:
# there are some warning from transformer
# due to its verbose, disable

from transformers import logging
logging.set_verbosity(40)

In [17]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler

In [25]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [38]:
import pandas as pd
from tqdm.autonotebook import tqdm

In [5]:
from linevul_model import Model
from linevul_helpers import TextDataset

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [7]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [8]:
# get from LineVul
checkpoint = '/home/hqn650/LineVul/linevul/saved_models/checkpoint-best-f1/12heads_linevul_model.bin'

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [10]:
pre_train = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True)

In [19]:
from dataclasses import dataclass

@dataclass
class Args:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    use_non_pretrained_model = False
    block_size = 512
    test_data_file = '/home/hqn650/LineVul/data/big-vul_dataset/test.csv'
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=42
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512
    
args = Args()

In [12]:
model = Model(pre_train, config, tokenizer, args)

In [13]:
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [14]:
test_dataset = TextDataset(tokenizer, args, file_type='test')

  0%|          | 0/18864 [00:00<?, ?it/s]

In [15]:
best_threshold=0.5

In [20]:
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size, num_workers=0)

In [21]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [26]:
nb_eval_steps = 0
model.eval()
logits=[]  
y_trues=[]
for batch in test_dataloader:
    (inputs_ids, labels) = [x.to(args.device) for x in batch]
    with torch.no_grad():
        lm_loss, logit = model(input_ids=inputs_ids, labels=labels)
        logits.append(logit.cpu().numpy())
        y_trues.append(labels.cpu().numpy())
    nb_eval_steps += 1
# calculate scores
logits = np.concatenate(logits, 0)
y_trues = np.concatenate(y_trues, 0)
y_preds = logits[:, 1] > best_threshold
acc = accuracy_score(y_trues, y_preds)
recall = recall_score(y_trues, y_preds)
precision = precision_score(y_trues, y_preds)   
f1 = f1_score(y_trues, y_preds)             
result = {
    "test_accuracy": float(acc),
    "test_recall": float(recall),
    "test_precision": float(precision),
    "test_f1": float(f1),
    "test_threshold":best_threshold,
}



In [27]:
result

{'test_accuracy': 0.9909351145038168,
 'test_recall': 0.8635071090047394,
 'test_precision': 0.9712153518123667,
 'test_f1': 0.9141996989463121,
 'test_threshold': 0.5}

In [30]:
correct_indices = np.where((y_trues == y_preds))
correct_indices = list(correct_indices[0])

In [31]:
tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
tp_indices = list(tp_indices[0])

In [33]:
# after identify true positive sample, create new loader for explaination

dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, num_workers=0)

In [36]:
df = pd.read_csv(args.test_data_file)

In [37]:
top_k_constant = [args.top_k_constant]

In [50]:
def line_level_localization_tp(flaw_lines: str, tokenizer, model, mini_batch, original_func: str, args, top_k_loc: list, top_k_constant: list, reasoning_method: str, index: int, write_invalid_data: bool):
    # function for captum LIG.
    def predict(input_ids):
        return model(input_ids=input_ids)[0]

    def lig_forward(input_ids):
        logits = model(input_ids=input_ids)[0]
        y_pred = 1 # for positive attribution, y_pred = 0 for negative attribution
        pred_prob = logits[y_pred].unsqueeze(-1)
        return pred_prob

    flaw_line_seperator = "/~/"
    (input_ids, labels) = mini_batch
    ids = input_ids[0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(ids)
    all_tokens = [token.replace("Ġ", "") for token in all_tokens]
    all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
    original_lines = ''.join(all_tokens).split("Ċ")

    # flaw line verification
    # get flaw tokens ground truth
    flaw_lines = get_all_flaw_lines(flaw_lines=flaw_lines, flaw_line_seperator=flaw_line_seperator)
    flaw_tokens_encoded = encode_all_lines(all_lines=flaw_lines, tokenizer=tokenizer)
    verified_flaw_lines = []
    do_explanation = False
    for i in range(len(flaw_tokens_encoded)):
        encoded_flaw = ''.join(flaw_tokens_encoded[i])
        encoded_all = ''.join(all_tokens)
        if encoded_flaw in encoded_all:
            verified_flaw_lines.append(flaw_tokens_encoded[i])
            do_explanation = True

    # do explanation if at least one flaw line exist in the encoded input
    if do_explanation:
        if reasoning_method == "attention":
            # attentions: a tuple with of one Tensor with 4D shape (batch_size, num_heads, sequence_length, sequence_length)
            input_ids = input_ids.to(args.device)
            prob, attentions = model(input_ids=input_ids, output_attentions=True)
            # take from tuple then take out mini-batch attention values
            attentions = attentions[0][0]
            attention = None
            # go into the layer
            for i in range(len(attentions)):
                layer_attention = attentions[i]
                # summerize the values of each token dot other tokens
                layer_attention = sum(layer_attention)
                if attention is None:
                    attention = layer_attention
                else:
                    attention += layer_attention
            # clean att score for <s> and </s>
            attention = clean_special_token_values(attention, padding=True)
            # attention should be 1D tensor with seq length representing each token's attention value
            word_att_scores = get_word_att_scores(all_tokens=all_tokens, att_scores=attention)
            all_lines_score, flaw_line_indices = get_all_lines_score(word_att_scores, verified_flaw_lines)
            # return if no flaw lines exist
            if len(flaw_line_indices) == 0:
                return "NA"
            total_lines, num_of_flaw_lines, all_correctly_predicted_flaw_lines, min_clean_lines_inspected, max_clean_lines_inspected, all_correctly_localized_func, top_10_correct_idx, top_10_not_correct_idx \
            = \
            line_level_evaluation(all_lines_score=all_lines_score, flaw_line_indices=flaw_line_indices, top_k_loc=top_k_loc, top_k_constant=top_k_constant, true_positive_only=True, index=index)
        elif reasoning_method == "lig":
            ref_token_id, sep_token_id, cls_token_id = tokenizer.pad_token_id, tokenizer.sep_token_id, tokenizer.cls_token_id
            ref_input_ids = create_ref_input_ids(input_ids, ref_token_id, sep_token_id, cls_token_id)
            # send data to device
            input_ids = input_ids.to(args.device)
            labels = labels.to(args.device)
            ref_input_ids = ref_input_ids.to(args.device)
            lig = LayerIntegratedGradients(lig_forward, model.encoder.roberta.embeddings)
            attributions, delta = lig.attribute(inputs=input_ids,
                                                baselines=ref_input_ids,
                                                internal_batch_size=32,
                                                return_convergence_delta=True)
            score = predict(input_ids)
            pred_idx = torch.argmax(score).cpu().numpy()
            pred_prob = score[pred_idx]
            attributions_sum = summarize_attributions(attributions)        
            attr_scores = attributions_sum.tolist()
            # each token should have one score
            assert len(all_tokens) == len(attr_scores)
            # store tokens and attr scores together in a list of tuple [(token, attr_score)]
            word_attr_scores = get_word_att_scores(all_tokens=all_tokens, att_scores=attr_scores)
            # remove <s>, </s>, <unk>, <pad>
            word_attr_scores = clean_word_attr_scores(word_attr_scores=word_attr_scores)
            all_lines_score, flaw_line_indices = get_all_lines_score(word_attr_scores, verified_flaw_lines)
            # return if no flaw lines exist
            if len(flaw_line_indices) == 0:
                return "NA"
            total_lines, num_of_flaw_lines, all_correctly_predicted_flaw_lines, min_clean_lines_inspected, max_clean_lines_inspected, all_correctly_localized_func, top_10_correct_idx, top_10_not_correct_idx \
             = \
            line_level_evaluation(all_lines_score=all_lines_score, flaw_line_indices=flaw_line_indices, top_k_loc=top_k_loc, top_k_constant=top_k_constant, true_positive_only=True, index=index)
        elif reasoning_method == "deeplift" or \
             reasoning_method == "deeplift_shap" or \
             reasoning_method == "gradient_shap" or \
             reasoning_method == "saliency":
            # send data to device
            input_ids = input_ids.to(args.device)
            input_embed = model.encoder.roberta.embeddings(input_ids).to(args.device)
            if reasoning_method == "deeplift":
                #baselines = torch.randn(1, 512, 768, requires_grad=True).to(args.device)
                baselines = torch.zeros(1, 512, 768, requires_grad=True).to(args.device)
                reasoning_model = DeepLift(model)
            elif reasoning_method == "deeplift_shap":
                #baselines = torch.randn(16, 512, 768, requires_grad=True).to(args.device)
                baselines = torch.zeros(16, 512, 768, requires_grad=True).to(args.device)
                reasoning_model = DeepLiftShap(model)
            elif reasoning_method == "gradient_shap":
                #baselines = torch.randn(16, 512, 768, requires_grad=True).to(args.device)
                baselines = torch.zeros(16, 512, 768, requires_grad=True).to(args.device)
                reasoning_model = GradientShap(model)
            elif reasoning_method == "saliency":
                reasoning_model = Saliency(model)
            # attributions -> [1, 512, 768]
            if reasoning_method == "saliency":
                attributions = reasoning_model.attribute(input_embed, target=1)
            else:
                attributions = reasoning_model.attribute(input_embed, baselines=baselines, target=1)
            attributions_sum = summarize_attributions(attributions)        
            attr_scores = attributions_sum.tolist()
            # each token should have one score
            assert len(all_tokens) == len(attr_scores)
            # store tokens and attr scores together in a list of tuple [(token, attr_score)]
            word_attr_scores = get_word_att_scores(all_tokens=all_tokens, att_scores=attr_scores)
            # remove <s>, </s>, <unk>, <pad>
            word_attr_scores = clean_word_attr_scores(word_attr_scores=word_attr_scores)
            all_lines_score, flaw_line_indices = get_all_lines_score(word_attr_scores, verified_flaw_lines)
            # return if no flaw lines exist
            if len(flaw_line_indices) == 0:
                return "NA"
            total_lines, num_of_flaw_lines, all_correctly_predicted_flaw_lines, min_clean_lines_inspected, max_clean_lines_inspected, all_correctly_localized_func, top_10_correct_idx, top_10_not_correct_idx \
             = \
            line_level_evaluation(all_lines_score=all_lines_score, flaw_line_indices=flaw_line_indices, top_k_loc=top_k_loc, top_k_constant=top_k_constant, true_positive_only=True, index=index)        
      
        results = {"total_lines": total_lines,
                    "num_of_flaw_lines": num_of_flaw_lines,
                    "all_correctly_predicted_flaw_lines": all_correctly_predicted_flaw_lines,
                    "all_correctly_localized_function": all_correctly_localized_func,
                    "min_clean_lines_inspected": min_clean_lines_inspected,
                    "max_clean_lines_inspected": max_clean_lines_inspected,
                    "top_10_correct_idx": top_10_correct_idx,
                    "top_10_not_correct_idx": top_10_not_correct_idx}
        return results
    else:
        if write_invalid_data:
            with open("../invalid_data/invalid_line_lev_data.txt", "a") as f:
                f.writelines("--- ALL TOKENS ---")
                f.writelines("\n")
                alltok = ''.join(all_tokens)
                alltok = alltok.split("Ċ")
                for tok in alltok:
                    f.writelines(tok)
                    f.writelines("\n")
                f.writelines("--- FLAW ---")
                f.writelines("\n")
                for i in range(len(flaw_tokens_encoded)):
                    f.writelines(''.join(flaw_tokens_encoded[i]))
                    f.writelines("\n")
                f.writelines("\n")
                f.writelines("\n")
    # if no flaw line exist in the encoded input
    return "NA"

In [62]:
def get_all_flaw_lines(flaw_lines: str, flaw_line_seperator: str) -> list:
    if isinstance(flaw_lines, str):
        flaw_lines = flaw_lines.strip(flaw_line_seperator)
        flaw_lines = flaw_lines.split(flaw_line_seperator)
        flaw_lines = [line.strip() for line in flaw_lines]
    else:
        flaw_lines = []
    return flaw_lines
def encode_all_lines(all_lines: list, tokenizer) -> list:
    encoded = []
    for line in all_lines:
        encoded.append(encode_one_line(line=line, tokenizer=tokenizer))
    return encoded
def encode_one_line(line, tokenizer):
    # add "@ " at the beginning to ensure the encoding consistency, i.e., previous -> previous, not previous > pre + vious
    code_tokens = tokenizer.tokenize("@ " + line)
    return [token.replace("Ġ", "") for token in code_tokens if token != "@"]
def clean_special_token_values(all_values, padding=False):
    # special token in the beginning of the seq 
    all_values[0] = 0
    if padding:
        # get the last non-zero value which represents the att score for </s> token
        idx = [index for index, item in enumerate(all_values) if item != 0][-1]
        all_values[idx] = 0
    else:
        # special token in the end of the seq 
        all_values[-1] = 0
    return all_values
def get_word_att_scores(all_tokens: list, att_scores: list) -> list:
    word_att_scores = []
    for i in range(len(all_tokens)):
        token, att_score = all_tokens[i], att_scores[i]
        word_att_scores.append([token, att_score])
    return word_att_scores

In [64]:
def get_all_lines_score(word_att_scores: list, verified_flaw_lines: list):
    verified_flaw_lines = [''.join(l) for l in verified_flaw_lines]
    # word_att_scores -> [[token, att_value], [token, att_value], ...]
    separator = ["Ċ", " Ċ", "ĊĊ", " ĊĊ"]
    # to return
    all_lines_score = []
    score_sum = 0
    line_idx = 0
    flaw_line_indices = []
    line = ""
    for i in range(len(word_att_scores)):
        # summerize if meet line separator or the last token
        if ((word_att_scores[i][0] in separator) or (i == (len(word_att_scores) - 1))) and score_sum != 0:
            score_sum += word_att_scores[i][1]
            all_lines_score.append(score_sum)
            is_flaw_line = False
            for l in verified_flaw_lines:
                if l == line:
                    is_flaw_line = True
            if is_flaw_line:
                flaw_line_indices.append(line_idx)
            line = ""
            score_sum = 0
            line_idx += 1
        # else accumulate score
        elif word_att_scores[i][0] not in separator:
            line += word_att_scores[i][0]
            score_sum += word_att_scores[i][1]
    return all_lines_score, flaw_line_indices

In [66]:
def line_level_evaluation(all_lines_score: list, flaw_line_indices: list, top_k_loc: list, top_k_constant: list, true_positive_only: bool, index=None):
    if true_positive_only:    
        # line indices ranking based on attr values 
        ranking = sorted(range(len(all_lines_score)), key=lambda i: all_lines_score[i], reverse=True)
        # total flaw lines
        num_of_flaw_lines = len(flaw_line_indices)
        # clean lines + flaw lines
        total_lines = len(all_lines_score)
        ### TopK% Recall ###
        all_correctly_predicted_flaw_lines = []  
        ### IFA ###
        ifa = True
        all_clean_lines_inspected = []
        for top_k in top_k_loc:
            correctly_predicted_flaw_lines = 0
            for indice in flaw_line_indices:
                # if within top-k
                k = int(len(all_lines_score) * top_k)
                # if detecting any flaw lines
                if indice in ranking[: k]:
                    correctly_predicted_flaw_lines += 1
                if ifa:
                    # calculate Initial False Alarm
                    # IFA counts how many clean lines are inspected until the first vulnerable line is found when inspecting the lines ranked by the approaches.
                    flaw_line_idx_in_ranking = ranking.index(indice)
                    # e.g. flaw_line_idx_in_ranking = 3 will include 1 vulnerable line and 3 clean lines
                    all_clean_lines_inspected.append(flaw_line_idx_in_ranking)  
            # for IFA
            min_clean_lines_inspected = min(all_clean_lines_inspected)
            # for All Effort
            max_clean_lines_inspected = max(all_clean_lines_inspected)
            # only do IFA and All Effort once
            ifa = False
            # append result for one top-k value
            all_correctly_predicted_flaw_lines.append(correctly_predicted_flaw_lines)
        
        ### Top10 Accuracy ###
        all_correctly_localized_func = []
        top_10_correct_idx = []
        top_10_not_correct_idx = []
        correctly_located = False
        for k in top_k_constant:
            for indice in flaw_line_indices:
                # if detecting any flaw lines
                if indice in ranking[: k]:
                    """
                    # extract example for the paper
                    if index == 2797:
                        print("2797")
                        print("ground truth flaw line index: ", indice)
                        print("ranked line")
                        print(ranking)
                        print("original score")
                        print(all_lines_score)
                    """
                    # append result for one top-k value
                    all_correctly_localized_func.append(1)
                    correctly_located = True
                else:
                    all_correctly_localized_func.append(0)
            if correctly_located:
                top_10_correct_idx.append(index)
            else:
                top_10_not_correct_idx.append(index)
        return total_lines, num_of_flaw_lines, all_correctly_predicted_flaw_lines, min_clean_lines_inspected, max_clean_lines_inspected, all_correctly_localized_func, \
               top_10_correct_idx, top_10_not_correct_idx
    else:
        # all_lines_score_with_label: [[line score, line level label], [line score, line level label], ...]
        all_lines_score_with_label = []
        for i in range(len(all_lines_score)):
            if i in flaw_line_indices:
                all_lines_score_with_label.append([all_lines_score[i], 1])
            else:
                all_lines_score_with_label.append([all_lines_score[i], 0])
        return all_lines_score_with_label

In [88]:
top_k_locs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
index = 0
na_explanation_case_01 = 0
na_explanation_case_02 = 0
explain_list = []
progress_bar = tqdm(dataloader, total=len(dataloader))
for mini_batch in progress_bar:
    if index in tp_indices:
        if isinstance(df["flaw_line"][index], str) and isinstance(df["flaw_line_index"][index], str):  
            line_eval_results = \
                        line_level_localization_tp(flaw_lines=df["flaw_line"][index],
                                                tokenizer=tokenizer, 
                                                model=model, 
                                                mini_batch=mini_batch, 
                                                original_func=df["processed_func"][index], 
                                                args=args,
                                                top_k_loc=top_k_locs,
                                                top_k_constant=top_k_constant,
                                                reasoning_method=args.reasoning_method,
                                                index=index,
                                                write_invalid_data=False)
            if line_eval_results != "NA":
                explain_list.append((index, line_eval_results))
            else:
                na_explanation_case_01 +=1
        else:
            na_explanation_case_02 +=1
    index += 1

  0%|          | 0/18864 [00:00<?, ?it/s]

In [89]:
len(explain_list), len(tp_indices), na_explanation_case_01, na_explanation_case_02

(628, 911, 104, 179)

In [105]:
explain_list[99]

(2410,
 {'total_lines': 4,
  'num_of_flaw_lines': 2,
  'all_correctly_predicted_flaw_lines': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2],
  'all_correctly_localized_function': [1, 1],
  'min_clean_lines_inspected': 1,
  'max_clean_lines_inspected': 3,
  'top_10_correct_idx': [2410],
  'top_10_not_correct_idx': []})

In [151]:
df.iloc[99]['func_before']

'xmlXPathNextPrecedingInternal(xmlXPathParserContextPtr ctxt,\n                               xmlNodePtr cur)\n {\n     if ((ctxt == NULL) || (ctxt->context == NULL)) return(NULL);\n    if ((ctxt->context->node->type == XML_ATTRIBUTE_NODE) ||\n\t(ctxt->context->node->type == XML_NAMESPACE_DECL))\n\treturn(NULL);\n     if (cur == NULL) {\n         cur = ctxt->context->node;\n         if (cur == NULL)\n             return (NULL);\n         ctxt->ancestor = cur->parent;\n     }\n     if ((cur->prev != NULL) && (cur->prev->type == XML_DTD_NODE))\n\tcur = cur->prev;\n    while (cur->prev == NULL) {\n        cur = cur->parent;\n        if (cur == NULL)\n            return (NULL);\n        if (cur == ctxt->context->doc->children)\n            return (NULL);\n        if (cur != ctxt->ancestor)\n            return (cur);\n        ctxt->ancestor = cur->parent;\n    }\n    cur = cur->prev;\n    while (cur->last != NULL)\n        cur = cur->last;\n    return (cur);\n}\n'

In [186]:
index = 0
progress_bar = tqdm(dataloader, total=len(dataloader))
with torch.no_grad():
    for mini_batch in progress_bar:
        if index in tp_indices and index == 99:
            (input_ids, labels) = mini_batch
            ids = input_ids[0].detach().tolist()
            all_tokens = tokenizer.convert_ids_to_tokens(ids)
            all_tokens = [token.replace("Ġ", "") for token in all_tokens]
            all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
            
            prob, attentions = model(input_ids=input_ids, output_attentions=True)
            attentions = attentions[0][0]
            attention = None
            # go into the layer
            for i in range(len(attentions)):
                layer_attention = attentions[i]
                # summerize the values of each token dot other tokens
                layer_attention = sum(layer_attention)
                if attention is None:
                    attention = layer_attention
                else:
                    attention += layer_attention
            # clean att score for <s> and </s>
            attention = clean_special_token_values(attention, padding=True)
            # attention should be 1D tensor with seq length representing each token's attention value
            # word_att_scores -> [[token, att_value], [token, att_value], ...]
            word_att_scores = get_word_att_scores(all_tokens=all_tokens, att_scores=attention)


            # go through each line
            separator = ["Ċ", " Ċ", "ĊĊ", " ĊĊ"]
            score_sum = 0
            line = ""
            score_sum = 0
            lines_with_score = []
            line_idx = 0
            for i in range(len(word_att_scores)):
                score_sum += word_att_scores[i][1]
                if word_att_scores[i][0] not in separator:
                    line += word_att_scores[i][0]
                else:
                    lines_with_score.append((line_idx, line, score_sum.detach().item()))
                    line = ""
                    score_sum = 0
                    line_idx += 1
            break
        index += 1

  0%|          | 0/18864 [00:00<?, ?it/s]

In [192]:
line_idx

32

In [191]:
sorted_lines = sorted(lines_with_score, key=lambda x: x[2], reverse=True)
sorted_lines[:int(0.15*line_idx)]

[(15,
  'if((cur->prev!=NULL)&&(cur->prev->type==XML_DTD_NODE))',
  480.56585693359375),
 (3,
  'if((ctxt==NULL)||(ctxt->context==NULL))return(NULL);',
  379.0490417480469),
 (4,
  'if((ctxt->context->node->type==XML_ATTRIBUTE_NODE)||',
  357.1581115722656),
 (6, '(ctxt->context->node->type==XML_NAMESPACE_DECL))', 304.76922607421875)]

In [165]:
import codecs

context = df.iloc[99]['func_before']
modified_context = codecs.decode(context, 'unicode_escape')

new_variable = modified_context.replace(r'\n', '\n')

print(new_variable)


xmlXPathNextPrecedingInternal(xmlXPathParserContextPtr ctxt,
                               xmlNodePtr cur)
 {
     if ((ctxt == NULL) || (ctxt->context == NULL)) return(NULL);
    if ((ctxt->context->node->type == XML_ATTRIBUTE_NODE) ||
	(ctxt->context->node->type == XML_NAMESPACE_DECL))
	return(NULL);
     if (cur == NULL) {
         cur = ctxt->context->node;
         if (cur == NULL)
             return (NULL);
         ctxt->ancestor = cur->parent;
     }
     if ((cur->prev != NULL) && (cur->prev->type == XML_DTD_NODE))
	cur = cur->prev;
    while (cur->prev == NULL) {
        cur = cur->parent;
        if (cur == NULL)
            return (NULL);
        if (cur == ctxt->context->doc->children)
            return (NULL);
        if (cur != ctxt->ancestor)
            return (cur);
        ctxt->ancestor = cur->parent;
    }
    cur = cur->prev;
    while (cur->last != NULL)
        cur = cur->last;
    return (cur);
}



In [168]:
len(lines_with_score)

32

In [172]:
df.iloc[99]

index                                                                      183429
Access Gained                                                                 NaN
Attack Origin                                                                 NaN
Authentication Required                                                       NaN
Availability                                                                  NaN
CVE ID                                                                        NaN
CVE Page                                                                      NaN
CWE ID                                                                        NaN
Complexity                                                                    NaN
Confidentiality                                                               NaN
Integrity                                                                     NaN
Known Exploits                                                                NaN
Publish Date    