# Reproduce LineVul

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import commons

  from tqdm.autonotebook import tqdm


In [3]:
from transformers import AutoTokenizer, AutoModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

In [4]:
# there are some warning from transformer
# due to its verbose, disable

from transformers import logging
logging.set_verbosity(40)

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler

In [91]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader

In [7]:
import pandas as pd
from tqdm.autonotebook import tqdm

In [8]:
from linevul_model import Model
from linevul_helpers import TextDataset
from linevul_extra import extract_line_attention, linevul_predict

In [9]:
from project_dataset import load_dataset

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [11]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 1
config.num_attention_heads = 12

In [12]:
# get from LineVul
checkpoint = '/home/hqn650/LineVul/linevul/saved_models/checkpoint-best-f1/12heads_linevul_model.bin'

In [13]:
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [14]:
pre_train = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', 
                                                             config=config, 
                                                             ignore_mismatched_sizes=True)

In [15]:
from dataclasses import dataclass

@dataclass
class Args:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    use_non_pretrained_model = False
    block_size = 512
    test_data_file = '/home/hqn650/LineVul/data/big-vul_dataset/test.csv'
    code_length=256
    do_local_explanation=True
    reasoning_method='attention'
    seed=42
    num_attention_heads=12
    do_sorting_by_line_scores=False
    do_sorting_by_pred_prob=False
    top_k_constant=10
    use_word_level_tokenizer=False
    eval_batch_size=512

    task = "attack_vector"
    
args = Args()

In [16]:
model = Model(pre_train, config, tokenizer, args)

In [17]:
model.load_state_dict(torch.load(checkpoint, map_location=args.device))
model.to(args.device)

Model(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [18]:
from linevul_helpers import TextDataset, convert_examples_to_features

class ExtendTextDataset(TextDataset):
    def __init__(self, tokenizer, args, data_frame):
        self.examples = []
        funcs = data_frame["processed_func"].tolist()
        for i in tqdm(range(len(funcs)), desc='ExtendTextDataset'):
            self.examples.append(convert_examples_to_features(funcs[i], 1, tokenizer, args))

In [19]:
# multi-gpu evaluate
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [20]:
# to find TP
def find_tp(model, tokenizer, args, data_frame=None):
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
    result, y_trues, y_preds = linevul_predict(model, data_loader, args.device)
    tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
    tp_indices = list(tp_indices[0])
    return result, tp_indices
    
result, correct_indices = find_tp(model, tokenizer, args)

Convert example to features:   0%|          | 0/18864 [00:00<?, ?it/s]



In [21]:
# test_dataset = TextDataset(tokenizer, args, file_type='test')

In [22]:
# test_sampler = SequentialSampler(test_dataset)
# test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size, num_workers=0)

In [23]:
# result, y_trues, y_preds = linevul_predict(model, test_dataloader, args.device)

In [24]:
result

{'test_accuracy': 0.9909351145038168,
 'test_recall': 0.8635071090047394,
 'test_precision': 0.9712153518123667,
 'test_f1': 0.9141996989463121,
 'test_threshold': 0.5}

In [25]:
correct_indices[:10]

[66, 99, 103, 108, 120, 129, 130, 142, 153, 167]

In [26]:
# correct_indices = np.where((y_trues == y_preds))
# correct_indices = list(correct_indices[0])

In [27]:
# tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
# tp_indices = list(tp_indices[0])

In [28]:
# after identify true positive sample, create new loader with batch_size = 1 for explaination

# dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, num_workers=0)

In [29]:
# df = pd.read_csv(args.test_data_file)

In [30]:
# top_k_constant = [args.top_k_constant]

In [31]:
# model.eval()
# index = 0
# progress_bar = tqdm(dataloader, total=len(dataloader))
# extract_list = []
# for mini_batch in progress_bar:
#     if index in tp_indices:
#         (input_ids, labels) = mini_batch
#         ids = input_ids[0].detach().tolist()
#         all_tokens = tokenizer.convert_ids_to_tokens(ids)
#         all_tokens = [token.replace("Ġ", "") for token in all_tokens]
#         all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
#         with torch.no_grad():
#             prob, attentions = model(input_ids=input_ids, output_attentions=True)
#         lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
#         extract_list.append((index, lines_with_score, n_lines))
#     index += 1

In [33]:
def explain(model, tokenizer, explain_indices, data_frame=None): 
    """ 
        return (sample_idx, lines, n_lines)
    """
    if data_frame is not None:
        dataset = ExtendTextDataset(tokenizer, args, data_frame)
    else:
        dataset = TextDataset(tokenizer, args, file_type='test')
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=0)
    model.eval()
    index = 0
    progress_bar = tqdm(data_loader, total=len(data_loader))
    extract_list = []
    for mini_batch in progress_bar:
        if index in explain_indices:
            (input_ids, labels) = mini_batch
            ids = input_ids[0].detach().tolist()
            all_tokens = tokenizer.convert_ids_to_tokens(ids)
            all_tokens = [token.replace("Ġ", "") for token in all_tokens]
            all_tokens = [token.replace("ĉ", "Ċ") for token in all_tokens]
            with torch.no_grad():
                prob, attentions = model(input_ids=input_ids, output_attentions=True)
            lines_with_score, n_lines = extract_line_attention(attentions, all_tokens)
            extract_list.append((index, lines_with_score, n_lines))
        index += 1
    return extract_list

extract_list = explain(model, tokenizer, correct_indices)

Convert example to features:   0%|          | 0/18864 [00:00<?, ?it/s]

  0%|          | 0/18864 [00:00<?, ?it/s]

In [34]:
len(extract_list), len(correct_indices)

(911, 911)

In [35]:
extract_list[-1]

(18822,
 [(2,
   'command_line->AppendSwitch(switches::kEnableThreadedCompositing);',
   1153.4075002670288),
  (1, 'GpuFeatureTest::SetUpCommandLine(command_line);', 963.8868770599365),
  (0,
   'virtualvoidSetUpCommandLine(CommandLine*command_line){',
   850.8908805847168),
  (3, '}', 151.55852222442627)],
 4)

In [36]:
import codecs

df = pd.read_csv(args.test_data_file)
context = df.iloc[18822]['func_before']
modified_context = codecs.decode(context, 'unicode_escape')

new_variable = modified_context.replace(r'\n', '\n')

print(new_variable)

  virtual void SetUpCommandLine(CommandLine* command_line) {
    GpuFeatureTest::SetUpCommandLine(command_line);
    command_line->AppendSwitch(switches::kEnableThreadedCompositing);
  }



# Attack vector

In [37]:
attack_vector = load_dataset(args.task)

In [38]:
attack_vector_test = attack_vector['test']
attack_vector_test

Dataset({
    features: ['CVE ID', 'explain', 'func_before'],
    num_rows: 1350
})

In [39]:
attack_vector_test

Dataset({
    features: ['CVE ID', 'explain', 'func_before'],
    num_rows: 1350
})

In [40]:
attack_vector_test_df = attack_vector_test.to_pandas()
attack_vector_test_df

Unnamed: 0,CVE ID,explain,func_before
0,CVE-2017-3731,sending specially crafted truncated packets,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,CVE-2013-2867,via a crafted web site .,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,CVE-2012-5131,via unknown vectors .,static void unregisterBlobURLTask(void* conte...
3,CVE-2016-5842,persuading a victim to open a specially crafte...,}static inline void WriteResourceLong(unsigned...
4,CVE-2012-2875,via a crafted document .,void PromoResourceService::PromoResourceState...
...,...,...,...
1345,CVE-2012-6657,by leveraging the ability to create a raw sock...,"int sock_setsockopt(struct socket *sock, int l..."
1346,CVE-2016-6787,using a specially-crafted application,static void perf_event_for_each_child(struct p...
1347,CVE-2016-3951,inserting a USB device with an invalid USB des...,"static int cdc_ncm_bind(struct usbnet *dev, s..."
1348,CVE-2016-1621,using a specially crafted media file,static unsigned int subpel_variance_ref(const ...


In [41]:
# func_before doesn't have the same format as processed_func from lineVul
attack_vector_test_df.iloc[0]['func_before']

'static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {  EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,c);  switch (type) {  case EVP_CTRL_INIT:  gctx->key_set = 0;  gctx->iv_set = 0;  gctx->ivlen = EVP_CIPHER_CTX_iv_length(c);  gctx->iv = EVP_CIPHER_CTX_iv_noconst(c);  gctx->taglen = -1;  gctx->iv_gen = 0;  gctx->tls_aad_len = -1;  return 1;   case EVP_CTRL_AEAD_SET_IVLEN:  if (arg <= 0)  return 0;  /* Allocate memory for IV if needed */  if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {  if (gctx->iv != EVP_CIPHER_CTX_iv_noconst(c))  OPENSSL_free(gctx->iv);  gctx->iv = OPENSSL_malloc(arg);  if (gctx->iv == NULL)  return 0;  }  gctx->ivlen = arg;  return 1;   case EVP_CTRL_AEAD_SET_TAG:  if (arg <= 0 || arg > 16 || EVP_CIPHER_CTX_encrypting(c))  return 0;  memcpy(EVP_CIPHER_CTX_buf_noconst(c), ptr, arg);  gctx->taglen = arg;  return 1;   case EVP_CTRL_AEAD_GET_TAG:  if (arg <= 0 || arg > 16 || !EVP_CIPHER_CTX_encrypting(c)  || gctx->taglen < 0)  return 0

In [42]:
df.iloc[18859]['processed_func']

'DisplayNameValueList(char * buffer, int bufsize)\n{\n    struct NameValueParserData pdata;\n    struct NameValue * nv;\n    ParseNameValue(buffer, bufsize, &pdata);\n    for(nv = pdata.l_head;\n        nv != NULL;\n        nv = nv->l_next)\n    {\n        printf("%s = %s\\n", nv->name, nv->value);\n    }\n    ClearNameValueList(&pdata);\n}\n'

In [43]:
# convert the same format
attack_vector_test_df['processed_func'] = commons.dfmp(attack_vector_test_df, commons.format_code, "func_before", cs=200, desc="Format string to LineVul expectation")

(6 Workers) Format string to LineVul expectation:   0%|          | 0/1350 [00:00<?, ?it/s]

In [44]:
attack_vector_test_df

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2017-3731,sending specially crafted truncated packets,"static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int...","static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int..."
1,CVE-2013-2867,via a crafted web site .,void BluetoothDeviceChromeOS::OnUnregisterAgen...,void BluetoothDeviceChromeOS::OnUnregisterAgen...
2,CVE-2012-5131,via unknown vectors .,static void unregisterBlobURLTask(void* conte...,static void unregisterBlobURLTask(void* contex...
3,CVE-2016-5842,persuading a victim to open a specially crafte...,}static inline void WriteResourceLong(unsigned...,}\nstatic inline void WriteResourceLong(unsign...
4,CVE-2012-2875,via a crafted document .,void PromoResourceService::PromoResourceState...,void PromoResourceService::PromoResourceStateC...
...,...,...,...,...
1345,CVE-2012-6657,by leveraging the ability to create a raw sock...,"int sock_setsockopt(struct socket *sock, int l...","int sock_setsockopt(struct socket *sock, int l..."
1346,CVE-2016-6787,using a specially-crafted application,static void perf_event_for_each_child(struct p...,static void perf_event_for_each_child(struct p...
1347,CVE-2016-3951,inserting a USB device with an invalid USB des...,"static int cdc_ncm_bind(struct usbnet *dev, s...","static int cdc_ncm_bind(struct usbnet *dev, st..."
1348,CVE-2016-1621,using a specially crafted media file,static unsigned int subpel_variance_ref(const ...,static unsigned int subpel_variance_ref(const ...


In [45]:
# to find TP
# def find_tp(model, tokenizer, args, data_frame):
#     extend_dataset = ExtendTextDataset(tokenizer, args, data_frame)
#     sampler = SequentialSampler(extend_dataset)
#     data_loader = DataLoader(extend_dataset, sampler=sampler, batch_size=args.eval_batch_size, num_workers=0)
#     result, y_trues, y_preds = linevul_predict(model, data_loader, args.device)
#     tp_indices = np.where((y_trues == y_preds) & (y_trues == 1))
#     tp_indices = list(tp_indices[0])
#     return result, tp_indices
# result, tp_indices = find_tp(model, tokenizer, args, attack_vector_test_df)

In [46]:
result, explain_indices = find_tp(model, tokenizer, args, attack_vector_test_df)

ExtendTextDataset:   0%|          | 0/1350 [00:00<?, ?it/s]



In [47]:
result, len(explain_indices)

({'test_accuracy': 0.10444444444444445,
  'test_recall': 0.10444444444444445,
  'test_precision': 1.0,
  'test_f1': 0.18913480885311873,
  'test_threshold': 0.5},
 141)

In [48]:
# (sample_idx, lines, n_lines)
extract_list_attack_vector = explain(model, tokenizer, explain_indices, attack_vector_test_df)

ExtendTextDataset:   0%|          | 0/1350 [00:00<?, ?it/s]

  0%|          | 0/1350 [00:00<?, ?it/s]

In [49]:
len(extract_list_attack_vector), len(explain_indices)

(141, 141)

In [50]:
explain_indices[0]

8

In [51]:
extract_list_attack_vector[1]

(33,
 [(1, ':algo(0),settings(NULL),settings_len(0){}', 1466.052661895752),
  (0,
   'ContentEncoding::ContentCompression::ContentCompression()',
   1061.0095748901367)],
 2)

In [52]:
attack_vector_test_df.iloc[33]['processed_func']

'ContentEncoding::ContentCompression::ContentCompression()\n    : algo(0), settings(NULL), settings_len(0) {}'

In [53]:
def extract_top(top_k, extracted_list):
    """
        extracted_list -> [(index, lines, n_lines)], sorted
    """
    result = []
    for i in extracted_list:
        index = i[0]
        lines = i[1]
        n_lines = i[2]
        lines = lines[:max(3, int(top_k*n_lines))]  # get top k        
        lines = '\n'.join([i[1] for i in lines ])  # each item -> (line_idx, content, score)
        result.append((index, lines))
    return result

In [54]:
extract_list_attack_vector[1]

(33,
 [(1, ':algo(0),settings(NULL),settings_len(0){}', 1466.052661895752),
  (0,
   'ContentEncoding::ContentCompression::ContentCompression()',
   1061.0095748901367)],
 2)

In [75]:
sub_infos = extract_top(0.1, extract_list_attack_vector)
sub_infos[:3]

[(8,
  'NOTREACHED()<<"Unsupportedaudiobitdepthincrossfade.";\nDoCheckFakeData<uint8>(audio_data,length);\nDoCheckFakeData<int16>(audio_data,length);'),
 (33,
  ':algo(0),settings(NULL),settings_len(0){}\nContentEncoding::ContentCompression::ContentCompression()'),
 (37,
  '::libvpx_test::Encoder*encoder){\ntimebase_=static_cast<double>(tb.num)/tb.den;\nvirtualvoidPreEncodeFrameHook(::libvpx_test::VideoSource*video,')]

In [132]:
explain_features = []
for i in sub_infos:
    explain_features.append({
        'func_before': i[1],  # just call func_before to fit input for aspect model
        'explain': attack_vector_test_df.iloc[i[0]]['explain'],
        'orginal_fun': attack_vector_test_df.iloc[i[0]]['func_before']
    })
explain_features_df = pd.DataFrame.from_records(explain_features)

In [133]:
explain_features_df

Unnamed: 0,func_before,explain,orginal_fun
0,"NOTREACHED()<<""Unsupportedaudiobitdepthincross...",via vectors involving seek operations on video...,"void CheckFakeData(uint8* audio_data, int fra..."
1,":algo(0),settings(NULL),settings_len(0){}\nCon...",using a specially crafted media file,ContentEncoding::ContentCompression::ContentC...
2,::libvpx_test::Encoder*encoder){\ntimebase_=st...,using a specially crafted media file,virtual void PreEncodeFrameHook(::libvpx_test...
3,"dissect_ac_if_hdr_body(tvbuff_t*tvb,gintoffset...",persuading a victim to open a specially-crafte...,"dissect_ac_if_hdr_body(tvbuff_t *tvb, gint off..."
4,"constuint8_t*refs[]={GetReference(0),GetRefere...",using a specially crafted media file,void SADs(unsigned int *results) { const uin...
...,...,...,...
136,content::ResourceDispatcherHost::Get()->SetDel...,via unknown vectors .,void SetDelegateOnIO(content::ResourceDispatch...
137,"UMA_PAIRING_METHOD_DISPLAY_PASSKEY,\nif(entere...",via a crafted web site .,void BluetoothDeviceChromeOS::DisplayPasskey( ...
138,[4][5][6][7][8]*/\n/*percentspecifier:%s%c%p%u...,creating a symbolic link from a temporary file...,"int main(int argc, char** argv) { /* Kernel s..."
139,"if(LocaleNCompare((char*)text,MagickID,strlen(...",because the end-of-file condition is not cons...,static Image *ReadTXTImage(const ImageInfo *im...


In [94]:
explain_features_dataset = Dataset.from_pandas(explain_features_df)

In [95]:
explain_features_dataset

Dataset({
    features: ['func_before', 'explain'],
    num_rows: 141
})

In [141]:
from dataclasses import dataclass

@dataclass
class AspectArgs:
    model_name = "tf_board/attack_vector/"
    num_proc = 4
    batch_size = 5
    max_src_length = 1200
    max_des_length = 146
    data_cols = ['func_before', 'explain']
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "attack_vector"
    
aspect_args = AspectArgs()

In [142]:
from transformers import AutoTokenizer
codet5p_tokenizer = AutoTokenizer.from_pretrained(aspect_args.model_name)

In [143]:
import torch
from transformers import AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

explain_model = AutoModelForSeq2SeqLM.from_pretrained(aspect_args.model_name).to(device)

In [144]:
def preprocess_function(examples):
    source = [' '.join(ex) for ex in examples["func_before"]]
    target = [' '.join(ex) for ex in examples["explain"]]

    input_feature = codet5p_tokenizer(source, max_length=aspect_args.max_src_length, padding="max_length", truncation=True)
    labels = codet5p_tokenizer(target, max_length=aspect_args.max_des_length, padding="max_length", truncation=True)

    lables = labels["input_ids"].copy()

    return {  "input_ids": input_feature["input_ids"],
              "attention_mask": input_feature["attention_mask"],
              "labels": lables}


tokenized_ds = explain_features_dataset.map(
  preprocess_function,
  remove_columns=aspect_args.data_cols,
  batched=True,
  num_proc=aspect_args.num_proc,
  batch_size=aspect_args.batch_size)

tokenized_ds

Map (num_proc=4):   0%|          | 0/141 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 141
})

In [145]:
import evaluate
import numpy as np

rouge_metric = evaluate.load("rouge")

# define function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = codet5p_tokenizer(arg)
  return codet5p_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# define function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  # Replace -100
  labels = np.where(labels != -100, labels, codet5p_tokenizer.pad_token_id)
  # Convert id tokens to text
  text_preds = codet5p_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = codet5p_tokenizer.batch_decode(labels, skip_special_tokens=True)
  # Insert a line break (\n) in each sentence for ROUGE scoring
  text_preds = ["\n".join(p for p in text_preds)]
  text_labels = ["\n".join(l for l in text_labels)]
  # compute ROUGE score with custom tokenization
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

In [146]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  codet5p_tokenizer,
  model=explain_model,
  return_tensors="pt")

In [158]:
sample_dataloader = DataLoader(
  tokenized_ds.with_format("torch"),
  collate_fn=data_collator,
  batch_size=aspect_args.batch_size,
  num_workers=aspect_args.num_proc)

rouge_list = []

for batch in sample_dataloader:
    with torch.no_grad():
        preds = explain_model.generate(
          batch["input_ids"].to(device),
          max_length=aspect_args.max_des_length,
        ).cpu()
        labels = batch["labels"].cpu()
    
    rouge_result = metrics_func([preds, labels])
    print(rouge_result)
    rouge_list.append(rouge_result)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
{'rouge1': 0.8495238095238096, 'rouge2': 0.6577437858508605, 'rougeL': 0.6361904761904762, 'rougeLsum': 0.8217636022514071}
{'rouge1': 0.8324324324324325, 'rouge2': 0.5815217391304348, 'rougeL': 0.4702702702702703, 'rougeLsum': 0.7777777777777778}
{'rouge1': 0.7723785166240409, 'rouge2': 0.5809768637532133, 'rougeL': 0.5882352941176471, 'rougeLsum': 0.7619047619047619}
{'rouge1': 0.867816091954023, 'rouge2': 0.7687861271676302, 'rougeL': 0.82183908045977, 'rougeLsum': 0.8707865168539326}
{'rouge1': 0.6925207756232686, 'rouge2': 0.4958217270194986, 'rougeL': 0.5429362880886427, 'rougeLsum': 0.6666666666666667}
{'rouge1': 0.9287598944591029, 'rouge2': 0.8063660477453581, 'rougeL': 0.8179419525065963, 'rougeLsum': 0.8940568475452197}
{'rouge1': 0.8933717579250721, 'rouge2': 0.8347826086956521, 'rougeL': 0.8472622478386167, 'rougeLsum': 0.8845070422535211}
{'rouge1': 0.8604118993135011, 'rouge2': 0.7908045977011493, 'rougeL': 

In [162]:
# rouge1_values = [d['rouge1'] for d in rouge_list]
# rouge2_values = [d['rouge2'] for d in rouge_list]
# rougeL_values = [d['rougeL'] for d in rouge_list]
# average_rouge1 = sum(rouge1_values) / len(rouge1_values)
# average_rouge2 = sum(rouge2_values) / len(rouge2_values)
# average_rougeL = sum(rougeL_values) / len(rougeL_values)

# average_rouge1, average_rouge2, average_rougeL

rouge1_values = [d['rouge1'] for d in rouge_list]
rouge2_values = [d['rouge2'] for d in rouge_list]
rougeL_values = [d['rougeL'] for d in rouge_list]
rouge1_values = sum(rouge1_values) / len(rougeL_values)
average_rouge2 = sum(rouge2_values) / len(rougeL_values)
average_rougeL = sum(rougeL_values) / len(rougeL_values)

rouge1_values, average_rouge2, average_rougeL

(0.8500055158910367, 0.7002298141982961, 0.7082808602669106)

In [149]:
p = codet5p_tokenizer.batch_decode(preds[0], skip_special_tokens=True)
''.join(p).strip()

'v i a   v e c t o r s   i n v o l v i n g   s e e k   o p e r a t i o n s   o n   v i d e o   d a t a  .'

In [150]:
l = codet5p_tokenizer.batch_decode(labels[0], skip_special_tokens=True)
''.join(l)

'v i a   v e c t o r s   i n v o l v i n g   s e e k   o p e r a t i o n s   o n   v i d e o   d a t a  .'

In [151]:
preds.shape

torch.Size([5, 68])

In [152]:
explain_features_df.iloc[0]['orginal_fun']

' void CheckFakeData(uint8* audio_data, int frames_written,  double playback_rate) {  size_t length =  (frames_written * algorithm_.bytes_per_frame())  / algorithm_.bytes_per_channel();  switch (algorithm_.bytes_per_channel()) {  case 4:  DoCheckFakeData<int32>(audio_data, length);  break;  case 2:  DoCheckFakeData<int16>(audio_data, length);  break;  case 1:  DoCheckFakeData<uint8>(audio_data, length);  break;  default:  NOTREACHED() << "Unsupported audio bit depth in crossfade.";  }  } '

In [153]:
explain_features_df.iloc[0]['func_before']

'NOTREACHED()<<"Unsupportedaudiobitdepthincrossfade.";\nDoCheckFakeData<uint8>(audio_data,length);\nDoCheckFakeData<int16>(audio_data,length);'