In [1]:
%load_ext autoreload
%autoreload

In [2]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from pathlib import Path
from project_dataset import load_dataset
from code2nl.model import Seq2Seq

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from dataclasses import dataclass

@dataclass
class Args:
    model_name = "microsoft/codebert-base"
    num_proc = 4
    batch_size = 50
    max_source_length = 512  
    max_target_length = 53 
    data_cols = ["CVE ID", "explain", "func_before"]
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "vulnerability_type"
    prefix = 'codebert'
    do_lower_case = False
    beam_size = 10
    
args = Args()

In [5]:
ds = load_dataset(args.task)

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 3870
    })
    validation: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 431
    })
    test: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 1076
    })
})

In [7]:
df_train = ds['train']
df_train = df_train.to_pandas()

df_val = ds['validation']
df_val = df_val.to_pandas()

df_test = ds['test']
df_test = df_test.to_pandas()

In [8]:
df_train.head(3)

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2018-16066,out-of-bounds read,Node::InsertionNotificationRequest SVGStyleEl...,Node::InsertionNotificationRequest SVGStyleEle...
1,CVE-2015-2695,pointer type,spnego_gss_init_sec_context( OM_uint32 *min...,spnego_gss_init_sec_context(OM_uint32 *minor_s...
2,CVE-2011-2861,string read,void RenderThread::Init() { TRACE_EVENT_BEGIN...,void RenderThread::Init() {\n TRACE_EVENT_BEG...


In [9]:
import os

os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(f'tmp_data/{args.task}', exist_ok=True)
os.makedirs(f'{args.save_dir}/{args.prefix}_{args.task}', exist_ok=True)

In [10]:
import json

df_train['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_train['docstring_tokens'] = df_train.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/train.jsonl','w') as f:
    for _, row in df_train.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_val['code_tokens'] = df_val.func_before.apply(lambda x: x.split())
df_val['docstring_tokens'] = df_val.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/valid.jsonl','w') as f:
    for _, row in df_val.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_test['code_tokens'] = df_test.func_before.apply(lambda x: x.split())
df_test['docstring_tokens'] = df_test.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/test.jsonl','w') as f:
    for _, row in df_test.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

In [11]:
lr = args.lr
batch_size = args.batch_size # change depending on the GPU Colab gives you
beam_size = args.beam_size
source_length = args.max_source_length
target_length = args.max_target_length
data_dir = 'tmp_data'
output_dir = f'{args.save_dir}/{args.prefix}_{args.task}'
train_file = f'{data_dir}/{args.task}/train.jsonl'
dev_file = f'{data_dir}/{args.task}/valid.jsonl'
epochs = args.epochs 
pretrained_model = args.model_name

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_train \
    --do_eval \
    --do_lower_case \
    --model_type roberta \
    --model_name_or_path {pretrained_model} \
    --train_filename {train_file} \
    --dev_filename {dev_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --train_batch_size {batch_size} \
    --eval_batch_size {batch_size} \
    --learning_rate {lr} \
    --num_train_epochs {epochs}

07/20/2023 23:29:42 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='tf_board/codebert_vulnerability_type', load_model_path=None, train_filename='tmp_data/vulnerability_type/train.jsonl', dev_filename='tmp_data/vulnerability_type/valid.jsonl', test_filename=None, config_name='', tokenizer_name='', max_source_length=512, max_target_length=53, do_train=True, do_eval=True, do_test=False, do_lower_case=True, no_cuda=False, train_batch_size=50, eval_batch_size=50, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=100, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
07/20/2023 23:29:46 - INFO - __main__ -   *** Example ***
07/20/2023 23:29:46 - INFO - __main__ -   idx: 0
07/20/2023 23:29:46 - INFO - __main__ -   source_tokens: ['<s>', 'Node', '::', 'Insert', 'ion', 'Not', 'ification', 'Request', '_SV', 'G

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



epoch 86 loss 0.0036: 100%|█████████████████████| 78/78 [00:57<00:00,  1.36it/s]
07/21/2023 01:20:19 - INFO - __main__ -   
***** Running evaluation *****
07/21/2023 01:20:19 - INFO - __main__ -     Num examples = 431
07/21/2023 01:20:19 - INFO - __main__ -     Batch size = 50
07/21/2023 01:20:21 - INFO - __main__ -     eval_ppl = 2.09806
07/21/2023 01:20:21 - INFO - __main__ -     global_step = 6787
07/21/2023 01:20:21 - INFO - __main__ -     train_loss = 0.0036
07/21/2023 01:20:21 - INFO - __main__ -     ********************
Total: 431
07/21/2023 01:20:37 - INFO - __main__ -     bleu-4 = 68.48 
07/21/2023 01:20:37 - INFO - __main__ -     ********************
epoch 87 loss 0.0034: 100%|█████████████████████| 78/78 [00:57<00:00,  1.36it/s]
07/21/2023 01:21:34 - INFO - __main__ -   
***** Running evaluation *****
07/21/2023 01:21:34 - INFO - __main__ -     Num examples = 431
07/21/2023 01:21:34 - INFO - __main__ -     Batch size = 50
07/21/2023 01:21:37 - INFO - __main__ -     eval_ppl 

In [12]:
batch_size=64
dev_file= f'{data_dir}/{args.task}/valid.jsonl'
test_file=f"{data_dir}/{args.task}/test.jsonl"
test_model=f"{output_dir}/checkpoint-best-bleu/pytorch_model.bin" #checkpoint for test

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_test \
    --model_type roberta \
    --model_name_or_path microsoft/codebert-base \
    --load_model_path {test_model} \
    --dev_filename {dev_file} \
    --test_filename {test_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --eval_batch_size {batch_size}

07/21/2023 14:35:15 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='tf_board/codebert_vulnerability_type', load_model_path='tf_board/codebert_vulnerability_type/checkpoint-best-bleu/pytorch_model.bin', train_filename=None, dev_filename='tmp_data/vulnerability_type/valid.jsonl', test_filename='tmp_data/vulnerability_type/test.jsonl', config_name='', tokenizer_name='', max_source_length=512, max_target_length=53, do_train=False, do_eval=False, do_test=True, do_lower_case=False, no_cuda=False, train_batch_size=8, eval_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
07/21/2023 14:35:17 - INFO - __main__ -   reload model from tf_board/codebert_vulnerability_type/checkpoint-best-bleu/pytorch_model.bin
07/21/2023 14:35:18 - INFO - __main_

In [13]:
from transformers import RobertaTokenizer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)

In [14]:
from transformers import RobertaConfig, RobertaModel

config = RobertaConfig.from_pretrained(pretrained_model)
encoder = RobertaModel.from_pretrained(pretrained_model, config = config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(encoder = encoder,decoder = decoder,config=config,
                beam_size=beam_size,max_length=target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
model.load_state_dict(torch.load(Path(output_dir)/"checkpoint-best-bleu/pytorch_model.bin"))
model.to(device)

Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [15]:
idx = 0
TEXT_TO_SUMMARIZE = df_test.func_before.values[idx]
print('Code:', TEXT_TO_SUMMARIZE)
print('Original Comment:', df_val.explain.values[idx])

Code: static ssize_t read_mem(struct file *file, char __user *buf,    size_t count, loff_t *ppos) {  phys_addr_t p = *ppos;  ssize_t read, sz;  void *ptr;   if (p != *ppos)   return 0;   if (!valid_phys_addr_range(p, count))   return -EFAULT;  read = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED  /* we don't have page 0 mapped on sparc and m68k.. */  if (p < PAGE_SIZE) {   sz = size_inside_page(p, count);   if (sz > 0) {    if (clear_user(buf, sz))     return -EFAULT;    buf += sz;    p += sz;    count -= sz;    read += sz;   }  } #endif  while (count > 0) {  unsigned long remaining;  sz = size_inside_page(p, count);  if (!range_is_allowed(p >> PAGE_SHIFT, count))  return -EPERM;  /*    * On ia64 if a page has been mapped somewhere as uncached, then    * it must also be accessed uncached by the kernel or data    * corruption may occur.    */   ptr = xlate_dev_mem_ptr(p);   if (!ptr)    return -EFAULT;  remaining = copy_to_user(buf, ptr, sz);   unxlate_dev_mem_ptr(p, ptr);  if (remaining)  r

In [16]:
import pandas as pd
from code2nl.run import convert_examples_to_features, Example

def get_preds(df: pd.DataFrame):
    ps = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        examples = [
            Example(idx, source = row.func_before, target = row.explain)
        ]
        eval_features = convert_examples_to_features(
            examples, tokenizer, args, stage='test'
        )
        source_ids = torch.tensor(eval_features[0].source_ids, dtype = torch.long).unsqueeze(0).to('cuda')
        source_mask = torch.tensor(eval_features[0].source_mask, dtype = torch.long).unsqueeze(0).to('cuda')

        with torch.no_grad():
            preds = model(source_ids = source_ids, source_mask = source_mask)  
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                ps.append(text)
    
    return ps

In [17]:
import evaluate
rouge = evaluate.load('rouge')

In [18]:
# df_test = df_test.reset_index()
preds = get_preds(df_test)
references = []
for idx, row in df_test.iterrows():
    # print('Code:', row.func_before)
    # print('Original Comment:', row.explain)
    # print('Generated Comment:', preds[idx])
    # print('='*40)
    references.append(row.explain)

results = rouge.compute(predictions=preds, references=references)
results

  0%|          | 0/1076 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1064 > 512). Running this sequence through the model will result in indexing errors
07/21/2023 14:38:36 - INFO - absl -   Using default tokenizer.


{'rouge1': 0.6258336149600091,
 'rouge2': 0.6047906709152062,
 'rougeL': 0.6257911027427756,
 'rougeLsum': 0.6263246176582606}

In [19]:
for idx, row in df_test.head(3).iterrows():
    print('Code:', row.func_before)
    print('Original Comment:', row.explain)
    print('Generated Comment:', preds[idx])
    print('='*40)

Code: static ssize_t read_mem(struct file *file, char __user *buf,    size_t count, loff_t *ppos) {  phys_addr_t p = *ppos;  ssize_t read, sz;  void *ptr;   if (p != *ppos)   return 0;   if (!valid_phys_addr_range(p, count))   return -EFAULT;  read = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED  /* we don't have page 0 mapped on sparc and m68k.. */  if (p < PAGE_SIZE) {   sz = size_inside_page(p, count);   if (sz > 0) {    if (clear_user(buf, sz))     return -EFAULT;    buf += sz;    p += sz;    count -= sz;    read += sz;   }  } #endif  while (count > 0) {  unsigned long remaining;  sz = size_inside_page(p, count);  if (!range_is_allowed(p >> PAGE_SHIFT, count))  return -EPERM;  /*    * On ia64 if a page has been mapped somewhere as uncached, then    * it must also be accessed uncached by the kernel or data    * corruption may occur.    */   ptr = xlate_dev_mem_ptr(p);   if (!ptr)    return -EFAULT;  remaining = copy_to_user(buf, ptr, sz);   unxlate_dev_mem_ptr(p, ptr);  if (remaining)  r

In [1]:
import pandas as pd

In [2]:
df_1 = pd.read_csv('tf_board/codebert_vulnerability_type/test_1.gold', sep='\t', header=None)

In [3]:
references = df_1[1].values.tolist()

In [4]:
df_2 = pd.read_csv('tf_board/codebert_vulnerability_type/test_1.output', sep='\t', header=None)

In [5]:
preds = df_2[1].values.tolist()

In [6]:
import evaluate
rouge = evaluate.load('rouge')

df = []
for i, v in enumerate(zip(preds, references)):
    r_ = rouge.compute(predictions=[v[0]], references=[v[1]])
    df.append((i, r_['rouge1'], r_['rouge2'], r_['rougeL']))

In [7]:
df_ = pd.DataFrame(df, columns=['id', 'rouge1', 'rouge2', 'rougeL'])

In [8]:
df_.to_csv(f"vulnerability_type.tsv", sep='\t')