In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%env WANDB_PROJECT=codebert_attack_vector

env: WANDB_PROJECT=codebert_attack_vector


In [3]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from pathlib import Path
from project_dataset import load_dataset
from code2nl.model import Seq2Seq

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from dataclasses import dataclass

@dataclass
class Args:
    model_name = "neulab/codebert-cpp"
    num_proc = 4
    batch_size = 5
    max_source_length = 512  
    max_target_length = 146 
    data_cols = ["CVE ID", "explain", "func_before"]
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "attack_vector"
    prefix = 'neulab'
    do_lower_case = False
    
args = Args()

In [5]:
ds = load_dataset(args.task)

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 4858
    })
    validation: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 540
    })
    test: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 1350
    })
})

In [7]:
df_train = ds['train']
df_train = df_train.to_pandas()

df_val = ds['validation']
df_val = df_val.to_pandas()

df_test = ds['test']
df_test = df_test.to_pandas()

In [8]:
df_train.head(3)

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2013-4483,reference counter might not be adjusted properly,static int semctl_down(struct ipc_namespace *n...,static int semctl_down(struct ipc_namespace *n...
1,CVE-2017-13009,sending a specially crafted request,"mobility_print(netdissect_options *ndo, const...","mobility_print(netdissect_options *ndo, const ..."
2,CVE-2016-3839,via a crafted application that sends a signal ...,void close_uinput (void) { BTIF_TRACE_DEBUG...,void close_uinput(void) {\n BTIF_TRACE_DEBUG(...


In [9]:
import os

os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(f'tmp_data/{args.task}', exist_ok=True)
os.makedirs(f'{args.save_dir}/{args.prefix}_{args.task}', exist_ok=True)

In [10]:
import json

df_train['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_train['docstring_tokens'] = df_train.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/train.jsonl','w') as f:
    for _, row in df_train.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_val['code_tokens'] = df_val.func_before.apply(lambda x: x.split())
df_val['docstring_tokens'] = df_val.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/valid.jsonl','w') as f:
    for _, row in df_val.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_test['code_tokens'] = df_test.func_before.apply(lambda x: x.split())
df_test['docstring_tokens'] = df_test.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/test.jsonl','w') as f:
    for _, row in df_test.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

In [11]:
lr = 5e-5
batch_size = 50 # change depending on the GPU Colab gives you
beam_size = 10
source_length = args.max_source_length
target_length = args.max_target_length
data_dir = 'tmp_data'
output_dir = f'{args.save_dir}/{args.prefix}_{args.task}'
train_file = f'{data_dir}/{args.task}/train.jsonl'
dev_file = f'{data_dir}/{args.task}/valid.jsonl'
epochs = args.epochs 
pretrained_model = args.model_name

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_train \
    --do_eval \
    --do_lower_case \
    --model_type roberta \
    --model_name_or_path {pretrained_model} \
    --train_filename {train_file} \
    --dev_filename {dev_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --train_batch_size {batch_size} \
    --eval_batch_size {batch_size} \
    --learning_rate {lr} \
    --num_train_epochs {epochs}

07/27/2023 00:35:34 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='neulab/codebert-cpp', output_dir='tf_board/neulab_attack_vector', load_model_path=None, train_filename='tmp_data/attack_vector/train.jsonl', dev_filename='tmp_data/attack_vector/valid.jsonl', test_filename=None, config_name='', tokenizer_name='', max_source_length=512, max_target_length=146, do_train=True, do_eval=True, do_test=False, do_lower_case=True, no_cuda=False, train_batch_size=50, eval_batch_size=50, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=100, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initiali

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



epoch 20 loss 0.0881: 100%|█████████████████████| 98/98 [01:19<00:00,  1.23it/s]
07/27/2023 01:16:10 - INFO - __main__ -   
***** Running evaluation *****
07/27/2023 01:16:10 - INFO - __main__ -     Num examples = 540
07/27/2023 01:16:10 - INFO - __main__ -     Batch size = 50
07/27/2023 01:16:14 - INFO - __main__ -     eval_ppl = 2.6222
07/27/2023 01:16:14 - INFO - __main__ -     global_step = 2059
07/27/2023 01:16:14 - INFO - __main__ -     train_loss = 0.0881
07/27/2023 01:16:14 - INFO - __main__ -     ********************
Total: 540
07/27/2023 01:16:44 - INFO - __main__ -     bleu-4 = 69.09 
07/27/2023 01:16:44 - INFO - __main__ -     ********************
07/27/2023 01:16:44 - INFO - __main__ -     Best bleu:69.09
07/27/2023 01:16:44 - INFO - __main__ -     ********************
epoch 21 loss 0.0723: 100%|█████████████████████| 98/98 [01:20<00:00,  1.22it/s]
07/27/2023 01:18:07 - INFO - __main__ -   
***** Running evaluation *****
07/27/2023 01:18:07 - INFO - __main__ -     Num exam

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



epoch 79 loss 0.004: 100%|██████████████████████| 98/98 [01:20<00:00,  1.22it/s]
07/27/2023 03:09:16 - INFO - __main__ -   
***** Running evaluation *****
07/27/2023 03:09:16 - INFO - __main__ -     Num examples = 540
07/27/2023 03:09:16 - INFO - __main__ -     Batch size = 50
07/27/2023 03:09:19 - INFO - __main__ -     eval_ppl = 2.91081
07/27/2023 03:09:19 - INFO - __main__ -     global_step = 7841
07/27/2023 03:09:19 - INFO - __main__ -     train_loss = 0.004
07/27/2023 03:09:19 - INFO - __main__ -     ********************
Total: 540
07/27/2023 03:09:52 - INFO - __main__ -     bleu-4 = 71.76 
07/27/2023 03:09:52 - INFO - __main__ -     ********************
epoch 80 loss 0.0045: 100%|█████████████████████| 98/98 [01:20<00:00,  1.22it/s]
07/27/2023 03:11:12 - INFO - __main__ -   
***** Running evaluation *****
07/27/2023 03:11:12 - INFO - __main__ -     Num examples = 540
07/27/2023 03:11:12 - INFO - __main__ -     Batch size = 50
07/27/2023 03:11:15 - INFO - __main__ -     eval_ppl =

In [12]:
batch_size=64
dev_file= f'{data_dir}/{args.task}/valid.jsonl'
test_file=f"{data_dir}/{args.task}/test.jsonl"
test_model=f"{output_dir}/checkpoint-best-bleu/pytorch_model.bin" #checkpoint for test

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_test \
    --model_type roberta \
    --model_name_or_path microsoft/codebert-base \
    --load_model_path {test_model} \
    --dev_filename {dev_file} \
    --test_filename {test_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --eval_batch_size {batch_size}

07/27/2023 03:48:18 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='tf_board/neulab_attack_vector', load_model_path='tf_board/neulab_attack_vector/checkpoint-best-bleu/pytorch_model.bin', train_filename=None, dev_filename='tmp_data/attack_vector/valid.jsonl', test_filename='tmp_data/attack_vector/test.jsonl', config_name='', tokenizer_name='', max_source_length=512, max_target_length=146, do_train=False, do_eval=False, do_test=True, do_lower_case=False, no_cuda=False, train_batch_size=8, eval_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
07/27/2023 03:48:19 - INFO - __main__ -   reload model from tf_board/neulab_attack_vector/checkpoint-best-bleu/pytorch_model.bin
07/27/2023 03:48:21 - INFO - __main__ -   Test file: tmp_data/atta

In [13]:
from transformers import RobertaTokenizer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)

In [16]:
from transformers import RobertaConfig, RobertaModel

config = RobertaConfig.from_pretrained(pretrained_model)
encoder = RobertaModel.from_pretrained(pretrained_model, config = config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(encoder = encoder,decoder = decoder,config=config,
                beam_size=beam_size,max_length=target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
model.load_state_dict(torch.load(Path(output_dir)/"checkpoint-best-bleu/pytorch_model.bin"))
model.to(device)

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [17]:
idx = 0
TEXT_TO_SUMMARIZE = df_test.func_before.values[idx]
print('Code:', TEXT_TO_SUMMARIZE)
print('Original Comment:', df_val.explain.values[idx])

Code: static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {  EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,c);  switch (type) {  case EVP_CTRL_INIT:  gctx->key_set = 0;  gctx->iv_set = 0;  gctx->ivlen = EVP_CIPHER_CTX_iv_length(c);  gctx->iv = EVP_CIPHER_CTX_iv_noconst(c);  gctx->taglen = -1;  gctx->iv_gen = 0;  gctx->tls_aad_len = -1;  return 1;   case EVP_CTRL_AEAD_SET_IVLEN:  if (arg <= 0)  return 0;  /* Allocate memory for IV if needed */  if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {  if (gctx->iv != EVP_CIPHER_CTX_iv_noconst(c))  OPENSSL_free(gctx->iv);  gctx->iv = OPENSSL_malloc(arg);  if (gctx->iv == NULL)  return 0;  }  gctx->ivlen = arg;  return 1;   case EVP_CTRL_AEAD_SET_TAG:  if (arg <= 0 || arg > 16 || EVP_CIPHER_CTX_encrypting(c))  return 0;  memcpy(EVP_CIPHER_CTX_buf_noconst(c), ptr, arg);  gctx->taglen = arg;  return 1;   case EVP_CTRL_AEAD_GET_TAG:  if (arg <= 0 || arg > 16 || !EVP_CIPHER_CTX_encrypting(c)  || gctx->taglen < 0)  ret

In [18]:
import pandas as pd
from code2nl.run import convert_examples_to_features, Example

def get_preds(df: pd.DataFrame):
    ps = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        examples = [
            Example(idx, source = row.func_before, target = row.explain)
        ]
        eval_features = convert_examples_to_features(
            examples, tokenizer, args, stage='test'
        )
        source_ids = torch.tensor(eval_features[0].source_ids, dtype = torch.long).unsqueeze(0).to('cuda')
        source_mask = torch.tensor(eval_features[0].source_mask, dtype = torch.long).unsqueeze(0).to('cuda')

        with torch.no_grad():
            preds = model(source_ids = source_ids, source_mask = source_mask)  
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                ps.append(text)
    
    return ps

In [19]:
import evaluate
rouge = evaluate.load('rouge')

In [20]:
# df_test = df_test.reset_index()
preds = get_preds(df_test)
references = []
for idx, row in df_test.iterrows():
    # print('Code:', row.func_before)
    # print('Original Comment:', row.explain)
    # print('Generated Comment:', preds[idx])
    # print('='*40)
    references.append(row.explain)

results = rouge.compute(predictions=preds, references=references)
results

  0%|          | 0/1350 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1597 > 512). Running this sequence through the model will result in indexing errors
07/27/2023 04:35:32 - INFO - absl -   Using default tokenizer.


{'rouge1': 0.7591651925483067,
 'rouge2': 0.7005250615956675,
 'rougeL': 0.7577446031994383,
 'rougeLsum': 0.7568914274597773}

In [21]:
for idx, row in df_test.head(3).iterrows():
    print('Code:', row.func_before)
    print('Original Comment:', row.explain)
    print('Generated Comment:', preds[idx])
    print('='*40)

Code: static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {  EVP_AES_GCM_CTX *gctx = EVP_C_DATA(EVP_AES_GCM_CTX,c);  switch (type) {  case EVP_CTRL_INIT:  gctx->key_set = 0;  gctx->iv_set = 0;  gctx->ivlen = EVP_CIPHER_CTX_iv_length(c);  gctx->iv = EVP_CIPHER_CTX_iv_noconst(c);  gctx->taglen = -1;  gctx->iv_gen = 0;  gctx->tls_aad_len = -1;  return 1;   case EVP_CTRL_AEAD_SET_IVLEN:  if (arg <= 0)  return 0;  /* Allocate memory for IV if needed */  if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {  if (gctx->iv != EVP_CIPHER_CTX_iv_noconst(c))  OPENSSL_free(gctx->iv);  gctx->iv = OPENSSL_malloc(arg);  if (gctx->iv == NULL)  return 0;  }  gctx->ivlen = arg;  return 1;   case EVP_CTRL_AEAD_SET_TAG:  if (arg <= 0 || arg > 16 || EVP_CIPHER_CTX_encrypting(c))  return 0;  memcpy(EVP_CIPHER_CTX_buf_noconst(c), ptr, arg);  gctx->taglen = arg;  return 1;   case EVP_CTRL_AEAD_GET_TAG:  if (arg <= 0 || arg > 16 || !EVP_CIPHER_CTX_encrypting(c)  || gctx->taglen < 0)  ret