In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%env WANDB_PROJECT=codebert_attack_vector

env: WANDB_PROJECT=codebert_attack_vector


In [3]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from pathlib import Path
from project_dataset import load_dataset
from code2nl.model import Seq2Seq

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from dataclasses import dataclass

@dataclass
class Args:
    model_name = "neulab/codebert-cpp"
    num_proc = 4
    batch_size = 40
    max_source_length = 512  
    max_target_length = 153 
    data_cols = ["CVE ID", "explain", "func_before"]
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "root_cause"
    prefix = 'neulab'
    do_lower_case = False
    beam_size = 10
    
args = Args()

In [6]:
ds = load_dataset(args.task)

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 3431
    })
    validation: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 382
    })
    test: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 954
    })
})

In [8]:
df_train = ds['train']
df_train = df_train.to_pandas()

df_val = ds['validation']
df_val = df_val.to_pandas()

df_test = ds['test']
df_test = df_test.to_pandas()

In [9]:
df_train.head(3)

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2016-2546,uses an incorrect type of mutex,static int snd_timer_user_tselect(struct file ...,static int snd_timer_user_tselect(struct file ...
1,CVE-2016-1683,mishandles namespace nodes,"xsltCopyOf(xsltTransformContextPtr ctxt, xmlNo...","xsltCopyOf(xsltTransformContextPtr ctxt, xmlNo..."
2,CVE-2016-3078,No boundary checking,static void php_zip_get_from(INTERNAL_FUNCTION...,static void php_zip_get_from(INTERNAL_FUNCTION...


In [10]:
import os

os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(f'tmp_data/{args.task}', exist_ok=True)
os.makedirs(f'{args.save_dir}/{args.prefix}_{args.task}', exist_ok=True)

In [11]:
import json

df_train['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_train['docstring_tokens'] = df_train.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/train.jsonl','w') as f:
    for _, row in df_train.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_val['code_tokens'] = df_val.func_before.apply(lambda x: x.split())
df_val['docstring_tokens'] = df_val.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/valid.jsonl','w') as f:
    for _, row in df_val.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_test['code_tokens'] = df_test.func_before.apply(lambda x: x.split())
df_test['docstring_tokens'] = df_test.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/test.jsonl','w') as f:
    for _, row in df_test.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

In [12]:
lr = 5e-5
batch_size = args.batch_size # change depending on the GPU Colab gives you
beam_size = args.beam_size
source_length = args.max_source_length
target_length = args.max_target_length
data_dir = 'tmp_data'
output_dir = f'{args.save_dir}/{args.prefix}_{args.task}'
train_file = f'{data_dir}/{args.task}/train.jsonl'
dev_file = f'{data_dir}/{args.task}/valid.jsonl'
epochs = args.epochs 
pretrained_model = args.model_name


! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_train \
    --do_eval \
    --do_lower_case \
    --model_type roberta \
    --model_name_or_path {pretrained_model} \
    --train_filename {train_file} \
    --dev_filename {dev_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --train_batch_size {batch_size} \
    --eval_batch_size {batch_size} \
    --learning_rate {lr} \
    --num_train_epochs {epochs}

In [13]:
batch_size=64
dev_file= f'{data_dir}/{args.task}/valid.jsonl'
test_file=f"{data_dir}/{args.task}/test.jsonl"
test_model=f"{output_dir}/checkpoint-best-bleu/pytorch_model.bin" #checkpoint for test

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_test \
    --model_type roberta \
    --model_name_or_path microsoft/codebert-base \
    --load_model_path {test_model} \
    --dev_filename {dev_file} \
    --test_filename {test_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --eval_batch_size {batch_size}

07/28/2023 18:23:53 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='tf_board/neulab_root_cause', load_model_path='tf_board/neulab_root_cause/checkpoint-best-bleu/pytorch_model.bin', train_filename=None, dev_filename='tmp_data/root_cause/valid.jsonl', test_filename='tmp_data/root_cause/test.jsonl', config_name='', tokenizer_name='', max_source_length=512, max_target_length=153, do_train=False, do_eval=False, do_test=True, do_lower_case=False, no_cuda=False, train_batch_size=8, eval_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
07/28/2023 18:23:55 - INFO - __main__ -   reload model from tf_board/neulab_root_cause/checkpoint-best-bleu/pytorch_model.bin
07/28/2023 18:23:56 - INFO - __main__ -   Test file: tmp_data/root_cause/valid.js

In [14]:
from transformers import RobertaTokenizer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)

In [15]:
from transformers import RobertaConfig, RobertaModel

config = RobertaConfig.from_pretrained(pretrained_model)
encoder = RobertaModel.from_pretrained(pretrained_model, config = config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(encoder = encoder,decoder = decoder,config=config,
                beam_size=beam_size,max_length=target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
model.load_state_dict(torch.load(Path(output_dir)/"checkpoint-best-bleu/pytorch_model.bin"))
model.to(device)

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [16]:
idx = 0
TEXT_TO_SUMMARIZE = df_test.func_before.values[idx]
print('Code:', TEXT_TO_SUMMARIZE)
print('Original Comment:', df_val.explain.values[idx])

Code: int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) {  bool pr = false;  u32 msr = msr_info->index;  u64 data = msr_info->data;   switch (msr) {  case MSR_AMD64_NB_CFG:  case MSR_IA32_UCODE_REV:  case MSR_IA32_UCODE_WRITE:  case MSR_VM_HSAVE_PA:  case MSR_AMD64_PATCH_LOADER:  case MSR_AMD64_BU_CFG2:   break;   case MSR_EFER:   return set_efer(vcpu, data);  case MSR_K7_HWCR:   data &= ~(u64)0x40; /* ignore flush filter disable */   data &= ~(u64)0x100; /* ignore ignne emulation enable */   data &= ~(u64)0x8; /* ignore TLB cache disable */   if (data != 0) {    vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",      data);    return 1;   }   break;  case MSR_FAM10H_MMIO_CONF_BASE:   if (data != 0) {    vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "      "0x%llx\n", data);    return 1;   }   break;  case MSR_IA32_DEBUGCTLMSR:   if (!data) {    /* We support the non-activated case already */    break;   } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCT

In [17]:
import pandas as pd
from code2nl.run import convert_examples_to_features, Example

def get_preds(df: pd.DataFrame):
    ps = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        examples = [
            Example(idx, source = row.func_before, target = row.explain)
        ]
        eval_features = convert_examples_to_features(
            examples, tokenizer, args, stage='test'
        )
        source_ids = torch.tensor(eval_features[0].source_ids, dtype = torch.long).unsqueeze(0).to('cuda')
        source_mask = torch.tensor(eval_features[0].source_mask, dtype = torch.long).unsqueeze(0).to('cuda')

        with torch.no_grad():
            preds = model(source_ids = source_ids, source_mask = source_mask)  
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                ps.append(text)
    
    return ps

In [18]:
import evaluate
rouge = evaluate.load('rouge')



In [19]:
# df_test = df_test.reset_index()
preds = get_preds(df_test)
references = []
for idx, row in df_test.iterrows():
    # print('Code:', row.func_before)
    # print('Original Comment:', row.explain)
    # print('Generated Comment:', preds[idx])
    # print('='*40)
    references.append(row.explain)

results = rouge.compute(predictions=preds, references=references)
results

  0%|          | 0/954 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2771 > 512). Running this sequence through the model will result in indexing errors
07/28/2023 18:27:05 - INFO - absl -   Using default tokenizer.


{'rouge1': 0.623554735169465,
 'rouge2': 0.5963898806527652,
 'rougeL': 0.6222143396636503,
 'rougeLsum': 0.622308346074455}

In [20]:
for idx, row in df_test.head(3).iterrows():
    print('Code:', row.func_before)
    print('Original Comment:', row.explain)
    print('Generated Comment:', preds[idx])
    print('='*40)

Code: int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) {  bool pr = false;  u32 msr = msr_info->index;  u64 data = msr_info->data;   switch (msr) {  case MSR_AMD64_NB_CFG:  case MSR_IA32_UCODE_REV:  case MSR_IA32_UCODE_WRITE:  case MSR_VM_HSAVE_PA:  case MSR_AMD64_PATCH_LOADER:  case MSR_AMD64_BU_CFG2:   break;   case MSR_EFER:   return set_efer(vcpu, data);  case MSR_K7_HWCR:   data &= ~(u64)0x40; /* ignore flush filter disable */   data &= ~(u64)0x100; /* ignore ignne emulation enable */   data &= ~(u64)0x8; /* ignore TLB cache disable */   if (data != 0) {    vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",      data);    return 1;   }   break;  case MSR_FAM10H_MMIO_CONF_BASE:   if (data != 0) {    vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "      "0x%llx\n", data);    return 1;   }   break;  case MSR_IA32_DEBUGCTLMSR:   if (!data) {    /* We support the non-activated case already */    break;   } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCT

In [27]:
import pandas as pd

In [30]:
df_1 = pd.read_csv('tf_board/neulab_root_cause/test_1.gold', sep='\t', header=None)

In [36]:
references = df_1[1].values.tolist()

In [37]:
df_2 = pd.read_csv('tf_board/neulab_root_cause/test_1.output', sep='\t', header=None)

In [38]:
preds = df_2[1].values.tolist()

In [39]:
import evaluate
rouge = evaluate.load('rouge')

df = []
for i, v in enumerate(zip(preds, references)):
    r_ = rouge.compute(predictions=[v[0]], references=[v[1]])
    df.append((i, r_['rouge1'], r_['rouge2'], r_['rougeL']))

In [40]:
df_ = pd.DataFrame(df, columns=['id', 'rouge1', 'rouge2', 'rougeL'])

In [41]:
df_.to_csv(f"root_cause_cpp.tsv", sep='\t')