In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from pathlib import Path
from project_dataset import load_dataset
from code2nl.model import Seq2Seq

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
from dataclasses import dataclass

@dataclass
class Args:
    model_name = "neulab/codebert-cpp"
    num_proc = 4
    batch_size = 40
    max_source_length = 512  
    max_target_length = 167 
    data_cols = ["CVE ID", "explain", "func_before"]
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "impact"
    prefix = 'neulab'
    do_lower_case = False
    beam_size = 10
    
args = Args()

In [7]:
ds = load_dataset(args.task)

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 7032
    })
    validation: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 782
    })
    test: Dataset({
        features: ['CVE ID', 'explain', 'func_before', 'processed_func'],
        num_rows: 1954
    })
})

In [9]:
df_train = ds['train']
df_train = df_train.to_pandas()

df_val = ds['validation']
df_val = df_val.to_pandas()

df_test = ds['test']
df_test = df_test.to_pandas()

In [10]:
df_train.head(3)

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2017-15537,read the FPU registers of other processes on t...,"int xstateregs_set(struct task_struct *target,...","int xstateregs_set(struct task_struct *target,..."
1,CVE-2011-2875,cause a denial of service or possibly have uns...,WebRTCSessionDescriptionDescriptor MockWebRTCP...,WebRTCSessionDescriptionDescriptor\nMockWebRTC...
2,CVE-2016-4303,overflow a buffer and execute arbitrary code o...,int cJSON_GetArraySize( cJSON *array ) { cJSO...,int cJSON_GetArraySize(cJSON *array) {\n cJSO...


In [11]:
import os

os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(f'tmp_data/{args.task}', exist_ok=True)
os.makedirs(f'{args.save_dir}/{args.prefix}_{args.task}', exist_ok=True)

In [12]:
import json

df_train['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_train['docstring_tokens'] = df_train.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/train.jsonl','w') as f:
    for _, row in df_train.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_val['code_tokens'] = df_val.func_before.apply(lambda x: x.split())
df_val['docstring_tokens'] = df_val.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/valid.jsonl','w') as f:
    for _, row in df_val.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_test['code_tokens'] = df_test.func_before.apply(lambda x: x.split())
df_test['docstring_tokens'] = df_test.explain.apply(lambda x: x.split())
with open(f'tmp_data/{args.task}/test.jsonl','w') as f:
    for _, row in df_test.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

In [13]:
lr = args.lr
batch_size = args.batch_size # change depending on the GPU Colab gives you
beam_size = args.beam_size
source_length = args.max_source_length
target_length = args.max_target_length
data_dir = 'tmp_data'
output_dir = f'{args.save_dir}/{args.prefix}_{args.task}'
train_file = f'{data_dir}/{args.task}/train.jsonl'
dev_file = f'{data_dir}/{args.task}/valid.jsonl'
epochs = args.epochs 
pretrained_model = args.model_name

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_train \
    --do_eval \
    --do_lower_case \
    --model_type roberta \
    --model_name_or_path {pretrained_model} \
    --train_filename {train_file} \
    --dev_filename {dev_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --train_batch_size {batch_size} \
    --eval_batch_size {batch_size} \
    --learning_rate {lr} \
    --num_train_epochs {epochs}

In [14]:
batch_size=64
dev_file= f'{data_dir}/{args.task}/valid.jsonl'
test_file=f"{data_dir}/{args.task}/test.jsonl"
test_model=f"{output_dir}/checkpoint-best-bleu/pytorch_model.bin" #checkpoint for test

! python CodeXGLUE/Code-Text/code-to-text/code/run.py \
    --do_test \
    --model_type roberta \
    --model_name_or_path microsoft/codebert-base \
    --load_model_path {test_model} \
    --dev_filename {dev_file} \
    --test_filename {test_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --eval_batch_size {batch_size}

In [15]:
from transformers import RobertaTokenizer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)

In [16]:
from transformers import RobertaConfig, RobertaModel

config = RobertaConfig.from_pretrained(pretrained_model)
encoder = RobertaModel.from_pretrained(pretrained_model, config = config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(encoder = encoder,decoder = decoder,config=config,
                beam_size=beam_size,max_length=target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
model.load_state_dict(torch.load(Path(output_dir)/"checkpoint-best-bleu/pytorch_model.bin"))
model.to(device)

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [17]:
idx = 0
TEXT_TO_SUMMARIZE = df_test.func_before.values[idx]
print('Code:', TEXT_TO_SUMMARIZE)
print('Original Comment:', df_val.explain.values[idx])

Code: static int cg_getattr(const char *path, struct stat *sb) {  struct timespec now;  struct fuse_context *fc = fuse_get_context();  char * cgdir = NULL;  char *fpath = NULL, *path1, *path2;  struct cgfs_files *k = NULL;  const char *cgroup;  const char *controller = NULL;  int ret = -ENOENT;    if (!fc)   return -EIO;   memset(sb, 0, sizeof(struct stat));   if (clock_gettime(CLOCK_REALTIME, &now) < 0)   return -EINVAL;   sb->st_uid = sb->st_gid = 0;  sb->st_atim = sb->st_mtim = sb->st_ctim = now;  sb->st_size = 0;   if (strcmp(path, "/cgroup") == 0) {   sb->st_mode = S_IFDIR | 00755;   sb->st_nlink = 2;   return 0;  }   controller = pick_controller_from_path(fc, path);  if (!controller)   return -EIO;  cgroup = find_cgroup_in_path(path);  if (!cgroup) {   /* this is just /cgroup/controller, return it as a dir */   sb->st_mode = S_IFDIR | 00755;   sb->st_nlink = 2;   return 0;  }   get_cgdir_and_path(cgroup, &cgdir, &fpath);   if (!fpath) {   path1 = "/";   path2 = cgdir;  } else {  

In [18]:
import pandas as pd
from code2nl.run import convert_examples_to_features, Example

def get_preds(df: pd.DataFrame):
    ps = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        examples = [
            Example(idx, source = row.func_before, target = row.explain)
        ]
        eval_features = convert_examples_to_features(
            examples, tokenizer, args, stage='test'
        )
        source_ids = torch.tensor(eval_features[0].source_ids, dtype = torch.long).unsqueeze(0).to('cuda')
        source_mask = torch.tensor(eval_features[0].source_mask, dtype = torch.long).unsqueeze(0).to('cuda')

        with torch.no_grad():
            preds = model(source_ids = source_ids, source_mask = source_mask)  
            for pred in preds:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                ps.append(text)
    
    return ps

In [19]:
import evaluate
rouge = evaluate.load('rouge')



In [20]:
# df_test = df_test.reset_index()
preds = get_preds(df_test)
references = []
for idx, row in df_test.iterrows():
    # print('Code:', row.func_before)
    # print('Original Comment:', row.explain)
    # print('Generated Comment:', preds[idx])
    # print('='*40)
    references.append(row.explain)

results = rouge.compute(predictions=preds, references=references)
results

  0%|          | 0/1954 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1011 > 512). Running this sequence through the model will result in indexing errors
07/29/2023 03:58:59 - INFO - absl -   Using default tokenizer.


{'rouge1': 0.7052975024475849,
 'rouge2': 0.6633614370542575,
 'rougeL': 0.6998883135246539,
 'rougeLsum': 0.6993792951482853}

In [21]:
for idx, row in df_test.head(3).iterrows():
    print('Code:', row.func_before)
    print('Original Comment:', row.explain)
    print('Generated Comment:', preds[idx])
    print('='*40)

Code: static int cg_getattr(const char *path, struct stat *sb) {  struct timespec now;  struct fuse_context *fc = fuse_get_context();  char * cgdir = NULL;  char *fpath = NULL, *path1, *path2;  struct cgfs_files *k = NULL;  const char *cgroup;  const char *controller = NULL;  int ret = -ENOENT;    if (!fc)   return -EIO;   memset(sb, 0, sizeof(struct stat));   if (clock_gettime(CLOCK_REALTIME, &now) < 0)   return -EINVAL;   sb->st_uid = sb->st_gid = 0;  sb->st_atim = sb->st_mtim = sb->st_ctim = now;  sb->st_size = 0;   if (strcmp(path, "/cgroup") == 0) {   sb->st_mode = S_IFDIR | 00755;   sb->st_nlink = 2;   return 0;  }   controller = pick_controller_from_path(fc, path);  if (!controller)   return -EIO;  cgroup = find_cgroup_in_path(path);  if (!cgroup) {   /* this is just /cgroup/controller, return it as a dir */   sb->st_mode = S_IFDIR | 00755;   sb->st_nlink = 2;   return 0;  }   get_cgdir_and_path(cgroup, &cgdir, &fpath);   if (!fpath) {   path1 = "/";   path2 = cgdir;  } else {  