In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from pathlib import Path
from project_dataset import load_dataset
from icse.model import Seq2Seq

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
from dataclasses import dataclass

@dataclass
class Args:
    # model_name = "neulab/codebert-cpp"
    num_proc = 8
    batch_size = 35
    max_source_length = 512  
    max_target_length = 53 
    data_cols = ["CVE ID", "explain", "func_before"]
    save_dir = 'tf_board'
    epochs = 100
    grad_acc_steps = 4
    lr = 5e-5
    log_freq = 10
    local_rank = -1
    deepspeed = None
    fp16 = False
    lr_warmup_steps = 200
    weight_decay = 0.05
    task = "impact"
    prefix = 'icse'
    do_lower_case = True
    beam_size = 10
    
args = Args()

In [5]:
ds = load_dataset(args.task)

In [6]:
df_train = ds['train']
df_train = df_train.to_pandas()

df_val = ds['validation']
df_val = df_val.to_pandas()

df_test = ds['test']
df_test = df_test.to_pandas()

In [7]:
df_train.head(3)

Unnamed: 0,CVE ID,explain,func_before,processed_func
0,CVE-2016-2546,uses an incorrect type of mutex,static int snd_timer_user_tselect(struct file ...,static int snd_timer_user_tselect(struct file ...
1,CVE-2016-1683,mishandles namespace nodes,"xsltCopyOf(xsltTransformContextPtr ctxt, xmlNo...","xsltCopyOf(xsltTransformContextPtr ctxt, xmlNo..."
2,CVE-2016-3078,No boundary checking,static void php_zip_get_from(INTERNAL_FUNCTION...,static void php_zip_get_from(INTERNAL_FUNCTION...


In [8]:
import os

data_folder = 'tmp_data_icse'

os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(f'{data_folder}/{args.task}', exist_ok=True)
os.makedirs(f'{args.save_dir}/{args.prefix}_{args.task}', exist_ok=True)

In [9]:
import json

df_train['contents'] = df_train.func_before.apply(lambda x: x.split())
df_train['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_train['docstring_tokens'] = df_train.explain.apply(lambda x: x.split())
with open(f'{data_folder}/{args.task}/train.jsonl','w') as f:
    for _, row in df_train.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_val['contents'] = df_val.func_before.apply(lambda x: x.split())
df_val['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_val['docstring_tokens'] = df_val.explain.apply(lambda x: x.split())
with open(f'{data_folder}/{args.task}/valid.jsonl','w') as f:
    for _, row in df_val.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

df_test['contents'] = df_test.func_before.apply(lambda x: x.split())
df_test['code_tokens'] = df_train.func_before.apply(lambda x: x.split())
df_test['docstring_tokens'] = df_test.explain.apply(lambda x: x.split())
with open(f'{data_folder}/{args.task}/test.jsonl','w') as f:
    for _, row in df_test.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

In [12]:
lr = 5e-5
batch_size = args.batch_size # change depending on the GPU Colab gives you
beam_size = 10
source_length = args.max_source_length
target_length = args.max_target_length
data_dir = data_folder
output_dir = f'{args.save_dir}/{args.prefix}_{args.task}'
train_file = f'{data_dir}/{args.task}/train.jsonl'
dev_file = f'{data_dir}/{args.task}/valid.jsonl'
test_file=f"{data_dir}/{args.task}/test.jsonl"
epochs = args.epochs 

! python icse/run.py \
    --do_train \
    --do_eval \
    --do_test \
    --do_lower_case \
    --train_filename {train_file} \
    --dev_filename {dev_file} \
    --test_filename {test_file} \
    --output_dir {output_dir} \
    --max_source_length {source_length} \
    --max_target_length {target_length} \
    --beam_size {beam_size} \
    --train_batch_size {batch_size} \
    --eval_batch_size {batch_size} \
    --learning_rate {lr} \
    --num_train_epochs {epochs}

09/13/2023 22:09:53 - INFO - __main__ -   Namespace(model_type='roberta', model_name_or_path='microsoft/codebert-base', output_dir='tf_board/icse_root_cause', load_model_path=None, train_filename='tmp_data_icse/root_cause/train.jsonl', dev_filename='tmp_data_icse/root_cause/valid.jsonl', test_filename='tmp_data_icse/root_cause/test.jsonl', config_name='', tokenizer_name='', max_source_length=512, max_target_length=53, do_train=True, do_eval=True, do_test=True, do_lower_case=True, no_cuda=False, train_batch_size=35, eval_batch_size=35, gradient_accumulation_steps=1, learning_rate=5e-05, beam_size=10, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=100, max_steps=-1, eval_steps=-1, train_steps=-1, warmup_steps=0, local_rank=-1, seed=42)
  metric = load_metric("rouge")
09/13/2023 22:09:57 - INFO - __main__ -   *** Example ***
09/13/2023 22:09:57 - INFO - __main__ -   idx: 0
09/13/2023 22:09:57 - INFO - __main__ -   source_tokens: ['<s>', 'static', '_int', '_s', '

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Total: 382
09/14/2023 01:03:30 - INFO - __main__ -     bleu-4 = 61.17 
09/14/2023 01:03:30 - INFO - __main__ -     ********************
epoch 88 loss 0.0096: 100%|█████████████████████| 99/99 [01:23<00:00,  1.18it/s]
09/14/2023 01:04:54 - INFO - __main__ -   
***** Running evaluation *****
09/14/2023 01:04:54 - INFO - __main__ -     Num examples = 382
09/14/2023 01:04:54 - INFO - __main__ -     Batch size = 35
Total: 382
09/14/2023 01:05:27 - INFO - __main__ -     bleu-4 = 61.38 
09/14/2023 01:05:27 - INFO - __main__ -     ********************
epoch 89 loss 0.009: 100%|██████████████████████| 99/99 [01:23<00:00,  1.18it/s]
09/14/2023 01:06:51 - INFO - __main__ -   
***** Running evaluation *****
09/14/2023 01:06:51 - INFO - __main__ -     Num examples = 382
09/14/2023 01:06:51 - INFO - __main__ -     Batch size = 35
09/14/2023 01:06:54 - INFO - __main__ -     eval_ppl = 3.32506
09/14/2023 01:06:54 - INFO - __main__ -     global_step = 8911
09/14/2023 01:06:54 - INFO - __main__ -     tr