In [1]:
# !pip3 uninstall -y apex

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
os.environ["PYTHONPATH"]="/home/shariff/AVATAR"
!export PYTHONIOENCODING=utf-8
!export PYTHONPATH=/home/shariff/AVATAR

In [3]:
from codet5.configs import add_args, set_seed, set_dist
from codet5.utils import get_filenames, get_elapse_time, load_and_cache_gen_data
from codet5.models import build_or_load_gen_model
import multiprocessing
import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from tqdm import tqdm
from evaluation import smooth_bleu
from evaluation.bleu import _bleu
from evaluation.CodeBLEU import calc_code_bleu
import numpy as np
import time

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class Args:
    do_test = True
    model_type = "codet5"
    config_name = "Salesforce/codet5-base" 
    tokenizer_name = "Salesforce/codet5-base"
    model_name_or_path = "Salesforce/codet5-base"
    load_model_path = "/home/shariff/AVATAR/codet5/program/java2python/checkpoint-best-ppl/pytorch_model.bin"
    task = "translate"
    sub_task = "java-python"
    data_dir = "/home/shariff/AVATAR/data"
    cache_path = "/home/shariff/AVATAR/codet5/program/java2python"
    output_dir = "/home/shariff/AVATAR/codet5/program/java2python"
    res_dir = "/home/shariff/AVATAR/codet5/program/java2python"
    max_source_length = 510 
    max_target_length = 510 
    beam_size = 10
    eval_batch_size = 4
    local_rank = -1
    no_cuda=False
    seed = 1234
    data_num = -1
    add_task_prefix=True
    res_fn = ''
    
args = Args()

In [5]:
def eval_bleu_epoch(args, eval_data, eval_examples, model, tokenizer, split_tag):
    print("  ***** Running bleu evaluation on {} data*****".format(split_tag))
    print("  Num examples = ", len(eval_examples))
    print("  Batch size = ", args.eval_batch_size)
    eval_sampler = SequentialSampler(eval_data)
    if args.data_num == -1:
        eval_dataloader = DataLoader(
            eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True
        )
    else:
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    model.eval()
    pred_ids = []
    bleu, codebleu = 0.0, 0.0
    for batch in tqdm(eval_dataloader, total=len(eval_dataloader), desc="Eval bleu for {} set".format(split_tag)):
        source_ids = batch[0].to(args.device)
        source_mask = source_ids.ne(tokenizer.pad_token_id)
        with torch.no_grad():
            if args.model_type == 'roberta':
                preds = model(source_ids=source_ids, source_mask=source_mask)
                top_preds = [pred[0].cpu().numpy() for pred in preds]
            else:
                preds = model.generate(source_ids,
                                       attention_mask=source_mask,
                                       use_cache=True,
                                       num_beams=args.beam_size,
                                       early_stopping=args.task == 'summarize',
                                       max_length=args.max_target_length)
                top_preds = list(preds.cpu().numpy())
            pred_ids.extend(top_preds)
    pred_nls = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in pred_ids]

    output_fn = os.path.join(args.res_dir, "{}.output".format(split_tag))
    gold_fn = os.path.join(args.res_dir, "{}.gold".format(split_tag))
    src_fn = os.path.join(args.res_dir, "{}.src".format(split_tag))

    if args.task in ['defect']:
        target_dict = {0: 'false', 1: 'true'}
        golds = [target_dict[ex.target] for ex in eval_examples]
        eval_acc = np.mean([int(p == g) for p, g in zip(pred_nls, golds)])
        result = {'em': eval_acc, 'bleu': 0, 'codebleu': 0}

        with open(output_fn, 'w') as f, open(gold_fn, 'w') as f1, open(src_fn, 'w') as f2:
            for pred_nl, gold in zip(pred_nls, eval_examples):
                f.write(pred_nl.strip() + '\n')
                f1.write(target_dict[gold.target] + '\n')
                f2.write(gold.source.strip() + '\n')
            print("Save the predictions into %s", output_fn)
    else:
        dev_accs, predictions = [], []
        with open(output_fn, 'w', encoding='utf8') as f, \
                open(gold_fn, 'w', encoding='utf8') as f1, \
                open(src_fn, 'w', encoding='utf8') as f2:
            for pred_nl, gold in zip(pred_nls, eval_examples):
                dev_accs.append(pred_nl.strip() == gold.target.strip())
                if args.task in ['summarize']:
                    predictions.append(str(gold.idx) + '\t' + pred_nl)
                    f.write(str(gold.idx) + '\t' + pred_nl.strip() + '\n')
                    f1.write(str(gold.idx) + '\t' + gold.target.strip() + '\n')
                    f2.write(str(gold.idx) + '\t' + gold.source.strip() + '\n')
                else:
                    print("INPUT:", gold.source.strip())
                    print()
                    print("OUTPUT:", pred_nl.strip())
                    print()
                    print("GROUND TRUTH:", gold.target.strip())
                    f.write(pred_nl.strip() + '\n')
                    f1.write(gold.target.strip() + '\n')
                    f2.write(gold.source.strip() + '\n')

        if args.task in ['summarize']:
            (goldMap, predictionMap) = smooth_bleu.computeMaps(predictions, gold_fn)
            bleu = round(smooth_bleu.bleuFromMaps(goldMap, predictionMap)[0], 2)
        else:
            bleu = round(_bleu(gold_fn, output_fn), 2)

        em = np.mean(dev_accs) * 100
        result = {'em': em, 'bleu': bleu}

    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  {} = {}".format(key, str(round(result[key], 4))))

    return result

In [6]:
t0 = time.time()
set_dist(args)
set_seed(args)
config, model, tokenizer = build_or_load_gen_model(args)
model.to(args.device)
pool = multiprocessing.Pool(args.cpu_cont)
args.train_filename, args.dev_filename, args.test_filename = get_filenames(args.data_dir, args.task, args.sub_task)
fa = open(os.path.join(args.output_dir, 'summary.log'), 'a+')


Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, cpu count: 128


In [7]:
eval_examples, eval_data = load_and_cache_gen_data(
    args, args.test_filename, pool, tokenizer, 'test', only_src=True, is_sample=False
)
model = model.module if hasattr(model, 'module') else model
result = eval_bleu_epoch(args, eval_data, eval_examples, model, tokenizer, 'test')
test_bleu, test_em = result['bleu'], result['em']
test_codebleu = result['codebleu'] if 'codebleu' in result else 0
result_str = "bleu-4: %.2f, em: %.4f, codebleu: %.4f\n" % (test_bleu, test_em, test_codebleu)
print("result_str", result_str)
fa.write(result_str)
if args.res_fn:
    with open(args.res_fn, 'a+') as f:
        f.write('[Time: {}] {}\n'.format(get_elapse_time(t0), file))
        f.write(result_str)
fa.write("Finish and take {}".format(get_elapse_time(t0)))
fa.close()
