In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
os.environ["PYTHONPATH"]="/home/shariff/AVATAR"
!export PYTHONIOENCODING=utf-8
!export PYTHONPATH=/home/shariff/AVATAR

In [2]:
from __future__ import absolute_import

import sys

sys.path.append("..")
import os
import json
import random
import logging
import argparse

from io import open
from itertools import cycle

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
    RobertaConfig,
    RobertaModel,
    RobertaTokenizer
)

from evaluation.bleu import _bleu
from codebert.model import Seq2Seq

MODEL_CLASSES = {
    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)
}

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger(__name__)

In [3]:
# class Args:
#     do_test = True
#     model_type = "codet5"
#     config_name = "Salesforce/codet5-base" 
#     tokenizer_name = "Salesforce/codet5-base"
#     model_name_or_path = "Salesforce/codet5-base"
#     load_model_path = "/home/shariff/AVATAR/codet5/program/java2python/checkpoint-best-ppl/pytorch_model.bin"
#     task = "translate"
#     sub_task = "java-python"
#     data_dir = "/home/shariff/AVATAR/data"
#     cache_path = "/home/shariff/AVATAR/codet5/program/java2python"
#     output_dir = "/home/shariff/AVATAR/codet5/program/java2python"
#     res_dir = "/home/shariff/AVATAR/codet5/program/java2python"
#     max_source_length = 510 
#     max_target_length = 510 
#     beam_size = 10
#     eval_batch_size = 8
#     local_rank = -1
#     no_cuda=False
#     seed = 1234
#     data_num = -1
#     add_task_prefix=True
#     res_fn = ''
    
# args = Args()

In [3]:
class Args:
    ## Required parameters  
    model_type = "roberta"
    model_name_or_path = "microsoft/codebert-base"
    tokenizer_name = "roberta-base"
    output_dir = "/home/shariff/AVATAR/codebert/program/java2python"
    load_model_path = "/home/shariff/AVATAR/codebert/program/java2python/checkpoint-best-ppl/pytorch_model.bin"
    ## Other parameters
#     train_filename = 
#     dev_filename = 
#     test_filename = 
    config_name = "roberta-base"
    max_source_length = 510
    max_target_length = 510
    do_train = False
    do_eval = False
    do_test = True
    do_lower_case = True
    no_cuda = False
    train_batch_size = 8
    eval_batch_size = 16
    gradient_accumulation_steps = 1
    learning_rate = 5e-5
    beam_size = 10
    weight_decay = 0.0
    adam_epsilon = 1e-8
    max_grad_norm = 1.0
    num_train_epochs = 30
    max_steps = -1
    eval_steps = 1
    train_steps = -1
    warmup_steps = 0
    local_rank = -1
    seed = 42
    max_patience = 10
    data_dir = "/home/shariff/AVATAR/data"
    source = "java"
    target = "python"
    log_file = "/home/shariff/AVATAR/codebert/program/java2python/finetune.log"

args = Args()

In [4]:
def set_seed(args):
    """set random seed."""
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [5]:
# print arguments
if args.log_file is not None:
    fh = logging.FileHandler(args.log_file)
    logger.addHandler(fh)
logger.info(args)

# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend='nccl')
    args.n_gpu = 1
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
               args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
args.device = device

# Set seed
set_seed(args)
# make dir if output_dir not exist
if os.path.exists(args.output_dir) is False:
    os.makedirs(args.output_dir)

config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case)

# budild model
encoder = model_class.from_pretrained(args.model_name_or_path, config=config)
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model = Seq2Seq(
    encoder=encoder,
    decoder=decoder,
    config=config,
    beam_size=args.beam_size,
    max_length=args.max_target_length,
    sos_id=tokenizer.cls_token_id,
    eos_id=tokenizer.sep_token_id
)

if args.load_model_path is not None:
    logger.info("reload model from {}".format(args.load_model_path))
    model.load_state_dict(torch.load(args.load_model_path))

###### Count and print number of parameters ######
total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info("num. model params: {} (num. trained: {})".format(total_params, total_trainable_params))

model.to(device)
if args.local_rank != -1:
    # Distributed training
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError(
            "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
        )
    model = DDP(model)

elif args.n_gpu > 1:
    # multi-gpu training
    model = torch.nn.DataParallel(model)

In [7]:
def convert_examples_to_features(examples, tokenizer, args, stage=None):
    features = []
    for example_index, example in enumerate(examples):
        # source
        source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length - 2]
        source_tokens = [tokenizer.cls_token] + source_tokens + [tokenizer.sep_token]
        source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
        source_mask = [1] * (len(source_tokens))
        padding_length = args.max_source_length - len(source_ids)
        source_ids += [tokenizer.pad_token_id] * padding_length
        source_mask += [0] * padding_length

        # target
        if stage == "test":
            target_tokens = tokenizer.tokenize("None")
        else:
            target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length - 2]

        target_tokens = [tokenizer.cls_token] + target_tokens + [tokenizer.sep_token]
        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
        target_mask = [1] * len(target_ids)
        padding_length = args.max_target_length - len(target_ids)
        target_ids += [tokenizer.pad_token_id] * padding_length
        target_mask += [0] * padding_length

        features.append(
            InputFeatures(
                example_index,
                source_ids,
                target_ids,
                source_mask,
                target_mask,
            )
        )

    return features

In [8]:
class Example(object):
    """A single training/test example."""

    def __init__(self, source, target):
        self.source = source
        self.target = target


In [9]:
def read_examples(data_dir, source, target, part):
    """Read examples from filename."""
    examples = []
    src_file_name = os.path.join(data_dir, part + '.java-python.' + source)
    tgt_file_name = os.path.join(data_dir, part + '.java-python.' + target)
    with open(src_file_name, encoding='utf8') as f1, \
            open(tgt_file_name, encoding='utf8') as f2:
        for src, tgt in zip(f1, f2):
            examples.append(
                Example(source=src.strip(), target=tgt.strip())
            )
    return examples


In [10]:
class InputFeatures(object):
    """A single training/test features for a example."""

    def __init__(
            self,
            example_id,
            source_ids,
            target_ids,
            source_mask,
            target_mask,
    ):
        self.example_id = example_id
        self.source_ids = source_ids
        self.target_ids = target_ids
        self.source_mask = source_mask
        self.target_mask = target_mask


In [6]:
eval_examples = read_examples(args.data_dir, args.source, args.target, 'test')
eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test')
all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_source_ids, all_source_mask)

# Calculate bleu
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

model.eval()
p = []
for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    source_ids, source_mask = batch
    with torch.no_grad():
        preds = model(source_ids=source_ids, source_mask=source_mask)
        for pred in preds:
            t = pred[0].cpu().numpy()
            t = list(t)
            if 0 in t:
                t = t[:t.index(0)]
            text = tokenizer.decode(t, clean_up_tokenization_spaces=False)
            p.append(text)
model.train()
accs = []
with open(os.path.join(args.output_dir, "test.output"), 'w', encoding='utf-8') as f, \
        open(os.path.join(args.output_dir, "test.gold"), 'w', encoding='utf-8') as f1:
    for hyp, gold in zip(p, eval_examples):
        f.write(hyp + '\n')
        f1.write(gold.target + '\n')
        accs.append(hyp == gold.target)
dev_bleu = round(_bleu(
    os.path.join(args.output_dir, "test.gold"),
    os.path.join(args.output_dir, "test.output")
), 2)
logger.info("  %s = %s " % ("bleu-4", str(dev_bleu)))
logger.info("  %s = %s " % ("xMatch", str(round(np.mean(accs) * 100, 4))))
logger.info("  " + "*" * 20)
