In [None]:
# Install all requirements
!pip install -r requirements.txt

In [1]:
# Necessary Imports
import os
import sys
import pickle
import torch
import json
import random
import logging
import argparse
import numpy as np
from io import open
from itertools import cycle
import torch.nn as nn
from tqdm import tqdm, trange
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaModel, RobertaTokenizer)
from xglue.utils import *

In [2]:
# Setup Arguments
args = argparse.Namespace()
argsdict = vars(args)

argsdict["model_type"] = "roberta"
argsdict["model_name_or_path"] = "microsoft/codebert-base"
argsdict["tokenizer_name"] = "roberta-base"
argsdict["output_dir"] = "outputs"

argsdict["load_model_path"] = None
argsdict["train_filename"] = "./data/small/train.buggy-fixed.buggy,./data/small/train.buggy-fixed.fixed"
argsdict["dev_filename"] = "./data/small/valid.buggy-fixed.buggy,./data/small/valid.buggy-fixed.fixed"
argsdict["test_filename"] = "./data/small/test.buggy-fixed.buggy,./data/small/test.buggy-fixed.fixed"
argsdict["config_name"] = "roberta-base"
argsdict["max_source_length"] = 256
argsdict["max_target_length"] = 256

argsdict["do_train"] = True
argsdict["do_eval"] = False
argsdict["do_test"] = False
argsdict["do_lower_case"] = False
argsdict["no_cuda"] = False

argsdict["train_batch_size"] = 16
argsdict["eval_batch_size"] = 16
argsdict["gradient_accumulation_steps"] = 1

argsdict["learning_rate"] = 5e-5
argsdict["beam_size"] = 5
argsdict["weight_decay"] = 0.0
argsdict["adam_epsilon"] = 1e-8
argsdict["max_grad_norm"] = 1.0
argsdict["num_train_epochs"] = 3.0
argsdict["max_steps"] = -1
argsdict["eval_steps"] = 5000
argsdict["train_steps"] = 100000

argsdict["warmup_steps"] = 1000
argsdict["local_rank"] = -1
argsdict["seed"] = 123456

argsdict["mlm"] = False
argsdict["mlm_probability"] = 0.15

argsdict["cache_dir"] = ""
argsdict["max_seq_length"] = 200

argsdict["logging_steps"] = 2 
argsdict["save_steps"] = 2
argsdict["save_total_limit"] = None
argsdict["eval_all_checkpoints"] = True

argsdict["overwrite_output_dir"] = False
argsdict["overwrite_cache"] = True
argsdict["fp16"] = False
argsdict["fp16_opt_level"] = '01' 

argsdict["server_ip"] = ''
argsdict["server_port"] = ''
argsdict["test_result_dir"] = 'test_results.tsv'
argsdict["prediction_file"] = 'predictions.txt'

argsdict["per_gpu_train_batch_size"] = 16
argsdict["n_gpu"] = 1
argsdict["device"] = "cpu"

args.start_epoch = 0
args.start_step = 0
cpu_cont = 16

set_seed(args)

In [3]:
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

In [4]:
# Config & Tokenizer from Hugging Face Pretrained
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name,do_lower_case=args.do_lower_case)

In [5]:
# Build From Pre-trained Model
encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
              beam_size=args.beam_size,max_length=args.max_target_length,
              sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)

In [6]:
# Load model from our pre-trained and best model
if args.load_model_path is not None:
    model.load_state_dict(torch.load(args.load_model_path))

In [None]:
# Train the model
train_model(args, tokenizer, model)

In [None]:
# Test the model - Results are stored in predictions.txt
test_model(args, tokenizer, model)