In [None]:
# Install all requirements
!pip install -r requirements.txt

In [None]:
# Download all necessary data
!bash download_and_preprocess_data.sh

In [None]:
# Add Required Imports
from transformers import AutoTokenizer, AutoModel, pipeline
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, pipeline
import json
import argparse
from xglue.utils import *
from xglue.model import *
import multiprocessing
import torch

from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer)


In [None]:
# Setup Arguments
args = argparse.Namespace()
argsdict = vars(args)

argsdict["data_dir"] = "./data"
argsdict["train_file"] = "train_codesearchnet_7.json"
argsdict["output_dir"] = "./outputs"
argsdict["dev_file"] = "dev_codesearchnet.json"
argsdict["test_file"] = "test.txt"

argsdict["model_type"] = "roberta"
argsdict["pn_weight"] = 1.0
argsdict["encoder_name_or_path"] = "microsoft/codebert-base"
argsdict["checkpoint_path"] = ""

argsdict["mlm"] = False
argsdict["mlm_probability"] = 0.15

argsdict["config_name"] = "roberta-base"
argsdict["tokenizer_name"] = "roberta-base"
argsdict["cache_dir"] = ""
argsdict["max_seq_length"] = 200

argsdict["do_train"] = True
argsdict["do_eval"] = False
argsdict["do_predict"] = False

argsdict["evaluate_during_training"] = True
argsdict["do_lower_case"] = False
argsdict["per_gpu_train_batch_size"] = 16
argsdict["per_gpu_eval_batch_size"] = 16
argsdict["gradient_accumulation_steps"] = 1
argsdict["learning_rate"] = 1e-5
argsdict["weight_decay"] = 0.0
argsdict["adam_epsilon"] = 1e-8
argsdict["max_grad_norm"] = 1.0
argsdict["num_train_epochs"] = 3
argsdict["max_steps"] = -1
argsdict["warmup_steps"] = 1000
argsdict["logging_steps"] = 2 
argsdict["save_steps"] = 2
argsdict["save_total_limit"] = None
argsdict["eval_all_checkpoints"] = True
argsdict["no_cuda"] = False
argsdict["overwrite_output_dir"] = False
argsdict["overwrite_cache"] = True
argsdict["seed"] = 123456
argsdict["fp16"] = False
argsdict["fp16_opt_level"] = '01' 
argsdict["local_rank"] = -1
argsdict["server_ip"] = ''
argsdict["server_port"] = ''
argsdict["test_result_dir"] = 'test_results.tsv'
argsdict["prediction_file"] = 'predictions.txt'

argsdict["per_gpu_train_batch_size"] = 16
argsdict["n_gpu"] = 1
argsdict["device"] = "cpu"

args.start_epoch = 0
args.start_step = 0
cpu_cont = 16

set_seed(args.seed)

In [None]:
pool = multiprocessing.Pool(cpu_cont)

In [None]:
# Model Classes Supported
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

In [None]:
# Configuration & Tokenizer
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.encoder_name_or_path,
                                      cache_dir=args.cache_dir if args.cache_dir else None)
config.num_labels = 2
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.encoder_name_or_path,
                                                do_lower_case=args.do_lower_case,
                                                cache_dir=args.cache_dir if args.cache_dir else None)

In [None]:
# Setup Max Sequence Length for the model
args.max_seq_length = min(args.max_seq_length, tokenizer.max_len_single_sentence)

In [None]:
# Download a pretrained huggingface model
model = model_class.from_pretrained(args.encoder_name_or_path,
                                            from_tf=bool('.ckpt' in args.encoder_name_or_path),
                                            config=config,
                                            cache_dir=args.cache_dir if args.cache_dir else None)

In [None]:
# Create a model
model = Model(model, config, tokenizer, args)

In [None]:
# Check for checkpoints
if args.checkpoint_path:
    model.load_state_dict(torch.load(os.path.join(args.checkpoint_path, 'pytorch_model.bin')))

In [None]:
# Perform Training
train_data_path = os.path.join(args.data_dir, args.train_file)
train_dataset = TextDataset(tokenizer, args, train_data_path, type='train')
train(args, train_dataset, model, tokenizer)

In [None]:
# Evaluate the model
results = {}

checkpoint_prefix = 'checkpoint-best-ever'
output_dir = os.path.join(args.output_dir, checkpoint_prefix)
model.load_state_dict(torch.load(os.path.join(output_dir, 'pytorch_model.bin')))
tokenizer = tokenizer.from_pretrained(output_dir)
model.to(args.device)
results = evaluate(args, model, tokenizer)

# Print top ten results
print(results[:10])