## VARIABLE NAME PREDICTION WITH CODE-TRANSFORMER


In [1]:
%cd /home/pojer/adv-ml/code-transformer/
%reload_ext autoreload
%autoreload 2

/home/pojer/adv-ml/code-transformer


In [2]:
from code_transformer.preprocessing.datamanager.preprocessed import CTPreprocessedDataManager
from code_transformer.preprocessing.graph.binning import ExponentialBinning
from code_transformer.preprocessing.graph.distances import PersonalizedPageRank, ShortestPaths, \
    AncestorShortestPaths, SiblingShortestPaths, DistanceBinning
from code_transformer.preprocessing.graph.transform_var import DistancesTransformerVar
from code_transformer.preprocessing.nlp.vocab import VocabularyTransformer, CodeSummarizationVocabularyTransformer

from code_transformer.preprocessing.pipeline.stage1var import CTStage1VarPreprocessor 

from code_transformer.preprocessing.pipeline.stage2var import CTStage2VarMultiLanguageSample
from code_transformer.utils.inference_var import get_model_manager, make_batch_from_sample, decode_predicted_tokens


from code_transformer.env import DATA_PATH_STAGE_2

%reload_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


## Load Model

Load the model with the name of the folder and the its type 

In [3]:
model_type = 'code_transformer'  # code_transformer, great or xl_net
# model_type = 'great' 
run_id = 'CT-188-var'  # Name of folder in which snapshots are stored
# run_id = 'GT-2'
snapshot = 'latest'  # Use 'latest' for the last stored snapshot
model_manager = get_model_manager(model_type)
model_config = model_manager.load_config(run_id)

language = model_config['data_setup']['language']
print(f"Model was trained on: {language}")

Model was trained on: python


## Load the model state and set in evaluation

In [4]:
model = model_manager.load_model(run_id, snapshot, gpu=False)
model = model.eval()

## Write the funtion and set the variable to predit
The code snippet to send to the transformer, to predict a specific variable inside the function just write an 'a' to each instance of this variable

In [5]:
code_snippet = """
def send_command(self, a, as_list=False):
  action = actions.Action({
      'Command': a,
      'Action': 'Command'},
    as_list=as_list)
  return self.send_action(action)
"""
code_snippet_language = 'python'

In [6]:
code_snippet = """
def to_bytes(a, encoding='utf-8'):
  if not a:
    return a
  if not isinstance(a, bytes_type):
    a = a.encode(encoding)
  return a
"""
code_snippet_language = 'python'

In [7]:
# original
code_snippet = """
def ensure_directory(path):
  a = os.path.dirname(path)
  if not os.path.isdir(a):
    os.makedirs(a)
"""
code_snippet_language = 'python

code_snippet = """
def ensure_directory(path):
  a = os.path.dirname(path)
  if not os.path.isdir(a):
    os.makedirs(a)
"""
code_snippet_language = 'python'

In [56]:
# original
code_snippet = """
def read_data(self, f):
  '''
  Read the data and populate the vocabulary
  '''
  file = open(f, 'r', encoding='utf-8').readlines()
  r_file = []
  for line in file:
    tmp = line.strip().split()
    r_file.append(tmp)
    # if there are more words in test and valid we add to voc
    self.voc.add_word_sent(tmp)

  return r_file
"""

code_snippet = """
def read_data(self, f):
  '''
  Read the data and populate the vocabulary
  '''
  file = open(f, 'r', encoding='utf-8').readlines()
  r_file = []
  for a in file:
    tmp = a.strip().split()
    r_file.append(tmp)
    # if there are more words in test and valid we add to voc
    self.voc.add_word_sent(tmp)

  return r_file
"""
code_snippet_language = 'python'

In [37]:
# original
code_snippet = """
def init_weights(self):
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
        weight.data.uniform_(-stdv, stdv)
"""

code_snippet = """
def init_weights(self):
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for a in self.parameters():
        a.data.uniform_(-stdv, stdv)
"""

code_snippet_language = 'python'

# Preprocess the input

## STAGE 1

In [49]:
preprocessor = CTStage1VarPreprocessor(code_snippet_language, allow_empty_methods=True)
stage1_sample = preprocessor.process([("f", "", code_snippet)], 0, "a", interactive=True)
# print(stage1_sample[0].variable_name)

## STAGE 2

In [50]:
# Load the config of the respective dataset that this model was trained on
model_language = model_config['data_setup']['language']
data_manager = CTPreprocessedDataManager(DATA_PATH_STAGE_2, model_language, partition='train', shuffle=True)
data_config = data_manager.load_config()

# Extract how distances should be computed from the dataset config
distances_config = data_config['distances']
PPR_ALPHA = distances_config['ppr_alpha']
PPR_USE_LOG = distances_config['ppr_use_log']
PPR_THRESHOLD = distances_config['ppr_threshold']

SP_THRESHOLD = distances_config['sp_threshold']

ANCESTOR_SP_FORWARD = distances_config['ancestor_sp_forward']
ANCESTOR_SP_BACKWARD = distances_config['ancestor_sp_backward']
ANCESTOR_SP_NEGATIVE_REVERSE_DISTS = distances_config['ancestor_sp_negative_reverse_dists']
ANCESTOR_SP_THRESHOLD = distances_config['ancestor_sp_threshold']

SIBLING_SP_FORWARD = distances_config['sibling_sp_forward']
SIBLING_SP_BACKWARD = distances_config['sibling_sp_backward']
SIBLING_SP_NEGATIVE_REVERSE_DISTS = distances_config['sibling_sp_negative_reverse_dists']
SIBLING_SP_THRESHOLD = distances_config['sibling_sp_threshold']

# Extract how distances should be binned from the dataset config
binning_config = data_config['binning']
EXPONENTIAL_BINNING_GROWTH_FACTOR = binning_config['exponential_binning_growth_factor']
N_FIXED_BINS = binning_config['n_fixed_bins']
NUM_BINS = binning_config['num_bins']

preprocessing_config = data_config['preprocessing']
REMOVE_PUNCTUATION = preprocessing_config['remove_punctuation']

# Put together all the implementations of the different distance metrics
distance_metrics = [
    PersonalizedPageRank(threshold=PPR_THRESHOLD, log=PPR_USE_LOG, alpha=PPR_ALPHA),
    ShortestPaths(threshold=SP_THRESHOLD),
    AncestorShortestPaths(forward=ANCESTOR_SP_FORWARD, backward=ANCESTOR_SP_BACKWARD,
                          negative_reverse_dists=ANCESTOR_SP_NEGATIVE_REVERSE_DISTS,
                          threshold=ANCESTOR_SP_THRESHOLD),
    SiblingShortestPaths(forward=SIBLING_SP_FORWARD, backward=SIBLING_SP_BACKWARD,
                         negative_reverse_dists=SIBLING_SP_NEGATIVE_REVERSE_DISTS,
                         threshold=SIBLING_SP_THRESHOLD)]

db = DistanceBinning(NUM_BINS, N_FIXED_BINS, ExponentialBinning(EXPONENTIAL_BINNING_GROWTH_FACTOR))

distances_transformer = DistancesTransformerVar(distance_metrics, db)
vocabs = data_manager.load_vocabularies()
if len(vocabs) == 4:
    vocabulary_transformer = CodeSummarizationVocabularyTransformer(*vocabs)
else:
    vocabulary_transformer = VocabularyTransformer(*vocabs)

In [51]:
# Now, take the result of stage1 preprocessing and feed it through the vocabulary and distances transformer to obtain a stage2 sample
stage2_sample = stage1_sample[0]
if REMOVE_PUNCTUATION:
    stage2_sample.remove_punctuation()
stage2_sample = vocabulary_transformer(stage2_sample)
stage2_sample = distances_transformer(stage2_sample)

if ',' in model_language:
    # In the multi-lingual setting, we have to furthermore bake the code snippet language into the sample
    stage2_sample = CTStage2VarMultiLanguageSample(stage2_sample.tokens, stage2_sample.graph_sample, stage2_sample.token_mapping,
                                                stage2_sample.stripped_code_snippet, stage2_sample.func_name,
                                                stage2_sample.docstring,
                                                code_snippet_language,
                                                encoded_func_name=stage2_sample.encoded_func_name if hasattr(stage2_sample, 'encoded_func_name') else None)

## Prepare the model

In [52]:
batch = make_batch_from_sample(stage2_sample, model_config, model_type)

## Output

In [53]:
output = model.forward_batch(batch)

In [54]:
k = 5
predictions = output.logits \
    .topk(k, axis=-1)\
    .indices\
    .squeeze()\
    .T

In [55]:
print('Predicted variable names:')
for i, prediction in enumerate(predictions):
    predicted_var_name = decode_predicted_tokens(prediction, batch, data_manager)
    print(f"  ({i + 1}) ", ' '.join(predicted_var_name))

Predicted variable names:
  (1)  line
  (2)  f sent body
  (3)  chunk file splitted
  (4)  r name
  (5)  file line 256
