In [1]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob
import re
import torch

# Sentence similarity model
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
from sentence_transformers import SentenceTransformer

# Baseline model 1
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline

In [2]:
# Check if GPU acceleration is available
if torch.cuda.is_available():
    device_num = torch.cuda.current_device()
else:
    # CPU
    device_num = -1

### Helper methods

In [3]:
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, orient='records', compression='gzip', lines=True)[columns] for f in file_list], 
                     sort=False)
def get_dfs(path):
    """Grabs the different data splits and converts them into dataframes"""
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted(glob.glob(path+"/"+split+"**/*.gz"))
        df = jsonl_list_to_dataframe(files, ["func_name", "code", "code_tokens", "repo"])
        dfs.append(df)
    return dfs

In [4]:
# For saving the original files into pickle files.
# df_train, df_valid, df_test = get_dfs("data/codenet/python/final/jsonl")

# df_train.to_pickle("train.pickle")
# df_valid.to_pickle("valid.pickle")
# df_test.to_pickle("test.pickle")

df_train = pd.read_pickle("train.pickle").reset_index(drop=True)
df_valid = pd.read_pickle("valid.pickle").reset_index(drop=True)
df_test = pd.read_pickle("test.pickle").reset_index(drop=True)

### Helper methods for baseline models testing

In [5]:
def output_print(input_sequence, unmasker, true_labels=None, top_k=2, mask_token="<mask>"):
    mask_num = input_sequence.count(mask_token)
    output = unmasker(input_sequence, top_k=top_k)
    if mask_num == 1:
        print("-" * 50)
        if true_labels:
            print(f"True label: {true_labels[0]}")
            print("")
        for candidate in output:
            print(f"Predicted_word: {candidate['token_str']}")
            print(f"Probability: {round(candidate['score'], 3)}")
        print("-" * 50)
        print("")
        
    else:
        for index, word_prediction in enumerate(output):
            print("-" * 50)
            print(f"Mask number: {index}")
            if true_labels:
                print(f"True label: {true_labels[index]}")
                print("")
            for candidate in word_prediction:
                print(f"Predicted_word: {candidate['token_str']}")
                print(f"Probability: {round(candidate['score'], 3)}")
            print("-" * 50)
            print("")

In [92]:
# Variable masker testing
test = """
def learn(env,
          network,
          seed=None,
          callback=None,
          load_path=None,
          **network_kwargs
            ):
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space
    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
"""

pattern = r"(\bdef\s\w*\(.*?\)):|(#\s*.*?\n)|(return\s*.*?\n)|(\b[\w,\s]*=\s*.*?\n)"
matches = [str().join(x) for x in re.findall(pattern, test, flags=re.DOTALL)]
matches

['def learn(env,\n          network,\n          seed=None,\n          lr=5e-4,\n          total_timesteps=100000,\n          buffer_size=50000,\n          exploration_fraction=0.1,\n          exploration_final_eps=0.02,\n          train_freq=1,\n          batch_size=32,\n          print_freq=100,\n          checkpoint_freq=10000,\n          checkpoint_path=None,\n          learning_starts=1000,\n          gamma=1.0,\n          target_network_update_freq=500,\n          prioritized_replay=False,\n          prioritized_replay_alpha=0.6,\n          prioritized_replay_beta0=0.4,\n          prioritized_replay_beta_iters=None,\n          prioritized_replay_eps=1e-6,\n          param_noise=False,\n          callback=None,\n          load_path=None,\n          **network_kwargs\n            )',
 'return a latent variable tensor, which\n',
 'returns true training stops.\n',
 '# Create all the functions necessary to train the model\n',
 'sess = get_session()\n',
 'q_func = build_q_func(network, *

In [94]:
# Basic assumption: The same line of code never occurs twice.
def mask_variable_names(code, mask_prob):
    """
    Mask the values of variables in a code with a certain probability.
    """
    # Regular expression pattern to match variable assignments
    # Function signature (to be filtered out later) | common variable definitions
    pattern = r"(\bdef\s\w*\(.*?\)):|(#\s*.*?\n)|(return\s*.*?\n)|(\b[\w,\s]*=\s*.*?\n)"
    matches = [str().join(x) for x in re.findall(pattern, code, flags=re.DOTALL)]
    var_indices = list()
    var_labels = list()
    # characters that should not exist in the first sub part of a found match.
    invalid_list = ["(", ")", "def", "#", "return"]
    
    # If there is a variable found
    if matches:
        for match in matches:
            # Split the match into sub-parts by the equal sign, and check if the first sub-part contain any parenthesis
            # or "def" (implies function signature).
            # If not, then the first sub-part is variable(s).
            first_sub_part = match.split("=")[0]
            if not any([invalid_character in first_sub_part for invalid_character in invalid_list]):
                variables = set(re.split(",|=", first_sub_part))
                
                # Masking variables based on the mask_prob
                masked_match = str(match)
                match_begin_index = code.find(masked_match)
                for var in variables:
                    # If beginning of the function call, then process no further.
                    if "(" in var:
                        break
                    if np.random.uniform() < mask_prob:
                        var_begin_index = masked_match.find(var.strip())
                        var_index = (var_begin_index + match_begin_index, var_begin_index + match_begin_index + len(var.strip()))
                        var_indices.append(var_index)
                        var_labels.append(var.strip())
            else:
                continue
        
        return var_indices, var_labels
    
    # If no variable is found
    else:
        return code, list()
        
def mask_variable_df(df, code_column_name="code", mask_prob=0.5, return_df=True):
    variable_indices_list = list()
    variable_labels_list = list()
    
    for index, row in df.iterrows():
        variable_indices, variable_labels = mask_variable_names(row["code"], mask_prob)
        variable_indices_list.append(variable_indices)
        variable_labels_list.append(variable_labels)
        
    if return_df:
        return pd.DataFrame({"variable_indices" : variable_indices_list, "variable_labels" : variable_labels_list})
    else:
        return variable_indices_list, variable_labels_list

In [7]:
model_se = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def cosine_similarity(sentences, model=model_se):
    embeddings = model.encode(sentences)
    return np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))

In [8]:
def remove_docstring(code):
    pattern = r'(""".*?""")|(\'\'\'.*?\'\'\')'
    return re.sub(pattern, '', code, flags=re.DOTALL)

In [9]:
def find_substring_indices(text, substring):
    pattern = re.compile(f'{substring}')
    indices = [(match.start(), match.end()-1) for match in pattern.finditer(text)]
    return indices

In [99]:
def split_into_windows(row, window_size, mask_token):
    windows = list()
    
    for variable_indices, variable_labels in zip(row["variable_indices"], row["variable_labels"]):
        # Window indices
        begin_index = variable_indices[0] - window_size if variable_indices[0] - window_size > 0 else 0
        end_index = variable_indices[1] + window_size if variable_indices[1] + window_size < len(row["code"]) else len(row["code"])
        
        current_window = row["code"][begin_index : variable_indices[0]] + mask_token + row["code"][variable_indices[1] : end_index]
        windows.append(current_window)
        
#         print(current_window)
#         print("--------")
    return windows

In [11]:
# Generates the prediction to the given masked code. If top_k is bigger than 1, then the top_k predictions
# will be concatenated by the given top_k_connection. Each prediction(s) will be stripped to remove unnecessary whitespaces.
def mask_prediction(row, top_k, unmasker, top_k_connection, mask_token, window_size):
    mask_num = len(row["variable_indices"])
    predictions = list()
    
    if mask_num == 0:
        return predictions
    
#     elif mask_num == 1:
#         window = split_into_windows(masked_code, window_size, mask_token, true_labels)
#         output = unmasker(window, top_k=top_k)
#         candidate_concat = top_k_connection.join([candidate['token_str'].strip() for candidate in output])
#         predictions.append(candidate_concat)
#         return predictions
    
    else:
        windows = split_into_windows(row, window_size, mask_token)
        for window in windows:
            output = unmasker(window, top_k=top_k)
            candidate_concat = top_k_connection.join([candidate['token_str'].strip() for candidate in output])
            predictions.append(candidate_concat)
            
        return predictions

In [12]:
# For the given code dataframe, it automatically masks the codes and fill the masks by the supplied unmasker.
# The predicted results are then compared with the true labels, with cosine similarity.
# If top_k is set bigger than 1, then top_k number of predictions will be concatenated to form a single predictions
# by the top_k_connection (default to the underscore).
# For example, if the predictions are: "A", "B", and "C", then top_k = 2, the final prediction will be "A_B".

# Currently runtime errors will be ignored. Runtime errors happen when the given code is longer than the maximum
# size of the unmasker model (512 tokens)

# Pre-trained transformers typically can take up to 512 tokens. Thus, if the given code is larger than this,
# then a RuntimeError will be raised. To avoid this, the window_size variable is added. It regulates the amount of
# context which will be give to the unmasker. If it is set to 100, total 200 characters will be given to the unmasker:
# 100 characters before the mask token, and 100 characters after the mask token.
# For example, 100 characters <mask> 100 characters
def baseline_test(code_df, unmasker, mask_token="<mask>", mask_prob=0.5, top_k=1, top_k_connection="_", 
                  code_column_name="code", window_size=100):
    
    masked_code_df = mask_variable_df(code_df, mask_prob=mask_prob, code_column_name=code_column_name)
    merged_code_df = pd.concat([code_df, masked_code_df], axis="columns")
    
    similarity_scores_list = list()
    predictions_list = list()
    true_labels_list = list()
    
    total_size = len(code_df)
    for index, row in merged_code_df.iterrows():
        if index % 1000 == 0:
            print(f"Progress: {round(index / total_size, 3) * 100}%")
        
        true_labels = row["variable_labels"]
        true_labels_list.append(true_labels)
        
        # If the current code snippet is longer than the maximum input size of the given unmasker 
        # then the runtime error will be raised. Try to reduce window_size.
        try:
            predictions = mask_prediction(row, top_k, unmasker, top_k_connection, mask_token, window_size)
            predictions_list.append(predictions)
        except RuntimeError:
            raise RuntimeError("The given input size is bigger than the maximum model input. Reduce the window size.")
        
        similarity_scores = list()
        for prediction, true_label in zip(predictions, true_labels):
            similarity_scores.append(cosine_similarity([prediction, true_label]))
        similarity_scores_list.append(similarity_scores)
        
    return predictions_list, true_labels_list, similarity_scores_list

### Baseline score 1: 

Source: https://huggingface.co/microsoft/codebert-base-mlm <br>
As stated in https://github.com/microsoft/CodeBERT, the basic CobeBERT is not suitable for filling-mask task.

In [13]:
model_b1 = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
tokenizer_b1 = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')
fill_mask_b1 = pipeline('fill-mask', model=model_b1, tokenizer=tokenizer_b1, device=device_num)

In [14]:
code_example = "if (x is not None) <mask> (x>1)"
output_print(code_example, fill_mask_b1, top_k=3)

--------------------------------------------------
Predicted_word:  and
Probability: 0.724
Predicted_word:  &
Probability: 0.106
Predicted_word: and
Probability: 0.022
--------------------------------------------------



In [15]:
sentences = ["This is an example sentence", "Each sentence is converted"]
model_ss = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
cosine_similarity(sentences)

0.40455922

In [100]:
# It takes a lot of time, even though GPU acceleration is applied. It is because of many variables in each code snippet
# and using the dataframe, not the huggingface api dataset. It should be refactored soon.
# I do recommend to load the pickle files I saved below, instead of running this again.

# You can just implement another metric and use it instead (implementation of perplexity should be straightforward).

# top_k = 1
b1_result_k1 = baseline_test(df_valid, fill_mask_b1, top_k=1)
print("Finished")

# top_k = 2
b1_result_k2 = baseline_test(df_valid, fill_mask_b1, top_k=2, mask_prob=1)
print("Finished")

# top_k = 3
b1_result_k3 = baseline_test(df_valid, fill_mask_b1, top_k=3)
print("Finished")

Progress: 0.0%




Finished


In [None]:
with open("b1_result_k1.pickle","wb") as fw:
    pickle.dump(b1_result_k1, fw)
    
with open("b1_result_k2","wb") as fw:
    pickle.dump(b1_result_k2, fw)
    
with open("b1_result_k3","wb") as fw:
    pickle.dump(b1_result_k3, fw)

### For debug (Printing out all windows)

To use, uncomment the following two lines in the function "split_into_windows" <br>
print(current_window) <br>
print("--------")

In [98]:
# baseline_test(df_valid.head(1), fill_mask_b1, top_k=1, mask_prob=1)

Progress: 0.0%
tails on the act function.
    """
    # Create all the functions necessary to train the model

    <mask> = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

 
--------
he functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    <mask> = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the 
--------
ure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    <mask> = env.observation_space
    def make_obs_ph(name):
        return ObservationInput(observation_spac
--------
space
    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    <mask>, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q
--------
ame):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, <mask> = deepq.build_



([['#',
   'q',
   '*',
   'obs',
   '_',
   'target',
   'log',
   'fn',
   'q',
   'n',
   'model',
   'gamma',
   'beta',
   'noise',
   'env',
   'act',
   'buff',
   'beta',
   'schedule',
   'beta',
   'beta',
   'b',
   'p',
   'schedule',
   'p',
   'steps',
   '/',
   'obs',
   '#',
   'obs',
   'td',
   '+',
   'train',
   'result',
   't',
   't',
   '/',
   'threshold',
   'p',
   'action',
   'reset',
   'done',
   '_',
   '_',
   '_',
   '_',
   '/',
   '#',
   'obs',
   '2',
   't',
   'experience',
   'rewards',
   'losses',
   'weights',
   'weights',
   '#',
   '_',
   'acc',
   'ba',
   '2',
   '/',
   'pr',
   '2',
   '2',
   'result',
   '*']],
 [['sess',
   'q_func',
   'observation_space',
   'act',
   'debug',
   'train',
   'update_target',
   'make_obs_ph',
   'q_func',
   'num_actions',
   'optimizer',
   'gamma',
   'grad_norm_clipping',
   'param_noise',
   'act_params',
   'act',
   'replay_buffer',
   'prioritized_replay_beta_iters',
   'beta_schedule',
 

In [97]:
# print(df_valid.loc[0, 'code'])

def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs
            ):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the 

In [18]:
# Final testing (in progress)

# print("Total mean value of all cosine similarities")
# for index, result in enumerate([b1_result_k1, b1_result_k2, b1_result_k3]):
#     similarity_scores_list = result[2]
#     total_values = 0
#     total_mask_num = 0
#     for similarity_scores in similarity_scores_list:
#         if similarity_scores:
#             total_values += np.sum(similarity_scores)
#             total_mask_num += len(similarity_scores)
    
#     total_average = total_values / total_mask_num
#     print(f"Top_k={index+1}\n Total mask number: {total_mask_num}, Total average cosine similarity: {round(total_average, 3)}")

### Baseline score 2: 