In [36]:
import pickle
import pandas as pd
import numpy as np
import os
import glob
import re
import torch

# Helpers
from testing import *
# Huggingface dataset
from datasets import Dataset
# Baseline model 1
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline

In [37]:
df_train, df_valid, df_test = loading_pickles()

In [39]:
df_valid

Unnamed: 0,func_name,code,code_tokens,repo
0,learn,"def learn(env,\n network,\n ...","[def, learn, (, env, ,, network, ,, seed, =, N...",openai/baselines
1,ActWrapper.save_act,"def save_act(self, path=None):\n """"""Sav...","[def, save_act, (, self, ,, path, =, None, ), ...",openai/baselines
2,nature_cnn,"def nature_cnn(unscaled_images, **conv_kwargs)...","[def, nature_cnn, (, unscaled_images, ,, *, *,...",openai/baselines
3,mlp,"def mlp(num_layers=2, num_hidden=64, activatio...","[def, mlp, (, num_layers, =, 2, ,, num_hidden,...",openai/baselines
4,lstm,"def lstm(nlstm=128, layer_norm=False):\n """"...","[def, lstm, (, nlstm, =, 128, ,, layer_norm, =...",openai/baselines
...,...,...,...,...
23102,Clifier.show_version,"def show_version(self):\n """""" custom co...","[def, show_version, (, self, ), :, class, Show...",xnuinside/clifier
23103,Clifier.check_path_action,"def check_path_action(self):\n """""" cust...","[def, check_path_action, (, self, ), :, class,...",xnuinside/clifier
23104,new_user,def new_user(yaml_path):\n '''\n Return ...,"[def, new_user, (, yaml_path, ), :, print, 'Re...",tklovett/PyShirtsIO
23105,_AddPropertiesForExtensions,"def _AddPropertiesForExtensions(descriptor, cl...","[def, _AddPropertiesForExtensions, (, descript...",ibelie/typy


In [38]:
df_valid.iloc[0]

func_name                                                  learn
code           def learn(env,\n          network,\n          ...
code_tokens    [def, learn, (, env, ,, network, ,, seed, =, N...
repo                                            openai/baselines
Name: 0, dtype: object

In [12]:
def baseline_test(code_df, unmasker, mask_token="<mask>", mask_prob=0.5, top_k=1, top_k_connection="_",
                  code_column_name="code", window_size=100):
    """
    For the given code dataframe, it automatically masks the codes and fill the masks by the supplied unmasker.
    The predicted results are then compared with the true labels, with cosine similarity.
    If top_k is set bigger than 1, then top_k number of predictions will be concatenated to form a single predictions
    by the top_k_connection (default to the underscore).
    For example, if the predictions are: "A", "B", and "C", then top_k = 2, the final prediction will be "A_B".

    Pre-trained transformers typically can take up to 512 tokens. Thus, if the given code is larger than this,
    then a RuntimeError will be raised. To avoid this, the window_size variable is added. It regulates the amount of
    context which will be give to the unmasker. If it is set to 100, total 200 characters will be given to the unmasker:
    100 characters before the mask token, and 100 characters after the mask token.
    For example, 100 characters <mask> 100 characters
    """
    masked_code_df = mask_variable_df(code_df, mask_prob=mask_prob, code_column_name=code_column_name)
    merged_code_df = pd.concat([code_df, masked_code_df], axis="columns")

    similarity_scores_list = list()
    predictions_list = list()
    true_labels_list = list()

    total_size = len(code_df)
    for index, row in merged_code_df.iterrows():
        if index % 1000 == 0:
            print(f"Progress: {round(index / total_size, 3) * 100}%")

        true_labels = row["variable_labels"]
        true_labels_list.append(true_labels)

        # If the current code snippet is longer than the maximum input size of the given unmasker
        # then the runtime error will be raised. Try to reduce window_size.
        try:
            predictions = mask_prediction(row, top_k, unmasker, top_k_connection, mask_token, window_size)
            predictions_list.append(predictions)
        except RuntimeError:
            raise RuntimeError("The given input size is bigger than the maximum model input. Reduce the window size.")

        similarity_scores = list()
        for prediction, true_label in zip(predictions, true_labels):
            similarity_scores.append(cosine_similarity([prediction, true_label]))
        similarity_scores_list.append(similarity_scores)

    return predictions_list, true_labels_list, similarity_scores_list

In [20]:
df_valid_masked = mask_variable_df(df_valid[:10])
df_valid_merged = pd.concat([df_valid[:10], df_valid_masked], axis="columns")

In [None]:
def split_into_windows(row, window_size, mask_token, testing=False):
    windows = list()

    for variable_indices, variable_labels in zip(row["variable_indices"], row["variable_labels"]):
        # Window indices
        begin_index = variable_indices[0] - window_size if variable_indices[0] - window_size > 0 else 0
        end_index = variable_indices[1] + window_size if variable_indices[1] + window_size < len(row["code"]) else len(
            row["code"])

        current_window = row["code"][begin_index: variable_indices[0]] + mask_token + row["code"][
                                                                                      variable_indices[1]: end_index]
        windows.append(current_window)

        if testing:
            print(current_window)
            print("--------")
    return windows


def mask_prediction(row, top_k, unmasker, top_k_connection, mask_token, window_size):
    """
    Generates the prediction to the given masked code. If top_k is bigger than 1, then the top_k predictions
    will be concatenated by the given top_k_connection. Each prediction(s) will be stripped to remove unnecessary whitespaces.
    """
    mask_num = len(row["variable_indices"])
    predictions = list()

    if mask_num == 0:
        return predictions

    else:
        windows = split_into_windows(row, window_size, mask_token)
        for window in windows:
            output = unmasker(window, top_k=top_k)
            candidate_concat = top_k_connection.join([candidate['token_str'].strip() for candidate in output])
            predictions.append(candidate_concat)

        return predictions