In [1]:
import pickle
import pandas as pd
import numpy as np
import torch

# Helpers
from testing import *
# Baseline model 1
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline

# Check if GPU acceleration is available
if torch.cuda.is_available():
    device_num = torch.cuda.current_device()
else:
    # CPU
    device_num = -1

In [2]:
# Run this if you removed pickle files
# saving_pickles()

train_df, valid_df, test_df = loading_pickles()

In [3]:
# Constants (for testing)

mask_prob = 0.5
window_size = 100
batch_size = 50
# Changing this may lead to different masking result. Do not change unless necessary.
rng_seed = 42

# Using only a portion of the validation dataset for performance
size_limit_proportion_valid = 0.5

In [4]:
valid_df_size = int(len(valid_df) * size_limit_proportion_valid)
masked_code_df = mask_variable_df(valid_df[:valid_df_size], mask_prob=mask_prob, rng_seed=rng_seed)
merged_code_df = pd.concat([valid_df[:valid_df_size], masked_code_df], axis="columns")

Masking:   0%|          | 0/11553 [00:00<?, ?it/s]

### Baseline model 1

Source: https://huggingface.co/microsoft/codebert-base-mlm
As stated in https://github.com/microsoft/CodeBERT, the basic CodeBERT is not suitable for filling-mask task.

In [5]:
model_b1 = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
tokenizer_b1 = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')
fill_mask_b1 = pipeline('fill-mask', model=model_b1, tokenizer=tokenizer_b1, device=device_num)

In [None]:
# This may cause exceptions in the following situations:
# 1. The given input size is bigger than the maximum model input. Reduce the window_size.
# 2. There is not enough GPU memory. Reduce the batch_size.
print("Top k = 1")
b1_result_k1 = model_test(merged_code_df=merged_code_df, unmasker=fill_mask_b1, top_k=1, window_size=window_size, batch_size=batch_size)
print("Top k = 2")
b1_result_k2 = model_test(merged_code_df=merged_code_df, unmasker=fill_mask_b1, top_k=2, window_size=window_size, batch_size=batch_size)
print("Top k = 3")
b1_result_k3 = model_test(merged_code_df=merged_code_df, unmasker=fill_mask_b1, top_k=3, window_size=window_size, batch_size=batch_size)

Top k = 1


Window split:   0%|          | 0/11553 [00:00<?, ?it/s]

In [None]:
# Commented out to prevent accidentally overwriting these files.

# with open("./baseline_results/b1_result_k1.pickle", "wb") as fw:
#     pickle.dump(b1_result_k1, fw)
#
# with open("./baseline_results/b1_result_k2.pickle", "wb") as fw:
#     pickle.dump(b1_result_k2, fw)
#
# with open("./baseline_results/b1_result_k3.pickle", "wb") as fw:
#     pickle.dump(b1_result_k3, fw)

### Baseline model 2

https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaForMaskedLM

In [None]:
# save the baseline 2 results here

model_b2 = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer_b2 = RobertaTokenizer.from_pretrained('roberta-base')
fill_mask_b2 = pipeline('fill-mask', model=model_b2, tokenizer=tokenizer_b2, device=device_num)

In [None]:
# This may cause exceptions in the following situations:
# 1. The given input size is bigger than the maximum model input. Reduce the window_size.
# 2. There is not enough GPU memory. Reduce the batch_size.
print("Top k = 1")
b2_result_k1 = model_test(merged_code_df=merged_code_df, unmasker=fill_mask_b2, top_k=1, window_size=window_size, batch_size=batch_size)
print("Top k = 2")
b2_result_k2 = model_test(merged_code_df=merged_code_df, unmasker=fill_mask_b2, top_k=2, window_size=window_size, batch_size=batch_size)
print("Top k = 3")
b2_result_k3 = model_test(merged_code_df=merged_code_df, unmasker=fill_mask_b2, top_k=3, window_size=window_size, batch_size=batch_size)

In [None]:
# Commented out to prevent accidentally overwriting these files.

# with open("./baseline_results/b2_result_k1.pickle", "wb") as fw:
#     pickle.dump(b2_result_k1, fw)
#
# with open("./baseline_results/b2_result_k2.pickle", "wb") as fw:
#     pickle.dump(b2_result_k2, fw)
#
# with open("./baseline_results/b2_result_k3.pickle", "wb") as fw:
#     pickle.dump(b2_result_k3, fw)

### Testing

In [None]:
b1_result_k1 = pd.read_pickle("./baseline_results/b1_result_k1.pickle")
b1_result_k2 = pd.read_pickle("./baseline_results/b1_result_k2.pickle")
b1_result_k3 = pd.read_pickle("./baseline_results/b1_result_k3.pickle")
b2_result_k1 = pd.read_pickle("./baseline_results/b2_result_k1.pickle")
b2_result_k2 = pd.read_pickle("./baseline_results/b2_result_k2.pickle")
b2_result_k3 = pd.read_pickle("./baseline_results/b2_result_k3.pickle")

results = [b1_result_k1, b1_result_k2, b1_result_k3, b2_result_k1, b2_result_k2, b2_result_k3]

In [None]:
def result_print(model_name, result_df):
    print(f"{model_name}: {np.mean(result_df['similarity'])}")

In [None]:
model_names = list()
for model_num in range(1, 3):
    for top_k in range(1, 4):
        model_names.append(f"B{model_num}, Top_k:{top_k}")

print("Average cosine similarity:")
for model_name, result in zip(model_names, results):
    result_print(model_name, result)