In [1]:
import pickle
import pandas as pd
import numpy as np
import torch

# Helpers
from testing import *
# Baseline model 1
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline

# Check if GPU acceleration is available
if torch.cuda.is_available():
    device_num = torch.cuda.current_device()
else:
    # CPU
    device_num = -1

In [2]:
# Run this if you removed pickle files
# saving_pickles()

train_df, valid_df, test_df = loading_pickles()

In [3]:
# Constants

mask_prob = 0.5
window_size = 100
batch_size = 50

In [4]:
# Masking the given code dataframe and use in every testing
# It is necessary to test models fairly (with the same masked variables on the given probability)
masked_code_df = mask_variable_df(valid_df, mask_prob=mask_prob)
merged_code_df = pd.concat([valid_df, masked_code_df], axis="columns")

Masking: 100%|██████████| 23107/23107 [00:07<00:00, 3174.12it/s]


### Baseline model 1

Source: https://huggingface.co/microsoft/codebert-base-mlm
As stated in https://github.com/microsoft/CodeBERT, the basic CodeBERT is not suitable for filling-mask task.

In [5]:
model_b1 = RobertaForMaskedLM.from_pretrained('microsoft/codebert-base-mlm')
tokenizer_b1 = RobertaTokenizer.from_pretrained('microsoft/codebert-base-mlm')
fill_mask_b1 = pipeline('fill-mask', model=model_b1, tokenizer=tokenizer_b1, device=device_num)

In [6]:
# This may cause exceptions in the following situations:
# 1. The given input size is bigger than the maximum model input. Reduce the window_size.
# 2. There is not enough GPU memory. Reduce the batch_size.
print("Top k = 1")
b1_result_k1 = baseline_test(merged_code_df=merged_code_df, unmasker=fill_mask_b1, top_k=1, window_size=window_size, batch_size=batch_size)
print("Top k = 2")
b1_result_k2 = baseline_test(merged_code_df=merged_code_df, unmasker=fill_mask_b1, top_k=2, window_size=window_size, batch_size=batch_size)
print("Top k = 3")
b1_result_k3 = baseline_test(merged_code_df=merged_code_df, unmasker=fill_mask_b1, top_k=3, window_size=window_size, batch_size=batch_size)

Baseline model 1

Top k = 1


Window split: 100%|██████████| 23107/23107 [00:02<00:00, 8428.76it/s]
Prediction: 100%|██████████| 67732/67732 [32:09<00:00, 35.09it/s]
Similarity: 100%|██████████| 67732/67732 [13:09<00:00, 85.77it/s]


Top k = 2


Window split: 100%|██████████| 23107/23107 [00:03<00:00, 7549.28it/s]
Prediction: 100%|██████████| 67732/67732 [34:20<00:00, 32.87it/s] 
Similarity: 100%|██████████| 67732/67732 [11:58<00:00, 94.25it/s]


Top k = 3


Window split: 100%|██████████| 23107/23107 [00:02<00:00, 8980.06it/s]
Prediction: 100%|██████████| 67732/67732 [38:29<00:00, 29.32it/s]  
Similarity: 100%|██████████| 67732/67732 [12:43<00:00, 88.75it/s]


In [13]:
# Commented out to prevent accidentally overwriting these files.

# with open("./baseline_results/b1_result_k1.pickle", "wb") as fw:
#     pickle.dump(b1_result_k1, fw)
# 
# with open("./baseline_results/b1_result_k2.pickle", "wb") as fw:
#     pickle.dump(b1_result_k2, fw)
#
# with open("./baseline_results/b1_result_k3.pickle", "wb") as fw:
#     pickle.dump(b1_result_k3, fw)

FileNotFoundError: [Errno 2] No such file or directory: '../baseline_results/b1_result_k1.pickle'

### Baseline model 2

In [None]:
# save the baseline 2 results here

### Testing

In [None]:
b1_result_k1 = pd.read_pickle("b1_result_k1.pickle")
b1_result_k2 = pd.read_pickle("b1_result_k2.pickle")
b1_result_k3 = pd.read_pickle("b1_result_k3.pickle")
b2_result_k1 = pd.read_pickle("b2_result_k1.pickle")
b2_result_k2 = pd.read_pickle("b2_result_k2.pickle")
b2_result_k3 = pd.read_pickle("b2_result_k3.pickle")

results = [b1_result_k1, b1_result_k2, b1_result_k3, b2_result_k1, b2_result_k2, b2_result_k3]

In [None]:
def result_print(model_name, result_df):
    print(f"{model_name}: {np.mean(result_df['similarity'])}")

In [None]:
model_names = list()
for model_num in range(1, 3):
    for top_k in range(1, 4):
        model_names.append(f"B{model_num}, Top_k:{top_k}")

print("Average cosine similarity:\n")
for model_name, result in zip(model_names, results):
    result_print(model_name, result_df)