In [3]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from transformers import pipeline

from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-python")
model = AutoModelForMaskedLM.from_pretrained("neulab/codebert-python")

In [4]:
# Model link: https://huggingface.co/neulab/codebert-python?text=def+more_frequent_entry%28%3Cmask%3E%29%3A
unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [5]:
def output_print(input_sequence, true_labels=None, top_k=2, model=unmasker, mask_token="<mask>"):
    mask_num = input_sequence.count(mask_token)
    output = model(input_sequence, top_k=top_k)
    if mask_num == 1:
        print("-" * 50)
        if true_labels:
            print(f"True label: {true_labels[0]}")
            print("")
        for candidate in output:
            print(f"Predicted_word: {candidate['token_str']}")
            print(f"Probability: {round(candidate['score'], 3)}")
        print("-" * 50)
        print("")
        
    else:
        for index, word_prediction in enumerate(output):
            print("-" * 50)
            print(f"Mask number: {index}")
            if true_labels:
                print(f"True label: {true_labels[index]}")
                print("")
            for candidate in word_prediction:
                print(f"Predicted_word: {candidate['token_str']}")
                print(f"Probability: {round(candidate['score'], 3)}")
            print("-" * 50)
            print("")

In [7]:
code_snippet_1 = """
def <mask>(list):
    max_value = float("inf")
    max_index = 0
    for i in range(len(list)):
        if max_value < list[i]:
            max_value = list[i]
            max_index = i
    return list[max_index]"""

In [8]:
output_print(code_snippet_1)

--------------------------------------------------
Predicted_word:  max
Probability: 0.738
Predicted_word:  min
Probability: 0.156
--------------------------------------------------



In [9]:
code_snippet_2 = """
def <mask>(img):
    plt.figure(figsize=(10, 10))
    plt.imshow(img)
    plt.tight_layout()
    plt.savefig()
"""

In [10]:
output_print(code_snippet_2)

--------------------------------------------------
Predicted_word:  show
Probability: 0.686
Predicted_word:  plot
Probability: 0.111
--------------------------------------------------



In [11]:
# Merging two lists into a single dictionary
code_snippet_3 = """
def <mask>(key_list, value_list):
    return dict(zip(keys_list, values_list))
"""

In [12]:
output_print(code_snippet_3)

--------------------------------------------------
Predicted_word:  dict
Probability: 0.302
Predicted_word:  map
Probability: 0.085
--------------------------------------------------



In [13]:
# The same functionality compared to the previous one, but with a different implementation
code_snippet_4 = """
def <mask>(key_list, value_list):
    items_tuples = zip(keys_list, values_list) 
    merged_dict = dict()
    for key, value in items_tuples: 
        if key in merged_dict: 
            pass
        else: 
            merged_dict[key] = value
            
    return merged_dict
"""

In [23]:
output_print(code_snippet_4, top_k=5)

--------------------------------------------------
Predicted_word:  merge
Probability: 0.833
Predicted_word:  union
Probability: 0.018
Predicted_word:  add
Probability: 0.017
Predicted_word:  combine
Probability: 0.015
Predicted_word:  update
Probability: 0.013
--------------------------------------------------



In [16]:
# The same code compared to the previous one, but with different variable names
code_snippet_5 = """
def <mask>(list_1, list_2):
    items_tuples = zip(keys_list, values_list) 
    final_dict = dict()
    for key, value in items_tuples: 
        if key in final_dict: 
            pass
        else: 
            final_dict[key] = value
            
    return final_dict
"""

In [24]:
output_print(code_snippet_5, top_k=5)

--------------------------------------------------
Predicted_word:  merge
Probability: 0.449
Predicted_word:  union
Probability: 0.052
Predicted_word:  add
Probability: 0.043
Predicted_word:  intersect
Probability: 0.038
Predicted_word:  update
Probability: 0.037
--------------------------------------------------



In [18]:
# Linear regression for the given datasets, from the scikit learn website
code_snippet_6 = """
def <mask>(X, y):
    reg = LinearRegression().fit(X, y)
    print(reg.score(X, y), reg.coef_)
    return reg
    
def <mask>(X, model):
    return model.predict(X)
"""

In [19]:
output_print(code_snippet_6)

--------------------------------------------------
Mask number: 0
Predicted_word:  fit
Probability: 0.599
Predicted_word:  train
Probability: 0.137
--------------------------------------------------

--------------------------------------------------
Mask number: 1
Predicted_word:  predict
Probability: 0.823
Predicted_word:  evaluate
Probability: 0.042
--------------------------------------------------



Testing with codes from my project

In [26]:
code_plot = """
def <mask>(df, <mask>, resample_minutes, colour=None, draw_plot=True, title=None, fill=True, label=None):
    if draw_plot:
        fig = plt.figure(figsize=(20, 8))
        
    <mask> = datetime.timedelta(minutes=resample_minutes)
    <mask> = df.resample(resample_timedelta, on='timestamp').mean(numeric_only=True)
    
    <mask> = lambda x: target_column_name if label is None else label
    if fill and not colour:
        plt.fill_between(df_resampled.index, df_resampled[target_column_name], label = label_check(label))
    elif not fill and not colour:
        plt.plot(df_resampled.index, df_resampled[target_column_name], linewidth = 2, label = label_check(label))
    else:
        plt.plot(df_resampled.index, df_resampled[target_column_name], linewidth = 2, color = colour, label = label_check(label))
        
    plt.xlabel("Time", fontsize=18)
    plt.ylabel(f'{target_column_name} count', fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    
    if title is None:
        plt.title(f"Distribution of {target_column_name} over time", fontsize=18) 
    else:
        plt.title(title, fontsize=20)"""

In [27]:
# Evaluation by Perplexity, BLEU, and cosine difference by word2vec
code_plot_labels = ["draw_time_dist", "target_column_name", "resample_timedelat", "df_resampled", "label_check"]
output_print(code_plot, code_plot_labels)

--------------------------------------------------
Mask number: 0
True label: draw_time_dist

Predicted_word:  plot
Probability: 0.366
Predicted_word:  hist
Probability: 0.116
--------------------------------------------------

--------------------------------------------------
Mask number: 1
True label: target_column_name

Predicted_word:  fig
Probability: 0.104
Predicted_word:  target
Probability: 0.078
--------------------------------------------------

--------------------------------------------------
Mask number: 2
True label: resample_timedelat

Predicted_word: dt
Probability: 0.355
Predicted_word: t
Probability: 0.174
--------------------------------------------------

--------------------------------------------------
Mask number: 3
True label: df_resampled

Predicted_word: df
Probability: 0.522
Predicted_word: mean
Probability: 0.113
--------------------------------------------------

--------------------------------------------------
Mask number: 4
True label: label_check

P