In [1]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from transformers import pipeline

from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-python")
model = AutoModelForMaskedLM.from_pretrained("neulab/codebert-python")

In [2]:
# Model link: https://huggingface.co/neulab/codebert-python?text=def+more_frequent_entry%28%3Cmask%3E%29%3A
unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [3]:
def output_print(output, true_labels):
    for index, word_prediction in enumerate(output):
        print("-" * 50)
        print(f"Mask number: {index}")
        print(f"True label: {true_labels[index]}")
        for candidate in word_prediction:
            print("")
            print(f"Predicted_word: {candidate['token_str']}")
            print(f"Probability: {round(candidate['score'], 3)}")
        print("-" * 50)
        print("")

In [4]:
code_snippet_1 = """
def <mask>(list):
    max_value = float("inf")
    max_index = 0
    for i in range(len(list)):
        if max_value < list[i]:
            max_value = list[i]
            max_index = i
    return list[max_index]"""

In [5]:
unmasker(code_snippet_1, top_k=5)

[{'score': 0.7382919788360596,
  'token': 19220,
  'token_str': ' max',
  'sequence': '\ndef max(list):\n    max_value = float("inf")\n    max_index = 0\n    for i in range(len(list)):\n        if max_value < list[i]:\n            max_value = list[i]\n            max_index = i\n    return list[max_index]'},
 {'score': 0.15644575655460358,
  'token': 5251,
  'token_str': ' min',
  'sequence': '\ndef min(list):\n    max_value = float("inf")\n    max_index = 0\n    for i in range(len(list)):\n        if max_value < list[i]:\n            max_value = list[i]\n            max_index = i\n    return list[max_index]'},
 {'score': 0.029951967298984528,
  'token': 4532,
  'token_str': ' maximum',
  'sequence': '\ndef maximum(list):\n    max_value = float("inf")\n    max_index = 0\n    for i in range(len(list)):\n        if max_value < list[i]:\n            max_value = list[i]\n            max_index = i\n    return list[max_index]'},
 {'score': 0.0072595286183059216,
  'token': 275,
  'token_str':

In [6]:
code_snippet_2 = """
def <mask>(img):
    plt.figure(figsize=(10, 10))
    plt.imshow(img)
    plt.tight_layout()
    plt.savefig()
"""

In [7]:
unmasker(code_snippet_2, top_k=5)

[{'score': 0.6859515905380249,
  'token': 311,
  'token_str': ' show',
  'sequence': '\ndef show(img):\n    plt.figure(figsize=(10, 10))\n    plt.imshow(img)\n    plt.tight_layout()\n    plt.savefig()\n'},
 {'score': 0.11137401312589645,
  'token': 6197,
  'token_str': ' plot',
  'sequence': '\ndef plot(img):\n    plt.figure(figsize=(10, 10))\n    plt.imshow(img)\n    plt.tight_layout()\n    plt.savefig()\n'},
 {'score': 0.021623341366648674,
  'token': 2451,
  'token_str': ' draw',
  'sequence': '\ndef draw(img):\n    plt.figure(figsize=(10, 10))\n    plt.imshow(img)\n    plt.tight_layout()\n    plt.savefig()\n'},
 {'score': 0.017904477193951607,
  'token': 1296,
  'token_str': ' test',
  'sequence': '\ndef test(img):\n    plt.figure(figsize=(10, 10))\n    plt.imshow(img)\n    plt.tight_layout()\n    plt.savefig()\n'},
 {'score': 0.012548865750432014,
  'token': 2274,
  'token_str': ' image',
  'sequence': '\ndef image(img):\n    plt.figure(figsize=(10, 10))\n    plt.imshow(img)\n    

In [8]:
# Merging two lists into a single dictionary
code_snippet_3 = """
def <mask>(key_list, value_list):
    return dict(zip(keys_list, values_list))
"""

In [9]:
unmasker(code_snippet_3, top_k=5)

[{'score': 0.3016514480113983,
  'token': 28700,
  'token_str': ' dict',
  'sequence': '\ndef dict(key_list, value_list):\n    return dict(zip(keys_list, values_list))\n'},
 {'score': 0.08486815541982651,
  'token': 5456,
  'token_str': ' map',
  'sequence': '\ndef map(key_list, value_list):\n    return dict(zip(keys_list, values_list))\n'},
 {'score': 0.06300310790538788,
  'token': 19388,
  'token_str': ' merge',
  'sequence': '\ndef merge(key_list, value_list):\n    return dict(zip(keys_list, values_list))\n'},
 {'score': 0.05066296085715294,
  'token': 146,
  'token_str': ' make',
  'sequence': '\ndef make(key_list, value_list):\n    return dict(zip(keys_list, values_list))\n'},
 {'score': 0.033224646002054214,
  'token': 2935,
  'token_str': ' update',
  'sequence': '\ndef update(key_list, value_list):\n    return dict(zip(keys_list, values_list))\n'}]

In [10]:
# The same functionality compared to the previous one, but with a different implementation
code_snippet_4 = """
def <mask>(key_list, value_list):
    items_tuples = zip(keys_list, values_list) 
    merged_dict = dict()
    for key, value in items_tuples: 
        if key in merged_dict: 
            pass
        else: 
            merged_dict[key] = value
            
    return merged_dict
"""

In [11]:
unmasker(code_snippet_4, top_k=5)

[{'score': 0.8331116437911987,
  'token': 19388,
  'token_str': ' merge',
  'sequence': '\ndef merge(key_list, value_list):\n    items_tuples = zip(keys_list, values_list) \n    merged_dict = dict()\n    for key, value in items_tuples: \n        if key in merged_dict: \n            pass\n        else: \n            merged_dict[key] = value\n            \n    return merged_dict\n'},
 {'score': 0.018189143389463425,
  'token': 2918,
  'token_str': ' union',
  'sequence': '\ndef union(key_list, value_list):\n    items_tuples = zip(keys_list, values_list) \n    merged_dict = dict()\n    for key, value in items_tuples: \n        if key in merged_dict: \n            pass\n        else: \n            merged_dict[key] = value\n            \n    return merged_dict\n'},
 {'score': 0.017174458131194115,
  'token': 1606,
  'token_str': ' add',
  'sequence': '\ndef add(key_list, value_list):\n    items_tuples = zip(keys_list, values_list) \n    merged_dict = dict()\n    for key, value in items_tupl

In [12]:
# The same code compared to the previous one, but with different variable names
code_snippet_5 = """
def <mask>(list_1, list_2):
    items_tuples = zip(keys_list, values_list) 
    final_dict = dict()
    for key, value in items_tuples: 
        if key in final_dict: 
            pass
        else: 
            final_dict[key] = value
            
    return final_dict
"""

In [13]:
unmasker(code_snippet_5, top_k=5)

[{'score': 0.4490654766559601,
  'token': 19388,
  'token_str': ' merge',
  'sequence': '\ndef merge(list_1, list_2):\n    items_tuples = zip(keys_list, values_list) \n    final_dict = dict()\n    for key, value in items_tuples: \n        if key in final_dict: \n            pass\n        else: \n            final_dict[key] = value\n            \n    return final_dict\n'},
 {'score': 0.051676101982593536,
  'token': 2918,
  'token_str': ' union',
  'sequence': '\ndef union(list_1, list_2):\n    items_tuples = zip(keys_list, values_list) \n    final_dict = dict()\n    for key, value in items_tuples: \n        if key in final_dict: \n            pass\n        else: \n            final_dict[key] = value\n            \n    return final_dict\n'},
 {'score': 0.043209727853536606,
  'token': 1606,
  'token_str': ' add',
  'sequence': '\ndef add(list_1, list_2):\n    items_tuples = zip(keys_list, values_list) \n    final_dict = dict()\n    for key, value in items_tuples: \n        if key in fin

In [14]:
# Linear regression for the given datasets, from the scikit learn website
code_snippet_6 = """
def <mask>(X, y):
    reg = LinearRegression().fit(X, y)
    print(reg.score(X, y), reg.coef_)
    return reg
    
def <mask>(X, model):
    return model.predict(X)
"""

In [15]:
unmasker(code_snippet_6, top_k=3)

[[{'score': 0.5985538363456726,
   'token': 2564,
   'token_str': ' fit',
   'sequence': '<s>\ndef fit(X, y):\n    reg = LinearRegression().fit(X, y)\n    print(reg.score(X, y), reg.coef_)\n    return reg\n    \ndef<mask>(X, model):\n    return model.predict(X)\n</s>'},
  {'score': 0.13737018406391144,
   'token': 2341,
   'token_str': ' train',
   'sequence': '<s>\ndef train(X, y):\n    reg = LinearRegression().fit(X, y)\n    print(reg.score(X, y), reg.coef_)\n    return reg\n    \ndef<mask>(X, model):\n    return model.predict(X)\n</s>'},
  {'score': 0.020674511790275574,
   'token': 1421,
   'token_str': ' model',
   'sequence': '<s>\ndef model(X, y):\n    reg = LinearRegression().fit(X, y)\n    print(reg.score(X, y), reg.coef_)\n    return reg\n    \ndef<mask>(X, model):\n    return model.predict(X)\n</s>'}],
 [{'score': 0.8229965567588806,
   'token': 7006,
   'token_str': ' predict',
   'sequence': '<s>\ndef<mask>(X, y):\n    reg = LinearRegression().fit(X, y)\n    print(reg.scor

Testing with codes from my project

In [16]:
code_plot = """
def <mask>(df, target_column_name, resample_minutes, colour=None, draw_plot=True, title=None, fill=True, label=None):
    if draw_plot:
        fig = plt.figure(figsize=(20, 8))
        
    <mask> = datetime.timedelta(minutes=resample_minutes)
    <mask> = df.resample(resample_timedelta, on='timestamp').mean(numeric_only=True)
    
    <mask> = lambda x: target_column_name if label is None else label
    if fill and not colour:
        plt.fill_between(df_resampled.index, df_resampled[target_column_name], label = label_check(label))
    elif not fill and not colour:
        plt.plot(df_resampled.index, df_resampled[target_column_name], linewidth = 2, label = label_check(label))
    else:
        plt.plot(df_resampled.index, df_resampled[target_column_name], linewidth = 2, color = colour, label = label_check(label))
        
    plt.xlabel("Time", fontsize=18)
    plt.ylabel(f'{target_column_name} count', fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    
    if title is None:
        plt.title(f"Distribution of {target_column_name} over time", fontsize=18) 
    else:
        plt.title(title, fontsize=20)"""

In [17]:
code_plot_labels = ["draw_time_dist", "resample_timedelat", "df_resampled", "label_check"]
output_print(unmasker(code_plot, top_k=2), code_plot_labels)

--------------------------------------------------
Mask number: 0
True label: draw_time_dist

Predicted_word:  plot
Probability: 0.24

Predicted_word:  hist
Probability: 0.19
--------------------------------------------------

--------------------------------------------------
Mask number: 1
True label: resample_timedelat

Predicted_word: dt
Probability: 0.377

Predicted_word: t
Probability: 0.16
--------------------------------------------------

--------------------------------------------------
Mask number: 2
True label: df_resampled

Predicted_word: df
Probability: 0.401

Predicted_word: mean
Probability: 0.184
--------------------------------------------------

--------------------------------------------------
Mask number: 3
True label: label_check

Predicted_word: label
Probability: 0.878

Predicted_word: check
Probability: 0.067
--------------------------------------------------



In [18]:
codeNAT1 = '''
def <mask>(epsilon):
    ranks = []
    i = 0
    for i in range(len(epsilon)):
        e = epsilon[i]
        <mask> = np.array([[2,0],[2,e]])
        ranks.append(np.linalg.matrix_rank(C))
    return ranks
'''

Nat_LABELS = ['ranklist','C']

In [19]:
output_print(unmasker(codeNAT1, top_k=2), Nat_LABELS)

--------------------------------------------------
Mask number: 0
True label: ranklist

Predicted_word:  rank
Probability: 0.623

Predicted_word:  ranks
Probability: 0.065
--------------------------------------------------

--------------------------------------------------
Mask number: 1
True label: C

Predicted_word: C
Probability: 0.981

Predicted_word:  C
Probability: 0.013
--------------------------------------------------



In [20]:
Natlabel2='''
<mask> = sp.interpolate.interp1d( x0, y0,'quadratic')
'''

labels2=['cubic']

output_print(unmasker(Natlabel2, top_k=2), labels2)

--------------------------------------------------
Mask number: 0
True label: cubic



TypeError: string indices must be integers

In [21]:
unmasker(Natlabel2, top_k=2)

[{'score': 0.07793566584587097,
  'token': 102,
  'token_str': 'a',
  'sequence': "\na = sp.interpolate.interp1d( x0, y0,'quadratic')\n"},
 {'score': 0.04366392642259598,
  'token': 329,
  'token_str': 'z',
  'sequence': "\nz = sp.interpolate.interp1d( x0, y0,'quadratic')\n"}]

In [23]:
def output_print(output, true_labels):
    for index, word_prediction in enumerate(output):
        print("-" * 50)
        print(f"Mask number: {index}")
        print(f"True label: {true_labels[index]}")
        for candidate in word_prediction:
            print("")
            print(f"Predicted_word: {candidate['token_str']}")
            print(f"Probability: {round(candidate['score'], 3)}")
        print("-" * 50)
        print("")

In [None]:
for index, word_prediction in enumerate(unmasker(Natlabel2, top_k=2)):
    print(index)

In [22]:
Natlabel2='''
<mask> = sp.interpolate.interp1d( x0, y0,'quadratic')
'''

labels2=['cubic']

output_print(unmasker(Natlabel2, top_k=2), labels2)

--------------------------------------------------
Mask number: 0
True label: cubic



TypeError: string indices must be integers