In [None]:
'''
This notebook was tokenizes (based on a Python tokenizer), flattens, and masks the target if statement with the token <MASK>
for the training, validation, and test datasets.
'''

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_csv('ft_train.csv')
df_valid = pd.read_csv('ft_valid.csv')
df_test = pd.read_csv('ft_test.csv')

In [None]:
from pygments.lexers import PythonLexer
from pygments.token import Token
from pygments import lex

In [None]:
def tokenize_code_with_tab(code):
    tokens = []
    for ttype, value in lex(code, PythonLexer()):
        # take care of tabs
        if ttype in Token.Text and len(value)%4 == 0:
            num_tabs = len(value)//4 # I consider each tab to be 4 spaces
            assert(value == len(value)*" ") # make sure it's actually all spaces
            tokens.append("<TAB>"*num_tabs)
            continue
        # ignore regular spaces and new lines
        elif ttype in Token.Text:
            continue
        tokens.append(value)
    return tokens

In [None]:
def tokenize_code_no_tab(code):
    tokens = []
    for ttype, value in lex(code, PythonLexer()):
        # ignore regular spaces and new lines
        if ttype in Token.Text:
            continue
        tokens.append(value)
    return tokens

In [None]:
def mask_if_statement(method, target, with_tab):
    # flatten cleaned method
    if with_tab:
        tokenized_method = tokenize_code_with_tab(method)
    else:
        tokenized_method = tokenize_code_no_tab(method)
    joined_tokens = " ".join(tokenized_method) # join tokens into a string separate by a space

    tokenized_target = " ".join(tokenize_code_no_tab(target))
    assert(tokenized_target in joined_tokens) # make sure the target if statement is found in the cleaned method
    return joined_tokens.replace(tokenized_target, "<MASK>") # replace if

In [None]:
masked_method_with_tab_list = []
masked_method_no_tab_list = []
for i in range(len(df_train)):
    try:
      method = df_train.iloc[i]["cleaned_method"]
      target = df_train.iloc[i]["target_block"]

      # mask and flatten with tab token
      masked_method_with_tab = mask_if_statement(method, target, True)
      masked_method_with_tab_list.append(masked_method_with_tab)

      # mask and flatten without tab token
      masked_method_no_tab = mask_if_statement(method, target, False)
      masked_method_no_tab_list.append(masked_method_no_tab)
    except Exception as e:
      print(i)
      raise e

In [None]:
# add masked and flattenen methods as columns in the df
df_train["masked_with_tab"] = masked_method_with_tab_list
df_train["masked_no_tab"] = masked_method_no_tab_list

In [None]:
# save df to csv
df_train.to_csv("ft_train_masked.csv")

In [None]:
masked_method_with_tab_list = []
masked_method_no_tab_list = []
for i in range(len(df_valid)):
    method = df_valid.iloc[i]["cleaned_method"]
    target = df_valid.iloc[i]["target_block"]
    masked_method_with_tab = mask_if_statement(method, target, True)
    masked_method_with_tab_list.append(masked_method_with_tab)
    masked_method_no_tab = mask_if_statement(method, target, False)
    masked_method_no_tab_list.append(masked_method_no_tab)

In [None]:
df_valid["masked_with_tab"] = masked_method_with_tab_list
df_valid["masked_no_tab"] = masked_method_no_tab_list
df_valid.to_csv("ft_valid_masked.csv")

In [None]:
masked_method_with_tab_list = []
masked_method_no_tab_list = []
for i in range(len(df_test)):
    method = df_test.iloc[i]["cleaned_method"]
    target = df_test.iloc[i]["target_block"]
    masked_method_with_tab = mask_if_statement(method, target, True)
    masked_method_with_tab_list.append(masked_method_with_tab)
    masked_method_no_tab = mask_if_statement(method, target, False)
    masked_method_no_tab_list.append(masked_method_no_tab)

In [None]:
df_test["masked_with_tab"] = masked_method_with_tab_list
df_test["masked_no_tab"] = masked_method_no_tab_list

In [None]:
df_test.to_csv("ft_test_masked.csv")