In [1]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob
import re

from transformers import pipeline
import torch

### Helper methods

In [72]:
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, orient='records', compression='gzip', lines=True)[columns] for f in file_list], 
                     sort=False)
def get_dfs(path):
    """Grabs the different data splits and converts them into dataframes"""
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted(glob.glob(path+"/"+split+"**/*.gz"))
        df = jsonl_list_to_dataframe(files, ["func_name", "code", "code_tokens", "repo"])
        dfs.append(df)
    return dfs

def mask_variable_names(code, mask_prob, mask_token):
    """
    Mask the values of variables in a code with a certain probability.
    """
    # Regular expression pattern to match variable assignments
    pattern = r"\b(\w.*\s*=\s*[^#\n]*)"
    matches = re.findall(pattern, code)
    masked_code = code
    masked_labels = list()
    
    # If there is a variable found
    if matches:
        for match in matches:
            # Split the match into sub-parts by the equal sign, and check if the first sub-part contain any parenthesis.
            # If not, then the first sub-part is variable(s).
            first_sub_part = match.split("=")[0]
            if not ("(" in first_sub_part or ")" in first_sub_part):
                variables = set(re.split(",|=", first_sub_part))
                
                # Masking variables based on the mask_prob
                for var in variables:
                    if np.random.uniform() < mask_prob:
                        masked_match = match.replace(var.strip(), mask_token)
                        masked_code = masked_code.replace(match, masked_match)
                        masked_labels.append(var.strip())
            else:
                continue
        
        return masked_code, masked_labels
    
    # If no variable is found
    else:
        return code, None
        
def mask_variable_df(df, code_column_name="code", mask_prob=0.5, mask_token="<mask>", return_df=True):
    masked_code_list = list()
    variable_labels_list = list()
    
    for index, row in df.iterrows():
        masked_code, variable_labels = mask_variable_names(row["code"], mask_prob, mask_token)
        masked_code_list.append(masked_code)
        variable_labels_list.append(variable_labels)
        
    if return_df:
        return pd.DataFrame({"masked_code" : masked_code_list, "masked_code_label" : variable_labels_list})
    else:
        return masked_code_list, variable_labels_list

In [73]:
# For saving the original files into pickle files.
# df_train, df_valid, df_test = get_dfs("data/codenet/python/final/jsonl")

# df_train.to_pickle("train.pickle")
# df_valid.to_pickle("valid.pickle")
# df_test.to_pickle("test.pickle")

df_train = pd.read_pickle("train.pickle").reset_index(drop=True, inplace=True)
df_valid = pd.read_pickle("valid.pickle").reset_index(drop=True, inplace=True)
df_test = pd.read_pickle("test.pickle").reset_index(drop=True, inplace=True)

### Variable masker examples

You may change the mask_prob to change the probability of a variable to be masked. Currently it is set to 1 (100%).

In [74]:
df_train_head = df_train.head()
result = mask_variable_df(df_train_head, mask_prob=1)

Unnamed: 0,masked_code,masked_code_label
0,"def train(train_dir, model_save_path=None, n_n...","[X, y, image, face_bounding_boxes, n_neighbors..."
1,"def predict(X_img_path, knn_clf=None, model_pa...","[knn_clf, X_img, X_face_locations, faces_encod..."
2,"def show_prediction_labels_on_image(img_path, ...","[pil_image, draw, name, text_height, text_width]"
3,"def _rect_to_css(rect):\n """"""\n Convert ...",
4,"def _trim_css_to_bounds(css, image_shape):\n ...",


In [75]:
print(result.loc[0, "masked_code"])

def predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):
    """
    Recognizes faces in given image using a trained KNN classifier

    :param X_img_path: path to image to be recognized
    :param knn_clf: (optional) a knn classifier object. if not specified, model_save_path must be specified.
    :param model_path: (optional) path to a pickled knn classifier. if not specified, model_save_path must be knn_clf.
    :param distance_threshold: (optional) distance threshold for face classification. the larger it is, the more chance
           of mis-classifying an unknown person as a known one.
    :return: a list of names and face locations for the recognized faces in the image: [(name, bounding box), ...].
        For faces of unrecognized persons, the name 'unknown' will be returned.
    """
    if not os.path.isfile(X_img_path) or os.path.splitext(X_img_path)[1][1:] not in ALLOWED_EXTENSIONS:
        raise Exception("Invalid image path: {}".format(X_img_path))



In [76]:
print(result.loc[0, "masked_code_label"])

['knn_clf', 'X_img', 'X_face_locations', 'faces_encodings', 'closest_distances', 'are_matches']


### Baseline score 1: 

### Baseline score 2: 