In [None]:
'''
This notebook was takes the Java methods in "preprocessed_final.csv" and tokenizes them. Then it splits the data
up into train, eval, and test sets. The split data is then saved to txt files.
'''

In [1]:
import pandas as pd

In [2]:
import sys
import time
from pygments.lexers.jvm import JavaLexer
from pygments.lexers import get_lexer_by_name
from pygments.token import Token

def print_progress(percent):
    width = 50  # Width of the progress bar
    filled = int(width * percent / 100)
    bar = "#" * filled + "-" * (width - filled)
    sys.stdout.write(f"\r[{bar}] {percent}%")
    sys.stdout.flush()

def tokenize_methods_from_dataframe(df: pd.DataFrame, method_column: str, language: str) -> pd.DataFrame:
    """
    Tokenize the methods in method_column

    Args:
        df (pd.DataFrame): DataFrame containing the methods.
        method_column (str): Column name containing the Java methods.
        language (str): Programming language for the lexer (e.g., 'java').

    Returns:
        pd.DataFrame: Updated DataFrame with a new column 'Tokenized Method'.
    """
    train_size = len(df[method_column])
    count=0
    def tokenize_method(code):
        lexer = get_lexer_by_name("Java")
        tokens = []
        in_function_declaration = False
        in_arguments = False
        arguments = {}
        arg_num = 0
        for t in lexer.get_tokens(code):
            if t[0] in Token.Text:
              continue
            if t[0] in Token.Name.Function:
                tokens.append("function")
                in_function_declaration = True
            elif in_arguments and t[0] in Token.Name and not (t[1][0].isupper()):
                arguments[t[1]] = "arg" + str(arg_num)
                arg_num += 1
                tokens.append(arguments[t[1]])
            elif t[1] in arguments.keys():
                tokens.append(arguments[t[1]])
            else:
                tokens.append(t[1])
            if in_function_declaration and t[1] == '(':
                in_arguments = True
                continue
            if in_arguments and t[1] == ')':
                in_arguments = False
                in_function_declaration = False
                continue
        nonlocal count
        count += 1

        if count%250 == 0:
            print_progress(int((count/train_size)*100))
            time.sleep(0.05)

        return tokens

    # Apply the function to the specified column and add a new column with the results
    df["Tokenized Method"] = df[method_column].apply(tokenize_method)
    print_progress(100)
    print()
    return df

In [3]:
df = pd.read_csv("preprocessed_final.csv")

In [4]:
all_untokenized_methods = list(df['Method No Comments'])

In [7]:
tokenized_df = tokenize_methods_from_dataframe(df, "Method No Comments", "java")

[##################################################] 100%


In [8]:
df_shuffled = tokenized_df.sample(frac=1, random_state=42)  # Shuffle rows
train_size = int(0.8 * len(df))  # 80%
eval_size = int(0.1 * len(df))    # 10%

# Split the DataFrame
train_df = df_shuffled.iloc[:train_size]
eval_df = df_shuffled.iloc[train_size:train_size + eval_size]
test_df = df_shuffled.iloc[train_size + eval_size:]

In [9]:
def save_tokenized_method_to_txt(df: pd.DataFrame, filename: str):
  lines = []
  for index, row in df.iterrows():
    lines.append(" ".join(row["Tokenized Method"])+"\n")

  with open(filename, "w",  encoding="utf-8") as f:
    f.writelines(lines)

In [10]:
save_tokenized_method_to_txt(train_df, "student_train.txt")

In [15]:
save_tokenized_method_to_txt(test_df, "test.txt")

In [16]:
save_tokenized_method_to_txt(eval_df, "eval.txt")

In [17]:
test_df_100 = test_df.sample(n=100, random_state=42)

In [18]:
test_df_100

Unnamed: 0,Commit Hash,File Name,Method Name,Method Code,Commit Link,Method No Comments,Tokenized Method
21534,1f89ec7101c872b2d0579d1e24847d69e6664ef8,HeaderRequestExtractor.java,doExtract,protected Optional<String[]> doExtract(fin...,https://www.github.com/dreamhead/moco/commit/1...,protected Optional<String[]> doExtract(fin...,"[protected, Optional, <, String, [, ], >, func..."
8356,13c82bbfdb768c67f364a5e6ab57267777234303,InvokeJCLTest.java,testPrintAPI,public void testPrintAPI() {\n Log log = ...,https://www.github.com/qos-ch/slf4j/commit/13c...,public void testPrintAPI() {\n Log log = ...,"[public, void, function, (, ), {, Log, log, =,..."
11383,89e93009ea58b35e20a89fd896be6d8d97cc6e33,LoggerFactory.java,reportIgnoredStaticLoggerBinders,private static void reportIgnoredStaticLog...,https://www.github.com/qos-ch/slf4j/commit/89e...,private static void reportIgnoredStaticLog...,"[private, static, void, function, (, Set, <, U..."
4760,3afde5830e5bc811dfaf214be86652e2c1f95489,ElementTest.java,testShallowCloneToString,public void testShallowCloneToString() {\n...,https://www.github.com/jhy/jsoup/commit/3afde5...,public void testShallowCloneToString() {\n...,"[public, void, function, (, ), {, Document, do..."
2568,01a62698797658b0f51d323fdfb5e9744d843773,HtmlTreeBuilder.java,clearStackToContext,private void clearStackToContext(String......,https://www.github.com/jhy/jsoup/commit/01a626...,private void clearStackToContext(String......,"[private, void, function, (, String, ., ., ., ..."
...,...,...,...,...,...,...,...
19063,13e373af765416613e66d2ecc1134a6edad57368,Moco.java,proxy,public static ResponseHandler proxy(final ...,https://www.github.com/dreamhead/moco/commit/1...,public static ResponseHandler proxy(final ...,"[public, static, ResponseHandler, function, (,..."
22984,464aa80fb3a3ad6d8c38b6efe5e81663b9ddf737,JsonSupport.java,assertEquals,public static void assertEquals(final Stri...,https://www.github.com/dreamhead/moco/commit/4...,public static void assertEquals(final Stri...,"[public, static, void, function, (, final, Str..."
19528,3ebcd63cf7116f700eae47fcc2429bb23401c14c,AbstractProxyResponseHandler.java,writeToResponse,public void writeToResponse(final SessionC...,https://www.github.com/dreamhead/moco/commit/3...,public void writeToResponse(final SessionC...,"[public, void, function, (, final, SessionCont..."
17082,eae2b0b57cb0e7810750932ac8549515766c5c89,HttpServerParser.java,createHttpServer,private HttpServer createHttpServer(List<S...,https://www.github.com/dreamhead/moco/commit/e...,private HttpServer createHttpServer(List<S...,"[private, HttpServer, function, (, List, <, Se..."


In [19]:
save_tokenized_method_to_txt(test_df_100, "test_100.txt")