In [None]:
'''
This notebook was used to preprocess the data collected during the extraction phase (see PyDriller.ipynb) and combine the
the extracted methods (from 5 GitHub repositories) into a single csv file, "preprocessed_final.csv".
'''

In [1]:
import pandas as pd
import re

In [2]:
### Type 1 Clones ###
def remove_duplicates(data):
    """Remove duplicate methods based on method content.
      Almost Type-1 with the exception of comments
    """
    return data.drop_duplicates(subset="Method Code", keep="first")

In [3]:
def filter_ascii_methods(data):
    """Filter methods to include only those with ASCII characters."""
    data = data[data["Method Code"].apply(lambda x: all(ord(char) < 128 for char in x))]
    return data

In [4]:
def remove_outliers(data, lower_percentile=5, upper_percentile=95):
    """Remove outliers based on method length."""
    method_lengths = data["Method Code"].apply(len)
    lower_bound = method_lengths.quantile(lower_percentile / 100)
    upper_bound = method_lengths.quantile(upper_percentile / 100)
    return data[(method_lengths >= lower_bound) & (method_lengths <= upper_bound)]

In [5]:
def remove_boilerplate_methods(data):
    """Remove boilerplate methods like setters and getters."""
    boilerplate_patterns = [
        r"\bset[A-Z][a-zA-Z0-9_]*",  # Setter methods
        r"\bget[A-Z][a-zA-Z0-9_]*",  # Getter methods
    ]
    boilerplate_regex = re.compile("|".join(boilerplate_patterns))
    data = data[~data["Method Name"].apply(lambda x: bool(boilerplate_regex.search(x)))]
    return data

In [6]:
def remove_invalid_functions(df):
    untokenized_methods = df['Method Code']
    invalid_ind=[]
    lexer = get_lexer_by_name("Java")
    for i, method in enumerate(untokenized_methods):
        num_functions = 0
        for t in lexer.get_tokens(method):
            if t[0] in Token.Name.Function:
                num_functions += 1
        if num_functions > 1:
            invalid_ind.append(i)
    new_df = df.drop(df.index[invalid_ind])
    return new_df

In [7]:
from pygments.lexers.jvm import JavaLexer
from pygments.lexers import get_lexer_by_name
from pygments.token import Token

In [8]:
def remove_comments_from_dataframe(df: pd.DataFrame, method_column: str, language: str) -> pd.DataFrame:
    """
    Removes comments from Java methods in a DataFrame and adds a new column with cleaned methods.

    Args:
        df (pd.DataFrame): DataFrame containing the methods.
        method_column (str): Column name containing the raw Java methods.
        language (str): Programming language for the lexer (e.g., 'java').

    Returns:
        pd.DataFrame: Updated DataFrame with a new column
    """
    # Define a function to remove comments from a single method
    def remove_comments(code):
        lexer = get_lexer_by_name(language)
        tokens = lexer.get_tokens(code)
        # Filter out comments using a lambda function
        clean_code = ''.join([token[1] for token in tokens if not (lambda t: t[0] in Token.Comment)(token)])
        return clean_code

    # Apply the function to the specified column and add a new column with the results
    df["Method No Comments"] = df[method_column].apply(remove_comments)
    return df

In [9]:
df1 = pd.read_csv("extracted_methods__jhy_jsoup.csv", on_bad_lines="skip")
df2 = pd.read_csv("extracted_methods__mpatric_mp3agic.csv", on_bad_lines="skip")
df3 = pd.read_csv("extracted_methods__qos-ch_slf4j.csv", on_bad_lines="skip")
df4 = pd.read_csv("extracted_methods__tootallnate_java-websocket.csv", on_bad_lines="skip")

In [10]:
def preprocess(df):
  print("Initial dataset size:", len(df))
  df = remove_duplicates(df)
  print("After removing duplicates:", len(df))

  df = filter_ascii_methods(df)
  print("After filtering ASCII methods:", len(df))

  df = remove_boilerplate_methods(df)
  print("After removing boilerplate methods:", len(df))

  df = remove_invalid_functions(df)
  print("After removing invalid functions:", len(df))

  return df

In [11]:
df1 = preprocess(df1)

Initial dataset size: 161917
After removing duplicates: 12481
After filtering ASCII methods: 12310
After removing boilerplate methods: 11932
After removing invalid functions: 7697


In [12]:
df2 = preprocess(df2)

Initial dataset size: 11063
After removing duplicates: 3114
After filtering ASCII methods: 3112
After removing boilerplate methods: 1775
After removing invalid functions: 1647


In [13]:
df3 = preprocess(df3)

Initial dataset size: 36783
After removing duplicates: 8032
After filtering ASCII methods: 8032
After removing boilerplate methods: 6811
After removing invalid functions: 4801


In [14]:
df4 = preprocess(df4)

Initial dataset size: 42066
After removing duplicates: 8198
After filtering ASCII methods: 8194
After removing boilerplate methods: 7489
After removing invalid functions: 4583


In [17]:
df5 = pd.read_csv("extracted_methods__oshi_oshi.csv", on_bad_lines="skip")
df5 = preprocess(df5)

Initial dataset size: 97168
After removing duplicates: 16158
After filtering ASCII methods: 15830
After removing boilerplate methods: 7582
After removing invalid functions: 6210


In [18]:
df6 = pd.read_csv("extracted_methods__dreamhead_moco.csv", on_bad_lines="skip")
df6 = preprocess(df6)

Initial dataset size: 93788
After removing duplicates: 12033
After filtering ASCII methods: 12019
After removing boilerplate methods: 10880
After removing invalid functions: 9852


In [19]:
df_concat = pd.concat([df1, df2, df3, df4, df6], ignore_index=True)

In [20]:
df_concat

Unnamed: 0,Commit Hash,File Name,Method Name,Method Code,Commit Link
0,2bc420589478b3cb01398cd9eb233be25b73b7c0,Position.java,incOffset,public int incOffset() {\n return o...,https://www.github.com/jhy/jsoup/commit/2bc420...
1,2bc420589478b3cb01398cd9eb233be25b73b7c0,Position.java,incLineNum,public int incLineNum() {\n return ...,https://www.github.com/jhy/jsoup/commit/2bc420...
2,2bc420589478b3cb01398cd9eb233be25b73b7c0,Position.java,incColNum,public int incColNum() {\n return c...,https://www.github.com/jhy/jsoup/commit/2bc420...
3,2bc420589478b3cb01398cd9eb233be25b73b7c0,Position.java,clone,protected Position clone() {\n try ...,https://www.github.com/jhy/jsoup/commit/2bc420...
4,2bc420589478b3cb01398cd9eb233be25b73b7c0,Position.java,equals,public boolean equals(Object o) {\n ...,https://www.github.com/jhy/jsoup/commit/2bc420...
...,...,...,...,...,...
28575,04ae900f27e01321655d258dac14e387dc0b4024,HttpRequestDumper.java,dump,public String dump(final Request request) ...,https://www.github.com/dreamhead/moco/commit/0...
28576,04ae900f27e01321655d258dac14e387dc0b4024,HttpRequestDumper.java,requestProtocolLine,private String requestProtocolLine(final H...,https://www.github.com/dreamhead/moco/commit/0...
28577,04ae900f27e01321655d258dac14e387dc0b4024,MocoExtension.java,afterEach,public void afterEach(final ExtensionConte...,https://www.github.com/dreamhead/moco/commit/0...
28578,5a7405cbd9744bb7c8fecc545e41796186d9b6e0,HttpRequestDumper.java,requestProtocolLine,private String requestProtocolLine(final H...,https://www.github.com/dreamhead/moco/commit/5...


In [21]:
df_concat = remove_comments_from_dataframe(df_concat, "Method Code", "Java")

In [22]:
df_final = remove_outliers(df_concat)
print("After removing outliers:", len(df_final))

After removing outliers: 25773
