In [None]:
import json
import re
import logging
import os

logger = logging.getLogger(__name__)

STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')
NON_SPACE_MATCHING_PATTERN = re.compile(r'\S')

lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'


def trim_method_name(full_name):
    """
    Extract method/function name from its full name,
    e.g., RpcResponseResolver.resolveResponseObject -> resolveResponseObject

    Args:
        full_name (str): Full name

    Returns:
        str: Method/Function name

    """
    point_pos = full_name.rfind('.')
    if point_pos != -1:
        return full_name[point_pos + 1:]
    else:
        return full_name

def remove_comments_and_docstrings(source):
    """
    Remove docs and comments from source string.
    Thanks to authors of GraphCodeBERT
    from: https://github.com/microsoft/CodeBERT/blob/master/GraphCodeBERT/codesearch/parser/utils.py#L4

    Args:
        source (str): Source code string
        lang (str): Source code language

    Returns:
        str: Source string

    """
    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):
            return " "  # note: a space and not an empty string
        else:
            return s

    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    temp = []
    for x in re.sub(pattern, replacer, source).split('\n'):
        if x.strip() != "":
            temp.append(x)
    return '\n'.join(temp)


def remove_unwanted_characters(source):
    return source.replace('\n', '').replace('\t', '').replace('\r', '')

def parse_java_json_file(file_path):
    # sources, codes, names = [], [], []
    sources = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            name = trim_method_name(data['func_name'])
            source = data['code'].strip()
            source = remove_comments_and_docstrings(source)
            clean_code = remove_unwanted_characters(source)
            source = replace_string_literal(clean_code)

            sources.append(source)
            

    return sources


def replace_string_literal(source):
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def trim_method_name(full_name):
    return full_name.split('.')[-1]

def iter_all_files(base):
    for root, dirs, files in os.walk(base):
        for f in files:
            yield os.path.join(root, f)

def iter_pre_train_dataset_files(lang_dir, lang="Java"):
    return [file for file in iter_all_files(base=lang_dir) if file.endswith('.jsonl')]

# Iterate over files and print the outputs
file_paths = iter_pre_train_dataset_files(lang_dir)
for file_path in file_paths:
    # sources, codes, names = parse_java_json_file(file_path)
    sources = parse_java_json_file(file_path)
    # for source, code, name in zip(sources, codes, names):
    for source in zip(sources):
    #     print(f"File: {file_path}")
    #     print(f"Method Name: {name}")
          print(f"Source Code: {source}")
        # print(f"Tokenized Code: {code}\n")
    break


For Non-Gen dataset

In [3]:
import json
import re
import logging
import os

logger = logging.getLogger(__name__)

STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')
NON_SPACE_MATCHING_PATTERN = re.compile(r'\S')

lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'


def trim_method_name(full_name):
    """
    Extract method/function name from its full name,
    e.g., RpcResponseResolver.resolveResponseObject -> resolveResponseObject

    Args:
        full_name (str): Full name

    Returns:
        str: Method/Function name

    """
    point_pos = full_name.rfind('.')
    if point_pos != -1:
        return full_name[point_pos + 1:]
    else:
        return full_name

def remove_comments_and_docstrings(source):
    """
    Remove docs and comments from source string.
    Thanks to authors of GraphCodeBERT
    from: https://github.com/microsoft/CodeBERT/blob/master/GraphCodeBERT/codesearch/parser/utils.py#L4

    Args:
        source (str): Source code string
        lang (str): Source code language

    Returns:
        str: Source string

    """
    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):
            return " "  # note: a space and not an empty string
        else:
            return s

    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    temp = []
    for x in re.sub(pattern, replacer, source).split('\n'):
        if x.strip() != "":
            temp.append(x)
    return '\n'.join(temp)


def remove_unwanted_characters(source):
    return source.replace('\n', '').replace('\t', '').replace('\r', '')

def parse_java_json_file(file_path):
    # sources, codes, names = [], [], []
    sources = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            name = trim_method_name(data['func_name'])
            source = data['code'].strip()
            source = remove_comments_and_docstrings(source)
            clean_code = remove_unwanted_characters(source)
            # #added
            # if clean_code.startswith('"') and clean_code.endswith('"'):
            #     clean_code = clean_code[1:-1]
            # #-----
            source = clean_code
            sources.append(source)

    return sources


# def replace_string_literal(source):
#     return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def trim_method_name(full_name):
    return full_name.split('.')[-1]

def iter_all_files(base):
    for root, dirs, files in os.walk(base):
        for f in files:
            yield os.path.join(root, f)

def iter_pre_train_dataset_files(lang_dir, lang="Java"):
    return [file for file in iter_all_files(base=lang_dir) if file.endswith('.jsonl')]

# Iterate over files and print the outputs
file_paths = iter_pre_train_dataset_files(lang_dir)
for file_path in file_paths:
    # sources, codes, names = parse_java_json_file(file_path)
    sources = parse_java_json_file(file_path)
    # for source, code, name in zip(sources, codes, names):
    for source in zip(sources):
    #     print(f"File: {file_path}")
    #     print(f"Method Name: {name}")
          print(f"Source Code: {source}")
        # print(f"Tokenized Code: {code}\n")
    break


Source Code: ('@CanIgnoreReturnValue  public long copyTo(CharSink sink) throws IOException {    checkNotNull(sink);    Closer closer = Closer.create();    try {      Reader reader = closer.register(openStream());      Writer writer = closer.register(sink.openStream());      return CharStreams.copy(reader, writer);    } catch (Throwable e) {      throw closer.rethrow(e);    } finally {      closer.close();    }  }',)
Source Code: ('public String read() throws IOException {    Closer closer = Closer.create();    try {      Reader reader = closer.register(openStream());      return CharStreams.toString(reader);    } catch (Throwable e) {      throw closer.rethrow(e);    } finally {      closer.close();    }  }',)
Source Code: ('public ImmutableList<String> readLines() throws IOException {    Closer closer = Closer.create();    try {      BufferedReader reader = closer.register(openBufferedStream());      List<String> result = Lists.newArrayList();      String line;      while ((line = rea

In [4]:

lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'

output_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/spt-code/sources/data/asts/ast_jdt/longmethod.txt'

file_paths = iter_pre_train_dataset_files(lang_dir, 'Java')
with open(output_path, 'w', encoding='utf-8') as output_file:
    for file_path in file_paths:
        sources = parse_java_json_file(file_path)
        for source in sources:
            output_file.write(source + "\n")
        #break  

New Extraction (11/04/2024)

In [2]:
import json
import re
import logging
import os

logger = logging.getLogger(__name__)

STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')
NON_SPACE_MATCHING_PATTERN = re.compile(r'\S')

lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'


def trim_method_name(full_name):
    """
    Extract method/function name from its full name,
    e.g., RpcResponseResolver.resolveResponseObject -> resolveResponseObject

    Args:
        full_name (str): Full name

    Returns:
        str: Method/Function name

    """
    point_pos = full_name.rfind('.')
    if point_pos != -1:
        return full_name[point_pos + 1:]
    else:
        return full_name


def replace_string_literal(source):
    """
    Replace the string literal in source code with ``<STR>``.

    Args:
        source (str): Source code in string

    Returns:
        str: Code after replaced

    """
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def remove_comments_and_docstrings(source):
    """
    Remove docs and comments from source string.
    Thanks to authors of GraphCodeBERT
    from: https://github.com/microsoft/CodeBERT/blob/master/GraphCodeBERT/codesearch/parser/utils.py#L4

    Args:
        source (str): Source code string
        lang (str): Source code language

    Returns:
        str: Source string

    """
    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):
            return " " 
        else:
            return s

    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    temp = []
    for x in re.sub(pattern, replacer, source).split('\n'):
        if x.strip() != "":
            temp.append(x)
    return '\n'.join(temp)


def remove_unwanted_characters(source):
    return source.replace('\n', '').replace('\t', '').replace('\r', '')

def parse_java_json_file(file_path):
    # sources, codes, names = [], [], []
    sources = []
    codes =[]
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            name = trim_method_name(data['func_name'])
            source = data['code'].strip()
            source = remove_comments_and_docstrings(source)
            source = replace_string_literal(source)
            code = replace_string_literal(' '.join(data['code_tokens']))
            # clean_code = remove_unwanted_characters(source)
            # source = clean_code
            sources.append(source)
            codes.append(code)
    return sources,codes


def trim_method_name(full_name):
    return full_name.split('.')[-1]

def iter_all_files(base):
    for root, dirs, files in os.walk(base):
        for f in files:
            yield os.path.join(root, f)

def iter_pre_train_dataset_files(lang_dir, lang="Java"):
    return [file for file in iter_all_files(base=lang_dir) if file.endswith('.jsonl')]

file_paths = iter_pre_train_dataset_files(lang_dir)
for file_path in file_paths:
    sources, codes = parse_java_json_file(file_path)
    for source, code in zip(sources, codes):
        print(f"Source Code:\n{source}\n")
        print(f"Code Tokens:\n{code}\n")
    break

Source Code:
@CanIgnoreReturnValue
  public long copyTo(CharSink sink) throws IOException {
    checkNotNull(sink);
    Closer closer = Closer.create();
    try {
      Reader reader = closer.register(openStream());
      Writer writer = closer.register(sink.openStream());
      return CharStreams.copy(reader, writer);
    } catch (Throwable e) {
      throw closer.rethrow(e);
    } finally {
      closer.close();
    }
  }

Code Tokens:
public long copyTo ( CharSink sink ) throws IOException { checkNotNull ( sink ) ; Closer closer = Closer . create ( ) ; try { Reader reader = closer . register ( openStream ( ) ) ; Writer writer = closer . register ( sink . openStream ( ) ) ; return CharStreams . copy ( reader , writer ) ; } catch ( Throwable e ) { throw closer . rethrow ( e ) ; } finally { closer . close ( ) ; } }

Source Code:
public String read() throws IOException {
    Closer closer = Closer.create();
    try {
      Reader reader = closer.register(openStream());
      return Char

In [3]:
lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'

output_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/spt-code/sources/data/asts/ast_jdt/pretrain_gen.txt'

file_paths = iter_pre_train_dataset_files(lang_dir, 'Java')
with open(output_path, 'w', encoding='utf-8') as output_file:
    for file_path in file_paths:
        _, codes = parse_java_json_file(file_path)
        for code in codes:
            output_file.write(code + "\n")
        #break  

In [3]:
import re

# Define the input and output file paths
input_file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/asts/pretrain/ast_jdt_output_file.txt'  # Replace with your input file path
output_file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/spt-code/sources/data/asts/ast_jdt/ast_outputs_pretrain.txt'

# Regular expression to match the Asts value
pattern = r'Asts: "(.*?)"'

# Open the input file for reading and output file for writing
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    # Read file line by line
    for line in infile:
        # Search for the Asts value in the current line
        match = re.search(pattern, line)
        if match:
            # Extract the ASTs value
            asts_value = match.group(1)
            # Write the ASTs value to the output file
            outfile.write(asts_value + '\n')

print(f"ASTs values have been extracted and saved to {output_file_path}")


ASTs values have been extracted and saved to /home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/spt-code/sources/data/asts/ast_jdt/ast_outputs_pretrain.txt


For Finetuning part:

In [None]:
import json
import re
import logging
import os

logger = logging.getLogger(__name__)

STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')
NON_SPACE_MATCHING_PATTERN = re.compile(r'\S')

lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json'


def trim_method_name(full_name):
    """
    Extract method/function name from its full name,
    e.g., RpcResponseResolver.resolveResponseObject -> resolveResponseObject

    Args:
        full_name (str): Full name

    Returns:
        str: Method/Function name

    """
    point_pos = full_name.rfind('.')
    if point_pos != -1:
        return full_name[point_pos + 1:]
    else:
        return full_name

def remove_comments_and_docstrings(source):
    """
    Remove docs and comments from source string.
    Thanks to authors of GraphCodeBERT
    from: https://github.com/microsoft/CodeBERT/blob/master/GraphCodeBERT/codesearch/parser/utils.py#L4

    Args:
        source (str): Source code string
        lang (str): Source code language

    Returns:
        str: Source string

    """
    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):
            return " "  # note: a space and not an empty string
        else:
            return s

    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    temp = []
    for x in re.sub(pattern, replacer, source).split('\n'):
        if x.strip() != "":
            temp.append(x)
    return '\n'.join(temp)


def remove_unwanted_characters(source):
    return source.replace('\n', '').replace('\t', '').replace('\r', '')

def parse_java_json_file(file_path):
    # sources, codes, names = [], [], []
    sources = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            left_context = replace_string_literal(data['left_context']).replace('\n', '\\n').replace('=', '\\u003d').replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\t", "\\t")
            right_context = replace_string_literal(data['right_context']).replace('\n', '\\n').replace('=', '\\u003d').replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\t", "\\t")
            target_seq = replace_string_literal(data['target_seq']).replace('\n', '\\n').replace('=', '\\u003d').replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\t", "\\t")
            formatted_string = f"{left_context} {target_seq} {right_context}"
            clean_code = remove_comments_and_docstrings(formatted_string)
            source = remove_unwanted_characters(clean_code)

            sources.append(source)
            

    return sources


def replace_string_literal(source):
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def trim_method_name(full_name):
    return full_name.split('.')[-1]

def iter_all_files(base):
    for root, dirs, files in os.walk(base):
        for f in files:
            yield os.path.join(root, f)

def iter_pre_train_dataset_files(lang_dir):
    return [file for file in iter_all_files(base=lang_dir) if file.endswith('.json')]

# Iterate over files and print the outputs
file_paths = iter_pre_train_dataset_files(lang_dir)
for file_path in file_paths:
    # sources, codes, names = parse_java_json_file(file_path)
    sources = parse_java_json_file(file_path)
    # for source, code, name in zip(sources, codes, names):
    for source in zip(sources):
    #     print(f"File: {file_path}")
    #     print(f"Method Name: {name}")
          print(f"Source Code: {source}")
        # print(f"Tokenized Code: {code}\n")
    break


In [4]:
output_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune_methods.txt'

file_paths = iter_pre_train_dataset_files(lang_dir)
with open(output_path, 'w', encoding='utf-8') as output_file:
    for file_path in file_paths:
        sources = parse_java_json_file(file_path)
        for source in sources:
            output_file.write(json.dumps(source) + "\n")