In [1]:
import json
import re
import logging
import os

logger = logging.getLogger(__name__)

STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')

lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'

def replace_string_literal(source):
    """
    Replace the string literal in source code with ``<STR>``.

    Args:
        source (str): Source code in string

    Returns:
        str: Code after replaced

    """
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def parse_java_json_file(file_path):
    # sources, codes, names = [], [], []
    sources = []
    codes =[]
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            source = data['code'].strip()
            source = replace_string_literal(source)
            code = replace_string_literal(' '.join(data.get('code_tokens', [])))
            sources.append(source)
            codes.append(code)
    return sources,codes


def iter_all_files(base):
    for root, dirs, files in os.walk(base):
        for f in files:
            yield os.path.join(root, f)

def iter_pre_train_dataset_files(lang_dir, lang="Java"):
    return [file for file in iter_all_files(base=lang_dir) if file.endswith('.jsonl')]

total_lines = 0

file_paths = iter_pre_train_dataset_files(lang_dir)
for file_path in file_paths:
    sources, codes = parse_java_json_file(file_path)
    total_lines += len(codes)

print(f"Total lines of code processed: {total_lines}")
# file_paths = iter_pre_train_dataset_files(lang_dir)
# for file_path in file_paths:
#     sources, codes = parse_java_json_file(file_path)
#     for source, code in zip(sources, codes):
#         print(f"Source Code:\n{source}\n")
#         print(f"Code Tokens:\n{code}\n")
#     break

Total lines of code processed: 181061


In [2]:
lang_dir = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/pre_train/java'

output_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/preprocess/pretrain_code_tokens.txt'

# To avoid duplicates, we use a set to track unique code entries
unique_codes = set()
line_count = 0

with open(output_path, 'w', encoding='utf-8') as output_file:
    file_paths = iter_pre_train_dataset_files(lang_dir)
    for file_path in file_paths:
        _, codes = parse_java_json_file(file_path)
        for code in codes:
            if code not in unique_codes:  # Check if the code line is unique
                output_file.write(code + "\n")
                unique_codes.add(code)  # Add to set to avoid duplicates
                line_count += 1

print(f"Total unique lines written to output: {line_count}")

Total unique lines written to output: 181060
