Save raw methods (RAW method: leftContext + target_seq + rightContext)

In [1]:
import json
import re

# Regular expression pattern to match string literals
STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')

def replace_string_literal(source):
    """Replace string literals in the source code with '___STR'."""
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def process_java_code(code):
    """Apply string literal replacement and other specified replacements to the Java code."""
    return replace_string_literal(code).replace('\n', '').replace('=', '').replace("\\", "").replace("\"", "").replace("\r", "").replace("\t", "")

file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/java-small.val.json'
output_file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune_raw_valid_final.txt'

results = []

with open(file_path, 'r') as file:
    line_count = 0
    for line in file:
        line_count += 1
        try:
            json_object = json.loads(line)
            left_context = process_java_code(json_object['left_context'])
            right_context = process_java_code(json_object['right_context'])
            target_seq = process_java_code(json_object['target_seq'])
            formatted_string = f"{left_context} {target_seq} {right_context}"
            enclosed_string = f"\"{formatted_string}\""
            results.append(enclosed_string)
        except json.JSONDecodeError:
            print(f"Error decoding JSON on input line {line_count}")
            results.append(f"\"Error in JSON format on line {line_count}\"")

print(f"Total input lines read: {line_count}")
print(f"Total output lines prepared: {len(results)}")

with open(output_file_path, 'w') as output_file:
    for item in results:
        output_file.write(item + '\n')

print(f"Data has been processed and saved to {output_file_path}.")


Total input lines read: 10000
Total output lines prepared: 10000
Data has been processed and saved to /home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune_raw_valid_final.txt.


Save {PRED} methods (RAW method: leftContext + PRED + rightContext)

In [2]:
import json
import re

# Regular expression pattern to match string literals
STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')

def replace_string_literal(source):
    """Replace string literals in the source code with '___STR'."""
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def process_java_code(code):
    """Apply string literal replacement and other specified replacements to the Java code."""
    return replace_string_literal(code).replace('\n', '').replace('=', '').replace("\\", "").replace("\"", "").replace("\r", "").replace("\t", "")

file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/java-small.val.json'
output_file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune_pred_valid_final.txt'

results = []

with open(file_path, 'r') as file:
    line_count = 0
    for line in file:
        line_count += 1
        try:
            json_object = json.loads(line)
            left_context = process_java_code(json_object['left_context'])
            right_context = process_java_code(json_object['right_context'])
            target_seq = process_java_code(json_object['target_seq'])
            formatted_string = f"{left_context} PRED {right_context}"
            enclosed_string = f"\"{formatted_string}\""
            results.append(enclosed_string)
        except json.JSONDecodeError:
            print(f"Error decoding JSON on input line {line_count}")
            results.append(f"\"Error in JSON format on line {line_count}\"")

print(f"Total input lines read: {line_count}")
print(f"Total output lines prepared: {len(results)}")

with open(output_file_path, 'w') as output_file:
    for item in results:
        output_file.write(item + '\n')

print(f"Data has been processed and saved to {output_file_path}.")

Total input lines read: 10000
Total output lines prepared: 10000
Data has been processed and saved to /home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune_pred_valid_final.txt.


To save both the lines in a single file:

In [5]:
import json
import re

# Regular expression pattern to match string literals
STRING_MATCHING_PATTERN = re.compile(r'([bruf]*)(\"\"\"|\'\'\'|\"|\')(?:(?!\2)(?:\\.|[^\\]))*\2')

def replace_string_literal(source):
    """Replace string literals in the source code with '___STR'."""
    return re.sub(pattern=STRING_MATCHING_PATTERN, repl='___STR', string=source)

def process_java_code(code):
    """Apply string literal replacement and other specified replacements to the Java code."""
    return replace_string_literal(code).replace('\n', '').replace('=', '').replace("\\", "").replace("\"", "").replace("\r", "").replace("\t", "")

file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/java-small.train.json'
output_file_path = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune-train-pre-raw.txt'

results = []

with open(file_path, 'r') as file:
    line_count = 0
    for line in file:
        line_count += 1
        try:
            json_object = json.loads(line)
            left_context = process_java_code(json_object['left_context'])
            right_context = process_java_code(json_object['right_context'])
            target_seq = process_java_code(json_object['target_seq'])
            
            # Formatted string with target sequence included
            complete_method = f"{left_context} {target_seq} {right_context}"
            # Formatted string with 'PRED' placeholder
            predicted_method = f"{left_context} PRED {right_context}"
            
            # Append both strings to the results, each on a new line
            results.append(complete_method)
            results.append(predicted_method)
        except json.JSONDecodeError:
            print(f"Error decoding JSON on input line {line_count}")
            results.append(f"Error in JSON format on line {line_count}")
            results.append(f"Error in JSON format on line {line_count}")

print(f"Total input lines read: {line_count}")
print(f"Total output lines prepared: {len(results)}")

with open(output_file_path, 'w') as output_file:
    for item in results:
        output_file.write(item + '\n')

print(f"Data has been processed and saved to {output_file_path}.")


Total input lines read: 1309842
Total output lines prepared: 2619684
Data has been processed and saved to /home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/finetune_raw/java-small-json/finetune-train-pre-raw.txt.
