In [1]:
# imports
import javalang
import os
from app.database import Database
from app.classes import IterationStatus, BatchAttemptStatus, TestStatus, AttemptStatus, Dataset
from typing import List
from app.constants import CONFIG
import pickle
import xml.etree.ElementTree as ET
from app.utils import etree_to_dict
from tqdm import tqdm

In [2]:
# constants
PROJECT_PATH = r"D:\IDEA_Projects\TestJavaCode"
DATASET_NAME1 = r"jfreechart"
DATASET_NAME2 = r"jfreechart154"

In [3]:
# read dataset
with open(f"{CONFIG.dataset_folder}/{DATASET_NAME1}.pkl", "rb") as f:
    raw_dataset = pickle.load(f)
    dataset1 = Dataset(**raw_dataset)

with open(f"{CONFIG.dataset_folder}/{DATASET_NAME2}.pkl", "rb") as f:
    raw_test_dataset = pickle.load(f)
    dataset2 = Dataset(**raw_test_dataset)

In [4]:
def handle_files(string):
    lines = string.splitlines()
    lines = [x.strip() for x in lines]
    lines = [x for x in lines if x]
    return "".join(lines)


In [13]:
from app.database import Database
db = Database()
all_dataset = db.get_all()

read time: 0.042999267578125
decompress time: 3.7555201053619385
loads time: 4.0817646980285645
assembling time: 3.6815402507781982


In [14]:
for b in all_dataset:
    print(b.dataset, b.doc_id)

lang_1_fixed 47
commons-csv 58
commons-cli 61
gson 66
jfreechart 82
lang_1_fixed 89
gson 98
commons-csv 99
commons-cli 100
jfreechart 101
jfreechart154 123
jfreechart154 126
lang_1_fixed 127
gson 128
jfreechart154 130
jfreechart154 131


In [7]:
old_batch = [x for x in all_dataset if x.doc_id == 101][0]
dataset1_idx_map = {}
count = -1
for class_info in dataset1:
    for method_info in class_info:
        count += 1
        key = f"{class_info.class_name}#{method_info.signature}#{count}"
        dataset1_idx_map[key] = handle_files(method_info.content)

old_map = {}
for attempt in old_batch.attempts:
    key = f"{attempt.name}#{attempt.idx}"
    if key in dataset1_idx_map:
        old_map[attempt.name] = (dataset1_idx_map[key], attempt)
    else:
        assert False

In [8]:
new_batch = [x for x in all_dataset if x.doc_id == 123][0]
dataset2_idx_map = {}
count = -1
for class_info in dataset2:
    for method_info in class_info:
        count += 1
        key = f"{class_info.class_name}#{method_info.signature}#{count}"
        dataset2_idx_map[key] = handle_files(method_info.content)

new_map = {}
for attempt in new_batch.attempts:
    key = f"{attempt.name}#{attempt.idx}"
    if key in dataset2_idx_map:
        new_map[attempt.name] = (dataset2_idx_map[key], attempt)
    else:
        assert False

In [83]:
from collections import Counter

status = []
generated_attempts = []
save_money_attempt = []
for attempt in new_batch.attempts:
    if attempt.name in old_map:
        old_content, old_attempt = old_map[attempt.name]
        new_content, new_attempt = new_map[attempt.name]
        if old_content != new_content:
            new_attempt.sub_iteration_status = None
            new_attempt.iteration_status = None
            generated_attempts.append(new_attempt)
            status.append("DIFFERENT -> FAIL")
        else:
            old_attempt.idx = new_attempt.idx
            if old_attempt.type in [IterationStatus.Type.PASS,IterationStatus.Type.COMPILE_ERROR, IterationStatus.Type.SYNTAX_ERROR, IterationStatus.Type.FAIL]:
                status.append("PASS/CE/SE/FAIL -> DIRECT")
                generated_attempts.append(old_attempt)
            elif old_attempt.type == IterationStatus.Type.RUNTIME_ERROR:
                status.append("RE -> FAIL")
                save_money_attempt.append((new_attempt.idx, old_attempt.sub_iteration_status))
                old_attempt.sub_iteration_status = None
                old_attempt.iteration_status = None
                generated_attempts.append(old_attempt)
            else:
                print(old_attempt.type)
                assert False
print(Counter(status))

Counter({'PASS/CE/SE/FAIL -> DIRECT': 3037, 'RE -> FAIL': 1413, 'DIFFERENT -> FAIL': 638})


In [15]:
b1 = [x for x in all_dataset if x.doc_id == 123][0]
b2 = [x for x in all_dataset if x.doc_id == 130][0]
b3 = [x for x in all_dataset if x.doc_id == 131][0]
# 找出b1.attempts中的idx在b2.attempts中不存在的
b1_idx = [x.idx for x in b1.attempts]
b2_idx = [x.idx for x in b2.attempts]
diff_b1_b2 = [x for x in b1_idx if x not in b2_idx]
# 将差别部分添加到新batch中
new_attempts = []
new_attempts.extend([x for x in b1.attempts if x.idx in diff_b1_b2])
print("add new attempts", len(new_attempts))
# 比较b2和b3的差别
get_compile_error = lambda x: [_ for _ in x if _.type == IterationStatus.Type.COMPILE_ERROR]
b2_ce = get_compile_error(b2.attempts)
b3_ce = get_compile_error(b3.attempts)
b2_ce_idx = [x.idx for x in b2_ce]
b3_ce_idx = [x.idx for x in b3_ce]
diff_b3_b2 = [x for x in b3_ce_idx if x not in b2_ce_idx]
# 将差别部分添加到新batch中
def to_fail(attempt):
    attempt.sub_iteration_status = None
    attempt.iteration_status = None
    return attempt
new_attempts.extend([to_fail(x) for x in b3.attempts if x.idx in diff_b3_b2])
print("now attempts", len(new_attempts))
# 将b3剩下的内容添加到新batch中
b3_remain = [x for x in b3.attempts if x.idx not in diff_b3_b2]
print("b3 remain", len(b3_remain))
new_attempts.extend(b3_remain)
print("now attempts", len(new_attempts))

add new attempts 684
now attempts 978
b3 remain 4794
now attempts 5772


In [18]:
# for i in range(684):
#      new_attempts[i].sub_iteration_status = None
#      new_attempts[i].iteration_status = None
#
# batch_attempt_status = BatchAttemptStatus(
#     attempts=new_attempts,
#     flow_run_request=b1.flow_run_request,
#     start_time_ms=0,
#     end_time_ms=0,
#     dataset="jfreechart154",
#     mode=b1.mode,
# )
# db.insert(batch_attempt_status)
db.delete([129])

read time: 0.04450869560241699
decompress time: 3.7254297733306885
loads time: 4.178632497787476
dumps time: 2.4713973999023438
compress time: 23.007278442382812
write time: 0.1072993278503418


In [92]:
print(len(save_money_attempt))
save_money_attempt_2 = [(x,y[0]) for x,y in save_money_attempt if len(y) == 1]
print(len(save_money_attempt_2))
save_money_attempt_3 = [(x,y.llm_records) for x,y in save_money_attempt_2]
save_money_attempt_4 = [(x,y[0].response) for x,y in save_money_attempt_3]

1413
1413


In [93]:
from tinydb import TinyDB
db = TinyDB("jfreechart154_gpt4.json")
for i in save_money_attempt_4:
    db.insert({
        "idx": i[0],
        "response": i[1]
    })