In [37]:
from tabulate import tabulate

data = [
    {"Name": "Alice", "City": "New York", "City2": 2},
    {"Name": "Bob", "City": "San Francisco"},
    {"Name": "Charlie", "City": "Seattle"}
]

table = tabulate(data, headers="keys", tablefmt="pipe", stralign='center')

print(table)


|  Name   |     City      |   City2 |
|:-------:|:-------------:|--------:|
|  Alice  |   New York    |       2 |
|   Bob   | San Francisco |         |
| Charlie |    Seattle    |         |


In [47]:
import difflib

text1 = "apple banana cherry date"
text2 = "apple cherry date elderberry"

# 创建 Differ 对象
d = difflib.Differ()

# 通过 Differ 对象的 compare 方法比较两个文本
diff = d.compare(text1.split("\n"), text2.split("\n"))

# 将比较结果转换为字符串并打印出来
print('\n'.join(diff))


- apple banana cherry date
+ apple cherry date elderberry


In [61]:
import pandas as pd

# 原始数据
list_of_dicts = [
    {"name": "Alice", "age": 25, "city": "New York", "money": 35.6},
    {"name": "Bob", "age": 30, "city": "San Francisco", "money": 23.6},
    {"name": "Charlie", "age": 35, "city": "Seattle", "money": 13.6},
    {"name": "David", "age": 28, "city": "New York", "money": 53.6},
    {"name": "Eve", "age": 32, "city": "San Francisco", "money": 3.6}
]

# 转为Dataframe
df = pd.DataFrame(list_of_dicts)

# 识别包含数字的列，对其进行统计
numeric_columns = df.select_dtypes(include=['int', 'float']).columns
statistics = df[numeric_columns].agg(['sum', 'mean', 'median']).reset_index()

# 将结果保存至一个List[list]
result = statistics.values.tolist()
result.insert(0, ['statistic', *numeric_columns])

print(result)


[['statistic', 'age', 'money'], ['sum', 150.0, 130.0], ['mean', 30.0, 26.0], ['median', 30.0, 23.6]]


In [60]:
import javalang


def get_type_string(type):
    if type is None:
        return 'void'
    # 处理基本类型和引用类型
    type_str = ''
    if isinstance(type, javalang.tree.ReferenceType):
        type_str = type.name
        if type.arguments:
            args = ', '.join(get_type_string(arg.type) for arg in type.arguments if arg.type is not None)
            type_str += f"<{args}>"
    elif isinstance(type, javalang.tree.BasicType):
        type_str = type.name
    else:
        type_str = str(type)

    # 处理数组类型
    dimensions = ''.join('[]' for _ in range(len(type.dimensions))) if hasattr(type, 'dimensions') else ''
    return f"{type_str}{dimensions}"


def get_method_signatures(java_code):
    tree = javalang.parse.parse(java_code)
    method_signatures = []

    for _, class_declaration in tree.filter(javalang.tree.ClassDeclaration):
        for method in class_declaration.methods:
            # 获取方法的返回类型，包括泛型信息和数组
            return_type = get_type_string(method.return_type)

            # 获取方法的参数类型，包括泛型信息和数组
            params = []
            for parameter in method.parameters:
                param_type = get_type_string(parameter.type)
                if parameter.varargs:
                    param_type += "..."
                params.append(param_type)

            # 构建方法签名
            method_signature = f"{method.name}({', '.join(params)}) : {return_type}"
            method_signatures.append(method_signature)

    return method_signatures


java_code = open(r"D:\IDEA_Projects\TestJavaCode\src\main\java\org\apache\commons\lang3\Example.java").read()
# 获取方法签名
signatures = get_method_signatures(java_code)
print(signatures)



FileNotFoundError: [Errno 2] No such file or directory: 'D:\\IDEA_Projects\\TestJavaCode\\src\\main\\java\\org\\apache\\commons\\lang3\\Example.java'

In [88]:
from typing import List
import javalang
from javalang.tree import ClassDeclaration, MethodDeclaration

method_filter = ['Hashcode']

from typing import List



def compress_data(content: str, signature: str, method_infos: List[dict]) -> str:
    # 将内容按行分割
    lines = content.split('\n')

    # 为了不影响行号，我们需要从后往前替换
    # 首先对method_infos按照start从大到小排序
    method_infos_sorted = sorted(method_infos, key=lambda x: -x['start'])

    # 遍历每个方法信息
    for method_info in method_infos_sorted:
        # 如果签名不匹配，则压缩方法体
        if method_info['signature'] != signature:
            # 注意，行号从1开始，列表索引从0开始，所以要减1
            start_index = method_info['start'] - 1
            end_index = method_info['end']
            # 将方法体替换为方法签名
            lines[start_index:end_index] = [method_info['signature']]

    # 将修改后的行列表重新组合成一个字符串
    compressed_content = '\n'.join(lines)
    return compressed_content



def remove_comments(input_string):
    import re
    pattern = r'/\*\*?\s.*?\*/'
    return re.sub(pattern, '', input_string, flags=re.DOTALL)


def get_class_info(file: str):
    file = remove_comments(file)

    file_lines = file.splitlines()
    try:
        tree = javalang.parse.parse(file)
    except Exception as e:
        print(file)
        raise e
    assert len(tree.types) == 1  # 一个文件只有一个类
    clazz: ClassDeclaration = tree.types[0]
    if not isinstance(clazz, ClassDeclaration) or 'abstract' in clazz.modifiers:
        return None, None, None
    clazz_name = clazz.name
    methods: List[MethodDeclaration] = clazz.methods

    method_infos = []
    all_method_infos = []
    for method in methods:
        valid_focal_method =  'public' in method.modifiers

        start = method.position[0]
        end = start
        stack1 = []
        flag = False
        while not flag or len(stack1) != 0:
            line = file_lines[end - 1]
            for ch in line:
                if ch == "{":
                    flag = True
                    stack1.append(ch)
                elif ch == "}":
                    stack1.pop()
            end += 1
            if end > 10000:
                assert False
        content = file_lines[start - 1:end]
        # 缩进处理
        indent = len(content[0]) - len(content[0].lstrip())
        content = [line[indent:] for line in content]
        content = "\n".join(content)

        # 获取方法的返回类型，包括泛型信息和数组
        return_type = get_type_string(method.return_type)

        # 获取方法的参数类型，包括泛型信息和数组
        params = []
        for parameter in method.parameters:
            param_type = get_type_string(parameter.type)
            if parameter.varargs:
                param_type += "..."
            params.append(param_type)

        # 构建方法签名
        method_signature = f"{method.name}({', '.join(params)}) : {return_type}"
        info = {
            "signature": method_signature,
            "name": method.name,
            "start": start,
            "end": end,
            "content": content,
        }
        if valid_focal_method:
            method_infos.append(info)
        all_method_infos.append(info)
    
    if clazz_name == 'ArrayUtils':
        print(1)
    for mi in method_infos:
        mi["compressed_content"] = compress_data(file, mi["signature"], all_method_infos)
            
    return clazz_name, file, method_infos

# get_class_info(open(r"D:\IDEA_Projects\TestJavaCode\src\main\java\net\mooctest\Queue.java").read())[2]
import zipfile
import tiktoken
import os
from io import BytesIO

tiktoken_encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
result_list = []

import zipfile
import os
from io import BytesIO
import tiktoken

tiktoken_encoder = tiktoken.encoding_for_model('gpt-3.5-turbo')
# 准备一个列表来存储结果
result_list = []

# 临时内存文件
new_zip_data = BytesIO()

# 打开原始 ZIP 文件
with zipfile.ZipFile(r'D:\jetbrains projects\pycharm\GPT-Java-Tester\dataset\lang_1_fixed.zip', 'r') as zip_ref:
    # 创建一个新的 ZipFile 对象，用于存储修改后的内容
    with zipfile.ZipFile(new_zip_data, 'w', zipfile.ZIP_DEFLATED) as new_zip_ref:
        # 遍历原始 ZIP 文件中的所有文件
        for file_info in zip_ref.infolist():
            filepath = file_info.filename
            file_data = zip_ref.read(filepath)
            # 检查是否是需要修改的文件
            if filepath.endswith(".java") and "src/main/java" in filepath:
                # 对文件内容进行解码
                decoded_data = file_data.decode(errors="replace")
                # 处理文件内容
                clazz_name, new_file_data, method_infos = get_class_info(decoded_data)
                # 如果处理后的文件不为空，则使用修改后的文件数据
                if clazz_name is not None:
                    file_data = new_file_data
                    package_reference = filepath.split("src/main/java/")[1][:-5].replace("/", ".").replace(
                        f".{clazz_name}", "")
                    # 假设 tiktoken_encoder 是您定义的一个对象，用于编码文件数据
                    tokens = len(tiktoken_encoder.encode(file_data))
                    if tokens > 15000:
                        print(clazz_name)
                    result_list.append({
                        "class_name": clazz_name,
                        "content": file_data,  # 再次解码以存储为字符串
                        "methods": sorted(method_infos, key=lambda x: x["start"]),
                        "tokens": tokens,
                        "package_reference": package_reference,
                        "lines": len(file_data.splitlines())
                    })
            # 将原始或修改后的文件数据写入新的 ZIP 文件中
            new_zip_ref.writestr(file_info, file_data)

# 关闭内存中的 ZIP 文件
new_zip_data.seek(0)

# 删除原始 ZIP 文件
os.remove(r'D:\jetbrains projects\pycharm\GPT-Java-Tester\dataset\lang_1_fixed.zip')

# 将新的 ZIP 文件数据写入到原始文件路径
with open(r'D:\jetbrains projects\pycharm\GPT-Java-Tester\dataset\lang_1_fixed.zip', 'wb') as f:
    f.write(new_zip_data.read())

# 关闭内存文件
new_zip_data.close()


1
ArrayUtils
StringUtils


In [89]:
print(f"clazz num: {len(result_list)}")
print(f"method num: {sum([len(clazz['methods']) for clazz in result_list])}")
print(result_list[1]["methods"][0]["compressed_content"])

clazz num: 87
method num: 1728

package org.apache.commons.lang3;

import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.lang3.builder.ToStringStyle;
import org.apache.commons.lang3.mutable.MutableInt;


public class ArrayUtils {

    
    public static final Object[] EMPTY_OBJECT_ARRAY = new Object[0];
    
    public static final Class<?>[] EMPTY_CLASS_ARRAY = new Class[0];
    
    public static final String[] EMPTY_STRING_ARRAY = new String[0];
    
    public static final long[] EMPTY_LONG_ARRAY = new long[0];
    
    public static final Long[] EMPTY_LONG_OBJECT_ARRAY = new Long[0];
    
    public static final int[] EMPTY_INT_ARRAY = new int[0];
    
    public static final Integer[] EMPTY_INTEGER_OBJECT_ARRAY =

In [90]:
# 验证数据集正确性
def has_overlap(intervals):
    intervals.sort(key=lambda x: x[0])  # 按照起点进行排序
    for i in range(1, len(intervals)):
        if intervals[i][0] < intervals[i - 1][1]:  # 如果下一个区间的起点小于上一个区间的终点，说明有重合
            return True
    return False


li = []
for class_info in result_list:
    for method_info in class_info["methods"]:
        li.append((class_info["class_name"], method_info["name"], method_info["start"], method_info["end"]))
    if has_overlap([(method_info["start"], method_info["end"]) for method_info in class_info["methods"]]):
        print(class_info["class_name"])
        assert False
        break
import pandas as pd

df = pd.DataFrame(li)
df.to_csv("method_info.csv", index=False)

In [91]:
import pickle
import gzip

# 保存压缩后的对象到文件
with open("lang_1_fixed.pkl", "wb") as f:
    pickle.dump({"data": sorted(result_list, key=lambda x: x["class_name"])}, f)

In [ ]:
with open("lang_1_fixed_info.pkl", "rb") as f:
    dataset_info = pickle.load(f)

with open("lang_1_fixed.pkl", "rb") as f:
    dataset = pickle.load(f)["data"]


# 假设你已经有了 dataset_info 和 dataset

def sort_dataset(dataset, dataset_info):
    # 创建一个字典，用于快速查找class_name和method_name的顺序
    class_order = {info['class_name']: idx for idx, info in enumerate(dataset_info)}
    method_order = {info['method_name']: idx for idx, info in enumerate(dataset_info)}

    # 定义一个辅助函数，用于获取class_name的排序键
    def get_class_order_key(item):
        # 如果class_name不在dataset_info中，返回一个大数使其排在最后
        return class_order.get(item['class_name'], float('inf'))

    # 定义一个辅助函数，用于获取method_name的排序键
    def get_method_order_key(item):
        # 如果method_name不在dataset_info中，返回一个大数使其排在最后
        return method_order.get(item['name'], float('inf'))

    # 首先按照class_name对dataset进行排序
    sorted_dataset = sorted(dataset, key=get_class_order_key)

    # 然后对每个元素的methods按照method_name进行排序
    for item in sorted_dataset:
        item['methods'] = sorted(item['methods'], key=get_method_order_key)

    return sorted_dataset


# 对数据集进行排序
sorted_dataset = sort_dataset(dataset, dataset_info)

# 打印排序后的数据集
for item in sorted_dataset:
    print(item['class_name'])
    for method in item['methods']:
        print('  ', method['name'])

In [44]:
# 写入
with open("lang_1_fixed.pkl", "wb") as f:
    pickle.dump({"data": sorted_dataset}, f)

In [46]:
import os
import re


def remove_comments(input_string: str) -> str:
    pattern = r'/\*\*?\s.*?\*/'
    return re.sub(pattern, '', input_string, flags=re.DOTALL)


def process_java_file(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        content = file.read()
        content = remove_comments(content)
    with open(file_path, 'w', encoding='utf-8', errors='replace') as file:
        file.write(content)


def process_directory(directory_path):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".java"):
                file_path = os.path.join(root, file)
                process_java_file(file_path)


directory_path = r"D:\jetbrains projects\pycharm\GPT-Java-Tester\dataset\src\main\java\org\apache\commons\lang3"
process_directory(directory_path)


In [56]:
from pydantic import BaseModel
from typing import *


class Dataset(BaseModel):
    class ClassInfo(BaseModel):
        class MethodInfo(BaseModel):
            start: int
            end: int
            content: str

        content: str
        tokens: int
        lines: int
        methods: dict[str, MethodInfo]

        def __iter__(self) -> Iterator[Tuple[str, MethodInfo]]:
            return self

        def __next__(self) -> Tuple[str, MethodInfo]:
            for method_name, method_info in self.methods.items():
                return method_name, method_info

    data: dict[str, ClassInfo]

    def __iter__(self) -> Iterator[Tuple[str, ClassInfo]]:
        return self

    def __next__(self) -> Tuple[str, ClassInfo]:
        for classname, classinfo in self.data.items():
            return classname, classinfo

    def get_item(self, class_name, method_name):
        class_info = self.data[class_name]
        method_info = class_info.methods[method_name]
        return class_info, method_info


d = Dataset(**obj)

In [58]:
for k in d:
    print(k)
    break

('EnumUtils', ClassInfo(content='/*\n * Licensed to the Apache Software Foundation (ASF) under one or more\n * contributor license agreements.  See the NOTICE file distributed with\n * this work for additional information regarding copyright ownership.\n * The ASF licenses this file to You under the Apache License, Version 2.0\n * (the "License"); you may not use this file except in compliance with\n * the License.  You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\npackage org.apache.commons.lang3;\n\nimport java.util.ArrayList;\nimport java.util.Arrays;\nimport java.util.Collections;\nimport java.util.EnumSet;

In [91]:

from collections import defaultdict


def etree_to_dict(t):
    d = {t.tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(etree_to_dict, children):
            for k, v in dc.items():
                dd[k].append(v)
        d = {t.tag: {k: v for k, v in dd.items()}}
    if t.attrib:
        d[t.tag].update(('@' + k, v) for k, v in t.attrib.items())
    if t.text:
        text = t.text.strip()
        if children or t.attrib:
            if text:
                d[t.tag]['#text'] = text
        else:
            d[t.tag] = text
    return d


import xml.etree.ElementTree as ET

tree = ET.parse(r"D:\IDEA_Projects\TestJavaCode\target\site\clover\clover.xml")
root = tree.getroot()
root_dict = etree_to_dict(root)['coverage']
source_packages: list = root_dict['project'][0]['package']
test_packages: list = root_dict['testproject'][0]['package']



In [22]:
from enum import Enum
from pydantic import BaseModel
from typing import Optional


class Person(BaseModel):
    class Type(Enum):
        STUDENT = "student"
        TEACHER = "teacher"

        def __str__(self):
            return self.value

    type: Type = Type.STUDENT


import json

obj = json.dumps(Person().model_dump(), default=str)
print(obj)
Person(**json.loads(obj))


{"type": "student"}


Person(type=<Type.STUDENT: 'student'>)