In [1]:
import json
import os
import glob
from collections import defaultdict

In [9]:
def extract_mistakes_from_directory(directory_path, output_base_path):
    for root, _, files in os.walk(directory_path):
        for filename in files:
            if filename.endswith('.json'):
                print(filename)
                file_path = os.path.join(root, filename)
                relative_path = os.path.relpath(root, directory_path)
                output_directory = os.path.join(output_base_path, relative_path)
                extract_mistakes(file_path, filename, output_directory)

def extract_mistakes(file_path, filename, output_directory):
    with open(file_path, 'r') as file:
        data = json.load(file)

    details = data.get("details", {})
    mismatches = []

    for key, value in details.items():
        if "predictions" in value and "references" in value:
            if value["predictions"] != value["references"]:
                mismatches.append({key: value})

    os.makedirs(output_directory, exist_ok=True)
    output_file_path = os.path.join(output_directory, f"{filename}_mismatches.json")

    with open(output_file_path, 'w') as outfile:
        json.dump(mismatches, outfile, indent=2, ensure_ascii=False)

extract_mistakes_from_directory('AGIEval_results', 'AGIEval_output')


AGIEval-hu-MCQ-lsat-rc.json
AGIEval-hu-MCQ-sat-math.json
AGIEval-hu-MCQ-logiqa-hu.json
AGIEval-hu-MCQ-aqua-rat.json
AGIEval-en-MCQ-sat-en-without-passage.json
AGIEval-en-MCQ-lsat-ar.json
AGIEval-hu-MCQ-lsat-ar.json
AGIEval-en-MCQ-sat-en.json
AGIEval-hu-MCQ-sat-hu-without-passage.json
AGIEval-hu-MCQ-sat-hu.json
AGIEval-en-MCQ-logiqa-en.json
AGIEval-en-MCQ-aqua-rat.json
AGIEval-en-MCQ-lsat-lr.json
AGIEval-en-MCQ-lsat-rc.json
AGIEval-en-MCQ-sat-math.json
AGIEval-hu-MCQ-lsat-lr.json
AGIEval-hu-MCQ-lsat-rc.json
AGIEval-hu-MCQ-sat-math.json
AGIEval-hu-MCQ-logiqa-hu.json
AGIEval-hu-MCQ-aqua-rat.json
AGIEval-en-MCQ-sat-en-without-passage.json
AGIEval-en-MCQ-lsat-ar.json
AGIEval-hu-MCQ-lsat-ar.json
AGIEval-en-MCQ-sat-en.json
AGIEval-hu-MCQ-sat-hu-without-passage.json
AGIEval-hu-MCQ-sat-hu.json
AGIEval-en-MCQ-logiqa-en.json
AGIEval-en-MCQ-aqua-rat.json
AGIEval-en-MCQ-lsat-lr.json
AGIEval-en-MCQ-lsat-rc.json
AGIEval-en-MCQ-sat-math.json
AGIEval-hu-MCQ-lsat-lr.json


In [22]:
def replace_newline_in_json_directory(input_directory, output_base_path):
    for root, _, files in os.walk(input_directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            relative_path = os.path.relpath(root, input_directory)
            output_directory = os.path.join(output_base_path, relative_path)
            replace_newline_in_json(file_path, output_directory, filename)

def replace_newline_in_json(file_path, output_directory, filename):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    def replace_newlines(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                obj[key] = replace_newlines(value)
        elif isinstance(obj, list):
            for i, value in enumerate(obj):
                obj[i] = replace_newlines(value)
        elif isinstance(obj, str):
            obj = obj.replace('\\n', '\n')
        return obj

    modified_data = replace_newlines(data)

    os.makedirs(output_directory, exist_ok=True)
    output_file_path = os.path.join(output_directory, filename)

    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(modified_data, file, indent=4, ensure_ascii=False)

    with open(output_file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()

    file_content = file_content.replace('\\n', '\n\t\t\t\t\t\t\t\t')

    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(file_content)

    print(f"Newlines replaced in {output_file_path}")

replace_newline_in_json_directory('current_inspection','current_inspection/reformated')


Newlines replaced in current_inspection/reformated/./common_mistakes.json
Newlines replaced in current_inspection/reformated/./hu_unique_file2.json
Newlines replaced in current_inspection/reformated/./en_unique_file1.json


In [21]:

def process_json_files(file1_path, file2_path, output_common, output_unique1, output_unique2):
    # Load JSON files
    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
        data1 = json.load(file1)  # List of dictionaries
        data2 = json.load(file2)  # List of dictionaries
    
    # Flatten and consolidate dictionaries by keys
    flat_data1 = defaultdict(list)
    flat_data2 = defaultdict(list)
    
    for entry in data1:
        for key, value in entry.items():
            flat_data1[key].append(value)
            
    for entry in data2:
        for key, value in entry.items():
            flat_data2[key].append(value)
    
    # Find identical keys and unique keys
    keys1 = set(flat_data1.keys())
    keys2 = set(flat_data2.keys())
    
    common_keys = keys1 & keys2
    unique_keys1 = keys1 - common_keys
    unique_keys2 = keys2 - common_keys
    
    common_keys = {int(key) for key in common_keys}
    unique_keys1 = {int(key) for key in unique_keys1}
    unique_keys2 = {int(key) for key in unique_keys2}


    common_keys = sorted(common_keys)
    unique_keys1 = sorted(unique_keys1)
    unique_keys2 = sorted(unique_keys2)


    # Prepare output data
    common_elements = {key: {"file1": flat_data1[str(key)], "file2": flat_data2[str(key)]} for key in common_keys}
    unique_data1 = {key: flat_data1[str(key)] for key in unique_keys1}
    unique_data2 = {key: flat_data2[str(key)] for key in unique_keys2}

    # common_elements = dict(sorted(common_elements))
    # unique_data1 = dict(sorted(unique_data1))
    # unique_data2 = dict(sorted(unique_data2))

    
    # Save results to JSON files
    with open(output_common, 'w') as common_file:
        json.dump(common_elements, common_file, indent=4,ensure_ascii=False)
    with open(output_unique1, 'w') as unique1_file:
        json.dump(unique_data1, unique1_file, indent=4,ensure_ascii=False)
    with open(output_unique2, 'w') as unique2_file:
        json.dump(unique_data2, unique2_file, indent=4,ensure_ascii=False)

# Example usage
process_json_files(
    '/home/recovery/Desktop/AGIEal_SFT_tools/AGIEval_output/internlm2_5-7b-chat-turbomind/AGIEval-en-MCQ-lsat-lr.json_mismatches.json', 
    '/home/recovery/Desktop/AGIEal_SFT_tools/AGIEval_output/internlm2_5-7b-chat-turbomind/AGIEval-hu-MCQ-lsat-lr.json_mismatches.json', 
    'common_mistakes.json', 
    'en_unique_file1.json', 
    'hu_unique_file2.json'
)
