# Translation dataset parsing kit

In [None]:
import json
import re
import os
import glob
from collections import defaultdict

In [None]:
def extract_content(text, label):
    pattern = f"<{label}>(.*?)<{label}>"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""

def parse_json(input_json):
    parsed_data = []

    for index, content in input_json.items():
        prediction = content.get("prediction", "")
        gold = content.get("gold", "")

        passage = extract_content(prediction, "background")
        question = extract_content(prediction, "question")
        options_content = extract_content(prediction, "options")
        options = options_content.split("\n")

        parsed_element = {
            "passage": passage,
            "question": question,
            "options": options,
            "label": gold,
            "ind": int(index)
        }

        parsed_data.append(parsed_element)

    return parsed_data

def read_json_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def write_json_to_file(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4,ensure_ascii=False)

def process_directory(input_directory, output_directory):
    os.makedirs(output_directory, exist_ok=True)
    for file_path in glob.glob(os.path.join(input_directory, '*.json')):
        input_json = read_json_from_file(file_path)
        parsed_json = parse_json(input_json)

        output_file_path = os.path.join(output_directory, os.path.basename(file_path))
        write_json_to_file(parsed_json, output_file_path)

        print(f"Processed {file_path} and saved to {output_file_path}")

# Example usage
input_directory = 'input'  # Replace with your input directory path
output_directory = 'output'  # Replace with your desired output directory path

process_directory(input_directory, output_directory)


Processed input/AGIEval-translate-en-MCQ-sat-math_5.json and saved to output/AGIEval-translate-en-MCQ-sat-math_5.json
Processed input/AGIEval-translate-en-MCQ-aqua-rat_6.json and saved to output/AGIEval-translate-en-MCQ-aqua-rat_6.json
Processed input/AGIEval-translate-en-MCQ-lsat-ar_1.json and saved to output/AGIEval-translate-en-MCQ-lsat-ar_1.json
Processed input/AGIEval-translate-en-MCQ-sat-en_0.json and saved to output/AGIEval-translate-en-MCQ-sat-en_0.json
Processed input/AGIEval-translate-en-MCQ-logiqa-en_6.json and saved to output/AGIEval-translate-en-MCQ-logiqa-en_6.json
Processed input/AGIEval-translate-en-MCQ-logiqa-en_1.json and saved to output/AGIEval-translate-en-MCQ-logiqa-en_1.json
Processed input/AGIEval-translate-en-MCQ-lsat-rc_2.json and saved to output/AGIEval-translate-en-MCQ-lsat-rc_2.json
Processed input/AGIEval-translate-en-MCQ-lsat-lr_0.json and saved to output/AGIEval-translate-en-MCQ-lsat-lr_0.json
Processed input/AGIEval-translate-en-MCQ-gaokao-english_0.json

In [None]:
def read_json_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def write_json_to_file(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4,ensure_ascii=False)

def merge_json_files(input_directory, output_directory):
    os.makedirs(output_directory, exist_ok=True)
    file_groups = defaultdict(list)

    # Group files by their base name without the index
    for file_path in glob.glob(os.path.join(input_directory, '*.json')):
        base_name = os.path.basename(file_path)
        name_without_index = '_'.join(base_name.split('_')[:-1]) + '.json'
        file_groups[name_without_index].append(file_path)

    # Merge files in each group
    for base_name, file_list in file_groups.items():
        merged_data = []
        for file_path in file_list:
            data = read_json_from_file(file_path)
            merged_data.extend(data)

        output_file_path = os.path.join(output_directory, base_name)
        write_json_to_file(merged_data, output_file_path)
        print(f"Merged files into {output_file_path}")

# Example usage
input_directory = 'output'  # Replace with your input directory path
output_directory = 'parsed'  # Replace with your desired output directory path

merge_json_files(input_directory, output_directory)


Merged files into parsed/AGIEval-translate-en-MCQ-sat-math.json
Merged files into parsed/AGIEval-translate-en-MCQ-aqua-rat.json
Merged files into parsed/AGIEval-translate-en-MCQ-lsat-ar.json
Merged files into parsed/AGIEval-translate-en-MCQ-sat-en.json
Merged files into parsed/AGIEval-translate-en-MCQ-logiqa-en.json
Merged files into parsed/AGIEval-translate-en-MCQ-lsat-rc.json
Merged files into parsed/AGIEval-translate-en-MCQ-lsat-lr.json
Merged files into parsed/AGIEval-translate-en-MCQ-gaokao-english.json
Merged files into parsed/AGIEval-translate-en-MCQ-sat-en-without-passage.json


In [None]:


def rename_files_in_directory(directory):
    for filename in os.listdir(directory):
        if '-en-' in filename:
            new_filename = filename.replace('-en-', '-hu-')
            old_file_path = os.path.join(directory, filename)
            new_file_path = os.path.join(directory, new_filename)
            os.rename(old_file_path, new_file_path)
            print(f"Renamed {filename} to {new_filename}")

# Example usage
directory_path = 'parsed'  # Replace with your directory path
rename_files_in_directory(directory_path)


Renamed AGIEval-translate-en-MCQ-sat-math.json to AGIEval-translate-hu-MCQ-sat-math.json
Renamed AGIEval-translate-en-MCQ-aqua-rat.json to AGIEval-translate-hu-MCQ-aqua-rat.json
Renamed AGIEval-translate-en-MCQ-lsat-rc.json to AGIEval-translate-hu-MCQ-lsat-rc.json
Renamed AGIEval-translate-en-MCQ-logiqa-en.json to AGIEval-translate-hu-MCQ-logiqa-en.json
Renamed AGIEval-translate-en-MCQ-lsat-ar.json to AGIEval-translate-hu-MCQ-lsat-ar.json
Renamed AGIEval-translate-en-MCQ-lsat-lr.json to AGIEval-translate-hu-MCQ-lsat-lr.json
Renamed AGIEval-translate-en-MCQ-gaokao-english.json to AGIEval-translate-hu-MCQ-gaokao-english.json
Renamed AGIEval-translate-en-MCQ-sat-en.json to AGIEval-translate-hu-MCQ-sat-en.json
Renamed AGIEval-translate-en-MCQ-sat-en-without-passage.json to AGIEval-translate-hu-MCQ-sat-hu-without-passage.json


In [None]:


def rename_files_in_directory(directory):
    for filename in os.listdir(directory):

        new_filename = filename[18:]
        old_file_path = os.path.join(directory, filename)
        new_file_path = os.path.join(directory, new_filename)
        os.rename(old_file_path, new_file_path)
        print(f"Renamed {filename} to {new_filename}")

# Example usage
directory_path = 'parsed'  # Replace with your directory path
rename_files_in_directory(directory_path)


Renamed AGIEval-translate-hu-MCQ-logiqa-en.json to hu-MCQ-logiqa-en.json
Renamed AGIEval-translate-hu-MCQ-gaokao-english.json to hu-MCQ-gaokao-english.json
Renamed AGIEval-translate-hu-MCQ-lsat-rc.json to hu-MCQ-lsat-rc.json
Renamed AGIEval-translate-hu-MCQ-aqua-rat.json to hu-MCQ-aqua-rat.json
Renamed AGIEval-translate-hu-MCQ-sat-en.json to hu-MCQ-sat-en.json
Renamed AGIEval-translate-hu-MCQ-sat-math.json to hu-MCQ-sat-math.json
Renamed AGIEval-translate-hu-MCQ-sat-hu-without-passage.json to hu-MCQ-sat-hu-without-passage.json
Renamed AGIEval-translate-hu-MCQ-lsat-ar.json to hu-MCQ-lsat-ar.json
Renamed AGIEval-translate-hu-MCQ-lsat-lr.json to hu-MCQ-lsat-lr.json
