In [7]:
import json
import random
import csv

# Paths to the JSON and CSV files
json_file_path = "/home/imai.s/SLRT/CiCo/CLCL/ph_test.json"
csv_file_path = "bt_score_data.csv"

# Set the random seed for reproducibility
random.seed(42)

# Load the JSON data
with open(json_file_path, "r") as file:
    data = json.load(file)

# Create a dictionary where the key is the folder name and the value is the English transcription
folder_to_english = {
    data["original_video_dic"][key][0]: data["original_text_dic"].get(data["original_video_dic"][key][0], "No English transcription found")
    for key in data["original_video_dic"]
}

# Parse the CSV file and map folder names to German transcriptions
folder_to_german = {}
with open(csv_file_path, "r") as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        # Extract folder names from 'hyps' column
        hyp_folders = eval(row['hyps'])  # Safely evaluate the set of folder names
        german_ref = row['refs']
        for folder in hyp_folders:
            folder_to_german[folder] = german_ref

# Combine English and German transcriptions
folder_to_transcriptions = {
    folder: {
        "english": folder_to_english.get(folder, "No English transcription found"),
        "german": folder_to_german.get(folder, "No German transcription found")
    }
    for folder in folder_to_english
}

# Randomly sample 100 items
sampled_items = random.sample(folder_to_transcriptions.items(), 100)

# Convert the sampled items into a dictionary
result = {folder: transcriptions for folder, transcriptions in sampled_items}

output_path = "sampled_ph_test_with_german.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(result, output_file, indent=4, ensure_ascii=False)

# Print the result to confirm proper encoding
print(json.dumps(result, indent=4, ensure_ascii=False))


{
    "03July_2011_Sunday_tagesschau-1665": {
        "english": "In the other parts of the country, it usually gets friendly thanks to a high pressure zone that ranges from the Biscay to the Shetland Islands",
        "german": "in den übrigen landesteilen wird es meist freundlich dank einer hochdruckzone die von der biskaya bis zu den shetlandinseln reicht ."
    },
    "12July_2010_Monday_tagesschau-374": {
        "english": "Tomorrow there will be strong shower or thunderstorms in the east and southeast with a mix of sun and clouds",
        "german": "morgen gibt es im osten und südosten bei einer mischung aus sonne und wolken zum teil kräftige schauer oder gewitter ."
    },
    "02December_2009_Wednesday_tagesschau-4039": {
        "english": "And now the weather forecast for tomorrow Thursday",
        "german": "und nun die wettervorhersage für morgen donnerstag den dritten dezember ."
    },
    "11November_2010_Thursday_tagesschau-3564": {
        "english": "Sometimes heav