In [37]:
import json

In [38]:
input_file = './hottp.json'
output_file = 'hottp_updated_format.json'

In [39]:
with open(input_file, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [47]:
def find_value(data, text):
    if isinstance(data, dict):
        if "Text" in data and data["Text"] == text:
            return data.get("Content")
        for key, val in data.items():
            if key == "Versions" and isinstance(val, dict):
                val = [val]  # Convert the dictionary to a list with a single element
            result = find_value(val, text)
            if result:
                return result
    elif isinstance(data, list):
        for val in data:
            result = find_value(val, text)
            if result:
                return result
    return None

In [50]:
def find_source(data, key):
    if isinstance(data, dict):
        if key in data:
            return data.get(key)
        for val in data.values():
            result = find_value(val, key)
            if result:
                return result
    return None

In [51]:
def get_records(versions): 
    roles = []
    
    for version in versions:
        source = find_source(version, 'Source')
        roles.append(source)
        rsv = find_value(version, 'RSV')
        roles.append(rsv)
        neb = find_value(version, 'NEB')
        roles.append(neb)
        bj = find_value(version, 'BJ')
        roles.append(bj)
        lut = find_value(version, 'LUT')
        roles.append(lut)
        
    return roles[:5]

In [44]:
def convert_format(alignment):
    records = get_records(alignment['Alternatives']['Alternative'])
    
    new_format = {  
        "format": "alignment", 
        "version": "[add timestamp from filename or git commit if you have it]", 
        "groups": [
            {
                "type": "translation", 
                "documents": [
                    {"scheme": "ws-token", 
                     "docid": alignment['Id'],
                     "meta": {"Ids": alignment['References']['Reference'],
                              "Remark": alignment['Remark'], 
                              "Remark-FR": alignment['Remark-FR'] if 'Remark-FR' in alignment else '',
                              "Suggestion": alignment['Suggestion'], 
                              "SuggestionFR": alignment['SuggestionFR'] if 'SuggestionFR' in alignment else '',
                              "Factors": alignment['Alternatives']['Alternative'][0]['Factors'],
                              "Literal": alignment['Alternatives']['Alternative'][0]['Literal'],
                              "LiteralFR": alignment['Alternatives']['Alternative'][0]['LiteralFR'] if 'LiteralFR' in alignment else '',
                              "Misc": alignment['Alternatives']['Alternative'][1],
                              }
                    }, 
                ], 
                "roles": ["source", "RSV", "NEB", "BJ", "LUT"], 
                "records": records
            }
        ]
    }
    
    return new_format

In [45]:
import re

In [52]:
with open(output_file, mode='a') as file:
    file.write('[\n')

for i in data['HOTTP_Entries']['HOTTP_Entry']:
    try:
        new_format = convert_format(i)

        with open(output_file, mode='a', encoding='utf-8') as file:
            json.dump(new_format, file,ensure_ascii=False)
            file.write(',')
            file.write('\n')
    except Exception as e:
        pass
        
with open(output_file, mode='a') as file:
    file.write(']')