In [1]:
import xml.etree.ElementTree as ET
tree = ET.parse('informatica_komahar.XML')
root = tree.getroot()

In [2]:
print(root)
print(root.tag, root.attrib)

<Element 'POWERMART' at 0x0000017CFCF1C540>
POWERMART {'CREATION_DATE': '04/24/2023 15:09:24', 'REPOSITORY_VERSION': '184.93'}


In [3]:
for child in root:
    print(child.tag, child.attrib)

REPOSITORY {'NAME': 'PowerCenterProd2', 'VERSION': '184', 'CODEPAGE': 'MS1252', 'DATABASETYPE': 'Microsoft SQL Server'}


In [4]:
root[0][0].attrib['NAME']

'AgencyShared'

In [5]:
for folder in root.iter('FOLDER'):
    print(folder.attrib)

{'NAME': 'AgencyShared', 'GROUP': '', 'OWNER': 'Administrator', 'SHARED': 'SHARED', 'DESCRIPTION': '', 'PERMISSIONS': 'rwx------', 'UUID': 'a2c28a96-3bd7-4911-a28e-45850c9e195d'}
{'NAME': 'AgencyDataMart', 'GROUP': '', 'OWNER': 'Administrator', 'SHARED': 'NOTSHARED', 'DESCRIPTION': '', 'PERMISSIONS': 'rwx------', 'UUID': '05dccfc6-e007-4b2e-bb03-e60ee90b2330'}
{'NAME': 'aaaWBMIShared', 'GROUP': '', 'OWNER': 'Administrator', 'SHARED': 'SHARED', 'DESCRIPTION': '', 'PERMISSIONS': 'rwx------', 'UUID': '5b9756f6-b675-4c3c-a3f3-6dfaf9123095'}


In [3]:
no_of_folders = 0
total_no_mappings = 0
for i, folder in enumerate(root.iter('FOLDER')):
    no_of_folders += 1
    
    no_of_mapping = 0
    for mapping in folder.iter('MAPPING'): no_of_mapping +=  1
    total_no_mappings += no_of_mapping
    print(f"No. of mappings in Folder {i} : {no_of_mapping}")

print(f"\nNo. of Folders = {no_of_folders}")
print(f"Total no. of Mappings = {total_no_mappings}")

No. of mappings in Folder 0 : 0
No. of mappings in Folder 1 : 7
No. of mappings in Folder 2 : 3

No. of Folders = 3
Total no. of Mappings = 10


In [15]:
no_of_folders = len(root.findall('.//FOLDER'))
total_no_mappings = sum(len(folder.findall('.//MAPPING')) for folder in root.findall('.//FOLDER'))

for i, folder in enumerate(root.iter('FOLDER')):
    print(f"No. of mappings in Folder {i} : {len(folder.findall('.//MAPPING'))}")

print(f"\nNo. of Folders = {no_of_folders}")
print(f"Total no. of Mappings = {total_no_mappings}")

No. of mappings in Folder 0 : 0
No. of mappings in Folder 1 : 7
No. of mappings in Folder 2 : 3

No. of Folders = 3
Total no. of Mappings = 10


In [16]:
model_basic = {}

for folder in root.iter('FOLDER'):
    name = folder.get('NAME')

    for mapping in folder.iter('MAPPING'):
        mapping_name = mapping.get('NAME')
        tasks = {task.get('NAME'): {'name': task.get('NAME'), 'task_type': task.get('TRANSFORMATION_TYPE'), 'transformation_name': task.get('TRANSFORMATION_NAME')} for task in mapping.iter('INSTANCE')}
        model_basic[mapping_name] = {'name': mapping_name, 'parent_folder': name, 'tasks': tasks}

print(model_basic)


{'m_AGY_DM_LOAD_AgencyRelationshipCurrent': {'name': 'm_AGY_DM_LOAD_AgencyRelationshipCurrent', 'parent_folder': 'AgencyDataMart', 'tasks': {'AgencyRelationshipCurrent': {'name': 'AgencyRelationshipCurrent', 'task_type': 'Target Definition', 'transformation_name': 'Shortcut_to_AgencyRelationshipCurrent'}, 'AgencyRelationshipDim': {'name': 'AgencyRelationshipDim', 'task_type': 'Source Definition', 'transformation_name': 'Shortcut_to_AgencyRelationshipDim'}, 'SQ_Shortcut_to_AgencyRelationshipDim': {'name': 'SQ_Shortcut_to_AgencyRelationshipDim', 'task_type': 'Source Qualifier', 'transformation_name': 'SQ_Shortcut_to_AgencyRelationshipDim'}, 'EXP_Standalone': {'name': 'EXP_Standalone', 'task_type': 'Expression', 'transformation_name': 'EXP_Standalone'}}}, 'm_AGY_DM_LOAD_AgencyRelationshipDim': {'name': 'm_AGY_DM_LOAD_AgencyRelationshipDim', 'parent_folder': 'AgencyDataMart', 'tasks': {'AgencyRelationshipDim_Insert': {'name': 'AgencyRelationshipDim_Insert', 'task_type': 'Target Definition'

In [5]:
import json
print(json.dumps(model_basic, indent=4))

{
    "m_AGY_DM_LOAD_AgencyRelationshipCurrent": {
        "name": "m_AGY_DM_LOAD_AgencyRelationshipCurrent",
        "parent_folder": "AgencyDataMart",
        "tasks": {
            "AgencyRelationshipCurrent": {
                "name": "AgencyRelationshipCurrent",
                "task_type": "Target Definition",
                "transformation_name": "Shortcut_to_AgencyRelationshipCurrent"
            },
            "AgencyRelationshipDim": {
                "name": "AgencyRelationshipDim",
                "task_type": "Source Definition",
                "transformation_name": "Shortcut_to_AgencyRelationshipDim"
            },
            "SQ_Shortcut_to_AgencyRelationshipDim": {
                "name": "SQ_Shortcut_to_AgencyRelationshipDim",
                "task_type": "Source Qualifier",
                "transformation_name": "SQ_Shortcut_to_AgencyRelationshipDim"
            },
            "EXP_Standalone": {
                "name": "EXP_Standalone",
                "task_type

In [17]:
# Serializing json
json_object = json.dumps(model_basic, indent=4)
 
# Writing to sample.json
with open("model_basic.json", "w") as outfile:
    outfile.write(json_object)

## add Transformation Fields in Tasks

In [20]:
model_basic = {}

for folder in root.iter('FOLDER'):
    folder_name = folder.get('NAME')

    for mapping in folder.iter('MAPPING'):
        mapping_name = mapping.get('NAME')
        tasks = {}
        transformations = {trans.get('NAME'): [{'name': field.get('NAME'), 'data_type': field.get('DATATYPE')} for field in trans.iter('TRANSFORMFIELD')] for trans in mapping.iter('TRANSFORMATION')}
        
        for task in mapping.iter('INSTANCE'):
            task_name = task.get('NAME')
            task_type = task.get('TRANSFORMATION_TYPE')
            trans_name = task.get('TRANSFORMATION_NAME')
            fields = transformations.get(trans_name, [])

            tasks[task_name] = {'name': task_name, 'task_type': task_type, 'transformation_name': trans_name, 'fields': fields}

        model_basic[mapping_name] = {'name': mapping_name, 'parent_folder': folder_name, 'tasks': tasks}

print(model_basic)

{'m_AGY_DM_LOAD_AgencyRelationshipCurrent': {'name': 'm_AGY_DM_LOAD_AgencyRelationshipCurrent', 'parent_folder': 'AgencyDataMart', 'tasks': {'AgencyRelationshipCurrent': {'name': 'AgencyRelationshipCurrent', 'task_type': 'Target Definition', 'transformation_name': 'Shortcut_to_AgencyRelationshipCurrent', 'fields': []}, 'AgencyRelationshipDim': {'name': 'AgencyRelationshipDim', 'task_type': 'Source Definition', 'transformation_name': 'Shortcut_to_AgencyRelationshipDim', 'fields': []}, 'SQ_Shortcut_to_AgencyRelationshipDim': {'name': 'SQ_Shortcut_to_AgencyRelationshipDim', 'task_type': 'Source Qualifier', 'transformation_name': 'SQ_Shortcut_to_AgencyRelationshipDim', 'fields': [{'name': 'EDWAgencyAKId', 'data_type': 'integer'}, {'name': 'EDWLegalPrimaryAgencyAKId', 'data_type': 'integer'}, {'name': 'AgencyRelationshipEffectiveDate', 'data_type': 'date/time'}, {'name': 'AgencyRelationshipExpirationDate', 'data_type': 'date/time'}]}, 'EXP_Standalone': {'name': 'EXP_Standalone', 'task_type'

In [21]:
# Serializing json
json_object = json.dumps(model_basic, indent=4)
 
# Writing to sample.json
with open("model_basic.json", "w") as outfile:
    outfile.write(json_object)