In [1]:
import os
import json
import xml.etree.ElementTree as ET

def extract_tableau_data(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Namespace dictionary (adjust if necessary)
    ns = {'t': 'http://tableausoftware.com/xml/user'}

    # Initialize a dictionary to store extracted information
    tableau_data = {
        'datasources': [],
        'calculations': [],
        'parameters': [],
        'worksheets': []
    }

    # Extract Data Sources
    for ds in root.findall('.//datasource', ns):
        ds_info = {
            'name': ds.get('name'),
            'caption': ds.get('caption'),
            'connections': []
        }
        # Extract Connections
        for conn in ds.findall('.//connection', ns):
            ds_info['connections'].append(conn.attrib)
        tableau_data['datasources'].append(ds_info)

    # Extract Calculated Fields
    for column in root.findall(".//column[@caption][@datatype][@name][@type='calculated']", ns):
        calc_element = column.find('.//calculation', ns)
        formula = calc_element.get('formula') if calc_element is not None else ''
        calc_info = {
            'name': column.get('name'),
            'caption': column.get('caption'),
            'datatype': column.get('datatype'),
            'formula': formula
        }
        tableau_data['calculations'].append(calc_info)

    # Extract Parameters
    for param in root.findall(".//column[@class='parameter']", ns):
        calc_element = param.find('.//calculation', ns)
        formula = calc_element.get('formula') if calc_element is not None else ''
        param_info = {
            'name': param.get('name'),
            'datatype': param.get('datatype'),
            'caption': param.get('caption'),
            'formula': formula
        }
        tableau_data['parameters'].append(param_info)

    # Extract Worksheets and their Visualizations
    for ws in root.findall('.//worksheet', ns):
        ws_info = {
            'name': ws.get('name'),
            'views': []
        }
        # Extract Marks (Visualizations)
        for mark in ws.findall('.//mark', ns):
            mark_info = {
                'type': mark.get('type'),
                'encodings': []
            }
            # Extract Encodings
            for enc in mark.findall('.//encoding', ns):
                encoding_info = enc.attrib
                mark_info['encodings'].append(encoding_info)
            ws_info['views'].append(mark_info)
        tableau_data['worksheets'].append(ws_info)

    return tableau_data

def read_xml_files_and_extract(input_folder, output_folder):
    # Get all XML files in the input folder
    xml_files = [f for f in os.listdir(input_folder) if f.endswith('.twb') or f.endswith('.xml')]
    for xml_file in xml_files:
        input_path = os.path.join(input_folder, xml_file)
        output_file_name = os.path.splitext(xml_file)[0] + '.json'
        output_path = os.path.join(output_folder, output_file_name)

        # Extract data from XML file
        try:
            tableau_data = extract_tableau_data(input_path)
            # Save extracted data to JSON file
            with open(output_path, 'w', encoding='utf-8') as outfile:
                json.dump(tableau_data, outfile, ensure_ascii=False, indent=4)
            print(f"Processed '{xml_file}' and saved to '{output_file_name}'.")
        except Exception as e:
            print(f"Error processing '{xml_file}': {e}")

if __name__ == "__main__":
    # Define the input and output folders
    input_folder = 'Tableau Reports'   # Replace with your input folder path
    output_folder = 'JSON' # Replace with your output folder path

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Read XML files and extract information
    read_xml_files_and_extract(input_folder, output_folder)

Error processing 'User Trending S-R1.xml': syntax error: line 1, column 0
