In [1]:
from bs4 import BeautifulSoup, Tag
import json
import os
import re
import base64

def recursive_extract(tag, depth=0):
    # Check if we've hit the bottom
#    if not tag.find_all('ul'):
#        key_value = re.split('[:|,]', tag.text.strip(), 1)
#        if len(key_value) == 2:
#            key = key_value[0].strip().replace(' ', '_')
#            return {key: key_value[1].strip()+"hi", 'depth': depth}
#        else:
#            key = key_value[0].strip().replace(' ', '_')
#            return {key: None, 'depth': depth}

    data = {}
    # Iterate over nested lists
    for li in tag.find_all('li', recursive=False):
        key_value = re.split('[:|,]', li.text.strip(), 1)
        key = key_value[0].strip().replace(' ', '_') if len(key_value) > 0 else ''
        ul = li.find('ul')
        if ul:
            data[key] = recursive_extract(ul, depth+1)
        elif len(key_value) == 2:
            data[key] = {'value': parse_hex_string(key_value[1].strip()), 'depth': depth}
        else:
            data[key] = {'value': None, 'depth': depth}

    return data

def html_to_json(html_file_path, json_file_path):
    # Check if HTML file exists
    if not os.path.isfile(html_file_path):
        print(f"The file {html_file_path} does not exist.")
        return

    with open(html_file_path, 'r') as file:
        # Parse HTML file content with BeautifulSoup
        soup = BeautifulSoup(file, 'html.parser')

    # The data dict will store the parsed HTML as JSON
    data = {}

    # Find all root-level <ul> tags and their content
    for ul in soup.body.find_all('ul',{"class": "ftstree"}, recursive=False):
        frame_key = re.split('[:|,]', ul.attrs.get('id', 'unknown frame'), 1)[0].strip().replace(' ', '_')
        data[frame_key] = recursive_extract(ul)

    # Write the JSON data to the output file
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print(f"JSON file has been created at {json_file_path}")



def bytes_to_base64(bytes_data):
    return base64.b64encode(bytes_data).decode('utf-8')

def base64_to_bytes(base64_string):
    return base64.b64decode(base64_string.encode('utf-8'))


def parse_hex_string(hex_string):
    # Attempt to split string by comma
    parts = hex_string.rsplit(',', 1)

    text = parts[0]

    # If there is no comma-separated part, return text and None for data
    if len(parts) == 1 or not parts[1].strip():
        return {'text': text, 'data': None}

    data_text = parts[1]

    # Find all two-digit hex digits (in reverse order)
    data_text = re.findall(r'([a-fA-F0-9]{2})', data_text[::-1])

    # Convert to byte array
    data = bytes.fromhex(' '.join(data_text[::-1]))

    # Convert byte array to base64 string
    data_base64 = bytes_to_base64(data)

    # Return dict
    return {'text': text, 'data': data_base64}




In [2]:
html_filename='data/Capture-2023-06-24_154423524117300_subset.htm'
json_filename='data/Capture-2023-06-24_154423524117300_subset.json'

html_filename='data/le_capture_20230625_181228.html'
json_filename='data/le_capture_20230625_181228.json'
# Use the function
html_to_json(html_filename, json_filename)

JSON file has been created at data/le_capture_20230625_181228.json


In [32]:

html_filename='data/Capture-2023-06-24_154423524117300_with_summary.html'
json_filename='data/Capture-2023-06-24_154423524117300_with_summary.json'
# Use the function
html_to_json(html_filename, json_filename)

JSON file has been created at data/Capture-2023-06-24_154423524117300_with_summary.json


In [26]:
# Test the function
result = parse_hex_string("0xa89522,a8 95 22")
print(result)

result = parse_hex_string("0xa89522,")
print(result)

result = parse_hex_string("0xa89522")
print(result)

result = parse_hex_string("0xa89522, test")
print(result)

{'text': '0xa89522', 'data': b'\x8aY"'}
{'text': '0xa89522', 'data': None}
{'text': '0xa89522', 'data': None}
{'text': '0xa89522', 'data': b''}
