In [None]:
def post_process(file):
    with open(file, "r", encoding="utf-8") as f:
        content = f.readlines()

    list_start = []
    for idx, line in enumerate(content):
        if "|Col1|VIETTEL AI RACE" in line:
            list_start.append(idx)
    to_remove = set()
    for s in list_start:
        for j in (s, s + 1, s + 2):
            if 0 <= j < len(content):
                to_remove.add(j)

    content = [line for i, line in enumerate(content) if i not in to_remove]
    new_text = "\n".join(content)
    return new_text

In [95]:
import re
import pandas as pd
from io import StringIO

# --------------------------------------
# Normalize text within each table cell
# --------------------------------------
def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    # Convert <br> to spaces
    s = re.sub(r'<br\s*/?>', ' ', s, flags=re.IGNORECASE)
    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

# ---------------------------------------------------------
# Automatically detect the header row of a markdown table
# ---------------------------------------------------------
def detect_header(lines: list):
    """
    Pick the header row based on:
    - First non-empty line containing multiple | delimeters
    """
    for i, line in enumerate(lines):
        if "|" in line and not re.match(r'^\s*\|\s*-', line):  # skip separator lines
            return i
    return 0


# ---------------------------------------------------------
# Convert markdown-like table to READOC-friendly HTML table
# ---------------------------------------------------------
def convert_markdown_to_html_table(md_table: str) -> str:
    # Normalize the incoming text lines
    lines = [l.strip() for l in md_table.strip().split("\n") if l.strip()]

    # Detect header row
    header_idx = detect_header(lines)
    header_line = lines[header_idx]
    # Clean header formatting (remove **bold**)
    clean_header = re.sub(r'\*{2}(.*?)\*{2}', r'\1', header_line)

    # Extract header names
    header_parts = [normalize_text(col) for col in clean_header.split("|") if col.strip()]

    # Extract body rows (skip header + separator)
    body_lines = [
        l for l in lines[header_idx + 1:]
        if "|" in l and not re.match(r'^\s*\|\s*-', l)
    ]

    # Build CSV-like temporary structure for pandas
    csv_str = ";;".join(header_parts) + "\n"
    for row in body_lines:
        cols = [normalize_text(c) for c in row.split("|") if c.strip()]
        # if len(cols) > len(header_parts):
            # cols = cols[:len(header_parts)]
        csv_str += ";;".join(cols) + "\n"

    # Convert to DataFrame

    try:
        df = pd.read_csv(StringIO(csv_str), sep=";;")
    except Exception as e:
        print("Error parsing table:", e)
        print("header_parts:", header_parts)
        print(csv_str)

    # Final normalization
    for col in df.columns:
        df[col] = df[col].apply(normalize_text)

    # ---------------------------
    # Construct HTML output
    # ---------------------------
    html = [
        '<table>',
        '<colgroup><col/><col/><col/><col/></colgroup>',
        '<tbody>',
        '<tr>'
    ]

    # Headers
    for col in df.columns:
        html.append(f'<td><strong>{col}</strong></td>')
    html.append('</tr>')

    # Rows
    for _, row in df.iterrows():
        html.append('<tr>')
        for col in df.columns:
            html.append(f'<td>{row[col]}</td>')
        html.append('</tr>')

    html.append('</tbody></table>')

    return "\n".join(html)


# --------------------- Example Usage ---------------------

input_table = """
|Col1|Col2|Col3|Col4|
|---|---|---|---|
|**Enterprise**<br> <br>|T1562|Impair Defenses|JumbledPath can impair logging<br>on all devices used along its<br>connection path to compromised<br>hosts.|
|**Enterprise**<br> <br>|T1070|Indicator<br>Removal: Clear<br>Linux or Mac<br>System Logs|JumbledPath can clear logs on all<br>devices used along its connection<br>path to compromised network<br>infrastructure.|
|**Enterprise**<br> <br>|T1104|Multi-Stage<br>Channels|JumbledPath can communicate<br>over a unique series of connections<br>to send and retrieve data from<br>exploited devices.|
|**Enterprise**<br> <br>|T1040|Network Sniffing|JumbledPath has the ability to<br>perform packet capture on remote<br>devices via actor-defined jump-<br>hosts.|
"""

html_output = convert_markdown_to_html_table(input_table)
print(html_output)


<table>
<colgroup><col/><col/><col/><col/></colgroup>
<tbody>
<tr>
<td><strong>Col1</strong></td>
<td><strong>Col2</strong></td>
<td><strong>Col3</strong></td>
<td><strong>Col4</strong></td>
</tr>
<tr>
<td>**Enterprise**</td>
<td>T1562</td>
<td>Impair Defenses</td>
<td>JumbledPath can impair logging on all devices used along its connection path to compromised hosts.</td>
</tr>
<tr>
<td>**Enterprise**</td>
<td>T1070</td>
<td>Indicator Removal: Clear Linux or Mac System Logs</td>
<td>JumbledPath can clear logs on all devices used along its connection path to compromised network infrastructure.</td>
</tr>
<tr>
<td>**Enterprise**</td>
<td>T1104</td>
<td>Multi-Stage Channels</td>
<td>JumbledPath can communicate over a unique series of connections to send and retrieve data from exploited devices.</td>
</tr>
<tr>
<td>**Enterprise**</td>
<td>T1040</td>
<td>Network Sniffing</td>
<td>JumbledPath has the ability to perform packet capture on remote devices via actor-defined jump- hosts.</td>
</tr>

  df = pd.read_csv(StringIO(csv_str), sep=";;")


In [None]:
file = "./output/var_train_258/Public_258.md"
md = post_process(file)

In [59]:
def detect_table_and_convert_html(md):
    md_blocks = md.split("\n\n\n\n")
    md_blocks = [i for i in md_blocks if i.strip() != '']
    out_blocks = []
    for block in md_blocks:
        lines = [l.strip() for l in block.strip().split("\n") if l.strip()]
        first_line = lines[0]
        # consider it a table header only if it contains multiple pipes (>=3)
        if first_line.count("|") >= 3 and not re.match(r'^\s*\|\s*-', first_line):
            # print(block)
            html_output = convert_markdown_to_html_table(block)
            out_blocks.append(html_output)
        else:
            out_blocks.append(block)

    out_text = "\n".join(out_blocks)
    return out_text

    # print(f"Detected header line: {header_line}", header_idx)

In [96]:
import os
list_orin_file = "output/var_test_pymu"
list_post_file = list_orin_file+"_post"
os.makedirs(list_post_file, exist_ok=True)
for file in os.listdir(list_orin_file):
    if file.endswith(".md"):
        print(f"Processing file: {file}")
        orin_path = os.path.join(list_orin_file, file)
        post_path = os.path.join(list_post_file, file)
        md = post_process(orin_path)
        md = detect_table_and_convert_html(md)
        with open(post_path, "w", encoding="utf-8") as f:
            f.write(md)

Processing file: Public_299.md
Processing file: Public_289.md
Processing file: Public_288.md
Processing file: Public_320.md
Processing file: Public_371.md
Processing file: Public_382.md
Processing file: Public_293.md
Processing file: Public_355.md
Processing file: Public_361.md
Processing file: Public_341.md
Processing file: Public_375.md


  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), se

Processing file: Public_297.md
Processing file: Public_396.md
Processing file: Public_287.md
Processing file: Public_351.md
Processing file: Public_300.md
Processing file: Public_340.md
Processing file: Public_374.md
Processing file: Public_296.md
Processing file: Public_286.md
Processing file: Public_364.md
Processing file: Public_301.md
Processing file: Public_370.md
Processing file: Public_292.md
Processing file: Public_354.md
Processing file: Public_360.md
Processing file: Public_295.md
Processing file: Public_326.md
Processing file: Public_285.md


  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), se

Processing file: Public_302.md
Processing file: Public_353.md
Processing file: Public_291.md
Processing file: Public_373.md
Processing file: Public_357.md
Processing file: Public_363.md
Processing file: Public_290.md
Processing file: Public_372.md
Processing file: Public_356.md
Processing file: Public_362.md
Processing file: Public_294.md
Processing file: Public_327.md
Processing file: Public_366.md
Processing file: Public_352.md
Processing file: Public_328.md
Processing file: Public_338.md
Processing file: Public_369.md
Processing file: Public_339.md
Processing file: Public_368.md
Processing file: Public_319.md
Processing file: Public_358.md


  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), sep=";;")
  df = pd.read_csv(StringIO(csv_str), se