In [111]:
from bs4 import BeautifulSoup
import re

In [112]:
def split_tag(tag):
  """Split a tag to get the name of the actual tag"""
  try:
    split_tag = lambda tag, point : tag.split(point)
    point_list = ['#', '.', ' ', '\n', '\t']
    finder = [9999,'']
    for index, p in enumerate(point_list):
      pos = tag.find(p)
      if pos < finder[0] and pos > 0:
        finder[0] = pos
        finder[1] = p
    return tag.split(finder[1], 1)[0]
  except:
    return tag

In [113]:
def get_class_id(element, type):
    """Returns class or id attribute in proper HTML form"""
    attr = 'class' if type == '\.' else 'id'
    matches = re.search(f"({type}+\w+\S+)", element)
    if matches is not None:
        split_del = '.' if type =='\.' else type
        attrs = matches.group(0).split(split_del)
        new_element = element.replace(matches.group(0), '')
        return f"{attr}=\"{' '.join(attrs[1:])}\"", new_element
    return None, element

def get_attributes(element: str):
    """
    Get the attributes of the elements.
    1. Get classes
    2. Get id
    3. Other attrs
    """
    combined_attrs = ''
    classes, new_el = get_class_id(element, '\.')
    id, _ = get_class_id(new_el, '#')

    if classes is not None:
        combined_attrs += classes

    if id is not None:
        combined_attrs += id

    if '(' in element:
        combined_attrs += element[element.find('(')+1:element.rfind(')')]

    return combined_attrs

In [114]:
def get_content(element):
    """
    Get the main content of the element.
    The text should be between quotation marks.
    e.g. 
        :div "I am a content"
        <div>I am a content</div>
    """
    matches = re.search('"(.*?)"', element)
    if matches is not None:
        return matches.group(0).replace('"','')
    return ""

In [115]:
def get_childs(pcml: list, i: int, t: int, end="") -> str:
  """
  Recursive function that gets the child elements from a starting position
  i = ith element
  """
  if i >= len(pcml):
    return ""

  tabs_count = pcml[i].count('\t')
  attrs = get_attributes(pcml[i])
  attrs = '' if attrs == '' else ' ' + attrs
  tag = split_tag(pcml[i]) # element tag, e.g. div
  content = get_content(pcml[i]) # content of the element

  if tabs_count > t or i > len(pcml):
    h = get_childs(pcml, i+1, tabs_count, f"</{tag}>")
    return f"<{tag}{attrs}>{content}{h}{end}"
  elif tabs_count == t:
    h = get_childs(pcml, i+1, tabs_count, end)
    return f"<{tag}{attrs}>{content}</{tag}>{h}"
  elif tabs_count < t:
    h = get_childs(pcml, i+1, tabs_count) if i+1 <= len(pcml) else ""
    return f"<{tag}{attrs}>{content}</{tag}>{end}{h}"
  else:
    return ""

In [116]:
def tokenise(pcml: str) -> list:
    return pcml.replace('  ', '\t').split(':')

def get_pretty_html(pcml: str) -> str:
    pcml = get_childs(tokenise(pcml), 1, 0)
    return BeautifulSoup(pcml, 'html.parser').prettify()

In [117]:
test_string = '''
:div
:main.vmainClass1.vmainClass2 (type="input" outlined)
  :div#main-container
    :div#keyboard-shortcut.class2
    :div "I am div content"
    :div "Another content"
  :div.form-container
    :form method=post
  :v-icon
    :v-con'''

print(get_pretty_html(test_string))

<div>
</div>
<main class="vmainClass1 vmainClass2" outlined="" type="input">
 input
 <div id="main-container">
  <div class="class2" id="keyboard-shortcut">
  </div>
  <div>
   I am div content
  </div>
  <div>
   Another content
  </div>
 </div>
 <div class="form-container">
  <form>
  </form>
 </div>
 <v-icon>
  <v-con>
  </v-con>
 </v-icon>
</main>


In [110]:
test_string = '''
:div.vmain (outlined hello)
    :div "No"
    :span "Hello"'''

print(get_pretty_html(test_string))

<div class="vmain" hello="" outlined="">
 <div>
  No
 </div>
 <span>
  Hello
 </span>
</div>
