In [1]:
import json
from collections import defaultdict
from typing import List
import spacy
import os
from bs4 import BeautifulSoup

data = {
    "DITA Tags": [
        "apiname", "b", "b", "body", "bodydiv", "chapter", "chdesc", "chdesc", "cmd", "cmd", "cmdname", "cmdname", 
        "codeblock", "codeph", "coderef", "component", "conbody", "conbodydiv", 
        "concept", "context", "copyrholder", "copyright", "copyryear", "data-about", 
        "data", "data", "desc", "div", "dl", "dlentry", "dlentry", "dlhead", "dt", "entry", "entry", 
        "glossAbbreviation", "glossBody", "glossPartOfSpeech", "glossdef", "glossentry", 
        "glossgroup", "glossref", "glossterm", "hazardstatement", "hazardsymbol", 
        "keyword", "keywords", "keywords", "mainbooktitle", "mainbooktitle", "map", "mathml", "mathmlref", 
        "menucascade", "msgblock", "msgnum", "note", "note", "notices", "ol", "otherinfo", 
        "p", "param", "param", "parameterentity", "parml", "parmname", "part", "ph", "platform", 
        "postreq", "prelreqs", "prereq", "prodinfo", "prodname", "prognum", "prolog", 
        "propdesc", "properties", "refbody", "refbodydiv", "reference", "resourceid", 
        "section", "sectiondiv", "shortdesc", "simpletable", "stentry", "step", 
        "stepresult", "steps-informal", "steps-unordered", "steps", "stepsection", 
        "steptroubleshooting", "stepxmp", "substep", "substeps", "substeps", "synph", 
        "systemoutput", "table", "task", "taskbody", "tasktroubleshooting", "tbody", 
        "term", "tested", "title","title","title","title","title","title","title", "titlealts", "topic", "topicref", "topicset", 
        "topicsetref", "topicsubject", "troublebody", "troubleshooting", 
        "troubleSolution", "tt", "tt", "typeofhazard", "u", "uicontrol", "ul", "userinput", 
        "var", "varname", "volume", "vrmlist", "vrmlist", "wintitle", "xmlpi", "xref"
    ],
    "HTML Tags": [
        "<code>", "<b>", "<strong>", "<body>", "<div>", 
        "<div>", "<p>", "<div>", "<code>", "<kbd>", "<code>", "<span>", 
        "<pre>", "<code>", "<a>", "<div>", 
        "<div>", "<div>", "<article>", "<div>", "", "<span>", 
        "", "<meta>", "<data>", "<div>", "<p>", 
        "<div>", "<dl>", "<dt>", "<dd>", "<dt>", "<dt>", "<td>", "<th>", "<abbr>", 
        "<p>", "<span>", "<p>", "<section>", "<div>", "<a>", "<dt>", "<div>", 
        "<img>", "<span>", "<meta>", "<span>", "<h1>", "<title>", "", 
        "<math>", "<a>", "<span>", "<div>", 
        "<span>", "<aside>", "<div>", "<div>", "<ol>", "<div>", "<p>", "<var>", "<code>", 
        "", "<dl>", "<dt>", "<div>", "<span>", "<span>", 
        "<p>", "<p>", "<p>", "<div>", "<span>", "<span>", "", "<p>", 
        "<div>", "<div>", "<div>", "<section>", "", "<section>", "<div>", 
        "<p>", "<table>", "<td>", "<li>", "<p>", "<ul>", "<ul>", "<ol>", "<section>", 
        "<div>", "<pre>", "<li>", "<ol>","<ul>", "<code>", "<output>", "<table>", 
        "<section>", "<div>", "<div>", "<tbody>", "<dt>", "<span>", "<h1>","<h2>","<h3>","<h4>","<h5>","<h6>","<title>", 
        "<div>", "<article>", "<a>", "<div>", "<a>", "<div>", "<div>", "<div>", "<div>", 
        "<tt>", "<code>", "<span>", "<u>", "<kbd>", "<ul>", "<kbd>", "<var>", "<var>", 
        "<span>", "<ol>", "<ul>", "<title>", "", "<a>"
    ]
}

total_rows = len(data['DITA Tags'])

data['Attribute'] = ['ai-intent'] * total_rows
data['ai_intent_value'] = [1] * total_rows
data['html_files'] = [[]] * total_rows
data['verbs'] = [''] * total_rows
data['nouns'] = [[]] * total_rows
data['pos'] = [''] * total_rows

In [2]:
nlp = spacy.load("en_core_web_sm")
tags_contents = defaultdict(list)
tags_to_extract = [tag for tag in data['HTML Tags'] if tag != ""]

def remove_angle_brackets(tags):
    if isinstance(tags, list):
        return [remove_angle_brackets(tag) for tag in tags]
    elif isinstance(tags, str):
        return tags.strip().strip('<>')
    return tags

cleaned_HTML_Tags = remove_angle_brackets(tags_to_extract)

def extract_entities(sentence, area):
    doc = nlp(sentence)
    
    if area == "VERB":
        return [token.text for token in doc if token.pos_ == "VERB"]
    elif area == "NOUN":
        return [token.text for token in doc if token.pos_ == 'NOUN']
    else:
        return [(entity.text, entity.label_) for entity in doc.ents]
    
    
def fetch_tags(tag: BeautifulSoup, all_tags: List):
    if tag.has_attr('ai-intent'):
        if tag.name in cleaned_HTML_Tags and tag.decode_contents().strip() != "":
            ai_intent_value = tag['ai-intent']
            tag_content = {
                'tag': tag.name,
                'content': tag.decode_contents(),
                'ai-intent': ai_intent_value
            }
            all_tags.append(tag_content)
    for child in tag.find_all(recursive=False):
        fetch_tags(child, all_tags)

In [3]:
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        all_tags = []
        fetch_tags(soup, all_tags)

        grouped_data = defaultdict(list)
        for item in all_tags:
            tag = item['tag']
            content = item['content']
            grouped_data[tag].append(content)

        return grouped_data

def process_html_files_recursively(folder_path):
    html_files = []

    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.html'):
                file_contents = defaultdict(list)
                file_path = os.path.join(root, file_name)
                grouped_data = extract_content_from_html(file_path)
                file_contents['file_name'] = file_name
                file_contents['file_contents'] = grouped_data
                html_files.append(file_contents)
                
    with open('tags_contents.json', 'w', encoding='utf-8') as json_file:
        json.dump(html_files, json_file, indent=4, ensure_ascii=False)
                

folder_path = os.path.join(os.getcwd(), 'Aurigo_HTML_Files')

process_html_files_recursively(folder_path)