In [37]:
import pandas as pd

data = {
    "DITA Tags": [
        "apiname", "b", "body", "bodydiv", "chapter", "chdesc", "cmd", "cmdname", 
        "codeblock", "codeph", "coderef", "component", "conbody", "conbodydiv", 
        "concept", "context", "copyrholder", "copyright", "copyryear", "data-about", 
        "data", "desc", "div", "dl", "dlentry", "dlhead", "dt", "entry", 
        "glossAbbreviation", "glossBody", "glossPartOfSpeech", "glossdef", "glossentry", 
        "glossgroup", "glossref", "glossterm", "hazardstatement", "hazardsymbol", 
        "keyword", "keywords", "mainbooktitle", "map", "mathml", "mathmlref", 
        "menucascade", "msgblock", "msgnum", "note", "notices", "ol", "otherinfo", 
        "p", "param", "parameterentity", "parml", "parmname", "part", "ph", "platform", 
        "postreq", "prelreqs", "prereq", "prodinfo", "prodname", "prognum", "prolog", 
        "propdesc", "properties", "refbody", "refbodydiv", "reference", "resourceid", 
        "section", "sectiondiv", "shortdesc", "simpletable", "stentry", "step", 
        "stepresult", "steps-informal", "steps-unordered", "steps", "stepsection", 
        "steptroubleshooting", "stepxmp", "substep", "substeps", "synph", 
        "systemoutput", "table", "task", "taskbody", "tasktroubleshooting", "tbody", 
        "term", "tested", "title", "titlealts", "topic", "topicref", "topicset", 
        "topicsetref", "topicsubject", "troublebody", "troubleshooting", 
        "troubleSolution", "tt", "typeofhazard", "u", "uicontrol", "ul", "userinput", 
        "var", "varname", "volume", "vrmlist", "wintitle", "xmlpi", "xref"
    ],
    "HTML Tags": [
        "<code>", ["<b>", "<strong>"], "<body>", "<div>", 
        "<div>", ["<p>", "<div>"], ["<code>", "<kbd>"], ["<code>", "<span>"], 
        "<pre>", "<code>", "<a>", "<div>", 
        "<div>", "<div>", "<article>", "<div>", "", "<span>", 
        "", "<meta>", ["<data>", "<div>"], "<p>", 
        "<div>", "<dl>", ["<dt>", "<dd>"], "<dt>", "<dt>", ["<td>", "<th>"], "<abbr>", 
        "<p>", "<span>", "<p>", "<section>", "<div>", "<a>", "<dt>", "<div>", 
        "<img>", "<span>", ["<meta>", "<span>"], ["<h1>", "<title>"], "", 
        "<math>", "<a>", "<span>", "<div>", 
        "<span>", ["<aside>", "<div>"], "<div>", "<ol>", "<div>", "<p>", ["<var>", "<code>"], 
        "", "<dl>", "<dt>", "<div>", "<span>", "<span>", 
        "<p>", "<p>", "<p>", "<div>", "<span>", "<span>", "", "<p>", 
        "<div>", "<div>", "<div>", "<section>", "", "<section>", "<div>", 
        "<p>", "<table>", "<td>", "<li>", "<p>", "<ul>", "<ul>", "<ol>", "<section>", 
        "<div>", "<pre>", "<li>", ["<ol>","<ul>"], "<code>", "<output>", "<table>", 
        "<section>", "<div>", "<div>", "<tbody>", "<dt>", "<span>", ["<h1>","<h2>","<h3>","<h4>","<h5>","<h6>","<title>"], 
        "<div>", "<article>", "<a>", "<div>", "<a>", "<div>", "<div>", "<div>", "<div>", 
        ["<tt>", "<code>"], "<span>", "<u>", "<kbd>", "<ul>", "<kbd>", "<var>", "<var>", 
        "<span>", ["<ol>", "<ul>"], "<title>", "", "<a>"
    ]
}

df = pd.DataFrame(data)
num_rows = len(df)

df["Attribute"] = ['ai-intent'] * num_rows
df["ai_intent_value"] = [1] * num_rows
df["html_files"] = [''] * num_rows
df["html_file_ids"] = [''] * num_rows
df["verbs"] = [''] * num_rows
df["nouns"] = [''] * num_rows
df["pos"] = [''] * num_rows

df

Unnamed: 0,DITA Tags,HTML Tags,Attribute,ai_intent_value,html_files,html_file_ids,verbs,nouns,pos
0,apiname,<code>,ai-intent,1,,,,,
1,b,"[<b>, <strong>]",ai-intent,1,,,,,
2,body,<body>,ai-intent,1,,,,,
3,bodydiv,<div>,ai-intent,1,,,,,
4,chapter,<div>,ai-intent,1,,,,,
...,...,...,...,...,...,...,...,...,...
114,volume,<span>,ai-intent,1,,,,,
115,vrmlist,"[<ol>, <ul>]",ai-intent,1,,,,,
116,wintitle,<title>,ai-intent,1,,,,,
117,xmlpi,,ai-intent,1,,,,,


In [39]:
import spacy
import os
import json
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")

def extract_verbs_entities(sentence):
    doc = nlp(sentence)
    
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    
    nouns= [token.text for token in doc if token.pos_ == 'NOUN']
    
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    
    return { 'verbs': verbs, 'nouns':nouns, 'entities': entities }


tags_to_extract = [tag for tag in df['HTML Tags'].tolist() if tag != ""]
tags_to_extract

['<code>',
 ['<b>', '<strong>'],
 '<body>',
 '<div>',
 '<div>',
 ['<p>', '<div>'],
 ['<code>', '<kbd>'],
 ['<code>', '<span>'],
 '<pre>',
 '<code>',
 '<a>',
 '<div>',
 '<div>',
 '<div>',
 '<article>',
 '<div>',
 '<span>',
 '<meta>',
 ['<data>', '<div>'],
 '<p>',
 '<div>',
 '<dl>',
 ['<dt>', '<dd>'],
 '<dt>',
 '<dt>',
 ['<td>', '<th>'],
 '<abbr>',
 '<p>',
 '<span>',
 '<p>',
 '<section>',
 '<div>',
 '<a>',
 '<dt>',
 '<div>',
 '<img>',
 '<span>',
 ['<meta>', '<span>'],
 ['<h1>', '<title>'],
 '<math>',
 '<a>',
 '<span>',
 '<div>',
 '<span>',
 ['<aside>', '<div>'],
 '<div>',
 '<ol>',
 '<div>',
 '<p>',
 ['<var>', '<code>'],
 '<dl>',
 '<dt>',
 '<div>',
 '<span>',
 '<span>',
 '<p>',
 '<p>',
 '<p>',
 '<div>',
 '<span>',
 '<span>',
 '<p>',
 '<div>',
 '<div>',
 '<div>',
 '<section>',
 '<section>',
 '<div>',
 '<p>',
 '<table>',
 '<td>',
 '<li>',
 '<p>',
 '<ul>',
 '<ul>',
 '<ol>',
 '<section>',
 '<div>',
 '<pre>',
 '<li>',
 ['<ol>', '<ul>'],
 '<code>',
 '<output>',
 '<table>',
 '<section>',
 '<div>

In [41]:
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        content = {}
        for tag in tags_to_extract:
            if isinstance(tag, str): 
                tag_name = tag.strip().strip('<>')
                elements = soup.find_all(tag_name)
                content[tag] = [element.get_text() for element in elements]
                content[tag] = list(set(text for text in content[tag] if text))
                
        for tag, texts in content.items():
            print(tag, texts)
            df.loc[df['HTML Tags'].apply(lambda x: tag in x if isinstance(x, list) else tag == x), 'html_files'] = df['html_files'].apply(lambda x: x + "\n".join(texts))
        
        # content_info = {tag: [extract_verbs_entities(text) for text in texts] for tag, texts in content.items()}
        
        # return title, content, title_info, content_info
        return "hi"

def process_html_files_recursively(folder_path, output_json_file):
    data = []
    index = 0

    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.html'):
                file_path = os.path.join(root, file_name)
                
                title = extract_content_from_html(file_path)
                
    #             data.append({
    #                 'index': index,
    #                 'file_name': file_name,
    #                 'file_path': file_path,
    #                 'title': title,
    #                 'title_info': title_info,
    #                 'paragraphs': paragraphs,
    #                 'para_info': para_info
    #             })
    #             index += 1
    
    # with open(output_json_file, 'w', encoding='utf-8') as json_file:
    #     json.dump(data, json_file, indent=4, ensure_ascii=False)
        
        
folder_path = os.path.join(os.getcwd(), 'Aurigo_HTML_Files')
output_json_file = 'output.json'

process_html_files_recursively(folder_path, output_json_file)

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union