In [1]:
import spacy
import os
import json
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")

def extract_verbs_entities(sentence):
    doc = nlp(sentence)
    
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    
    nouns= [token.text for token in doc if token.pos_ == 'NOUN']
    
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    
    return { 'verbs': verbs, 'nouns':nouns, 'entities': entities }

In [2]:
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        # Extract title
        title = str(soup.title.string) if soup.title else 'No title'
        
        # Extract paragraphs
        paragraphs = [p.get_text() for p in soup.find_all('p')]
        
        paragraphs = list(set(para for para in paragraphs if para))
        
        para_info = [extract_verbs_entities(para) for para in paragraphs] 
        title_info =  extract_verbs_entities(title)
        
        return title, paragraphs, para_info, title_info

def process_html_files_recursively(folder_path, output_json_file):
    data = []
    index = 0

    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.html'):
                file_path = os.path.join(root, file_name)
                
                title, paragraphs, para_info, title_info = extract_content_from_html(file_path)
                
                data.append({
                    'index': index,
                    'file_name': file_name,
                    'file_path': file_path,
                    'title': title,
                    'title_info': title_info,
                    'paragraphs': paragraphs,
                    'para_info': para_info
                })
                index += 1
    
    with open(output_json_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)

In [3]:
folder_path = os.path.join(os.getcwd(), 'Aurigo_HTML_Files')
output_json_file = 'output.json'

process_html_files_recursively(folder_path, output_json_file)