# Extract Text from HTML

In [1]:
from bs4 import BeautifulSoup
import os

In [2]:
def remove_non_ascii(s):
    return "".join(i for i in s if ord(i) < 128)

# Function to extract text from an HTML file
def extract_text_from_html(html_file, txt_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'lxml')  # You can use 'html.parser' as well

    # Extract text from the parsed HTML (this removes all tags)
    soup_text = soup.get_text()
    soup_text = soup_text.replace('\n', ' ')
    soup_text = remove_non_ascii(soup_text)

    # Save the extracted text to a .txt file
    with open(txt_file, 'w', encoding='utf-8') as f:
        f.write(soup_text)

def extract_text_from_html_files_in_directory(input_directory, output_directory):
    # Ensure the output directory exists, create it if not
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Loop through all HTML files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith('.html'):
            html_file = os.path.join(input_directory, filename)
            
            # Set the corresponding .txt file path in the output directory
            txt_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}.txt")
            
            # Extract the text from HTML and save to the new directory
            extract_text_from_html(html_file, txt_file)
            print(f"Processed {html_file} -> {txt_file}")

# def extract_text_from_html_files_in_directory(directory):
#     for filename in os.listdir(directory):
#         if filename.endswith('.html'):
#             html_file = os.path.join(directory, filename)
#             txt_file = os.path.join(directory, f"{os.path.splitext(filename)[0]}.txt")
#             extract_text_from_html(html_file, txt_file)
#             print(f"Processed {html_file} -> {txt_file}")

In [3]:
# Example usage
directory = 'data'  # Directory containing HTML files
extract_text_from_html_files_in_directory(directory, "/home/cdsw/3_populate_vector_db/parsed")

Processed data/privacy_policy.html -> /home/cdsw/3_populate_vector_db/parsed/privacy_policy.txt
Processed data/shipping_policy.html -> /home/cdsw/3_populate_vector_db/parsed/shipping_policy.txt
Processed data/return_policy.html -> /home/cdsw/3_populate_vector_db/parsed/return_policy.txt
