<a href="https://colab.research.google.com/github/rajoriavishal/python-playground/blob/main/Html_To_Text/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Import necessary libraries
# 'os' is for interacting with the operating system, like checking file paths.
# 'BeautifulSoup' from 'bs4' is the key library for parsing HTML.
import os
from bs4 import BeautifulSoup

class UserInterface:
    """
    Handles all interactions with the user, such as getting input.
    """
    def get_input_filepath(self):
        """
        Prompts the user to enter a file path and returns the input.

        Returns:
            str: The file path provided by the user.
        """
        prompt_message = "Please enter the full path to the file you want to clean: "
        user_input = input(prompt_message)
        return user_input

    def get_output_filepath(self):
        """
        Prompts the user for the desired output file path.

        Returns:
            str: The output file path provided by the user.
        """
        prompt_message = "Please enter the output file with extenstion): "
        user_input = input(prompt_message)
        return user_input

class FileHandler:
    """
    A class dedicated to handling file read and write operations.
    It encapsulates all file system interactions.
    """
    def read_file(self, filepath, encoding='utf-8'):
        """
        Reads the entire content of a file and returns it as a string.
        """
        try:
            with open(filepath, 'r', encoding=encoding) as file_object:
                content = file_object.read()
                return content
        except FileNotFoundError:
            print(f"Error: The file '{filepath}' was not found.")
            return None
        except Exception as e:
            print(f"An error occurred while reading the file: {e}")
            return None

    def write_file(self, filepath, data, encoding='utf-8'):
        """
        Writes data to a specified file.
        """
        try:
            with open(filepath, 'w', encoding=encoding) as file_object:
                file_object.write(data)
                return True
        except Exception as e:
            print(f"An error occurred while writing to the file: {e}")
            return False

class HtmlTextExtractor:
    """
    A class responsible for parsing HTML and extracting clean text.
    It also validates if the content contains HTML.
    """
    def __init__(self, content):
        """
        Initializes the extractor with the content to be parsed.
        """
        self.content = content
        self.parser_type = 'html.parser'
        self.soup_instance = BeautifulSoup(self.content, self.parser_type)

    def contains_html(self):
        """
        Checks if the provided content has any HTML tags.

        Returns:
            bool: True if HTML tags are found, False otherwise.
        """
        # The 'find()' method returns the first tag it finds, or None.
        first_tag = self.soup_instance.find()
        # If first_tag is not None, it means there is at least one HTML tag.
        has_tags = first_tag is not None
        return has_tags

    def extract_text(self):
        """
        Parses the HTML content and extracts the plain text.
        """
        text_separator = '\n'
        clean_text = self.soup_instance.get_text(separator=text_separator)
        return clean_text

class HtmlCleaner:
    """
    An orchestrator class that uses other components to clean an HTML file.
    """
    def __init__(self):
        """
        Initializes the cleaner with its helper components.
        """
        self.file_handler = FileHandler()

    def process(self, input_filepath, output_filepath):
        """
        Executes the main logic to read, validate, clean, and write the file.
        """
        # Step 1: Check if the input file exists.
        if not os.path.exists(input_filepath):
            # This check is done here instead of FileHandler to stop the process early.
            print(f"Process stopped: The file '{input_filepath}' does not exist.")
            return

        # Step 2: Read the file content.
        print(f"Reading content from '{input_filepath}'...")
        file_data = self.file_handler.read_file(input_filepath)

        # Proceed only if data was read successfully.
        if file_data is not None:
            # Step 3: Validate if the content has HTML.
            print("Validating file content...")
            extractor = HtmlTextExtractor(file_data)
            is_html = extractor.contains_html()

            if not is_html:
                print("Validation failed: No HTML tags were found in the file.")
                print("Process stopped.")
                return

            print("Validation successful: HTML content detected.")

            # Step 4: Extract the clean text.
            print("Parsing HTML and extracting text...")
            cleaned_text = extractor.extract_text()

            # Step 5: Write the cleaned text to the user-specified output file.
            print(f"Writing cleaned text to '{output_filepath}'...")
            was_write_successful = self.file_handler.write_file(output_filepath, cleaned_text)

            if was_write_successful:
                print("\nProcess completed successfully.")
                print(f"Cleaned file is available at: '{output_filepath}'")
            else:
                print("\nProcess failed during file writing.")
        else:
            print("\nProcess failed because the input file could not be read.")

class Application:
    """
    The main application class that runs the entire program.
    """
    def run(self):
        """
        Starts the application, gets user input, and initiates the cleaning process.
        """
        ui = UserInterface()
        input_path = ui.get_input_filepath()

        # Proceed only if the user provided an input path.
        if not input_path:
            print("No input file path was entered. Exiting program.")
            return

        output_path = ui.get_output_filepath()

        # Proceed only if the user provided an output path.
        if not output_path:
            print("No output file path was entered. Exiting program.")
            return

        cleaner = HtmlCleaner()
        cleaner.process(input_path, output_path)

# --- Main execution block ---
# This ensures the code inside only runs when the script is executed directly.
if __name__ == "__main__":
    app = Application()
    app.run()



Please enter the full path to the file you want to clean: /content/Lecture 2.html
Please enter the output file with extenstion): Lecture 2 Transcript
Reading content from '/content/Lecture 2.html'...
Validating file content...
Validation successful: HTML content detected.
Parsing HTML and extracting text...
Writing cleaned text to 'Lecture 2 Transcript'...

Process completed successfully.
Cleaned file is available at: 'Lecture 2 Transcript'
