<a href="https://colab.research.google.com/github/rajoriavishal/python-playground/blob/main/Transcript_Timesheet_Remover/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import os

class TranscriptFileHandler:
    """Handles reading from and writing to files."""

    def read_file(self, file_path):
        """
        Reads the content of a file.

        Args:
            file_path (str): The path to the file.

        Returns:
            str: The content of the file, or None if an error occurs.
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                return content
        except FileNotFoundError:
            print(f"❌ Error: The file was not found at '{file_path}'")
            print("Please check the path and try again. In Google Colab, you can right-click the file and select 'Copy path'.")
            return None
        except Exception as e:
            print(f"An unexpected error occurred during file reading: {e}")
            return None

    def write_file(self, file_path, content):
        """
        Writes content to a new file.

        Args:
            file_path (str): The path for the new file.
            content (str): The content to write.

        Returns:
            bool: True if write was successful, False otherwise.
        """
        try:
            with open(file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(content)
            return True
        except Exception as e:
            print(f"An unexpected error occurred during file writing: {e}")
            return False


class TextProcessor:
    """A class to handle text manipulation and cleaning."""

    def remove_timestamps_from_text(self, text_content):
        """
        Removes various timestamp formats from a string. This version processes
        the file line by line to accurately remove lines that are only timestamps.

        Args:
            text_content (str): The original text content.

        Returns:
            str: The text with timestamps removed.
        """
        # Pattern for standard SRT timestamp lines (e.g., "00:01:05,234 --> 00:01:07,890")
        srt_pattern = re.compile(r'^\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3}$')

        # Pattern for lines that contain only a timestamp like H:MM:SS or M:SS.
        # It handles optional hours, and single or double digits for H and M.
        flexible_timestamp_pattern = re.compile(r'^((\d{1,2}:)?\d{1,2}:\d{2})$')

        all_lines = text_content.splitlines()
        clean_lines = []

        for line in all_lines:
            # Strip whitespace to correctly match patterns against the line's content
            stripped_line = line.strip()

            # Check if the line matches either of the timestamp formats
            is_srt_match = srt_pattern.match(stripped_line)
            is_flexible_match = flexible_timestamp_pattern.match(stripped_line)

            # If the line is not a timestamp, we keep it
            if not is_srt_match and not is_flexible_match:
                clean_lines.append(line)

        content_without_timestamps = '\n'.join(clean_lines)
        return content_without_timestamps

    def clean_extra_whitespace(self, text_content):
        """
        Removes extra whitespace and empty lines from text.

        Args:
            text_content (str): The text to clean.

        Returns:
            str: The cleaned text.
        """
        lines = text_content.splitlines()
        stripped_lines = [line.strip() for line in lines]
        non_empty_lines = [line for line in stripped_lines if line]
        final_text = '\n'.join(non_empty_lines)
        return final_text


class TranscriptProcessor:
    """Orchestrates the transcript cleaning process."""

    def __init__(self, input_path):
        self.input_path = input_path
        self.file_handler = TranscriptFileHandler()
        self.text_processor = TextProcessor()

    def generate_output_path(self):
        """
        Creates the output file path from the input path.

        Returns:
            str: The new file path.
        """
        base_name, extension = os.path.splitext(self.input_path)
        output_path = f"{base_name}_NoTS.txt"
        return output_path

    def process(self):
        """
        Executes the full process of reading, cleaning, and saving the transcript.
        """
        original_content = self.file_handler.read_file(self.input_path)
        if original_content is None:
            return

        content_no_ts = self.text_processor.remove_timestamps_from_text(original_content)
        final_text = self.text_processor.clean_extra_whitespace(content_no_ts)

        output_path = self.generate_output_path()

        was_saved = self.file_handler.write_file(output_path, final_text)

        if was_saved:
            print("\n✅ Success! Timestamps have been removed.")
            print(f"Cleaned file saved as: {output_path}")


class UserInterface:
    """Handles interaction with the user."""

    def display_header(self):
        """Prints the welcome header for the tool."""
        print("--- Transcript Timestamp Remover (OOP Version) ---")
        print("Upload your transcript file to the Colab environment.")

    def get_user_input(self):
        """
        Gets the file path from the user and sanitizes it.

        Returns:
            str: The sanitized file path.
        """
        file_path_input = input("Paste the path to your transcript file here: ")

        # Breaking down the sanitize operation
        path_stripped = file_path_input.strip()
        sanitized_path = path_stripped.strip("'\"")
        return sanitized_path


def main():
    """Main function to run the application."""
    ui = UserInterface()
    ui.display_header()

    file_path = ui.get_user_input()

    if file_path:
        processor = TranscriptProcessor(file_path)
        processor.process()


if __name__ == "__main__":
    main()



--- Transcript Timestamp Remover (OOP Version) ---
Upload your transcript file to the Colab environment.
Paste the path to your transcript file here: /content/L2.txt

✅ Success! Timestamps have been removed.
Cleaned file saved as: /content/L2_NoTS.txt
