In [6]:
import os
import re
import html
from html import unescape  # Import the unescape function from the html module

In [7]:
def process_smi_files(input_dir, output_dir):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # List all files in the input directory
    smi_files = [f for f in os.listdir(input_dir) if f.endswith('.smi')]

    for smi_file_name in smi_files:
        input_path = os.path.join(input_dir, smi_file_name)
        output_path = os.path.join(output_dir, smi_file_name.replace('.smi', '.txt'))

        try:
            process_single_smi_file(input_path, output_path)
        except Exception as e:
            print(f"Error processing {input_path}: {e}")

In [8]:
def process_single_smi_file(input_path, output_path):
    # Load the SMI file as binary and decode it
    with open(input_path, 'rb') as smi_file:
        smi_content = smi_file.read().decode('utf-8')

    # Process the content and write to the output .txt file
    timed_plain_text = smi_file_to_timed_plain_text(smi_content)

    if timed_plain_text.strip():
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(timed_plain_text)
        print(f"Processed: {input_path} -> {output_path}")
    else:
        print(f"Skipping: {input_path} (empty output)")


In [9]:
def smi_file_to_timed_plain_text(smi_content):
    # Extract timed plain text from SMI content using regex
    pattern = re.compile(r'<SYNC Start=\d+><P class=\'en-IN\'>(.*?)\n', re.DOTALL)
    matches = pattern.findall(smi_content)
    timed_plain_text = '\n'.join(matches)
    return timed_plain_text

In [10]:
input_directory = 'C:/Users/Admin/Dataset Buisness/'
output_directory = 'C:/Users/Admin/Dataset Buisness/subs'

process_smi_files(input_directory, output_directory)

Processed: C:/Users/Admin/Dataset Buisness/2022 Finance Budget Key Highlights_Full-HD.smi -> C:/Users/Admin/Dataset Buisness/subs\2022 Finance Budget Key Highlights_Full-HD.txt
Processed: C:/Users/Admin/Dataset Buisness/ABG Shipyard Scams Indian Banks Of Over ₹22,000 Crore _ ISH News_Full-HD.smi -> C:/Users/Admin/Dataset Buisness/subs\ABG Shipyard Scams Indian Banks Of Over ₹22,000 Crore _ ISH News_Full-HD.txt
Processed: C:/Users/Admin/Dataset Buisness/Adani Buys a Port in Israel Worth 1.2Billion Dollars _ ISH News_Full-HD.smi -> C:/Users/Admin/Dataset Buisness/subs\Adani Buys a Port in Israel Worth 1.2Billion Dollars _ ISH News_Full-HD.txt
Processed: C:/Users/Admin/Dataset Buisness/Adani Group Takes Over Management of Mumbai Airport_Full-HD.smi -> C:/Users/Admin/Dataset Buisness/subs\Adani Group Takes Over Management of Mumbai Airport_Full-HD.txt
Processed: C:/Users/Admin/Dataset Buisness/Adani Overtakes Arnault & Becomes the 3rd Richest Man _ ISH News_Full-HD.smi -> C:/Users/Admin/Da