In [1]:
import os
import re
from html import unescape
from datetime import timedelta

In [2]:
def process_smi_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    smi_files = [f for f in os.listdir(input_dir) if f.endswith('.smi')]

    for smi_file_name in smi_files:
        input_path = os.path.join(input_dir, smi_file_name)
        output_path = os.path.join(output_dir, smi_file_name.replace('.smi', '.txt'))

        try:
            process_single_smi_file(input_path, output_path)
        except Exception as e:
            print(f"Error processing {input_path}: {e}")

In [3]:
def process_single_smi_file(input_path, output_path):
    with open(input_path, 'rb') as smi_file:
        smi_content = smi_file.read().decode('utf-8')

    timed_plain_text = smi_file_to_timed_plain_text(smi_content)

    if timed_plain_text.strip():
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(timed_plain_text)
        print(f"Processed: {input_path} -> {output_path}")
    else:
        print(f"Skipping: {input_path} (empty output)")

In [4]:
def smi_file_to_timed_plain_text(smi_content):
    pattern = re.compile(r'<SYNC Start=(\d+)><P class=\'en-IN\'>(.*?)\n', re.DOTALL)
    matches = pattern.findall(smi_content)

    decoded_matches = [(timedelta(milliseconds=int(timestamp)), unescape(text)) for timestamp, text in matches]

    formatted_matches = [f"{int(timestamp.total_seconds() // 60):02}:{int(timestamp.total_seconds() % 60):02}\n{text}" for timestamp, text in decoded_matches if text.strip()]

    timed_plain_text = '\n'.join(formatted_matches)
    return timed_plain_text

In [7]:
# Example usage
input_directory = 'D:/Rahul/Dataset/'
output_directory = 'D:/Rahul/Dataset/subs'
process_smi_files(input_directory, output_directory)


Skipping: D:/Rahul/Dataset/$1Billion Pours in to Rebuild Notre Dame_Full-HD.smi (empty output)
Processed: D:/Rahul/Dataset/'Vaccine bus' To Inoculate Vegetable Vendors In Kolkata_Full-HD.smi -> D:/Rahul/Dataset/subs\'Vaccine bus' To Inoculate Vegetable Vendors In Kolkata_Full-HD.txt
Skipping: D:/Rahul/Dataset/1 Arrested in CSMT Bridge Collapse_Full-HD.smi (empty output)
Skipping: D:/Rahul/Dataset/1 in 5 Products Sold by E-Tailers Is Fake_Full-HD.smi (empty output)
Processed: D:/Rahul/Dataset/1 Lakh Russian Soldiers Killed in Ukraine Says a US Report_ ISH News_Full-HD.smi -> D:/Rahul/Dataset/subs\1 Lakh Russian Soldiers Killed in Ukraine Says a US Report_ ISH News_Full-HD.txt
Skipping: D:/Rahul/Dataset/1,000 kg of Fake Amul Butter Seized by FDA_Full-HD.smi (empty output)
Processed: D:/Rahul/Dataset/10 Countries Help India by Sending Oxygen Tanks_Full-HD.smi -> D:/Rahul/Dataset/subs\10 Countries Help India by Sending Oxygen Tanks_Full-HD.txt
Processed: D:/Rahul/Dataset/10 Dead in Brazil 