In [2]:
# Write a python notbook script that converts .srt files to .csv files and vice versa.
import csv
import re

def srt_to_csv(srt_file, csv_file=None, preserve_linebreaks=True):
    """
    Convert SRT file to CSV format. The CSV will contain the Segment Start, Segment End, and Segment Text of each subtitle entry.
    Segment Start and Segment End are the timestamps in seconds, and Segment Text is the subtitle text.
    :param srt_file: Path to the input SRT file.
    :param csv_file: Path to the output CSV file. If None, auto-generates based on preserve_linebreaks.
    :param preserve_linebreaks: If True, preserves line breaks as <br/> tags and uses _br_converted.csv suffix.
    """
    def time_to_seconds(time_str):
        """Convert SRT timestamp format (HH:MM:SS,mmm) to seconds"""
        time_str = time_str.replace(',', '.')
        parts = time_str.split(':')
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds = float(parts[2])
        return hours * 3600 + minutes * 60 + seconds
    
    # Auto-generate CSV filename if not provided
    if csv_file is None:
        if preserve_linebreaks:
            csv_file = srt_file.replace('.srt', '_br_converted.csv')
        else:
            csv_file = srt_file.replace('.srt', '_converted.csv')
    
    with open(srt_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split content into subtitle blocks
    subtitle_blocks = re.split(r'\n\s*\n', content.strip())
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Segment Start', 'Segment End', 'Segment Text'])
        
        for block in subtitle_blocks:
            if block.strip():
                lines = block.strip().split('\n')
                if len(lines) >= 3:
                    # Extract timestamp line
                    timestamp_line = lines[1]
                    # Extract start and end times
                    times = timestamp_line.split(' --> ')
                    if len(times) == 2:
                        start_time = time_to_seconds(times[0].strip())
                        end_time = time_to_seconds(times[1].strip())
                        # Extract subtitle text - preserve line breaks as <br/> tags or join with spaces
                        if preserve_linebreaks:
                            subtitle_text = '<br/>'.join(lines[2:])
                        else:
                            subtitle_text = ' '.join(lines[2:])
                        writer.writerow([start_time, end_time, subtitle_text])


def csv_to_srt(csv_file, srt_file):
    """
    Convert CSV file to SRT format. The CSV should contain the Segment Start, Segment End, and Segment Text of each subtitle entry.
    :param csv_file: Path to the output CSV file.
    :param srt_file: Path to the input SRT file.
    """
    def seconds_to_time(seconds):
        """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
    
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        
        with open(srt_file, 'w', encoding='utf-8') as output_f:
            for i, row in enumerate(reader, 1):
                start_time = float(row['Segment Start'])
                end_time = float(row['Segment End'])
                text = row['Segment Text']
                
                # Convert <br/> tags back to line breaks for SRT format
                text = text.replace('<br/>', '\n')
                
                # Write SRT format
                output_f.write(f"{i}\n")
                output_f.write(f"{seconds_to_time(start_time)} --> {seconds_to_time(end_time)}\n")
                output_f.write(f"{text}\n\n")

In [3]:
srt_file = r'D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english_SE.srt'

# Example 1: Convert with line breaks preserved (default behavior)
# This will create a file with _br_converted.csv suffix
srt_to_csv(srt_file)

# Example 2: Convert without preserving line breaks
# This will create a file with _converted.csv suffix
srt_to_csv(srt_file, preserve_linebreaks=False)

# # Example 3: Convert with custom filename and line breaks preserved
# csv_file_custom = srt_file.replace('.srt', '_custom.csv')
# srt_to_csv(srt_file, csv_file_custom, preserve_linebreaks=True)

In [None]:
csv_file = r'D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english_SE_reduced.csv'
# generate a srt file name based on the csv file name with _converted.srt suffix
srt_file = csv_file.replace('.csv', '_converted.srt')

#csv to srt conversion
csv_to_srt(csv_file, srt_file)