In [None]:
# Write a python notbook script that converts .srt files to .csv files and vice versa.
import csv
import re
import pandas as pd

def srt_to_csv(srt_file, csv_file=None, preserve_linebreaks=True):
    """
    Convert SRT file to CSV format. The CSV will contain the Segment Start, Segment End, and Segment Text of each subtitle entry.
    Segment Start and Segment End are the timestamps in seconds, and Segment Text is the subtitle text.
    :param srt_file: Path to the input SRT file.
    :param csv_file: Path to the output CSV file. If None, auto-generates based on preserve_linebreaks.
    :param preserve_linebreaks: If True, preserves line breaks as <br/> tags and uses _br_converted.csv suffix.
    """
    def time_to_seconds(time_str):
        """Convert SRT timestamp format (HH:MM:SS,mmm) to seconds"""
        time_str = time_str.replace(',', '.')
        parts = time_str.split(':')
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds = float(parts[2])
        return hours * 3600 + minutes * 60 + seconds
    
    # Auto-generate CSV filename if not provided
    if csv_file is None:
        if preserve_linebreaks:
            csv_file = srt_file.replace('.srt', '_br_converted.csv')
        else:
            csv_file = srt_file.replace('.srt', '_converted.csv')
    
    with open(srt_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split content into subtitle blocks
    subtitle_blocks = re.split(r'\n\s*\n', content.strip())
    
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Segment Start', 'Segment End', 'Segment Text'])
        
        for block in subtitle_blocks:
            if block.strip():
                lines = block.strip().split('\n')
                if len(lines) >= 3:
                    # Extract timestamp line
                    timestamp_line = lines[1]
                    # Extract start and end times
                    times = timestamp_line.split(' --> ')
                    if len(times) == 2:
                        start_time = time_to_seconds(times[0].strip())
                        end_time = time_to_seconds(times[1].strip())
                        # Extract subtitle text - preserve line breaks as <br/> tags or join with spaces
                        if preserve_linebreaks:
                            subtitle_text = '<br/>'.join(lines[2:])
                        else:
                            subtitle_text = ' '.join(lines[2:])
                        writer.writerow([start_time, end_time, subtitle_text])


def csv_to_srt(csv_file, column, srt_file):
    """
    Convert CSV file to SRT format. The CSV should contain the Segment Start, Segment End, and Segment Text of each subtitle entry.
    :param csv_file: Path to the output CSV file.
    :param column: The column name to use for the subtitle text.
    :param srt_file: Path to the input SRT file.
    """
    def seconds_to_time(seconds):
        """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
    
    # Read CSV using pandas to handle column name issues
    df = pd.read_csv(csv_file)
    
    # Find the correct column names by looking for patterns
    start_col = None
    end_col = None
    text_col = None
    
    for col in df.columns:
        col_clean = col.strip().strip("'\"")  # Remove quotes and whitespace
        if 'segment start' in col_clean.lower():
            start_col = col
        elif 'segment end' in col_clean.lower():
            end_col = col
    
    # Find the text column
    for col in df.columns:
        col_clean = col.strip().strip("'\"")
        if col_clean == column.strip().strip("'\""):
            text_col = col
            break
    
    if start_col is None or end_col is None or text_col is None:
        available_cols = [col.strip().strip("'\"") for col in df.columns]
        raise ValueError(f"Could not find required columns. Available columns: {available_cols}")
    
    print(f"Using columns: Start='{start_col}', End='{end_col}', Text='{text_col}'")
    
    with open(srt_file, 'w', encoding='utf-8') as output_f:
        for i, (_, row) in enumerate(df.iterrows(), 1):
            start_time = float(row[start_col])
            end_time = float(row[end_col])
            text = str(row[text_col])
            
            # Convert <br/> tags back to line breaks for SRT format
            text = text.replace('<br/>', '\n')
            
            # Write SRT format
            output_f.write(f"{i}\n")
            output_f.write(f"{seconds_to_time(start_time)} --> {seconds_to_time(end_time)}\n")
            output_f.write(f"{text}\n\n")

In [None]:
srt_file = r'D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english_SE.srt'

# Example 1: Convert with line breaks preserved (default behavior)
# This will create a file with _br_converted.csv suffix
srt_to_csv(srt_file)

# Example 2: Convert without preserving line breaks
# This will create a file with _converted.csv suffix
srt_to_csv(srt_file, preserve_linebreaks=False)

# # Example 3: Convert with custom filename and line breaks preserved
# csv_file_custom = srt_file.replace('.srt', '_custom.csv')
# srt_to_csv(srt_file, csv_file_custom, preserve_linebreaks=True)

In [None]:
csv_file = r'D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english_SE_br_converted_cleaned_gpt_4_1_translated_d_openai_gpt_5_pv2.0_edited.csv'
# generate a srt file name based on the csv file name with _converted.srt suffix
srt_file = csv_file.replace('.csv', '.srt')

# Read the CSV to check available columns
df = pd.read_csv(csv_file)
print("Available columns in CSV:")
for i, col in enumerate(df.columns):
    print(f"  {i+1}. {repr(col)}")

# Look for the translated text column (handle quotes in column names)
column = None
for col in df.columns:
    col_clean = col.strip().strip("'\"")
    if col_clean == "Translated Text (edited)":
        column = col_clean
        break
    elif col_clean == "Translated Text":
        column = col_clean

if column is None:
    # Fallback to first available text column
    for col in df.columns:
        col_clean = col.strip().strip("'\"")
        if 'translated' in col_clean.lower() and 'edited' in col_clean.lower():
            column = col_clean
            break
    if column is None:
        for col in df.columns:
            col_clean = col.strip().strip("'\"")
            if 'translated' in col_clean.lower():
                column = col_clean
                break

print(f'Column selected for SRT conversion: {column}')

try:
    # csv to srt conversion
    csv_to_srt(csv_file, column, srt_file)
    print(f"Successfully converted CSV to SRT: {srt_file}")
except Exception as e:
    print(f"Error during conversion: {e}")