In [None]:
import re
import os
from docx import Document

In [None]:
def formate_transcripts(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except UnicodeDecodeError:
        try:
            with open(filename, 'r', encoding='ISO-8859-1') as f:
                lines = f.readlines()
        except:
            print(f"Failed to read file {filename} with both UTF-8 and ISO-8859-1 encodings. Skipping this file.")
            return []

    results = []
    timestamp = ""
    for i, line in enumerate(lines):
        line = line.strip()

        # Detect timestamp line
        if re.match(r"\d{2}:\d{2}:\d{2}\.\d{3} -->", line):
            timestamp = line.strip()

        # Next line is the speaker and statement
        elif timestamp and re.match(r".+?:", line):  # must contain "Speaker:"
            results.append({
                "timestamp": timestamp,
                "statement": line.strip(),
                "index": i  # Use line number instead of transcript index
            })
            timestamp = ""  # reset until next timestamp

    return results

In [None]:
def get_speaker(results, pid): 
    dct = {}
    for key, value in enumerate(results):
        # print("Original statement:", value['statement'])  # Debugging line
        if ":" in value['statement']:
            speaker, statement = value['statement'].split(":", 1)
        else:
            speaker = "NA"
            statement = value['statement']
        
        if speaker not in ["JaeWon Kim", "Thea Klein-Balajee"]:
            speaker = pid  # Replace the speaker with the PID
        
        match = re.search("[\.\?\!]$", statement)
        if match:
            dct[key] = {"speaker": speaker, "statement": [statement.strip()], 'index': [value['index']], 'timestamp': [value['timestamp']]}
        else:
            dct[key] = {"speaker": speaker, "statement": [statement.strip() + "... "],  'index': [value['index']], 'timestamp': [value['timestamp']]}
                
    return dct

In [None]:
def formate_statements(dct):
    for key, value in dct.items():
        ind = key + 1
        if ind != len(dct):
            speaker = value['speaker']
            speaker_i = dct[ind]['speaker']
            if speaker == speaker_i:
                dct[ind]['statement'].insert(0,"".join(dct[key]['statement']))
                dct[key]['statement'] = "APPENDED"

                indexes = dct[key]['index']
                for index in indexes:
                    dct[ind]['index'].append(index)
                dct[key]['index'] = "APPENDED"

                timestamps = dct[key]['timestamp']
                for timestamp in timestamps:
                    dct[ind]['timestamp'].append(timestamp)
                dct[key]['timestamp'] = "APPENDED"
    return dct

In [None]:
def save_clean_transcript(dct, filename):
    cleaned_directory = os.path.join('.', 'cleaned')

    if not os.path.exists(cleaned_directory):
        os.makedirs(cleaned_directory)

    # Replace extension with .docx (handles both .vtt and .txt)
    base_name = os.path.splitext(filename)[0]
    new_filename = f"{base_name}.docx"
    output_path = os.path.join(cleaned_directory, new_filename)

    # Create a new Document
    doc = Document()

    for key, value in dct.items():
        if value['statement'] != "APPENDED":
            # Extract timestamp without milliseconds
            formatted_timestamp = value['timestamp'][-1].split('-->')[0].strip().split('.')[0]

            doc.add_paragraph(f"{value['speaker']} {formatted_timestamp}")
            doc.add_paragraph(" ".join(value['statement']))
            doc.add_paragraph()  # Add separation line

    # Save the Document
    doc.save(output_path)

In [None]:
files = [f for f in os.listdir('.') if os.path.isfile(f)]

In [None]:
for filename in files:
    if filename.endswith((".vtt", ".txt")) and not filename.endswith("_cleaned.vtt"):
        pid = os.path.splitext(os.path.basename(filename))[0]
        if pid:
            results = formate_transcripts(filename)
            dct = get_speaker(results, pid)
            clean = formate_statements(dct)
            save_clean_transcript(clean, filename)
        else:
            print(f"Failed to extract PID from filename: {filename}")
