In [None]:
import os
from claudette import *

In [None]:
os.environ['ANTHROPIC_API_KEY'] = 'your-api-key'

In [None]:
model = models[1] #sonnet 3.5

In [None]:
chat = Chat(model, sp="""You are a helpful and concise assistant that will correct a diarization script.""")

In [None]:
def process_chunk(chunk, index, total_chunks):
    global chat
    if index == 0:
        prompt = f'''In the speaker diarization transcript below, some words are potentially misplaced due to bleeding. As in one speaker is talking then immediately after a second speaker is talking, and the first words of the second speaker are attributed to the last words of the second speaker. I need you to read the meaning and context of the sentences and make your best guess about how the sentences should be re-distributed 
                  They may also be displaced in the following manner-- Two or more speakers speak, and the words are all attributed to one speaker, when actually 2 or three speakers spoke like this: speaker_1 "talking" speaker_2 "talking" speaker_1 "talking", but all the transcription captures is speaker_1 "talking talking talking" 
                  Please correct those words and move them to the right speaker. Directly show the corrected transcript without explaining what changes were made or why you made those changes. 
                  This is the first of {total_chunks} parts. Correct this chunk in isolation. Text to correct is indicated by this tag <text-to-correct></text-to-correct>
                  Feel free to attempt to correct the speaker identified, for example if it says it is speaker_01, but you think it is speaker_02, correct that, also if you know the actual name of a character, replace the "SPEAKER_01" with it's actual character name
                  <text-to-correct>{chunk}</text-to-correct>
                '''
    else:
        if index % 2 == 0:
            chat = Chat(model, sp="""You are a helpful and concise assistant that will correct a diarization script.""")
        prompt = f'''In the speaker diarization transcript below, some words are potentially misplaced.
                  Please correct those words and move them to the right speaker. Directly show the corrected transcript without explaining what changes were made or why you made those changes.
                  Continue diarization from previous {index} parts, maintaining consistent speaker attribution and context. The diarization transcript to correct is indicated by this tag <text-to-correct></text-to-correct>
                  This is part {index + 1} of {total_chunks}. Correct the section indicated by <previous-corrected-text></previous-corrected-text> only, and directly show the corrected transcript without explaining what changes were made or why you made those changes.
                  Feel free to attempt to correct the speaker identified, for example if it says it is speaker_01, but you think it is speaker_02, correct that, also if you know the actual name of a character, replace the "SPEAKER_01" with it's actual character name
                  <text-to-correct>{chunk}</text-to-correct>
                '''
    
    r = chat(prompt)
    print(f"Processed chunk {index + 1} of {total_chunks}")
    corrected_text = contents(r)
    return corrected_text

In [None]:
def read_and_chunk(file_path, max_size=20000):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    lines = text.split('\n')
    chunks = []
    current_chunk = ""

    for line in lines:
        if len(current_chunk + '\n' + line) > max_size:
            chunks.append(current_chunk)
            current_chunk = line
        else:
            if current_chunk:
                current_chunk += '\n' + line
            else:
                current_chunk = line

    if current_chunk:
        chunks.append(current_chunk)
    return chunks

In [None]:
def process_conversation(file_path):
    chunks = read_and_chunk(file_path)
    results = []
    context = ""

    for index, chunk in enumerate(chunks):
        corrected_text = process_chunk(chunk, context, index, len(chunks))
        results.append(corrected_text)
        context = corrected_text

    return '\n'.join(results)

In [None]:
def write_results_to_file(original_file_path, processed_text):
    improved_file_path = f"improved_{original_file_path}"
    with open(improved_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_text)
    print(f"Results written to {improved_file_path}")

In [None]:
print("Current working directory:", os.getcwd())
print("Files in current directory:", os.listdir())

In [None]:
conversation_path = 'diarization/transcription_output_A_Ch.txt'
improved_path = 'transcription_output_A_Ch.txt'
processed_text = process_conversation(conversation_path)
write_results_to_file(improved_path, processed_text)