In [25]:
from thefuzz import fuzz
import numpy as np
import os
from collections import defaultdict

### Functions to Standardize Names from OCR

In [2]:
def standardize_name(name):
    # Remove spaces and lowercase for better matching
    return ''.join(name.split()).lower()

def group_similar_speaker_names(pairs, threshold=85):
    unique_names = []
    name_map = {}

    for _, name in pairs:
        std_name = standardize_name(name)
        found = False
        for canon in unique_names:
            if fuzz.ratio(std_name, canon) >= threshold:
                name_map[name] = canon
                found = True
                break
        if not found:
            unique_names.append(std_name)
            name_map[name] = std_name

    # Map all names to their canonical representative (de-standardize if needed)
    grouped = [(timestamp, name_map[name]) for (timestamp, name) in pairs]
    return grouped

### Functions to merge Diarization and Speaker changes dicts

In [21]:
def merge_consecutive_speaker_segments(segments, custom_end_time=None):
    merged = []
    current_speaker = speakerChanges[0][1]
    start = speakerChanges[0][0]
    end = speakerChanges[0][0]
    merged = []
    for i in range(len(speakerChanges)):    
        if speakerChanges[i][1]== current_speaker:
            continue
        else:
            end = speakerChanges[i][0]
            merged.append({'start':start, 'end':end, 'speaker':current_speaker})

            start = speakerChanges[i][0]

            current_speaker = speakerChanges[i][1]

    if custom_end_time:
        end = custom_end_time
        
    if start != end:
        merged.append({'start':start, 'end':end, 'speaker':current_speaker})
    return merged

def merge_diarization(diarization):
    merged = []
    current_speaker = diarization[0]['speaker']
    start_time = diarization[0]['start']
    end_time = diarization[0]['end']
    text = ''
    for i in range(len(diarization)):
        speaker = diarization[i]['speaker']
        timestamp = diarization[i]['end']
        if speaker == current_speaker:
            text = text + ' ' + diarization[i]['text']
            end_time = timestamp
        else:
            merged.append({'start': start_time, 'end': end_time, 'speaker': current_speaker, 'text':text})
            text = diarization[i]['text']
            current_speaker = speaker
            start_time = end_time = timestamp

    # Add the last segment
    merged.append({'start': start_time, 'end': end_time, 'speaker': current_speaker, 'text':text})
    return merged

### Identify top speaker based on query interval

In [35]:
def top_speaker(intervals, query_start, query_end):
    speaker_times = defaultdict(float)

    # Case: point query
    if query_start == query_end:
        for interval in intervals:
            if float(interval['start']) <= query_start <= float(interval['end']):
                return interval['speaker'], float(interval['end']) - float(interval['start'])
        return None, 0.0  # No speaker was speaking at that exact time


    for interval in intervals:
        start = max(float(interval['start']), query_start)
        end = min(float(interval['end']), query_end)
        if start < end:
            duration = end - start
            speaker = interval['speaker']
            speaker_times[speaker] += duration

    if not speaker_times:
        return None, 0.0

    top_speaker = max(speaker_times.items(), key=lambda x: x[1])
    return top_speaker  # returns (speaker_name, total_time)

### Load Data

In [6]:
speaker_changes_path = '/work/users/s/m/smerrill/Albemarle/speakerChanges'
whisper_diariaztion_path = '/work/users/s/m/smerrill/Albemarle/whisperDiarization'

In [13]:
speaker_changes_files = os.listdir(speaker_changes_path)
whisper_diariaztion_files = os.listdir(whisper_diariaztion_path)
common_files = list(set(speaker_changes_files) & set(whisper_diariaztion_files))

### Build transcript by combining Diarization with Visual Speaker Detection

In [44]:
save_path = '/work/users/s/m/smerrill/Albemarle/transcripts'

In [46]:
for file_name in common_files:
    changes_file = os.path.join(speaker_changes_path, file_name)
    whisper_file = os.path.join(whisper_diariaztion_path, file_name)
    
    speakerChanges = np.load(changes_file, allow_pickle=True)
    diarization = np.load(whisper_file, allow_pickle=True)

    merged_diarization = merge_diarization(diarization)
    custom_end_time = merged_diarization[-1]['end']

    speakerChanges = group_similar_speaker_names(speakerChanges)
    speakerChanges = merge_consecutive_speaker_segments(speakerChanges, custom_end_time)
    
    transcript = []
    for segment in merged_diarization:
        start, end = segment['start'], segment['end']
        text = segment['text']
        speaker, speak_time = top_speaker(speakerChanges, start, end)
        if speaker is None:
            print("HERE")
            break
        transcript.append({'speaker':speaker, 'text':text})
    np.save(os.path.join(save_path, file_name), transcript)
    break