In [1]:
from pathlib import Path

import pandas as pd

from psifx.io import json, rttm, vtt

In [2]:
root = Path("/home/guillaume/Datasets/UNIL/CH.102")
transcription_path = root / "Transcriptions/Mixed.normalized.vtt"
diarization_path = root / "Diarizations/Mixed.normalized.rttm"
identification_path = root / "Identifications/Mixed.normalized.json"
enhanced_transcription_path = root / "Transcriptions/Mixed.normalized.enhanced.vtt"

In [3]:
transcription = vtt.VTTReader.read(transcription_path)
transcription = pd.DataFrame.from_records(transcription)
transcription

Reading: 100%|██████████| 45/45 [00:00<00:00, 125913.06it/s]


Unnamed: 0,start,end,speaker,text
0,0.0,9.0,,"On peut parler ? Du coup, t'as fait quoi ce we..."
1,9.0,13.0,,"Ce que j'ai fait ce weekend, j'avais la nuance..."
2,13.0,14.0,,Ce qui est Arthur ?
3,14.0,16.0,,"C'est un des potes, tu sais avec qui je vais m..."
4,16.0,18.0,,Super bizarre.
5,18.0,23.0,,"Ok, il était là ou pas au soirée que t'as fait..."
6,23.0,26.0,,"Il était pas là la dernière, non."
7,26.0,29.0,,"Ah c'est ça, t'es pas le pépel ?"
8,30.0,32.0,,"Non, il a jamais pas."
9,32.0,34.0,,"Ah ok, je ne pense pas que je vais à Kissel."


In [4]:
diarization = rttm.RTTMReader.read(diarization_path)
diarization = pd.DataFrame.from_records(diarization)
diarization["end"] = diarization["start"] + diarization["duration"]
diarization

Decoding: 100%|██████████| 26/26 [00:00<00:00, 1109.29it/s]


Unnamed: 0,type,file_stem,channel,start,duration,orthography,speaker_type,speaker_name,confidence_score,signal_lookahead_time,end
0,SPEAKER,Mixed.normalized,1,5.776,4.061,,,SPEAKER_01,,,9.837
1,SPEAKER,Mixed.normalized,1,10.009,1.024,,,SPEAKER_00,,,11.033
2,SPEAKER,Mixed.normalized,1,12.5,3.003,,,SPEAKER_00,,,15.503
3,SPEAKER,Mixed.normalized,1,13.251,0.512,,,SPEAKER_01,,,13.763
4,SPEAKER,Mixed.normalized,1,16.305,0.171,,,SPEAKER_00,,,16.476
5,SPEAKER,Mixed.normalized,1,16.476,6.57,,,SPEAKER_01,,,23.046
6,SPEAKER,Mixed.normalized,1,24.036,3.993,,,SPEAKER_00,,,28.029
7,SPEAKER,Mixed.normalized,1,26.049,0.358,,,SPEAKER_01,,,26.407
8,SPEAKER,Mixed.normalized,1,27.517,1.28,,,SPEAKER_01,,,28.797
9,SPEAKER,Mixed.normalized,1,29.582,1.775,,,SPEAKER_00,,,31.357


In [5]:
identification = json.JSONReader.read(identification_path)
mapping = identification["mapping"]
identification

Reading: 100%|██████████| 1/1 [00:00<00:00, 6384.02it/s]


{'mapping': {'SPEAKER_00': 'Left.normalized.wav',
  'SPEAKER_01': 'Right.normalized.wav'},
 'agreement': 0.52}

In [None]:
for transcription_index in range(transcription.shape[0]):
    transcription_row = transcription.iloc[transcription_index]
    matching_diarization_index = None
    highest_iou_index, highest_iou = None, 0.0
    for diarization_index in range(diarization.shape[0]):
        diarization_row = diarization.iloc[diarization_index]
        intersection_start = max(transcription_row["start"], diarization_row["start"])
        intersection_end = min(transcription_row["end"], diarization_row["end"])
        union_start = min(transcription_row["start"], diarization_row["start"])
        union_end = max(transcription_row["end"], diarization_row["end"])
        intersection_duration = max(0.0, intersection_end - intersection_start)
        union_duration = max(0.0, union_end - union_start)
        iou = intersection_duration / union_duration
        if iou > highest_iou:
            highest_iou_index, highest_iou = diarization_index, iou
    matching_diarization_index = highest_iou_index
    # if matching_diarization_index is None:
    #     transcription_center = transcription_row[["start", "end"]].mean()
    #     lowest_distance_index, lowest_distance = None, np.inf
    #     for diarization_index in range(diarization.shape[0]):
    #         diarization_row = diarization.iloc[diarization_index]
    #         diarization_center = diarization_row[["start", "end"]].mean()
    #         distance = np.abs(transcription_center - diarization_center)
    #         if distance < lowest_distance:
    #             lowest_distance_index, lowest_distance = diarization_index, distance
    #     matching_diarization_index = lowest_distance_index
    if matching_diarization_index is not None:
        speaker_name = mapping[diarization.iloc[matching_diarization_index]["speaker_name"]]
    else:
        speaker_name = "NA"
    transcription.loc[transcription_index, "speaker"] = speaker_name
transcription

In [None]:
segments = []
for index in range(transcription.shape[0]):
    segment = {
        "start": transcription.iloc[index]["start"],
        "end": transcription.iloc[index]["end"],
        "speaker": transcription.iloc[index]["speaker"],
        "text": transcription.iloc[index]["text"],
    }
    segments.append(segment)
vtt.VTTWriter.write(
    path="/home/guillaume/Datasets/UNIL/CH.102/Transcriptions/Mixed.normalized.enhanced.vtt",
    segments=segments,
    verbose=True,
    overwrite=True,
)