In [1]:
import pandas as pd
import jsonlines
import numpy as np
import re

In [24]:
def rnd(num):
    return round(num * 2) / 2

def get_df(tv_show):
    base_path = "../data"
    speaker_path = f"{base_path}/diarized/{tv_show}.csv"
    transcr_path = f"{base_path}/transcriptions/{tv_show}.jsonl"

    df = pd.read_csv(speaker_path, header=None)
    df.columns = ["start", "end", "speaker"]
    def transform_speaker(speaker_str):
        return int(speaker_str.split("_")[2])
    df["speaker"] = df["speaker"].apply(transform_speaker)

    # round to nearest 0.5 seconds
    max_time = rnd(df.iloc[-1]["end"])

    # create a time series from 0 to max_time
    series = pd.Series(np.arange(0, max_time + 0.5, 0.5))
    series

    new_df = pd.DataFrame()
    new_df["time"] = series

    # DIARIZATION
    # for each speaker, find the time series indices that fall within the speaker's start-end time frame
    df_as_obj = df.to_dict(orient="records")
    for speaker in df_as_obj:
        speaker_id = speaker["speaker"]
        start = rnd(speaker["start"])
        end = rnd(speaker["end"])
        new_df[speaker_id] = new_df["time"].apply(lambda x: 1 if x >= start and x <= end else 0)

    print(df_as_obj[:10])

    # convert from detailed view to a single speaker col, -1 if there's no speaker
    new_df["speaker"] = -1
    for index, row in new_df.iterrows():
        for speaker_id in df["speaker"].unique():
            if row[speaker_id] == 1:
                new_df.at[index, "speaker"] = speaker_id
    new_df = new_df[["time", "speaker"]]

    # TRANSCRIPTIONS
    parsed = []
    with jsonlines.open(transcr_path) as reader:
        for obj in reader:
            timestamp = obj["timestamp"]
            txt = obj["text"]
            start, end = timestamp
            parsed.append({"start": start, "end": end, "text": txt})
    transcript_df = pd.DataFrame(parsed)
    transcript_df["text"] = transcript_df["text"].apply(lambda x: x.strip())
    # replace nan with max_end
    transcript_df["end"] = transcript_df["end"].fillna(max_time)
    # round all times to 0.5
    transcript_df["start"] = transcript_df["start"].apply(lambda x: rnd(x))
    transcript_df["end"] = transcript_df["end"].apply(lambda x: rnd(x))

    pattern = re.compile(r"\d+(?:,\d+)+")
    def filter_start(sent):
        sent = pattern.sub("", sent)
        return re.sub(r"^[^a-zA-Z0-9]+", "", sent)

    transcript_df["text"] = transcript_df["text"].apply(filter_start)

    # expand the start-end to 0.5 second intervals
    for index, row in transcript_df.iterrows():
        start = row["start"]
        end = row["end"]
        text = row["text"]
        for i in np.arange(start, end, 0.5):
            new_df.loc[new_df["time"] == i, "text"] = text
    new_df.fillna("", inplace=True)

    # create a mapping from text -> speaker
    text_to_speaker = {}
    for index, row in new_df.iterrows():
        speaker = row["speaker"]
        text = row["text"]

        if speaker == -1 or text == "":
            continue
        if text not in text_to_speaker:
            text_to_speaker[text] = {}
        if speaker not in text_to_speaker[text]:
            text_to_speaker[text][speaker] = 0
        text_to_speaker[text][speaker] += 1

    text_to_speaker = {k: max(v, key=v.get)
                    for k, v in sorted(text_to_speaker.items(),
                                        key=lambda item: sum(item[1].values()), reverse=True)}

    new_df["speaker"] = new_df["text"].apply(
        lambda x: text_to_speaker[x] if x in text_to_speaker else -1)

    new_df = new_df[new_df["text"] != ""]
    new_df = new_df.drop_duplicates(subset=["text"])
    
    new_df.reset_index(drop=True, inplace=True)
    return new_df

df = get_df("Debatten12okt")

[{'start': 3.9, 'end': 7.0, 'speaker': 5}, {'start': 7.9, 'end': 9.7, 'speaker': 0}, {'start': 10.3, 'end': 11.0, 'speaker': 0}, {'start': 11.6, 'end': 14.2, 'speaker': 0}, {'start': 16.0, 'end': 16.1, 'speaker': 2}, {'start': 17.5, 'end': 25.0, 'speaker': 2}, {'start': 25.6, 'end': 34.9, 'speaker': 5}, {'start': 35.6, 'end': 36.0, 'speaker': 5}, {'start': 36.2, 'end': 38.9, 'speaker': 5}, {'start': 39.1, 'end': 60.0, 'speaker': 5}]


In [23]:
df.head(50)

Unnamed: 0,time,speaker,text
0,4.0,-1,Hei. Denne debatten skal handle om dette.
1,7.5,-1,Målet er at flest mulig skal jobbe. Derfor må ...
2,16.0,-1,Og dette.
3,17.5,-1,Det er litt forstemmende at vi ofte får en dis...
4,22.0,-1,"nivået på ytelser, og altfor lite om hvordan v..."
5,27.0,-1,Selvfølgelig vet disse to dyrkdrevne politiker...
6,30.5,-1,når de sier at flere utenfor arbeid må stå opp...
7,33.0,-1,Selvfølgelig kommer reaksjonene fra de som ikk...
8,36.5,-1,Men i kveld skal vi ta Vedum og Brenna på ordet.
9,39.5,-1,Når de sier at de er så bekymret at over 600 0...


Unnamed: 0,time,speaker
0,4.0,5
1,8.0,0
2,10.5,0
3,11.5,0
4,16.0,2
5,17.5,2
6,25.5,5
7,35.5,5
8,36.0,5
9,39.0,5


In [62]:
tv_show = "Debatten12okt"
base_path = "../data"
speaker_path = f"{base_path}/diarized/{tv_show}.csv"

def rnd(num):
    return round(num * 2) / 2

df = pd.read_csv(speaker_path, header=None)
df.columns = ["start", "end", "speaker"]
# keep only start and speaker. rename start to "time"
df = df[["start", "speaker"]]
df = df.rename(columns={"start": "time"})
df["time"] = df["time"].apply(rnd)

def transform_speaker(speaker_str):
    return int(speaker_str.split("_")[2])

df["speaker"] = df["speaker"].apply(transform_speaker)

# TRANSCRIPTIONS
transcr_path = f"{base_path}/transcriptions/{tv_show}.jsonl"
parsed = []
with jsonlines.open(transcr_path) as reader:
    for obj in reader:
        timestamp = obj["timestamp"]
        txt = obj["text"]
        start, end = timestamp
        parsed.append({"time": rnd(start), "text": txt})

transcript_df = pd.DataFrame(parsed)
transcript_df["text"] = transcript_df["text"].apply(lambda x: x.strip())

pattern = re.compile(r"\d+(?:,\d+)+")
def filter_start(sent):
    sent = pattern.sub("", sent)
    return re.sub(r"^[^a-zA-Z0-9]+", "", sent)

transcript_df["text"] = transcript_df["text"].apply(filter_start)


for index, row in transcript_df.iterrows():
    time = row["time"]
    text = row["text"]
    closest = df.iloc[(df["time"]-time).abs().argsort()[:1]]
    closest_speaker = closest["speaker"].values[0]
    transcript_df.at[index, "speaker"] = closest_speaker
    
    
transcript_df.head(20)

Unnamed: 0,time,text,speaker
0,4.0,Hei. Denne debatten skal handle om dette.,5.0
1,7.5,Målet er at flest mulig skal jobbe. Derfor må ...,0.0
2,16.0,Og dette.,2.0
3,17.5,Det er litt forstemmende at vi ofte får en dis...,2.0
4,22.0,"nivået på ytelser, og altfor lite om hvordan v...",5.0
5,27.0,Selvfølgelig vet disse to dyrkdrevne politiker...,5.0
6,30.5,når de sier at flere utenfor arbeid må stå opp...,5.0
7,33.0,Selvfølgelig kommer reaksjonene fra de som ikk...,5.0
8,36.5,Men i kveld skal vi ta Vedum og Brenna på ordet.,5.0
9,39.5,Når de sier at de er så bekymret at over 600 0...,5.0


In [None]:
# change the "speaker" to iteratively increase. I.e. start from 1 and increment as we observe new speakers.


In [61]:
# from the transcript df, find the closest matching speaker



Unnamed: 0,time,text,speaker
0,4.0,Hei. Denne debatten skal handle om dette.,5.0
1,7.5,Målet er at flest mulig skal jobbe. Derfor må ...,0.0
2,16.0,Og dette.,2.0
3,17.5,Det er litt forstemmende at vi ofte får en dis...,2.0
4,22.0,"nivået på ytelser, og altfor lite om hvordan v...",5.0
5,27.0,Selvfølgelig vet disse to dyrkdrevne politiker...,5.0
6,30.5,når de sier at flere utenfor arbeid må stå opp...,5.0
7,33.0,Selvfølgelig kommer reaksjonene fra de som ikk...,5.0
8,36.5,Men i kveld skal vi ta Vedum og Brenna på ordet.,5.0
9,39.5,Når de sier at de er så bekymret at over 600 0...,5.0
