In [2]:
! pip install speechbrain==1.0.0 -q
! pip install faster_whisper -q
! pip install pyannote.audio -q
! pip install whisper -q
! pip install datasets -q

In [3]:
import librosa
import traceback
from faster_whisper import WhisperModel
import torch
import datasets
from pathlib import Path
import pandas as pd
import re
import time
import os
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import speechbrain
from scipy.spatial.distance import cdist

In [4]:
import datetime

# UPLOAD AUDIO

In [5]:
# Get the path to the audio fine
audio_file_path="/kaggle/input/dlp-ga/TEST-1.mp3"

## .mp3 to .wav conversion

* sample rate = 16 KHz
* Channel -1 (mono)
* Audio Codec = pcm_s16le

In [8]:
# mp3 to wav conversion
# ! ffmpeg -i "{audio_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file_path.split('/')[-1][:-4]}.wav"

# SPEAKER DIARIZATION - USING WHISPER SEGMENTS AND AGGLOMERATIVE HIERARCHICAL CLUSTERING

In [9]:
audio_file=f"{audio_file_path.split('/')[-1][:-4]}.wav"
audio_file="/kaggle/working/TEST-1.wav"

In [10]:
whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]

embedding_model=PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  torch.load(path, map_location=device), strict=False
  stats = torch.load(path, map_location=device)


In [12]:
def convert_time(secs):
    return datetime.timedelta(seconds=round(secs))

def segment_embedding(segment, duration):
    try:
        audio=Audio()

        start=segment["start"]
        end=min(duration, segment["end"])

        clip=Segment(start, end)
        waveform, sample_rate=audio.crop(audio_file, clip)

        embeddings=embedding_model(waveform[None])
        return embeddings
    except Exception as e:
        traceback.print_exc()
        raise RuntimeError("Error During Segment Embedding", e)

In [13]:
# load the ASR model
whisper_model="base"
model=WhisperModel(whisper_model, compute_type="int8")
time_start=time.time()

config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

## ASR with Duration using Whisper¶

In [14]:
# get duration
audio_data, sampling_rate=librosa.load(audio_file, mono=True, sr=16000)
duration=len(audio_data)/sampling_rate # calculate duration
# transcribe audio
options = dict(language="en", beam_size=5, best_of=5)
transcribe_options=dict(task="transcribe", **options)
segments_raw, info=model.transcribe(audio_file, **transcribe_options)

# convert back to original format
segments = []
for segment_chunk in segments_raw:
    chunk={}
    chunk["start"]=segment_chunk.start
    chunk["end"]=segment_chunk.end
    segments.append(chunk)
#     print(segment_chunk.start, segment_chunk.end, segment_chunk.text)

In [15]:
# Q1
print(len(audio_data))

1935244


In [16]:
# Q2
!ffmpeg -i /kaggle/working/TEST-1.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [17]:
# Q3, channel type
!ffmpeg -i /kaggle/working/TEST-1.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [18]:
# Q6
len(segments)

35

In [19]:
# Q7
embedding_model.dimension

192

In [20]:
# Q8
print(len(model.supported_languages))

100


## Embeddings for segments with SpeechBrain

In [21]:
embeddings = np.zeros(shape=(len(segments),192))
print(embeddings.shape)
for i, segment in enumerate(segments):
    embeddings[i]=segment_embedding(segment, duration)
embedddings=np.nan_to_num(embeddings)

(35, 192)


## Apply the Clustering Algorithm

In [28]:
best_num_speaker=3
clustering = KMeans(best_num_speaker, random_state=42).fit(embeddings)
labels=clustering.labels_
labels



array([0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1], dtype=int32)

In [29]:
c_1, c_2, c_3 = clustering.cluster_centers_

In [30]:
import numpy as np

def euclidean_distance(array1, array2):
    # Ensure the arrays are NumPy arrays
    array1 = np.array(array1)
    array2 = np.array(array2)
    
    # Check if the arrays have the same shape
    if array1.shape != array2.shape:
        raise ValueError("Arrays must have the same shape")
    
    # Calculate the Euclidean distance
    distance = np.sqrt(np.sum((array1 - array2) ** 2))
    return distance


In [31]:
# Q9
d_12 = euclidean_distance(c_1, c_2)
print("Distance between centers 1 and 2", d_12)
d_13 = euclidean_distance(c_1, c_3)
print("Distance between centers 1 and 3", d_13)
d_23 = euclidean_distance(c_2, c_3)
print("Distance between centers 2 and 3", d_23)

Distance between centers 1 and 2 303.5204511724629
Distance between centers 1 and 3 341.6032296573219
Distance between centers 2 and 3 247.53708017036368


In [32]:
best_num_speaker=2
clustering = KMeans(2, random_state=42).fit(embeddings)
labels=clustering.labels_
labels



array([0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [33]:
! pip install moviepy pandas pillow -q

In [None]:
import pandas as pd
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
from PIL import Image, ImageDraw, ImageFont

df_results = transcription_results

# Step 4: load the video

video_path="/content/videoplayback.mp4" # update with oyur video path
video = VideoFileClip(video_path)

# functions to create an image with text

def create_text_image(text, font_size=70, img_size=(640, 80), bg_color=(0,0,0), text_color=(255,255,255)):
    img=Image.new("RGB", img_size, color=bg_color)
    d = ImageDraw.Draw(img)

    try:
        font=ImageFont.truetype("arial.ttf", font_size)
    except IOError:
        font = ImageFont.load_default()
    text_width, text_height=d.text_size(text, font=font)
    position = ((img[0]-text_width)/2, (img_size[1]-text_height)/2)
    d.text(position, text, fill=text_color, font=font)
    return img

# Step 5: Overlay Speaker Labels

clips = [video]

for _, row in df_results.iterrows():
    start_time = pd.to_datetime(row["Start"]).time()
    end_time = pd.to_datetime(row["End"]).time()

    start_seconds=start_time.hour * 3600 + start_time.minute * 60 + start_time.second
    end_seconds=end_time.hour * 3600 + end_time.minute * 60 + end_time.second

    text_img = create_text_image(row["Speaker"])
    text_img_path="/content/temp_text_img.png"
    text_img.save(text_img_path)

    txt_clip = (ImageClip(text_img_path)
                .set_position('center', 'bottom')
                .set_start(start_seconds)
                .set_duration(end_seconds-start_seconds))

    clips.append(txt_clip)

# Combine All Clips

final_video=CompositeVideoClip(clips)

final_video_path = "/content/videoplayback_label.mp4"
final_video.write_videofile(final_video_path, codec="libx264")

In [None]:
from IPython.display import HTML
from base64 import b64encode

def show_video(final_video_path, video_width=1000):

    video_file = open(final_video_path, "r+b").read()

    video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"

    return HTML(f"""<video width={video_width} controls><source src="{video_url}"></video>""")

show_video(final_video_path)