<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/meeting/meeting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python3 -m pip install -U yt-dlp

In [None]:
!yt-dlp -x --audio-format wav "https://youtu.be/2MQHBPApza8"

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install pyannote.audio

In [None]:
import whisper
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.87G/2.87G [00:52<00:00, 58.6MiB/s]


In [None]:
# Hugging face access token
TOKEN=""

In [None]:
!huggingface-cli login

In [None]:
from pyannote.audio import Audio
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token=True)

audio_file = "test.wav"
diarization = pipeline(audio_file)

audio = Audio(sample_rate=16000, mono=True)

speaker_text_dict = {}

for segment, _, speaker in diarization.itertracks(yield_label=True):
    waveform, sample_rate = audio.crop(audio_file, segment)
    text = model.transcribe(waveform.squeeze().numpy())["text"]
    if speaker in speaker_text_dict.keys():
          speaker_text_dict[speaker].append({"start": segment.start, "end": segment.end, "text": text})
    else:
          speaker_text_dict[speaker] = [{"start": segment.start, "end": segment.end, "text": text}]
    print(f"[{segment.start:03.1f}s - {segment.end:03.1f}s] {speaker}: {text}")

In [None]:
import json
with open("output.json", "w") as f:
  json.dump(speaker_text_dict, f, indent=2, ensure_ascii=False)

In [None]:
with open("output.json", "r") as f:
  dic = json.load(f)
print(dic["SPEAKER_00"])

In [None]:
from pyannote.core import Segment
import scipy
from scipy.io.wavfile import write

sf = 16000

print(len(dic.keys()))

speaker_id = "SPEAKER_02"
spec_sp00_dic = dic[speaker_id][10]
segment = Segment(spec_sp00_dic["start"],  spec_sp00_dic["end"])
waveform, sample_rate = audio.crop(audio_file, segment)
wave = waveform.squeeze().numpy()

scipy.io.wavfile.write(filename='naretor.wav', rate=sf, data=wave)


In [None]:
# @title Refinement

In [None]:
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/embedding", 
                              use_auth_token=True)  # もし失敗したらhuggingface-login

Downloading:   0%|          | 0.00/96.4M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

In [None]:
from pyannote.core import Segment
from pyannote.audio import Inference


inference = Inference(model, window="whole")

In [None]:
# @title change meeting parameters
speaker_dict = {"kawasaki.wav": "川崎宗則", "darubish.wav": "ダルビッシュ有", "naretor.wav": "ナレーター"}

kawasaki_embed = inference("kawasaki.wav")
darubish_embed = inference("darubish.wav")
naretor_embed = inference("naretor.wav")

In [None]:
import numpy as np
from numpy.linalg import norm

def cosine_simularity(A, B):
  # compute cosine similarity
  cosine = np.dot(A,B)/(norm(A)*norm(B))

  return cosine

In [None]:
with open("output.json", "r") as f:
  output_dic = json.load(f)

In [None]:
def get_speaker_embed(speaker_id, inference, audio_file, output_dic):
  idx = 10
  speaker_segment = [output_dic[speaker_id][idx]["start"], output_dic[speaker_id][idx]["end"]]
  excerpt_speaker= Segment(speaker_segment[0], speaker_segment[1])
  embedding_speaker = inference.crop(audio_file, excerpt_speaker)
  return embedding_speaker

In [None]:
def calculate_similarity(embed_list, ref_embed):
  cosine_sim_list = []
  for embed in embed_list:
    cosine_sim = cosine_simularity(embed, ref_embed)
    cosine_sim_list.append(cosine_sim)

  return np.argmax(cosine_sim_list)

In [None]:
embedding_speaker00 = get_speaker_embed("SPEAKER_00", inference, audio_file, output_dic)
embedding_speaker01 = get_speaker_embed("SPEAKER_01", inference, audio_file, output_dic)
embedding_speaker02 = get_speaker_embed("SPEAKER_02", inference, audio_file, output_dic)

In [None]:
speaker_dict = {"kawasaki": "川崎宗則", "darubish": "ダルビッシュ有", "naretor": "ナレーター"}
embed_list = [embedding_speaker00, embedding_speaker01, embedding_speaker02]

darubish_id = calculate_similarity(embed_list, darubish_embed)
kawasaki_id = calculate_similarity(embed_list, kawasaki_embed)
naretor_id = calculate_similarity(embed_list, naretor_embed)

related_dict = {"kawasaki": kawasaki_id, "darubish": darubish_id, "naretor": naretor_id}

ans_speaker_dict = {}
for key, value in related_dict.items():
  ans_speaker_dict[value] = speaker_dict[key]

print(ans_speaker_dict)

darubish: 1  kawaski:  0  naretor:  2


In [None]:
# @title 音声類似度検索
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/embedding", 
                              use_auth_token=True)  # もし失敗したらhuggingface-login

In [None]:
from pyannote.core import Segment
from pyannote.audio import Inference


inference = Inference(model, window="whole")

In [None]:
# base speaker wav (今回は話者２)
base_speaker_audio = Segment(287.1, 298)
embedding_base_speaker = inference.crop(audio_file, base_speaker_audio)

In [None]:
# speaker 00
speaker00_segment = [310.9, 316.9]
excerpt_speaker00= Segment(speaker00_segment[0], speaker00_segment[1])
embedding_speaker00 = inference.crop(audio_file, excerpt_speaker00)

# speaker 01
speaker01_segment = [336.8, 347.2]
excerpt_speaker01= Segment(speaker01_segment[0], speaker01_segment[1])
embedding_speaker01 = inference.crop(audio_file, excerpt_speaker01)

# speaker 02
speaker02_segment = [17.2, 20.3]
excerpt_speaker02= Segment(speaker02_segment[0], speaker02_segment[1])
embedding_speaker02 = inference.crop(audio_file, excerpt_speaker02)


In [None]:
import numpy as np
from numpy.linalg import norm

def cosine_simularity(A, B):
  # compute cosine similarity
  cosine = np.dot(A,B)/(norm(A)*norm(B))

  return cosine

speaker_dict = {0: "川崎宗則", 1: "ダルビッシュ有", 2: "ナレーター"}

# 類似度を計算
cosine00 = cosine_simularity(embedding_speaker00, embedding_base_speaker)
cosine01 = cosine_simularity(embedding_speaker01, embedding_base_speaker)
cosine02 = cosine_simularity(embedding_speaker02, embedding_base_speaker)
# print("Cosine Similarity:", cosine00, cosine01, cosine02)
cosine_sim_list = [cosine00, cosine01, cosine02]

speaker = speaker_dict[np.argmax(cosine_sim_list)]
print("speaker: ", speaker)

speaker:  ナレーター


In [None]:
# Example for simularity search
from pyannote.audio import Inference
inference = Inference(model, window="whole")

embedding1 = inference("speaker1.wav")
embedding2 = inference("speaker2.wav")
# `embeddingX` is (1 x D) numpy array extracted from the file as a whole.

from scipy.spatial.distance import cdist
distance = cdist(embedding1, embedding2, metric="cosine")[0,0]