<a href="https://colab.research.google.com/github/sode-k/sode-k/blob/main/Rec2txt_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install resemblyzer

Collecting resemblyzer
  Downloading Resemblyzer-0.1.4-py3-none-any.whl.metadata (5.8 kB)
Collecting webrtcvad>=2.0.10 (from resemblyzer)
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting typing (from resemblyzer)
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Resemblyzer-0.1.4-py3-none-any.whl (15.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: webrtcvad, typing
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp311-cp311-linu

In [None]:
import torch
from transformers import pipeline
import librosa
import soundfile as sf
from resemblyzer import VoiceEncoder, preprocess_wav
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
import numpy as np
import os
from google.colab import drive, files

drive.mount('/content/drive')

model_name = "kotoba-tech/kotoba-whisper-v2.2"
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    transcribe = pipeline("automatic-speech-recognition", model=model_name, device=0 if device=="cuda" else -1)
    print("Model loaded successfully.")
except Exception as e:
    print("Error loading model:", e)
    exit()

audio_file = "/content/drive/MyDrive/奨学金問題/2025年1月23日・増田先生.m4a"

if not os.path.exists(audio_file):
    print(f"Error: File '{audio_file}' does not exist.")
    exit()

try:
    y, sr = librosa.load(audio_file, sr=16000)
    sf.write("processed_audio.wav", y, sr)
    print(f"Audio file '{audio_file}' loaded and converted to WAV format.")
except Exception as e:
    print("Error processing audio file:", e)
    exit()

try:
    encoder = VoiceEncoder(device=device)
    print("Voice encoder initialized.")
except Exception as e:
    print("Error initializing voice encoder:", e)
    exit()

try:
    segments = librosa.effects.split(y, top_db=20)
    num_segments = len(segments)
    print(f"Audio split into {num_segments} segments.")
except Exception as e:
    print("Error splitting audio:", e)
    exit()

embeddings_list = []
for segment in tqdm(segments, desc="Extracting embeddings"):
    start, end = segment
    segment_y = y[start:end]
    sf.write("temp.wav", segment_y, sr)
    try:
        wav_segment = preprocess_wav("temp.wav")
        embedding = encoder.embed_utterance(wav_segment)
        embeddings_list.append(embedding)
    except Exception as e:
        print("Error embedding segment:", e)
        embeddings_list.append(np.zeros(encoder.embedding_size))
os.remove("temp.wav")

try:
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.0)
    cluster_labels = clustering.fit_predict(embeddings_list)
    print("Clustering completed.")
except Exception as e:
    print("Error during clustering:", e)
    exit()

try:
    df = []
    for i, segment in enumerate(segments):
        start_time = segment[0] / sr
        end_time = segment[1] / sr
        df.append({'start': start_time, 'end': end_time, 'label': cluster_labels[i]})
    print("Segments labeled successfully.")
except Exception as e:
    print("Error labeling segments:", e)
    exit()

results = []
for item in tqdm(df, desc="Transcribing segments"):
    start_sample = int(item['start'] * sr)
    end_sample = int(item['end'] * sr)
    segment_y = y[start_sample:end_sample]
    sf.write("temp_segment.wav", segment_y, sr)
    try:
        transcription = transcribe("temp_segment.wav")
        if isinstance(transcription, dict):
            text = transcription.get('text', '').strip()
        elif isinstance(transcription, list) and len(transcription) > 0:
            text = transcription[0].get('text', '').strip()
        else:
            text = ''
        speaker = f"Speaker {item['label']}"
        if text:
            results.append(f"{speaker}: {text}")
    except Exception as e:
        print(f"Error transcribing segment {item['label']}: {e}")
    os.remove("temp_segment.wav")

if results:
    chat_transcript = "\n".join(results)
    try:
        with open("chat_transcript.txt", "w", encoding="utf-8") as f:
            f.write(chat_transcript)
        print("Transcript saved to 'chat_transcript.txt'.")
        files.download("chat_transcript.txt")
    except Exception as e:
        print("Error saving transcript:", e)
else:
    print("No transcription results to save.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully.


  y, sr = librosa.load(audio_file, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio file '/content/drive/MyDrive/奨学金問題/2025年1月23日・増田先生.m4a' loaded and converted to WAV format.
Loaded the voice encoder model on cuda in 0.11 seconds.
Voice encoder initialized.


  checkpoint = torch.load(weights_fpath, map_location="cpu")


Audio split into 2793 segments.


Extracting embeddings: 100%|██████████| 2793/2793 [00:42<00:00, 66.05it/s]


Clustering completed.
Segments labeled successfully.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Transcribing segments: 100%|██████████| 2793/2793 [20:12<00:00,  2.30it/s]

Transcript saved to 'chat_transcript.txt'.





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>