In [1]:
!pip install torch \
    numpy \
    soundfile \
    git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ur7nnijn
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-ur7nnijn
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25ldone
[?25h  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=c9e41d2f0c10dc750977f6564cf1bb11aee891b10528f1fe4d06b831ed01c792
  Stored in directory: /tmp/pip-ephem-wheel-cache-xak2rzqu/wheels/8b/6c/d0/622666868c179f156cf595c8b6f06f88bc5d80c4b31dccaa03
Successfully built openai-whisper
Installing collected pa

In [9]:
import os
import json

import whisper
import torch
import soundfile as sf
import numpy as np

# 入力ファイル（ステレオWAV）
audio_path = 'dialogue_stereo.wav'

In [11]:
# ステレオ分離: speaker A=左(0), B=右(1)と仮定
audio, sr = sf.read(audio_path)    # (samples, channels)
channel_A = audio[:,0]
channel_B = audio[:,1]

local_rank = 0
torch.cuda.set_device(local_rank)
os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank)
os.environ["OMP_NUM_THREADS"] = "2"
device = torch.device(f"cuda:{local_rank}")

# Whisperモデルのロード（smallかmediumが推奨、'ja'指定で日本語推論）
model = whisper.load_model('small', device=device)

def transcribe_channel(channel, sr, speaker_label):
    # Whisper expects mono wav, so pass as-is
    result = model.transcribe(
        channel.astype(np.float32), 
        language='ja', 
        word_timestamps=True  # NOTE: requires Whisper >=2023.4
    )
    # word-level JSON extraction
    words_json = []
    for segment in result['segments']:
        for word in segment['words']:   # word-level timestamps
            words_json.append({
                'speaker': speaker_label,
                'word': word['word'],
                'start': word['start'],
                'end': word['end']
            })
    return words_json

# 両チャンネルを transcribe
json_A = transcribe_channel(channel_A, sr, "A")
json_B = transcribe_channel(channel_B, sr, "B")

# 発話時間でソート（複数話者の時系列並び用）
full_json = json_A + json_B
full_json_sorted = sorted(full_json, key=lambda x: x['start'])

# JSON出力
with open('dialogue_transcript.json', 'w', encoding='utf-8') as f:
    json.dump(full_json_sorted, f, ensure_ascii=False, indent=2)

print('書き出し完了： dialogue_transcript.json')

100%|████████████████████████████████████████| 461M/461M [00:04<00:00, 109MiB/s]


書き出し完了： dialogue_transcript.json
