# 📹 ipywebrtc 麦克风录音示例

本示例演示如何在 JupyterLab/Notebook 中用 `ipywebrtc` 调用浏览器麦克风录音，并把录音数据保存为 wav 文件。

In [2]:
# 安装依赖（只需运行一次）
!pip install ipywidgets ipywebrtc

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting ipywebrtc
  Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl.metadata (825 bytes)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)
Installing collected packages: ipywebrtc, widgetsnbextension, jupyterlab_widgets, ipywidgets
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [ipyw

In [3]:
# 启用通用 widgets 支持
# 安装并启用 ipywebrtc 扩展

# widgets 支持
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

# # webrtc 支持
# !jupyter labextension install jupyter-webrtc

# # 安装完后 rebuild
# !jupyter lab build


In [4]:
# 引入所需组件
from ipywidgets import Button, VBox
from ipywebrtc import AudioRecorder, CameraStream
from IPython.display import display

# 创建并展示录音控件
# 只启用音频，无视频
stream = CameraStream(constraints={"audio": True, "video": False})
recorder = AudioRecorder(stream=stream)

# 点击按钮开始/停止录音
btn = Button(description="开始/停止 录音")

def toggle_record(b):
    if recorder.recording:
        recorder.stop()
        print("✅ 录音完成，数据长度：", len(recorder.audio), "bytes")
    else:
        recorder.record()
        print("🎙️ 录音中…")

btn.on_click(toggle_record)

display(VBox([stream, recorder, btn]))

VBox(children=(CameraStream(constraints={'audio': True, 'video': False}), AudioRecorder(audio=Audio(value=b'',…

In [3]:
# Cell 3：停止后保存为文件
# 点击按钮停止后，再运行此 cell
# Cell 3：停止录音后，保存成 wav 文件
# 停止录音后（recorder.recording == False），直接调用：
recorder.save('browser_mic.wav')
print("✅ 已保存录音到 browser_mic.wav")


✅ 已保存录音到 browser_mic.wav


In [4]:
# Cell 4（可选）：播放回放
from IPython.display import Audio
Audio('browser_mic.wav')

In [25]:
# Cell 4：用 faster-whisper 做 transcription
from pprint import pprint
from faster_whisper import WhisperModel
import time

# 1) 加载模型（根据需求选大小：tiny/base/small/medium/large）
# 2) device="mps" 利用 Apple Silicon GPU，否则 "cpu"
# 3) compute_type="float16" 或 "int8" 降低显存占用
# model = WhisperModel("small", device="mps", compute_type="float16")
# model = WhisperModel("small", device="cpu", compute_type="int8")
model = WhisperModel(
    "small",
    device="cpu",
    compute_type="int8",
    cpu_threads=8      # 调整到你机器的核心数
)
# 读取刚才录好的文件，执行转写

start_time = time.time()

segments, info = model.transcribe(
    "browser_mic.wav",
    beam_size=5,         # 束搜索大小，3-10 之间
    language="en",       # 指定语言,避免检测
    word_timestamps=False
)
end_time = time.time()
python_inference = end_time - start_time


# 打印信息
print(f"⏱️ 音频时长: {info.duration:.2f}s")
print("🖥️ Python 计时推理耗时: {:.2f}s".format(python_inference))

pprint(info, indent=2, width=60, sort_dicts=True)

# 输出每一段
for seg in segments:
    print(f"[{seg.start:.2f} → {seg.end:.2f}] {seg.text}")

⏱️ 音频时长: 4.17s
🖥️ Python 计时推理耗时: 0.04s
TranscriptionInfo(language='en',
                  language_probability=1,
                  duration=4.165,
                  duration_after_vad=4.165,
                  all_language_probs=None,
                  transcription_options=TranscriptionOptions(beam_size=5,
                                                             best_of=5,
                                                             patience=1,
                                                             length_penalty=1,
                                                             repetition_penalty=1,
                                                             no_repeat_ngram_size=0,
                                                             log_prob_threshold=-1.0,
                                                             no_speech_threshold=0.6,
                                                             compression_ratio_threshold=2.4,
                                 

In [28]:
# Cell 5：打印 info 字段，并手动测量整体耗时
import time
from pprint import pprint

# 1️⃣ 把 info 转成字典，看看到底有哪些字段
try:
    info_dict = info._asdict()     # 如果是 namedtuple
except AttributeError:
    info_dict = vars(info)         # 否则用 __dict__

print("ℹ️ TranscriptionInfo 包含这些字段：")
pprint(list(info_dict.keys()), width=60)

# 2️⃣ 手动再跑一次转写计时
start = time.perf_counter()
segments, info2 = model.transcribe(
    "browser_mic.wav",
    beam_size=5,
    language="en",
    word_timestamps=False
)
end = time.perf_counter()
total_time = end - start

# 3️⃣ 再次把 info2 转成 dict
try:
    info2_dict = info2._asdict()
except AttributeError:
    info2_dict = vars(info2)

# 4️⃣ 把 info2 和 Python 层面时间合并，漂亮打印
combined = {
    "audio_duration_s": info2_dict.get("duration"),
    # 如果模型内部提供了 decode_duration 或 total_duration 就拿出来
    "decode_duration_s": info2_dict.get("decode_duration"),
    "model_total_duration_s": info2_dict.get("total_duration"),
    # 手动测得的，从 call 到返回
    "python_wall_time_s": total_time
}

print("\n⏱️ 时间统计：")
pprint(combined, indent=2, width=60, sort_dicts=True)
print(type(segments))

for seg in segments:
    print(f"[{seg.start:.2f} → {seg.end:.2f}] {seg.text}")

ℹ️ TranscriptionInfo 包含这些字段：
['language',
 'language_probability',
 'duration',
 'duration_after_vad',
 'all_language_probs',
 'transcription_options',
 'vad_options']

⏱️ 时间统计：
{ 'audio_duration_s': 4.165,
  'decode_duration_s': None,
  'model_total_duration_s': None,
  'python_wall_time_s': 0.07561479200376198}
<generator object WhisperModel.generate_segments at 0x7fb5b1f55c80>
[0.00 → 2.00]  What's this about?


In [29]:

# Cell 4：完整消费 generator 并计时
import time
from pprint import pprint
from faster_whisper import WhisperModel

model = WhisperModel("small", device="cpu", compute_type="int8", cpu_threads=8)

# 开始「墙钟」计时
start = time.perf_counter()

# 调用 transcribe 得到 segments generator 和 info
segments_gen, info = model.transcribe(
    "browser_mic.wav",
    beam_size=5,
    language="en",
    word_timestamps=False
)

# 强制消费 generator，把所有 segment 都拉出来
segments = []
for seg in segments_gen:
    segments.append(seg)

# 结束计时
end = time.perf_counter()
wall_time = end - start

# 打印统计
print(f"🎧 输入音频时长: {info.duration:.2f}s")
print(f"🖥️ 全流程墙钟耗时: {wall_time:.2f}s")
# 如果 info 里有 decode_duration 或 total_duration，也一并打印
info_dict = info._asdict() if hasattr(info, "_asdict") else vars(info)
if "decode_duration" in info_dict:
    print(f"🔍 模型内部 decode 耗时: {info_dict['decode_duration']:.2f}s")
if "total_duration" in info_dict:
    print(f"⏱️ 模型内部总推理耗时: {info_dict['total_duration']:.2f}s")
print()

# 输出每一段
for seg in segments:
    print(f"[{seg.start:.2f}s → {seg.end:.2f}s] {seg.text}")

🎧 输入音频时长: 4.17s
🖥️ 全流程墙钟耗时: 3.28s

[0.00s → 2.00s]  What's this about?
