In [None]:
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
import torch
import sys

## CTC with Wav2Vec2-2-BERT

In [None]:
device = "cuda:0" if torch.cuda.is_available() else  "cpu"

transcriber = pipeline(
    "automatic-speech-recognition", model="hf-audio/wav2vec2-bert-CV16-en", device=device
)

Function to record the microphone inputs for a specified `chunk_length_s` duration, with a streaming chunk size
of `stream_chunk_s`:

In [36]:
def transcribe(chunk_length_s=10.0, stream_chunk_s=1.0):
    sampling_rate = transcriber.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Start speaking...")
    for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
        sys.stdout.write("\033[K")
        print(item["text"], end="\r")
        if not item["partial"][0]:
            break

    return item["text"]

In [None]:
transcribe()

## Encoder-Decoder with Distil-Whisper

Whisper small (ideal for CPU):

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" 

transcriber = pipeline(
    "automatic-speech-recognition", model="distil-whisper/distil-small.en", device=device
)

transcriber.model.generation_config.language = None
transcriber.model.generation_config.task = None

In [None]:
transcribe()

Distil-Whisper medium (fastest on GPU):

In [37]:
transcriber = pipeline(
    "automatic-speech-recognition", model="distil-whisper/distil-medium.en", device=device
)

transcriber.model.generation_config.language = None
transcriber.model.generation_config.task = None

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
transcribe()

Start speaking...
[K Hey, I'm running the Distill Whisper model in real time using the Transformers library with a streaming input and a chunk length of one second.

" Hey, I'm running the Distill Whisper model in real time using the Transformers library with a streaming input and a chunk length of one second."