In [12]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") #Audio â†’ numbers â†’ shape the model expects
wav2vec = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") #The model itself
wav2vec.eval() #Set to eval mode (we are not training)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [None]:
waveform, sr = sf.read("sample2.flac", dtype="float32") # waveform: np.ndarray, sr: int, each number in waveform signifies amplitude at that time

# Convert stereo â†’ mono
if waveform.ndim == 2:
    waveform = waveform.mean(axis=1)

waveform = torch.from_numpy(waveform) # Convert to torch tensor

# Resample to 16kHz if needed (since the model expects 16kHz audio)
if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    sr = 16000

# Prepare input for the model
inputs = processor(
    waveform.squeeze(), # Remove any extra dimensions
    sampling_rate=16000,
    return_tensors="pt", # Return PyTorch tensors
)

# Forward pass through the model
with torch.no_grad():
    logits = wav2vec(**inputs).logits # logit = time_step_1 -> [prob_sound_1, ...]

# for each time slice, choose sound with highest probability
predicted_ids = torch.argmax(logits, dim=-1)[0]
# Convert token IDs to human readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(predicted_ids.tolist())
tokens # each token is 20ms of audio that represents what was spoken
# <pad> = nothing meaningful happened in this 20ms slice
# | = word boundary


['<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 'B',
 'E',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 'F',
 '<pad>',
 'O',
 '<pad>',
 'R',
 'E',
 '<pad>',
 '|',
 '|',
 'H',
 'H',
 'E',
 '<pad>',
 '|',
 '|',
 '<pad>',
 'H',
 '<pad>',
 'A',
 '<pad>',
 'D',
 '<pad>',
 '|',
 '|',
 '<pad>',
 '<pad>',
 'T',
 'T',
 '<pad>',
 'I',
 '<pad>',
 'M',
 'M',
 'E',
 '<pad>',
 '|',
 '|',
 'T',
 'O',
 'O',
 '|',
 '|',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 'A',
 '<pad>',
 '<pad>',
 'N',
 'N',
 '<pad>',
 '<pad>',
 '<pad>',
 'S',
 '<pad>',
 'W',
 '<pad>',
 'E',
 'E',
 'R',
 'R',
 '<pad>',
 '<pad>',
 '|',
 '|',
 '|',
 '|',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad

In [18]:
FRAME_SEC = 0.02  # 20 ms per token

events = []
current = None

for i, tok in enumerate(tokens):
    t = i * FRAME_SEC

    if tok == '<pad>':
        if current:
            current["end"] = t
            events.append(current)
            current = None
        continue

    if current and current["label"] == tok:
        # same sound continuing (elongation)
        continue

    if current:
        current["end"] = t
        events.append(current)

    current = {
        "label": tok,
        "start": t
    }

if current:
    current["end"] = t
    events.append(current)

events

[{'label': 'B', 'start': 0.54, 'end': 0.56},
 {'label': 'E', 'start': 0.56, 'end': 0.58},
 {'label': 'F', 'start': 0.66, 'end': 0.68},
 {'label': 'O', 'start': 0.7000000000000001, 'end': 0.72},
 {'label': 'R', 'start': 0.74, 'end': 0.76},
 {'label': 'E', 'start': 0.76, 'end': 0.78},
 {'label': '|', 'start': 0.8, 'end': 0.84},
 {'label': 'H', 'start': 0.84, 'end': 0.88},
 {'label': 'E', 'start': 0.88, 'end': 0.9},
 {'label': '|', 'start': 0.92, 'end': 0.96},
 {'label': 'H', 'start': 0.98, 'end': 1.0},
 {'label': 'A', 'start': 1.02, 'end': 1.04},
 {'label': 'D', 'start': 1.06, 'end': 1.08},
 {'label': '|', 'start': 1.1, 'end': 1.1400000000000001},
 {'label': 'T', 'start': 1.18, 'end': 1.22},
 {'label': 'I', 'start': 1.24, 'end': 1.26},
 {'label': 'M', 'start': 1.28, 'end': 1.32},
 {'label': 'E', 'start': 1.32, 'end': 1.34},
 {'label': '|', 'start': 1.36, 'end': 1.4000000000000001},
 {'label': 'T', 'start': 1.4000000000000001, 'end': 1.42},
 {'label': 'O', 'start': 1.42, 'end': 1.46},
 {'

In [19]:
pauses = []

for i in range(1, len(events)):
    gap = events[i]["start"] - events[i-1]["end"]
    if gap >= 0.3:  # 300 ms
        pauses.append({
            "type": "pause",
            "start": events[i-1]["end"],
            "end": events[i]["start"],
            "duration": gap
        })

pauses

[{'type': 'pause',
  'start': 2.02,
  'end': 2.36,
  'duration': 0.33999999999999986},
 {'type': 'pause', 'start': 5.48, 'end': 5.9, 'duration': 0.41999999999999993},
 {'type': 'pause',
  'start': 6.28,
  'end': 6.640000000000001,
  'duration': 0.3600000000000003},
 {'type': 'pause', 'start': 7.62, 'end': 8.68, 'duration': 1.0599999999999996}]

In [20]:
stutters = []

for i in range(1, len(events)):
    prev = events[i - 1]
    curr = events[i]

    if (
        curr["label"] == prev["label"]
        and curr["start"] - prev["end"] < 0.05
    ):
        stutters.append({
            "type": "stutter",
            "label": curr["label"],
            "start": prev["start"],
            "end": curr["end"]
        })


stutters

[]

In [23]:
FILLER_PATTERNS = [
    # --- Single vowel fillers (require duration >= threshold) ---
    ["A"],   # ah
    ["E"],   # eh
    ["U"],   # uh
    ["O"],   # oh

    # --- Repeated vowel elongations ---
    ["A", "A"],
    ["E", "E"],
    ["U", "U"],
    ["O", "O"],

    # --- Canonical English fillers ---
    ["U", "H"],   # uh
    ["E", "R"],   # er
    ["A", "H"],   # ah
    ["M", "M"],   # mm

    # --- Nasal thinking sounds ---
    ["M"],
    ["N"],
    ["M", "M", "M"],
    ["N", "N"],

    # --- Mixed vowel + nasal (umm / amm / unn) ---
    ["A", "M"],
    ["U", "M"],
    ["E", "M"],
    ["A", "N"],
    ["U", "N"],

    # --- Breathy hesitation ---
    ["H"],
    ["H", "H"],
]


FRAME_SEC = 0.02  # 20 ms per token

fillers = []

i = 1  # start at 1 so we can look left
while i < len(tokens) - 2:  # stop early so we can look right
    pair = [tokens[i], tokens[i + 1]]

    if pair in FILLER_PATTERNS:
        left = tokens[i - 1]
        right = tokens[i + 2]

        # ðŸ”’ CRITICAL RULE: must be surrounded by silence
        if left == "<pad>" and right == "<pad>":
            start = i * FRAME_SEC
            end = (i + 2) * FRAME_SEC
            duration = end - start

            if duration >= 0.12:  # minimum audible filler
                fillers.append({
                    "type": "filler",
                    "label": "".join(pair).lower(),
                    "start": start,
                    "end": end,
                    "duration": duration
                })
            i += 2
        else:
            # embedded in a real word â†’ ignore
            i += 1
    else:
        i += 1

fillers

[]