In [245]:
import whisper
import pandas as pd
from dotenv import load_dotenv

In [246]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)
load_dotenv()
audio_file_path = "sample5.flac"

In [247]:
model = whisper.load_model("base", device="cpu")

In [248]:
# result = model.transcribe(
#     audio_file_path,
#     task="transcribe",
#     temperature=0,
#     word_timestamps=True,
#     condition_on_previous_text=False,
#     initial_prompt=(
#         "Transcribe verbatim. Include filler words like um, uh, er, "
#         "false starts, repetitions, and hesitations."
#     ),
#     fp16=False,
# )

result = model.transcribe(
    audio_file_path,
    task="transcribe",
    word_timestamps=True,
    fp16=False,
)

In [249]:
words = []
for seg in result["segments"]:
    for w in seg["words"]:
        words.append({
            "word": w["word"].strip(),
            "start": float(w["start"]),
            "end": float(w["end"]),
            "duration": float(w["end"] - w["start"]),
            "confidence": float(w["probability"])
        })

df_words = pd.DataFrame(words)
df_words.sort_values("confidence").head(5)
df_words.head()

Unnamed: 0,word,start,end,duration,confidence
0,We've,0.0,0.66,0.66,0.682309
1,been,0.66,0.76,0.1,0.990956
2,talking,0.76,1.04,0.28,0.996788
3,about,1.04,1.3,0.26,0.998473
4,a,1.3,1.44,0.14,0.994192


In [250]:
segments = []
for seg in result["segments"]:
    segments.append({
        "text": seg["text"].strip(),
        "start": float(seg["start"]),
        "end": float(seg["end"]),
        "duration": float(float(seg["end"]) - float(seg["start"])),
        "avg_word_confidence": sum([float(w["probability"]) for w in seg["words"]]) / (len(seg["words"]) if len(seg["words"]) > 0 else 0.0)
    })

df_segments = pd.DataFrame(segments)
with pd.option_context('display.max_colwidth', None):
    print(df_segments.head()) 

                                                                                          text  \
0                                We've been talking about a well-known person that you admire.   
1          And I'd like to discuss with you one or two more general questions related to this.   
2                               Now let's consider first of all famous people in your country.   
3                                           What kind of people become famous in Saudi Arabia?   
4  I'm thinking maybe the host they are working in the media or who they usually are presented   

   start    end  duration  avg_word_confidence  
0   0.00   2.84      2.84             0.957558  
1   4.24   8.38      4.14             0.984768  
2   8.80  11.84      3.04             0.855413  
3  13.12  16.00      2.88             0.956398  
4  17.24  28.54     11.30             0.756150  


In [251]:
total_duration = float(df_segments.iloc[-1]['end']) #- df_segments.iloc[0]['start'])
words_per_minute = (len(df_words) * 60) / (total_duration) 

print(f"Total duration (s): {total_duration:.2f}")
print(f"Number of words: {len(df_words)}")
print(f"Words per minute: {words_per_minute:.2f}")

Total duration (s): 93.46
Number of words: 183
Words per minute: 117.48


In [252]:
pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]
pauses

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.4 ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.42, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 1.28, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 1.24, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.3 , 0.  , 0.  , 0.82, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 2.2 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.3 , 0.  , 0.  , 0.18, 0.  , 0.  , 0.  ,
       0.72, 0.  , 0.  , 0.  , 0.  , 0.  , 0.34, 0.  , 0.  , 0.  , 0.62,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.

In [253]:
words_clean = df_words['word'].str.lower()
words_unique = words_clean.nunique()
words_total = len(words_clean)
vocab_richness = words_unique / words_total if words_total > 0 else 0
top_repeats = words_clean.value_counts().head(5)
print(f"Total words: {words_total}")
print(f"Unique words: {words_unique}")
print(f"Vocabulary richness: {vocab_richness:.2f}")

Total words: 183
Unique words: 98
Vocabulary richness: 0.54


In [254]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") #Audio → numbers → shape the model expects
wav2vec = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") #The model itself
wav2vec.eval(); #Set to eval mode (we are not training)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [255]:
waveform, sr = sf.read(audio_file_path, dtype="float32") # waveform: np.ndarray, sr: int, each number in waveform signifies amplitude at that time

# Convert stereo → mono
if waveform.ndim == 2:
    waveform = waveform.mean(axis=1)

waveform = torch.from_numpy(waveform) # Convert to torch tensor

# Resample to 16kHz if needed (since the model expects 16kHz audio)
if sr != 16000:
    waveform = torchaudio.functional.resample(waveform, sr, 16000)
    sr = 16000

# Prepare input for the model
inputs = processor(
    waveform.squeeze(), # Remove any extra dimensions
    sampling_rate=16000,
    return_tensors="pt", # Return PyTorch tensors
)

# Forward pass through the model
with torch.no_grad():
    logits = wav2vec(**inputs).logits # logit = time_step_1 -> [prob_sound_1, ...]

# for each time slice, choose sound with highest probability
predicted_ids = torch.argmax(logits, dim=-1)[0]
# Convert token IDs to human readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(predicted_ids.tolist())
#tokens # each token is 20ms of audio that represents what was spoken
# <pad> = nothing meaningful happened in this 20ms slice
# | = word boundary


In [256]:
FRAME_SEC = 0.02  # 20 ms per token

events = []
current = None

for i, tok in enumerate(tokens):
    t = i * FRAME_SEC

    if tok == '<pad>':
        if current:
            current["end"] = t
            events.append(current)
            current = None
        continue

    if current and current["label"] == tok:
        # same sound continuing (elongation)
        continue

    if current:
        current["end"] = t
        events.append(current)

    current = {
        "label": tok,
        "start": t
    }

if current:
    current["end"] = t
    events.append(current)

events;

In [257]:
MIN_GAP = 0.06       # 60 ms
MIN_EVENT = 0.12     # 120 ms

merged = []
current = None

for e in events:
    label = e["label"]

    # hard boundary: word separator
    if label == "|":
        if current:
            current["end"] = e["start"]
            if current["end"] - current["start"] >= MIN_EVENT:
                merged.append(current)
            current = None
        continue

    if current is None:
        current = {
            "labels": [label],
            "start": e["start"],
            "end": e["end"]
        }
        continue

    gap = e["start"] - current["end"]

    if gap <= MIN_GAP:
        # same acoustic blob
        current["labels"].append(label)
        current["end"] = e["end"]
    else:
        # close previous blob
        if current["end"] - current["start"] >= MIN_EVENT:
            merged.append(current)
        current = {
            "labels": [label],
            "start": e["start"],
            "end": e["end"]
        }

# flush last
if current and (current["end"] - current["start"] >= MIN_EVENT):
    merged.append(current)

df_wav2vec_merged = pd.DataFrame([
    {
        "labels": "".join(m["labels"]),
        "start": m["start"],
        "end": m["end"],
        "duration": m["end"] - m["start"]
    }
    for m in merged
])

df_wav2vec_merged.head()


Unnamed: 0,labels,start,end,duration
0,WE'VE,0.54,0.66,0.12
1,TALKING,0.84,1.14,0.3
2,ABOUT,1.18,1.36,0.18
3,WELL,1.48,1.62,0.14
4,KNOWN,1.66,1.84,0.18


In [258]:
pauses = []

for i in range(1, len(events)):
    gap = events[i]["start"] - events[i-1]["end"]
    if gap >= 0.3:  # 300 ms
        pauses.append({
            "type": "pause",
            "start": events[i-1]["end"],
            "end": events[i]["start"],
            "duration": gap
        })

pauses;

In [259]:
import soundfile as sf
import torchaudio

def load_audio(path):
    audio, sr = sf.read(path, dtype="float32")
    if audio.ndim == 2:
        audio = audio.mean(axis=1)  # stereo → mono
    if sr != 16000:
        audio = torchaudio.functional.resample(
            torch.from_numpy(audio), sr, 16000
        ).numpy()
        sr = 16000
    return audio, sr

audio, sr = load_audio(audio_file_path)

In [260]:
import whisperx
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

align_model, metadata = whisperx.load_align_model(
    language_code="en",  # or result["language"] if available
    device=device
)

aligned = whisperx.align(
    result["segments"],  # <-- your existing Whisper output
    align_model,
    metadata,
    audio,
    device
)

aligned_words = []

for seg in aligned["segments"]:
    for w in seg.get("words", []):
        if w["start"] is not None and w["end"] is not None:
            aligned_words.append({
                "word": w["word"].strip().lower(),
                "start": float(w["start"]),
                "end": float(w["end"])
            })

df_aligned_words = pd.DataFrame(aligned_words)
df_aligned_words = df_aligned_words.sort_values("start").reset_index(drop=True)
df_aligned_words.head()

Unnamed: 0,word,start,end
0,we've,0.0,0.69
1,been,0.71,0.832
2,talking,0.872,1.177
3,about,1.217,1.4
4,a,1.44,1.461


In [261]:
def overlaps_any_word_relaxed(start, end, words, tol=0.05):
    for _, w in words.iterrows():
        if start < w["end"] + tol and end > w["start"] - tol:
            return True
    return False

df_wav2vec_merged["overlaps_word"] = df_wav2vec_merged.apply(
    lambda r: overlaps_any_word_relaxed(
        r["start"], r["end"], df_aligned_words
    ),
    axis=1
)

df_non_word = df_wav2vec_merged[~df_wav2vec_merged["overlaps_word"]]
df_non_word.head()

Unnamed: 0,labels,start,end,duration,overlaps_word
35,ER,19.98,20.2,0.22,False
39,E,24.28,24.54,0.26,False
42,EM,26.08,26.34,0.26,False
54,A,32.3,32.42,0.12,False
60,E,36.02,36.16,0.14,False


In [262]:
import re

# conservative mapping: only sounds we are confident about
FILLER_MAP = {
    "A": "uh",
    "E": "uh",
    "U": "uh",
    "M": "um",
    "N": "um",
    "MM": "um",
    "NN": "um",
}


def classify_non_word_event(row):
    label = row["labels"].upper()
    duration = row["duration"]

    # Normalize repeated letters (e.g. MM → M)
    norm = re.sub(r"(.)\1+", r"\1", label)

    # Case 1: canonical filler sound
    if norm in FILLER_MAP:
        return {
            "type": "filler",
            "raw_label": label,
            "normalized": norm,
            "text": FILLER_MAP[norm],
        }

    # Case 2: looks like a real word fragment → stutter
    if len(label) >= 2 and label.isalpha():
        return {
            "type": "stutter",
            "raw_label": label,
            "normalized": label.lower(),
            "text": label.lower()
        }

    # Case 3: everything else → ignore
    return None


converted = []

for _, row in df_non_word.iterrows():
    result = classify_non_word_event(row)
    if result:
        converted.append({
            "type": result["type"],
            "text": result["text"],
            "raw_label": result["raw_label"],
            "start": row["start"],
            "end": row["end"],
            "duration": row["duration"]
        })

df_filler_events = pd.DataFrame(converted)
df_filler_events.head()

Unnamed: 0,type,text,raw_label,start,end,duration
0,stutter,er,ER,19.98,20.2,0.22
1,filler,uh,E,24.28,24.54,0.26
2,stutter,em,EM,26.08,26.34,0.26
3,filler,uh,A,32.3,32.42,0.12
4,filler,uh,E,36.02,36.16,0.14


In [263]:
filler_model = whisper.load_model("base", device="cpu")
verbatim_result = filler_model.transcribe(
    audio_file_path,
    task="transcribe",
    temperature=0,
    word_timestamps=True,
    condition_on_previous_text=False,
    initial_prompt=(
        "Transcribe verbatim. Include filler words like um, uh, er, "
        "false starts, repetitions, and hesitations."
    ),
    fp16=False,
)

import re

FILLER_PATTERN = re.compile(
    r"^(um+|uh+|erm+|er+|ah+|eh+)$",
    re.IGNORECASE
)

def normalize_whisper_token(token: str) -> str:
    """
    Normalize Whisper word tokens like:
    'um,' -> 'um'
    'uh...' -> 'uh'
    'erm-' -> 'erm'
    """
    token = token.lower().strip()

    # remove leading/trailing punctuation
    token = re.sub(r"^[^\w]+|[^\w]+$", "", token)

    return token


whisper_fillers = []

for seg in verbatim_result.get("segments", []):
    for w in seg.get("words", []):
        raw = w["word"]
        norm = normalize_whisper_token(raw)

        if not norm:
            continue

        if FILLER_PATTERN.match(norm):
            whisper_fillers.append({
                "type": "filler",
                "text": norm,                 # canonical filler
                "raw_text": raw,              # keep original for debugging
                "start": float(w["start"]),
                "end": float(w["end"]),
                "duration": float(w["end"] - w["start"]),
                "confidence": float(w["probability"]),
            })
if whisper_fillers:
    df_whisper_fillers = pd.DataFrame(whisper_fillers)

    # df_whisper_fillers = df_whisper_fillers[
    #     (df_whisper_fillers["duration"] >= 0.25) &
    #     (df_whisper_fillers["confidence"] >= 0.5)
    # ].reset_index(drop=True)
else:
    df_whisper_fillers = pd.DataFrame(
        columns=[
            "type",
            "text",
            "raw_text",
            "start",
            "end",
            "duration",
            "confidence",
        ]
    )
    
df_whisper_fillers

Unnamed: 0,type,text,raw_text,start,end,duration,confidence
0,filler,um,"um,",18.88,20.06,1.18,0.016718
1,filler,uh,"uh,",24.26,24.38,0.12,0.674564
2,filler,um,"um,",26.0,26.22,0.22,0.88938


In [264]:
def overlaps_time(a_start, a_end, b_start, b_end, tol=0.08):
    """
    Returns True if two time intervals overlap within tolerance.
    """
    return (a_start < b_end + tol) and (a_end > b_start - tol)

final_fillers = []

FINAL_FILLER_COLUMNS = [
    "style",
    "type",
    "text",
    "raw_label",
    "start",
    "end",
    "duration",
]

# 1️⃣ Add all wav2vec fillers first (ground truth)
for _, row in df_filler_events.iterrows():
    final_fillers.append({
        "style": "subtle",
        "type": row["type"],
        "text": row["text"],
        "raw_label": row.get("raw_label", None),
        "start": row["start"],
        "end": row["end"],
        "duration": row["duration"],
    })

for _, wf in df_whisper_fillers.iterrows():
    is_duplicate = False

    for af in final_fillers:
        if overlaps_time(
            wf["start"], wf["end"],
            af["start"], af["end"]
        ):
            is_duplicate = True
            break

    if not is_duplicate:
        final_fillers.append({
            "style": "clear",
            "type": "filler",
            "text": wf["text"],
            "raw_label": wf["raw_text"],
            "start": wf["start"],
            "end": wf["end"],
            "duration": wf["duration"],
        })

df_final_fillers = pd.DataFrame(
    final_fillers,
    columns=FINAL_FILLER_COLUMNS
)

if not df_final_fillers.empty:
    df_final_fillers = (
        df_final_fillers
        .sort_values("start")
        .reset_index(drop=True)
    )

df_final_fillers

Unnamed: 0,style,type,text,raw_label,start,end,duration
0,subtle,stutter,er,ER,19.98,20.2,0.22
1,subtle,filler,uh,E,24.28,24.54,0.26
2,subtle,stutter,em,EM,26.08,26.34,0.26
3,subtle,filler,uh,A,32.3,32.42,0.12
4,subtle,filler,uh,E,36.02,36.16,0.14
5,subtle,stutter,speial,SPEIAL,38.7,38.96,0.26
6,subtle,filler,uh,E,41.56,41.8,0.24
7,subtle,stutter,they,THEY,51.68,51.86,0.18
8,subtle,stutter,the,THE,54.88,55.22,0.34
9,subtle,stutter,be,BE,66.62,66.88,0.26
