In [7]:
#!/usr/bin/env python3
# ---------------------------------------------------------------
#  devclean_transcribe_sample.py   –  100-clip demo run
# ---------------------------------------------------------------
import random, re, pathlib
import whisper, torch, soundfile as sf, pandas as pd
from tqdm.auto import tqdm

# ----------------------- CONFIG --------------------------------
ROOT_DIR      = pathlib.Path(
    r"C:/Users/shuwu/OneDrive/Desktop/MLDS/text_analytics/LibriSpeech/dev-clean"
)
WHISPER_MODEL = "small"
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"
FP16          = (DEVICE == "cuda")
SAMPLE_SIZE   = 100                        # ← change here for more / fewer clips
OUT_FILE      = f"devclean_transcripts_{SAMPLE_SIZE}.parquet"
# ---------------------------------------------------------------

flacs = sorted(ROOT_DIR.rglob("*.flac"))
if not flacs:
    raise SystemExit("No FLAC files found – check ROOT_DIR")

random.seed(42)
flac_subset = random.sample(flacs, k=min(SAMPLE_SIZE, len(flacs)))
print(f"Selected {len(flac_subset)} random clips out of {len(flacs)} total")


Selected 100 random clips out of 2703 total


In [8]:
asr = whisper.load_model(WHISPER_MODEL, device=DEVICE, in_memory=False)

def parse_ids(p: pathlib.Path):
    spk, chap, utt = re.match(r"(\d+)-(\d+)-(\d+)\.flac", p.name).groups()
    return int(spk), int(chap), int(utt)

def duration(path: str) -> float:
    info = sf.info(path)
    return info.frames / info.samplerate

records = []
for wav in tqdm(flac_subset, unit="file"):
    txt = asr.transcribe(str(wav), fp16=FP16)["text"].strip()
    spk, chap, utt = parse_ids(wav)
    records.append({
        "file":    str(wav),
        "speaker": spk,
        "chapter": chap,
        "utt_id":  utt,
        "seconds": duration(str(wav)),
        "text":    txt,
    })

pd.DataFrame(records).to_parquet(OUT_FILE, index=False)
print(f"✓ Saved {len(records)} rows to {OUT_FILE}")


100%|██████████| 100/100 [01:25<00:00,  1.17file/s]

✓ Saved 100 rows to devclean_transcripts_100.parquet





In [10]:
import pandas as pd

# Path to the file you just created
df = pd.read_parquet("devclean_transcripts_100.parquet")   # <- adjust name/path if different

print(df.shape)        # rows, columns
df.head()       # first 5 rows


(100, 6)


Unnamed: 0,file,speaker,chapter,utt_id,seconds,text
0,C:\Users\shuwu\OneDrive\Desktop\MLDS\text_anal...,84,121550,7,9.525,"By his default, short while he sojourned here...."
1,C:\Users\shuwu\OneDrive\Desktop\MLDS\text_anal...,1993,147965,7,7.57,"His face had a look of weariness and pleasure,..."
2,C:\Users\shuwu\OneDrive\Desktop\MLDS\text_anal...,1462,170142,1,9.585,Emerging at Houston at half past three o'clock...
3,C:\Users\shuwu\OneDrive\Desktop\MLDS\text_anal...,3081,166546,20,5.34,He wants me to stand ready to obey any summons...
4,C:\Users\shuwu\OneDrive\Desktop\MLDS\text_anal...,2803,161169,0,11.7,"Once, there was a father who thought he would ..."
