# Import required Libraries

In [1]:
import os
import re
import pandas as pd
from pydub import AudioSegment
from transformers import pipeline
from jiwer import wer, cer

# Path Variables

In [2]:
root_path = "../../data/Speech-to-Text/"
mix_selection = "Bengali-English_test" # "Bengali-English_test", "Hindi-English_test"
segment_file = os.path.join(root_path, mix_selection, "test/transcripts/segments")
transcript_file = os.path.join(root_path, mix_selection, "test/transcripts/text")
file_directory = os.path.join(root_path, mix_selection, "test")
ground_truth_csv = os.path.join(root_path, mix_selection + "en_test.csv")

# Ground Truth Preparation

In [3]:
if not os.path.exists(ground_truth_csv):

    with open(segment_file, "r+") as read_file:
        lines = read_file.read().split("\n")
    
    speaker_id, utt_id, file_id, start, end = [], [], [], [], [] 
    for line in lines:
        words = line.strip().split()
        if len(words) == 4:
            speaker_id.append(words[0].split("_")[0])
            utt_id.append(words[0])
            file_id.append(words[1])
            start.append(words[2])
            end.append(words[3])
    
    df1 = pd.DataFrame.from_dict(
        {
            "speaker_id": speaker_id,
            "utt_id": utt_id,
            "file_id": file_id,
            "start": start,
            "end": end
        }
    )

    with open(transcript_file, "r+") as read_file:
        lines = read_file.read().split("\n")
    
    utt_id, transript = [], []
    for line in lines:
        words = line.strip().split()
        if len(words) >= 2:
            utt_id.append(words[0])
            transript.append(" ".join(words[1:]))
    
    df2 = pd.DataFrame.from_dict(
        {
            "utt_id": utt_id,
            "transript": transript,
        }
    )

    df = df1.merge(df2, on=["utt_id"])

    df["file_path"] = df.file_id.apply(lambda x: os.path.join(file_directory, x + ".wav")) 
    
    df.to_csv(ground_truth_csv, index=False)

else:
    df = pd.read_csv(ground_truth_csv)

In [4]:
df

Unnamed: 0,speaker_id,utt_id,file_id,start,end,transript,file_path
0,108223,108223_rrmCKQYc3DSNqLao_0000,rrmCKQYc3DSNqLao,0.0,8.0,libreoffice impressএর উপর এই কথ্য tutorial এ আ...,../../data/Speech-to-Text/Bengali-English_test...
1,108223,108223_rrmCKQYc3DSNqLao_0001,rrmCKQYc3DSNqLao,8.0,15.0,in tutorial আমরা শিখবো কিভাবে : slides পটভূমি ...,../../data/Speech-to-Text/Bengali-English_test...
2,108223,108223_rrmCKQYc3DSNqLao_0002,rrmCKQYc3DSNqLao,15.0,24.0,আপনি operating system হিসাবে gnu/linux এবং lib...,../../data/Speech-to-Text/Bengali-English_test...
3,108223,108223_rrmCKQYc3DSNqLao_0003,rrmCKQYc3DSNqLao,24.0,32.0,পটভূমি বলতে slideএ প্রয়োগ করা সেইসব রং এবং প্...,../../data/Speech-to-Text/Bengali-English_test...
4,108223,108223_rrmCKQYc3DSNqLao_0004,rrmCKQYc3DSNqLao,32.0,38.0,libreoffice impressএ অনেক পটভূমি বিকল্প থাকে য...,../../data/Speech-to-Text/Bengali-English_test...
...,...,...,...,...,...,...,...
4270,990934,990934_CqG43iY1ZcuxOToG_0156,CqG43iY1ZcuxOToG,807.0,814.0,বিস্তারিত তথ্যের জন্য contact @spokentutorial ...,../../data/Speech-to-Text/Bengali-English_test...
4271,990934,990934_CqG43iY1ZcuxOToG_0157,CqG43iY1ZcuxOToG,814.0,819.0,spoken tutorial প্রকল্প talk to a teacher প্রক...,../../data/Speech-to-Text/Bengali-English_test...
4272,990934,990934_CqG43iY1ZcuxOToG_0158,CqG43iY1ZcuxOToG,819.0,826.0,এটি ভারত সরকারের ict mhrd এর জাতীয় শিক্ষা mis...,../../data/Speech-to-Text/Bengali-English_test...
4273,990934,990934_CqG43iY1ZcuxOToG_0159,CqG43iY1ZcuxOToG,826.0,832.0,এই বিষয়ে বিস্তারিত তথ্য এই লিঙ্কে প্রাপ্তিসাধ্য,../../data/Speech-to-Text/Bengali-English_test...


# Speech-to-Text

In [5]:
split_folder = os.path.join(file_directory, "splits")
os.makedirs(split_folder, exist_ok=True)

### Create Chunks from Time Stamp

In [6]:
def process_chunk(row):
    try:
        audio = AudioSegment.from_file(row["file_path"])
        start_ms = int(float(row["start"]) * 1000)
        end_ms = int(float(row["end"]) * 1000)

        chunk = audio[start_ms:end_ms]

        # Define output file path
        chunk_filename = f"{row['utt_id']}.wav"
        chunk_path = os.path.join(split_folder, chunk_filename)

        # Export chunk
        chunk.export(chunk_path, format="wav")

        return chunk_path

    except Exception as e:
        print(f"Error processing {row['utt_id']}: {e}")
        return None

In [7]:
if "chunk_path" not in df.columns:
    df["chunk_path"] = df.apply(process_chunk, axis=1)
    df.to_csv(ground_truth_csv, index=False)

### Transcribe

In [8]:
available_models = [
    ("Whisper-Base", "openai/whisper-base"),
    ("Whisper-Medium", "openai/whisper-medium"),
    ("Whisper-Large-v2", "openai/whisper-large-v2"),
    ("Wav2Vec2-Large", "facebook/wav2vec2-large-960h"),
]

hf_token = "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"

In [None]:
for i in range(3, 4):
    choose_model = i
    model_key, model_id = available_models[choose_model - 1]
    df = pd.read_csv(ground_truth_csv)
    if model_key not in df.columns:
        stt_pipeline = pipeline(model=model_id, task="automatic-speech-recognition", token=hf_token, device=0)
        def transcribe(row):
            """Transcribe audio file using model pipeline"""
            audio_path = row["chunk_path"]
            try:
                if model_key != "Wav2Vec2-Large":
                    if "hin" in mix_selection.lower():
                        lang = "hi"
                    if "ben" in mix_selection.lower():
                        lang = "bn"
                    result = stt_pipeline(audio_path, return_timestamps=True, generate_kwargs={"language": lang})
                else:
                    result = stt_pipeline(audio_path)
                return result.get("text", "")
            except Exception as e:
                print(f"Error transcribing {audio_path}: {e}")
                return ""
            
        df[model_key] = df.apply(transcribe, axis=1)
        df.to_csv(ground_truth_csv, index=False)

Device set to use cuda:0
You have passed language=bn, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=bn.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if

In [None]:
df

# Evaluation

In [None]:
def normalize(text):
    """Normalize mixed text for comparison"""
    if not text:
        return ""
    text = text.lower().strip()
    text = re.sub(r'[^\w\s\u0980-\u09FF]', '', text)  
    return re.sub(r'\s+', ' ', text).strip()
    
print(f"Language pair: {mix_selection}")
for i in range(3, 4):
    choose_model = i
    model_key, model_id = available_models[choose_model - 1]
    refs = [normalize(str(t)) for t in df.transript]
    hyps = [normalize(str(t)) for t in df[model_key]]

    assert len(refs) == len(hyps), f"Length mismatch: {len(refs)} vs {len(hyps)}"

    validated_refs = []
    validated_hyps = []
    
    counter = 0
    for idx, (r, h) in enumerate(zip(refs, hyps)):
        if not r or not h:
            counter += 1
        else:
            validated_refs.append(r)
            validated_hyps.append(h)

    wer_percent = wer(validated_refs, validated_hyps) 
    cer_percent = cer(validated_refs, validated_hyps)
    print(f"Results for {model_key} -> WER: {wer_percent:.2f}%, CER: {cer_percent:.2f}%")
    print(f"Skipped chunks: {counter}")