# DEPENDENCY INSTALLATION

In [None]:
!pip install -q transformers accelerate pydub librosa rapidfuzz huggingface_hub

print("‚úÖ Dependencies installed.")

# IMPORTS

In [None]:
import os
import re
import shutil
import zipfile
from pathlib import Path
from typing import Dict, List, Tuple

import librosa
import numpy as np
import pandas as pd
import torch
from huggingface_hub import HfApi, login
from pydub import AudioSegment
from rapidfuzz import fuzz
from tqdm.auto import tqdm
from transformers import WhisperForConditionalGeneration, WhisperProcessor

print("‚úÖ Imports ready.")

# CONFIGURATION

- All paths, model IDs, and processing parameters live here.

In [None]:
# ‚îÄ‚îÄ File Range ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Files are named train_001.wav ‚Ä¶ train_124.wav. Set the range to process.
START_FILE = 1    # First file index (inclusive)
NUM_FILES  = 45     # Number of files to process from START_FILE

# ‚îÄ‚îÄ Dataset Paths ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
AUDIO_BASE_PATH = (
    "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition"
    "/transcription/transcription/train/audio"
)
TEXT_BASE_PATH = (
    "/kaggle/input/dl-sprint-4-0-bengali-long-form-speech-recognition"
    "/transcription/transcription/train/annotation"
)

# ‚îÄ‚îÄ File Naming Patterns ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
AUDIO_FILE_PATTERN = "train_{:03d}.wav"
TEXT_FILE_PATTERN  = "train_{:03d}.txt"

# ‚îÄ‚îÄ Output ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
OUTPUT_FOLDER = "./upload_all_25s_1"
ZIP_FILE_NAME = "upload_all_25s_1.zip"

# ‚îÄ‚îÄ HuggingFace Hub ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
HF_TOKEN          = ""
HF_REPO_ID        = "bitwisemind/preprocess_dataset_25s_chunk"
HF_REPO_TYPE      = "dataset"
HF_COMMIT_MESSAGE = "Upload processed Bengali audio chunks"

# ‚îÄ‚îÄ Chunking Parameters ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
CHUNK_MAX_DURATION_S = 25.0
MIN_CHUNK_DURATION_S = 0.5

# ‚îÄ‚îÄ ASR Model ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
MODEL_ID = "bengaliAI/tugstugi_bengaliai-regional-asr_whisper-medium"
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

# ‚îÄ‚îÄ Clean and recreate output directory ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if os.path.exists(OUTPUT_FOLDER):
    shutil.rmtree(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

print("‚úÖ Configuration loaded.")
print(f"   Files        : {AUDIO_FILE_PATTERN.format(START_FILE)} ‚Üí "
      f"{AUDIO_FILE_PATTERN.format(START_FILE + NUM_FILES - 1)}")
print(f"   Chunk size   : {CHUNK_MAX_DURATION_S}s  |  Device : {DEVICE}")
print(f"   Output       : {OUTPUT_FOLDER}")


# MODEL LOADING

- Loads the BengaliAI Whisper model fine-tuned for Bengali ASR.

In [None]:
print(f"üì• Loading model : {MODEL_ID}")

processor = WhisperProcessor.from_pretrained(MODEL_ID)

model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)
model = model.to(DEVICE)
model.eval()

print(f"‚úÖ Model ready on {DEVICE}.")

# AUDIO CHUNKING & TRANSCRIPTION UTILITIES

In [None]:
def transcribe_chunk(audio_array: np.ndarray, sr: int = 16000) -> str:
    """Transcribe a single 1-D audio array with the Bengali Whisper model.

    The model is already fine-tuned for Bengali; no forced language decoding
    is needed.
    """
    input_features = processor(
        audio_array, sampling_rate=sr, return_tensors="pt"
    ).input_features.to(DEVICE)

    with torch.no_grad():
        generated_ids = model.generate(input_features, max_new_tokens=444)

    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]


def chunk_and_transcribe(
    audio_path: str,
    max_duration: float = 25.0,
    output_dir: str = "./bengali_chunks",
) -> List[Dict]:
    """Split a long audio file into chunks, transcribe each, and save WAV files.

    Pipeline per file:
      1. Load with librosa (for ASR) and pydub (for export).
      2. Slice into non-overlapping chunks of ‚â§ max_duration seconds.
      3. Transcribe each chunk; skip silent / empty results.
      4. Export chunk WAV to output_dir.

    Returns:
        List of dicts with keys: chunk_id, audio_path, text, start, end, duration.
    """
    os.makedirs(output_dir, exist_ok=True)
    name = Path(audio_path).stem

    # Load audio in both formats
    audio_seg   = AudioSegment.from_file(audio_path)
    audio_array, sr = librosa.load(audio_path, sr=16000)

    chunk_samples = int(max_duration * sr)
    total_samples = len(audio_array)
    num_chunks    = (total_samples + chunk_samples - 1) // chunk_samples

    chunks   = []
    chunk_id = 0

    for start_sample in tqdm(
        range(0, total_samples, chunk_samples),
        desc=f"  {name}",
        total=num_chunks,
        leave=False,
    ):
        end_sample  = min(start_sample + chunk_samples, total_samples)
        chunk_audio = audio_array[start_sample:end_sample]
        duration    = len(chunk_audio) / sr

        # Discard near-silent trailing fragments
        if duration < MIN_CHUNK_DURATION_S:
            chunk_id += 1
            continue

        text = transcribe_chunk(chunk_audio, sr)

        # Skip chunks that produced no text (silence / noise)
        if not text.strip():
            chunk_id += 1
            continue

        # Export WAV via pydub (millisecond indexing)
        start_ms  = int(start_sample / sr * 1000)
        end_ms    = int(end_sample   / sr * 1000)
        out_path  = os.path.join(output_dir, f"{name}_{chunk_id:03d}.wav")
        audio_seg[start_ms:end_ms].export(out_path, format="wav")

        chunks.append({
            "chunk_id":   chunk_id,
            "audio_path": out_path,
            "text":       text,
            "start":      round(start_sample / sr, 3),
            "end":        round(end_sample   / sr, 3),
            "duration":   round(duration, 3),
        })
        chunk_id += 1

    return chunks


print("‚úÖ Chunking and transcription utilities ready.")

# PROCESS ALL FILES

- Iterates over the configured file range, chunks each audio file and accumulates all chunks into a single DataFrame.

In [None]:
all_chunks = []
end_file   = START_FILE + NUM_FILES   # exclusive upper bound

print(f"üöÄ Processing files {START_FILE} ‚Üí {end_file - 1}...\n")

for file_num in range(START_FILE, end_file):
    audio_path = os.path.join(AUDIO_BASE_PATH, AUDIO_FILE_PATTERN.format(file_num))
    text_path  = os.path.join(TEXT_BASE_PATH,  TEXT_FILE_PATTERN.format(file_num))

    if not os.path.exists(audio_path):
        print(f"  ‚ö†Ô∏è  Audio not found: {AUDIO_FILE_PATTERN.format(file_num)} ‚Äî skipped.")
        continue
    if not os.path.exists(text_path):
        print(f"  ‚ö†Ô∏è  Annotation not found: {TEXT_FILE_PATTERN.format(file_num)} ‚Äî skipped.")
        continue

    print(f"[{file_num}] {AUDIO_FILE_PATTERN.format(file_num)}")
    chunks = chunk_and_transcribe(audio_path, max_duration=CHUNK_MAX_DURATION_S,
                                  output_dir=OUTPUT_FOLDER)

    for chunk in chunks:
        chunk["file_num"]  = file_num
        chunk["text_path"] = text_path

    all_chunks.extend(chunks)
    print(f"      ‚Üí {len(chunks)} chunks extracted.")

df = pd.DataFrame(all_chunks)

# Save a full manifest (all metadata) before alignment
manifest_path = os.path.join(OUTPUT_FOLDER, "chunks_manifest.csv")
df.to_csv(manifest_path, index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Total chunks : {len(df)}")
print(f"   Manifest saved ‚Üí {manifest_path}")
df.head(10)

# GROUND-TRUTH ALIGNMENT

- Aligns each ASR-transcribed chunk to the corresponding segment of the ground-truth annotation using sequential fuzzy matching.

In [None]:
# ‚îÄ‚îÄ Text Cleaning Helper ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def clean_text(text: str) -> str:
    """Strip Bengali punctuation to normalize text for fuzzy comparison."""
    return re.sub(r"[‡•§,\-\.?!:;\"'()]", "", text).strip()


# ‚îÄ‚îÄ Alignment Function ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def align_chunks_sequential(file_df: pd.DataFrame, corpus_path: str) -> pd.DataFrame:
    """Map each ASR chunk to the best-matching span of the ground-truth corpus.

    Alignment strategy:
      - Maintains a sequential pointer (corpus_pos) so each chunk maps to the
        *next unused* portion of the corpus ‚Äî preserving reading order.
      - A small bidirectional search window (¬±5 words) allows for minor ASR
        insertions or deletions without de-syncing the pointer.
      - Fuzzy ratio (RapidFuzz) selects the best candidate span length.

    Args:
        file_df    : DataFrame of chunks belonging to a single audio file.
        corpus_path: Path to the plain-text ground-truth annotation file.

    Returns:
        file_df with a new 'gt' column containing the aligned ground-truth text.
    """
    with open(corpus_path, "r", encoding="utf-8") as f:
        corpus = f.read().strip()

    corpus_words = clean_text(corpus).split()
    gt_texts     = []
    corpus_pos   = 0   # Sequential word pointer

    for _, row in file_df.iterrows():
        asr_clean  = clean_text(row["text"])
        asr_words  = asr_clean.split()

        if not asr_words or corpus_pos >= len(corpus_words):
            gt_texts.append("")
            continue

        num_asr_words = len(asr_words)
        best_score, best_length, best_start = 0, num_asr_words, corpus_pos

        # Search over a small window of start offsets and span lengths
        for start_offset in range(-5, 10):
            start = max(0, corpus_pos + start_offset)
            for length_offset in range(-3, 5):
                length    = max(5, num_asr_words + length_offset)
                end       = min(start + length, len(corpus_words))
                if start >= end:
                    continue
                candidate = " ".join(corpus_words[start:end])
                score     = fuzz.ratio(asr_clean, candidate)
                if score > best_score:
                    best_score  = score
                    best_length = end - start
                    best_start  = start

        gt_end  = min(best_start + best_length, len(corpus_words))
        gt_text = " ".join(corpus_words[best_start:gt_end])
        gt_texts.append(gt_text)
        corpus_pos = gt_end   # Advance pointer past the matched span

    file_df = file_df.copy()
    file_df["gt"] = gt_texts
    return file_df


# ‚îÄ‚îÄ Run Alignment ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

print("üîç Aligning chunks with ground-truth annotations...\n")

aligned_dfs = []

for file_num in tqdm(range(START_FILE, end_file), desc="Aligning files"):
    file_chunks = df[df["file_num"] == file_num].copy()
    if file_chunks.empty:
        continue

    text_path = file_chunks.iloc[0]["text_path"]
    if not os.path.exists(text_path):
        print(f"  ‚ö†Ô∏è  Annotation missing for file {file_num} ‚Äî skipped.")
        continue

    aligned_dfs.append(align_chunks_sequential(file_chunks, text_path))

# Combine, filter empty GT rows, and retain only the columns needed for training
df_aligned = (
    pd.concat(aligned_dfs, ignore_index=True)
      .pipe(lambda d: d[d["gt"].str.len() > 0])
      [["audio_path", "gt"]]
)

aligned_path = os.path.join(OUTPUT_FOLDER, "aligned_chunks.csv")
df_aligned.to_csv(aligned_path, index=False, encoding="utf-8-sig")

print(f"‚úÖ Aligned chunks : {len(df_aligned)}")
print(f"   Saved ‚Üí {aligned_path}")
df_aligned.head(10)

# ZIP PACKAGING

- Compresses the entire output folder (WAVs + CSVs) into a single ZIP archive ready for upload.

In [None]:
print(f"üì¶ Packaging output folder ‚Üí {ZIP_FILE_NAME}")

with zipfile.ZipFile(ZIP_FILE_NAME, "w", zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUTPUT_FOLDER):
        for file in files:
            full_path = os.path.join(root, file)
            arc_name  = os.path.relpath(full_path, os.path.dirname(OUTPUT_FOLDER))
            zf.write(full_path, arc_name)

zip_size_mb = os.path.getsize(ZIP_FILE_NAME) / (1024 ** 2)
print(f"‚úÖ ZIP ready : {ZIP_FILE_NAME}  ({zip_size_mb:.2f} MB)")

# HUGGINGFACE HUB UPLOAD

- Authenticates and uploads the ZIP to the configured dataset repo.

In [None]:
if not HF_TOKEN:
    print("‚ö†Ô∏è  HF_TOKEN not set ‚Äî upload skipped.")
else:
    login(token=HF_TOKEN, add_to_git_credential=False)

    print(f"üì§ Uploading {ZIP_FILE_NAME} ‚Üí {HF_REPO_ID}")
    HfApi().upload_file(
        path_or_fileobj = ZIP_FILE_NAME,
        path_in_repo    = ZIP_FILE_NAME,
        repo_id         = HF_REPO_ID,
        repo_type       = HF_REPO_TYPE,
        commit_message  = HF_COMMIT_MESSAGE,
    )
    print(f"‚úÖ Upload complete.")
    print(f"   View at : https://huggingface.co/datasets/{HF_REPO_ID}")

# SUMMARY

In [None]:
print("=" * 60)
print("üìä  PIPELINE COMPLETE")
print("=" * 60)
print(f"   Files processed  : {NUM_FILES}")
print(f"   Total chunks     : {len(df)}")
print(f"   Aligned chunks   : {len(df_aligned)}")
print(f"   Output folder    : {OUTPUT_FOLDER}")
print(f"   ZIP archive      : {ZIP_FILE_NAME}  ({zip_size_mb:.2f} MB)")
print(f"   HuggingFace repo : {HF_REPO_ID}")
print("=" * 60)