# 🎵 Hindi Audio → Text → NER (CRF) Pipeline

This notebook automatically:
- Uses an existing transcription TXT file (if present) OR transcribes a provided audio file
- Loads your pre-trained CRF model (`hindi_ner_crf_model_epoch.joblib`)
- Runs NER on the text and displays tagged output and structured entities

Place your files in the same folder as this notebook:
- `hindi_ner_crf_model_epoch.joblib` (required)
- `final_transcription.txt` (optional; used if present)
- Or provide an audio file path when prompted


In [None]:
# Install dependencies
%pip -q install pandas numpy joblib sklearn-crfsuite scikit-learn pydub faster-whisper ipywidgets

import os
from pathlib import Path
import re
import unicodedata
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import joblib

from pydub import AudioSegment
from pydub.silence import split_on_silence
from faster_whisper import WhisperModel

from IPython.display import display

# Ensure ffmpeg is available for pydub (Windows users: install ffmpeg and add to PATH)



In [None]:
# Simple Hindi spell checker
class HindiSpellChecker:
    def __init__(self):
        self.hindi_common_words = self._load_hindi_common_words()
        self.common_corrections = self._load_common_corrections()

    def _load_common_corrections(self) -> Dict[str, str]:
        return {"भिहार": "बिहार", "भिहारी": "बिहारी", "इंडिया": "भारत"}

    def _load_hindi_common_words(self) -> set:
        return {"भारत", "है", "हैं", "मैं", "आप", "हम", "दिल्ली", "मुंबई"}

    def is_hindi_word(self, word: str) -> bool:
        for ch in word:
            if '\u0900' <= ch <= '\u097F':
                return True
        return False

    def clean_word(self, word: str) -> str:
        cleaned = re.sub(r'[^\u0900-\u097F\u0020]', '', word)
        cleaned = unicodedata.normalize('NFC', cleaned)
        return cleaned.strip()

    def check_word(self, word: str) -> Tuple[bool, str]:
        if not word or not self.is_hindi_word(word):
            return True, word
        cleaned = self.clean_word(word)
        if not cleaned:
            return True, word
        if cleaned in self.common_corrections:
            return False, self.common_corrections[cleaned]
        if cleaned in self.hindi_common_words:
            return True, cleaned
        return True, cleaned

    def correct_text(self, text: str) -> str:
        words = text.split()
        out = []
        for w in words:
            _, cw = self.check_word(w)
            out.append(cw)
        return ' '.join(out)

spell_checker = HindiSpellChecker()


In [None]:
# CRF model loader and feature extraction
class HindiNERModel:
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.model = None

    def load(self):
        if self.model is None:
            self.model = joblib.load(self.model_path)
        return self.model

    def word2features(self, sent, i):
        word = sent[i]
        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-1:]': word[-1:],
            'word[-2:]': word[-2:],
            'word.isdigit()': word.isdigit(),
            'word.len': len(word),
            'word.hasHindiChar': any('\u0900' <= ch <= '\u097F' for ch in word)
        }
        if i > 0:
            word1 = sent[i-1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.isdigit()': word1.isdigit(),
                '-1:word.len': len(word1),
            })
        else:
            features['BOS'] = True
        if i < len(sent)-1:
            word1 = sent[i+1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.isdigit()': word1.isdigit(),
                '+1:word.len': len(word1),
            })
        else:
            features['EOS'] = True
        return features

    def sent2features(self, sent):
        return [self.word2features(sent, i) for i in range(len(sent))]

    def predict(self, text: str):
        self.load()
        words = text.strip().split()
        if not words:
            return [], []
        feats = self.sent2features(words)
        preds = self.model.predict_single(feats)
        # group into entities
        entities = []
        current = None
        for i, (w, t) in enumerate(zip(words, preds)):
            if t.startswith('B-'):
                if current:
                    entities.append(current)
                current = {'text': w, 'label': t[2:], 'start': i, 'end': i}
            elif t.startswith('I-') and current and t[2:] == current['label']:
                current['text'] += ' ' + w
                current['end'] = i
            else:
                if current:
                    entities.append(current)
                current = None
        if current:
            entities.append(current)
        return preds, entities

ner_model = HindiNERModel("/content/hindi_ner_crf_model_epoch.joblib")


In [None]:
# Audio transcription helper
class HindiAudioTranscriber:
    def __init__(self, model_size: str = "medium", enable_spell_check: bool = True):
        self.model_size = model_size
        self.enable_spell_check = enable_spell_check
        self.model = None

    def load_model(self):
        if self.model is None:
            self.model = WhisperModel(self.model_size, device="auto", compute_type="auto")

    def split_audio(self, file_path: Path, min_silence_len=500, silence_thresh=-40, keep_silence=300):
        audio = AudioSegment.from_file(file_path)
        chunks = split_on_silence(
            audio,
            min_silence_len=min_silence_len,
            silence_thresh=audio.dBFS + silence_thresh,
            keep_silence=keep_silence
        )
        return chunks

    def transcribe_chunk(self, chunk) -> str:
        import tempfile, os
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            chunk.export(tmp.name, format="wav")
            segments, info = self.model.transcribe(tmp.name, language="hi")
            text = " ".join([seg.text for seg in segments])
            os.unlink(tmp.name)
            return text

    def transcribe(self, file_path: Path) -> str:
        self.load_model()
        chunks = self.split_audio(file_path)
        if not chunks:
            return ""
        parts = []
        for i, ch in enumerate(chunks):
            parts.append(self.transcribe_chunk(ch))
        full_text = " ".join(parts).strip()
        if self.enable_spell_check and full_text:
            return spell_checker.correct_text(full_text)
        return full_text

transcriber = HindiAudioTranscriber()


In [None]:
# Audio input UI: Upload or specify a path, then transcribe and run NER

import io
import os
from IPython.display import clear_output
import ipywidgets as widgets

transcription_txt = Path("final_transcription.txt")

# Widgets
uploader = widgets.FileUpload(accept='.wav,.mp3,.m4a,.flac,.ogg', multiple=False)
path_box = widgets.Text(placeholder='path/to/audio.wav or .mp3 ...', description='Audio path:')
run_btn = widgets.Button(description='Transcribe audio', button_style='primary', icon='play')
status_out = widgets.Output()

text = ""

def handle_transcription(_):
    global text
    with status_out:
        clear_output(wait=True)
        chosen_audio_path = None

        # 1) Uploaded file
        if uploader.value:
            # Extract the uploaded file
            [(fname, fileinfo)] = list(uploader.value.items())
            data: bytes = fileinfo['content']
            # Save locally with the original name (safe default)
            chosen_audio_path = Path(fname)
            with open(chosen_audio_path, 'wb') as f:
                f.write(data)
            print(f"📁 Saved uploaded audio to: {chosen_audio_path}")

        # 2) Path typed by user
        elif path_box.value and Path(path_box.value).exists():
            chosen_audio_path = Path(path_box.value)
            print(f"📝 Using provided path: {chosen_audio_path}")

        # 3) Fallback to existing transcription
        if chosen_audio_path is None:
            if transcription_txt.exists():
                print("✅ Found final_transcription.txt — using it as input")
                text_local = transcription_txt.read_text(encoding="utf-8").strip()
                if not text_local:
                    print("⚠️ final_transcription.txt is empty.")
                else:
                    text = spell_checker.correct_text(text_local)
                    print("\n=== Transcribed / Input Text (first 400 chars) ===\n")
                    print(text[:400] + ("..." if len(text) > 400 else ""))
                return
            else:
                print("❌ No audio provided and final_transcription.txt not found.")
                print("➡️ Upload an audio file above or enter a valid audio path.")
                return

        # If we have an audio path, transcribe it
        try:
            print(f"🎵 Transcribing audio: {chosen_audio_path}")
            text_local = transcriber.transcribe(chosen_audio_path)
            if not text_local:
                print("⚠️ Got empty transcription.")
                return
            # Optional spell check pass
            text = spell_checker.correct_text(text_local)
            # Save to final_transcription.txt for reproducibility
            transcription_txt.write_text(text, encoding="utf-8")
            print("💾 Saved transcription to final_transcription.txt")
            print("\n=== Transcribed / Input Text (first 400 chars) ===\n")
            print(text[:400] + ("..." if len(text) > 400 else ""))
        except Exception as e:
            print(f"❌ Transcription failed: {e}")

run_btn.on_click(handle_transcription)

ui = widgets.VBox([
    widgets.HTML('<b>Provide audio input</b> (upload or enter a file path)'),
    uploader,
    path_box,
    run_btn,
    status_out
])

display(ui)


VBox(children=(HTML(value='<b>Provide audio input</b> (upload or enter a file path)'), FileUpload(value={}, ac…

In [None]:
# Run NER and display results
if text:
    preds, entities = ner_model.predict(text)

    # Tagged inline output
    words = text.split()
    tagged = [f"{w} ({t})" for w, t in zip(words, preds)] if preds else []

    print("\n=== Tagged Text (first 200 tokens) ===\n")
    print(" ".join(tagged[:200]))

    # Structured entities
    if entities:
        print(f"\n=== Detected Entities: {len(entities)} ===\n")
        entity_df = pd.DataFrame(entities)
        display(entity_df)

        # Entity counts
        counts = entity_df['label'].value_counts()
        print("\n=== Entity Counts ===\n")
        display(pd.DataFrame(counts))
    else:
        print("\n(No entities detected.)")
else:
    print("Skipping NER — no text available.")



=== Tagged Text (first 200 tokens) ===

सुने (Out of Vocabulary) की (Out of Vocabulary) चिडिया (Out of Vocabulary) कहा (Out of Vocabulary) जाने (Out of Vocabulary) वाला (Out of Vocabulary) भारत (B-LOCATION) दुनिया (Out of Vocabulary) के (Out of Vocabulary) सबसे (Out of Vocabulary) प्राचिन (Out of Vocabulary) देसों (Out of Vocabulary) मैंसे (Out of Vocabulary) एक (Out of Vocabulary) है (Out of Vocabulary) भारत (B-LOCATION) का (Out of Vocabulary) इत्यास (Out of Vocabulary) बेहद (Out of Vocabulary) पूराना (Out of Vocabulary) है। (Out of Vocabulary) आर्कियलोजिकल (Out of Vocabulary) सर्वेज (Out of Vocabulary) से (Out of Vocabulary) पता (Out of Vocabulary) चलता (Out of Vocabulary) है (Out of Vocabulary) कि (Out of Vocabulary) भारत (B-LOCATION) में (Out of Vocabulary) इन्सानी (Out of Vocabulary) गतिविदी (Out of Vocabulary) दो (B-NUMEX) लाख (I-NUMEX) पचाजार (Out of Vocabulary) सालों (Out of Vocabulary) से (Out of Vocabulary) भी (Out of Vocabulary) ज़ादा (Out of Vocabulary) पूरानी (Out of Voca

Unnamed: 0,text,label,start,end
0,भारत,LOCATION,6,6
1,भारत,LOCATION,15,15
2,भारत,LOCATION,28,28
3,दो लाख,NUMEX,32,33
4,10,NUMEX,59,59
5,12,NUMEX,61,61
6,भारत,LOCATION,105,105
7,आर्केलोजिकल एग्सर्वेशन्स,ORGANIZATION,113,114
8,भारत,LOCATION,138,138



=== Entity Counts ===



Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
LOCATION,5
NUMEX,3
ORGANIZATION,1
